MaterializeInc · martykulma · Jul 10, 2025 · Jul 11, 2025 · Jul 17, 2025 · Jul 18, 2025
diff --git a/src/sql-server-util/src/cdc.rs b/src/sql-server-util/src/cdc.rs
@@ -17,11 +17,49 @@
 //! 2. [`CdcStream::into_stream`] returns a [`futures::Stream`] of [`CdcEvent`]s
 //!    optionally from the [`Lsn`] returned in step 1.
 //!
-//! Internally we get a snapshot by setting our transaction isolation level to
-//! [`TransactionIsolationLevel::Snapshot`], getting the current maximum LSN with
-//! [`crate::inspect::get_max_lsn`] and then running a `SELECT *`. We've observed that by
-//! using [`TransactionIsolationLevel::Snapshot`] the LSN remains stable for the entire
-//! transaction.
+//! The snapshot process is responsible for identifying a [`Lsn`] that provides
+//! a point-in-time view of the data for the table(s) being copied.  Similarly to
+//! MySQL, Microsoft SQL server, as far as we know, does not provide an API to
+//! achieve this.
+//!
+//! SQL Server `SNAPSHOT` isolation provides guarantees that a reader will only
+//! see writes committed before the transaction began.  More specficially, this
+//! snapshot is implemented using versions that are visibile based on the
+//! transaction sequence number (`XSN`). The `XSN` is set at the first
+//! read or write, not at `BEGIN TRANSACTION`, see [here](https://learn.microsoft.com/en-us/sql/relational-databases/sql-server-transaction-locking-and-row-versioning-guide?view=sql-server-ver17).
+//! This provides us a suitable starting point for capturing the table data.
+//! To force a `XSN` to be assigned, experiments have shown that a table must
+//! be read. We choose a well-known table that we should already have access to,
+//! [cdc.change_tables](https://learn.microsoft.com/en-us/sql/relational-databases/system-tables/cdc-change-tables-transact-sql?view=sql-server-ver17),
+//! and read a single value from it.
+//!
+//! Due to the asynchronous nature of CDC, we can assume that the [`Lsn`]
+//! returned from any CDC tables or CDC functions will always be stale,
+//! in relation to the source table that CDC is tracking. The system table
+//! [sys.dm_tran_database_transactions](https://learn.microsoft.com/en-us/sql/relational-databases/system-dynamic-management-views/sys-dm-tran-database-transactions-transact-sql?view=sql-server-ver17)
+//! will contain a [`Lsn`] for any transaction that performs a write operation.
+//! Creating a savepoint using [SAVE TRANSACTION](https://learn.microsoft.com/en-us/sql/t-sql/language-elements/save-transaction-transact-sql?view=sql-server-ver17).
+//! is sufficient to generate a [`Lsn`] in this case.
+//!
+//! Unfortunately, it isn't sufficient to just fix the [`Lsn`] and `XSN`. There exists
+//! the possibility that a write transaction may have inserted a row into one
+//! of the tables in the snapshot, but that write has not comitted. In this case,
+//! the `INSERT` has already been written to the transaction log at a [`Lsn`]
+//! less than than the one captured, but the snapshot will *not* observe that INSERT
+//! because the transaction has not committed, and may not commit until after
+//! the snapshot is complete. In order to force a clear delineation of updates,
+//! the upstream tables in the snapshot must be locked. This lock only needs
+//! to exist long enough to establish the [`Lsn`] and `XSN`.
+//!
+//! SQL server supports exclusive table locks, but those will only be released
+//! once the outermost transaction completes. For this reason, this module
+//! uses two connections for the snapshot process.  The first conection is used
+//! to initiate a transaction and lock the upstream tables.  While the first
+//! connection maintains the locks, the second connection starts a transaction
+//! with [`TransactionIsolationLevel::Snapshot`] isolation and creates a
+//! savepoint.  Once the savepoint is created and [`Lsn`] is captured, the first
+//! connection rolls back the transaction. The snapshot is created by the second
-//! connection rolls back the transaction. The snapshot is created by the second
+//! connection commits the transaction. The snapshot is created by the second
-//! connection rolls back the transaction. The snapshot is created by the second
+//! connection commits the transaction. The snapshot is created by the second
+//! connection within the existing transaction.
 //!
 //! After completing the snapshot we use [`crate::inspect::get_changes_asc`] which will return
 //! all changes between a `[lower, upper)` bound of [`Lsn`]s.
@@ -36,6 +74,7 @@ use futures::{Stream, StreamExt};
 use mz_ore::retry::RetryResult;
 use proptest_derive::Arbitrary;
 use serde::{Deserialize, Serialize};
+use tiberius::numeric::Numeric;
 
 use crate::{Client, SqlServerError, TransactionIsolationLevel};
 
@@ -56,7 +95,7 @@ pub struct CdcStream<'a> {
     ///
     /// Note: When CDC is first enabled in an instance of SQL Server it can take a moment
     /// for it to "completely" startup. Before starting a `TRANSACTION` for our snapshot
-    /// we'll wait this duration for SQL Server to report an LSN and thus indicate CDC is
+    /// we'll wait this duration for SQL Server to report a [`Lsn`] and thus indicate CDC is
     /// ready to go.
     max_lsn_wait: Duration,
 }
@@ -95,7 +134,7 @@ impl<'a> CdcStream<'a> {
         self
     }
 
-    /// The max duration we'll wait for SQL Server to return an LSN before taking a
+    /// The max duration we'll wait for SQL Server to return a [`Lsn`] before taking a
     /// snapshot.
     ///
     /// When CDC is first enabled in SQL Server it can take a moment before it is fully
@@ -122,6 +161,8 @@ impl<'a> CdcStream<'a> {
         ),
         SqlServerError,
     > {
+        static SAVEPOINT_NAME: &str = "_mz_snap_";
+
         // Determine what table we need to snapshot.
         let instances = self
             .capture_instances
@@ -140,13 +181,57 @@ impl<'a> CdcStream<'a> {
         // the upstream DB is ready for CDC.
         self.wait_for_ready().await?;
 
+        tracing::info!("Upstream is ready");
+
+        // The client that will be used for fencing does not need any special isolation level
+        // as it will be just be locking the table(s).
+        let mut fencing_client = self.client.new_connection().await?;
+        let mut fence_txn = fencing_client.transaction().await?;
+
+        // TODO (maz): we should consider a timeout or a lock + snapshot per-table instead of collectively
+        for (_capture_instance, schema, table) in &tables {
+            tracing::trace!(%schema, %table, "locking table");
+            fence_txn.lock_table_exclusive(&*schema, &*table).await?;
+        }
+        tracing::info!(?tables, "Locked tables");
+
         self.client
             .set_transaction_isolation(TransactionIsolationLevel::Snapshot)
             .await?;
-        let txn = self.client.transaction().await?;
+        let mut txn = self.client.transaction().await?;
+
+        // Creating a savepoint forces a write to the transaction log, which will
+        // assign a LSN, but it does not force a transaction sequence number to be
+        // assigned as far as I can tell.  I have not observed any entries added to
+        // `sys.dm_tran_active_snapshot_database_transactions` when creating a savepoint
+        // or when reading system views to retrieve the LSN.
+        //
+        // We choose cdc.change_tables because it is a system table that will exist
+        // when CDC is enabled, it has a well known schema, and as a CDC client,
+        // we should be able to read from it already.
+        let res = txn
+            .simple_query("SELECT TOP 1 object_id FROM cdc.change_tables")
+            .await?;
+        if res.len() != 1 {
+            Err(SqlServerError::InvariantViolated(
+                "No objects found in cdc.change_tables".into(),
+            ))?
+        }
+
+        // Because the tables are exclusively locked, any write operation has either
+        // completed, or is blocked. The LSN and XSN acquired now will represent a
+        // consistent point-in-time view, such that any comitted write will be
+        // visible to this snapshot and the LSN of such a write will be less than
+        // or equal to the LSN captured here.
+        txn.create_savepoint(SAVEPOINT_NAME).await?;
+        tracing::info!(%SAVEPOINT_NAME, "Created savepoint");
+        let lsn = txn.get_lsn().await?;
+
+        // Once the XSN is esablished and the LSN captured, the tables no longer
+        // need to be locked.  Any writes that happen to the upstream tables
+        // will have a LSN higher than our captured LSN, and will be read from CDC.
+        fence_txn.rollback().await?;
 
-        // Get the current LSN of the database.
-        let lsn = crate::inspect::get_max_lsn(txn.client).await?;
         tracing::info!(?tables, ?lsn, "starting snapshot");
 
         // Get the size of each table we're about to snapshot.
@@ -177,10 +262,10 @@ impl<'a> CdcStream<'a> {
                 tracing::trace!(%capture_instance, %schema_name, %table_name, "snapshot end");
             }
 
-            // Slightly awkward, but if the commit fails we need to conform to
+            // Slightly awkward, but if the rollback fails we need to conform to
             // type of the stream.
-            if let Err(e) = txn.commit().await {
-                yield ("commit".into(), Err(e));
+            if let Err(e) = txn.rollback().await {
+                yield ("rollback".into(), Err(e));
             }
         };
 
@@ -355,7 +440,7 @@ impl<'a> CdcStream<'a> {
             }
         }
 
-        // Ensure all of the capture instances are reporting an LSN.
+        // Ensure all of the capture instances are reporting a LSN.
         for instance in self.capture_instances.keys() {
             let (_client, min_result) = mz_ore::retry::Retry::default()
                 .max_duration(self.max_lsn_wait)
@@ -471,7 +556,7 @@ pub struct Lsn {
 impl Lsn {
     const SIZE: usize = 10;
 
-    /// Interpret the provided bytes as an [`Lsn`].
+    /// Interpret the provided bytes as a [`Lsn`].
     pub fn try_from_bytes(bytes: &[u8]) -> Result<Self, String> {
         if bytes.len() != Self::SIZE {
             return Err(format!("incorrect length, expected 10 got {}", bytes.len()));
@@ -545,6 +630,40 @@ impl TryFrom<&[u8]> for Lsn {
     }
 }
 
+impl TryFrom<Numeric> for Lsn {
+    type Error = String;
+
+    fn try_from(value: Numeric) -> Result<Self, Self::Error> {
+        if value.dec_part() != 0 {
+            return Err(format!(
+                "LSN expect Numeric(25,0), but found decimal portion {}",
+                value.dec_part()
+            ));
+        }
+        let mut decimal_lsn = value.int_part();
+        // LSN is composed of 4 bytes : 4 bytes : 2 bytes
+        // and MS provided the method to decode that here
+        // https://github.com/microsoft/sql-server-samples/blob/master/samples/features/ssms-templates/Sql/Change%20Data%20Capture/Enumeration/Create%20Function%20fn_convertnumericlsntobinary.sql
+
+        let vlf_id = u32::try_from(decimal_lsn / 10_i128.pow(15))
+            .map_err(|e| format!("Failed to decode vlf_id for lsn {decimal_lsn}: {e:?}"))?;
+        decimal_lsn -= i128::try_from(vlf_id).unwrap() * 10_i128.pow(15);
-        decimal_lsn -= i128::try_from(vlf_id).unwrap() * 10_i128.pow(15);
+        decimal_lsn -= i128::from(vlf_id) * 10_i128.pow(15);
-        decimal_lsn -= i128::try_from(vlf_id).unwrap() * 10_i128.pow(15);
+        decimal_lsn -= i128::from(vlf_id) * 10_i128.pow(15);
+
+        let block_id = u32::try_from(decimal_lsn / 10_i128.pow(5))
+            .map_err(|e| format!("Failed to decode block_id for lsn {decimal_lsn}: {e:?}"))?;
+        decimal_lsn -= i128::try_from(block_id).unwrap() * 10_i128.pow(5);
+
+        let record_id = u16::try_from(decimal_lsn)
+            .map_err(|e| format!("Failed to decode record_id for lsn {decimal_lsn}: {e:?}"))?;
+
+        Ok(Lsn {
+            vlf_id,
+            block_id,
+            record_id,
+        })
+    }
+}
+
 impl columnation::Columnation for Lsn {
     type InnerRegion = columnation::CopyRegion<Lsn>;
 }
@@ -589,7 +708,7 @@ impl timely::order::PartialOrder for Lsn {
 }
 impl timely::order::TotalOrder for Lsn {}
 
-/// Structured format of an [`Lsn`].
+/// Structured format of a [`Lsn`].
 ///
 /// Note: The derived impl of [`PartialOrd`] and [`Ord`] relies on the field
 /// ordering so do not change it.
@@ -695,6 +814,7 @@ impl Operation {
 mod tests {
     use super::Lsn;
     use proptest::prelude::*;
+    use tiberius::numeric::Numeric;
 
     #[mz_ore::test]
     fn smoketest_lsn_ordering() {
@@ -776,4 +896,37 @@ mod tests {
             test_case(random_bytes, num_increment)
         })
     }
+
+    #[mz_ore::test]
+    fn test_numeric_lsn_ordering() {
+        let a = Lsn::try_from(Numeric::new_with_scale(45_0000008784_00001_i128, 0)).unwrap();
+        let b = Lsn::try_from(Numeric::new_with_scale(45_0000008784_00002_i128, 0)).unwrap();
+        let c = Lsn::try_from(Numeric::new_with_scale(45_0000008785_00002_i128, 0)).unwrap();
+        let d = Lsn::try_from(Numeric::new_with_scale(49_0000008784_00002_i128, 0)).unwrap();
+        assert!(a < b);
+        assert!(b < c);
+        assert!(c < d);
+        assert!(a < d);
+
+        assert_eq!(a, a);
+        assert_eq!(b, b);
+        assert_eq!(c, c);
+        assert_eq!(d, d);
+    }
+
+    #[mz_ore::test]
+    fn test_numeric_lsn_invalid() {
+        let with_decimal = Numeric::new_with_scale(1, 20);
+        assert!(Lsn::try_from(with_decimal).is_err());
+
+        for v in [
+            4294967296_0000000000_00000_i128, // vlf_id is too large
+            1_4294967296_00000_i128,          // block_id is too large
+            1_0000000001_65536_i128,          // record_id is too large
+            -49_0000008784_00002_i128,        // negative is invalid
+        ] {
+            let invalid_lsn = Numeric::new_with_scale(v, 0);
+            assert!(Lsn::try_from(invalid_lsn).is_err());
+        }
+    }
 }
diff --git a/src/sql-server-util/src/inspect.rs b/src/sql-server-util/src/inspect.rs
@@ -16,6 +16,7 @@ use smallvec::SmallVec;
 use std::collections::BTreeMap;
 use std::fmt;
 use std::sync::Arc;
+use tiberius::numeric::Numeric;
 
 use crate::cdc::{Lsn, RowFilterOption};
 use crate::desc::{SqlServerColumnRaw, SqlServerTableRaw};
@@ -59,7 +60,29 @@ pub async fn increment_lsn(client: &mut Client, lsn: Lsn) -> Result<Lsn, SqlServ
     parse_lsn(&result[..1])
 }
 
-/// Parse an [`Lsn`] from the first column of the provided [`tiberius::Row`].
+/// Parse a [`Lsn`] in Decimal(25,0) format of the provided [`tiberius::Row`].
+///
+/// Returns an error if the provided slice doesn't have exactly one row.
+pub(crate) fn parse_numeric_lsn(row: &[tiberius::Row]) -> Result<Lsn, SqlServerError> {
+    match row {
+        [r] => {
+            let numeric_lsn = r
+                .try_get::<Numeric, _>(0)?
+                .ok_or_else(|| SqlServerError::NullLsn)?;
+            let lsn = Lsn::try_from(numeric_lsn).map_err(|msg| SqlServerError::InvalidData {
+                column_name: "lsn".to_string(),
+                error: msg,
+            })?;
+            Ok(lsn)
+        }
+        other => Err(SqlServerError::InvalidData {
+            column_name: "lsn".to_string(),
+            error: format!("expected 1 column, got {other:?}"),
+        }),
+    }
+}
+
+/// Parse a [`Lsn`] from the first column of the provided [`tiberius::Row`].
 ///
 /// Returns an error if the provided slice doesn't have exactly one row.
 fn parse_lsn(result: &[tiberius::Row]) -> Result<Lsn, SqlServerError> {
@@ -146,7 +169,7 @@ SELECT @mz_cleanup_status_bit;
     let max_deletes = i64::cast_from(max_deletes);
 
     // First we need to get a valid LSN as our low watermark. If we try to cleanup
-    // a change table with an LSN that doesn't exist in the `cdc.lsn_time_mapping`
+    // a change table with a LSN that doesn't exist in the `cdc.lsn_time_mapping`
     // table we'll get an error code `22964`.
     let result = client
         .query(GET_LSN_QUERY, &[&low_water_mark.as_bytes().as_slice()])