Cleanup

martykulma · martykulma · commit e502e99c6bd9 · 2025-07-22T22:32:08.000-04:00
diff --git a/src/sql-server-util/src/cdc.rs b/src/sql-server-util/src/cdc.rs
@@ -17,11 +17,46 @@
 //! 2. [`CdcStream::into_stream`] returns a [`futures::Stream`] of [`CdcEvent`]s
 //!    optionally from the [`Lsn`] returned in step 1.
 //!
-//! Internally we get a snapshot by setting our transaction isolation level to
-//! [`TransactionIsolationLevel::Snapshot`], getting the current maximum LSN with
-//! [`crate::inspect::get_max_lsn`] and then running a `SELECT *`. We've observed that by
-//! using [`TransactionIsolationLevel::Snapshot`] the LSN remains stable for the entire
-//! transaction.
+//! The snapshot process is responsible for identifying a [`Lsn`] that provides
+//! a point-in-time view of the data for the table(s) being copied.  Similarly to
+//! MySQL, Microsoft SQL server, as far as we know, does not provide an API to
+//! achieve this.
+//!
+//! SQL Server `SNAPSHOT` isolation provides guarantees that a reader will only
+//! see writes committed before the transaction began.  More specficially, this
+//! snapshot is implemented using versions that are visibile based on the
+//! transaction sequence number (`XSN`). The `XSN` is set at the first
+//! read or write, not at `BEGIN TRANSACTION`, see [here](https://learn.microsoft.com/en-us/sql/relational-databases/sql-server-transaction-locking-and-row-versioning-guide?view=sql-server-ver17).
+//! This provides us a suitable starting point for capturing the table data.
+//!
+//! Due to the asynchronous nature of CDC, we can assume that the [`Lsn`]
+//! returned from any CDC tables or CDC functions will always be stale,
+//! in relation to the source table that CDC is tracking. The system table
+//! [sys.dm_tran_database_transactions](https://learn.microsoft.com/en-us/sql/relational-databases/system-dynamic-management-views/sys-dm-tran-database-transactions-transact-sql?view=sql-server-ver17)
+//! will contain a [`Lsn`] for any transaction that performs a write operation.
+//! Creating a savepoint using [SAVE TRANSACTION](https://learn.microsoft.com/en-us/sql/t-sql/language-elements/save-transaction-transact-sql?view=sql-server-ver17).
+//! is sufficient to generate a [`Lsn`] in this case, and will additionally
+//! establish the `XSN` used to determine data visibiity.
+//!
+//! Unfortunately, it isn't sufficient to just create a savepoint. There exists
+//! the possibility that a write transaction may have inserted a row into one
+//! of the tables in the snapshot, but that write has not comitted. In this case,
+//! the `INSERT` has already been written to the transaction log at a [`Lsn`]
+//! less than than the one captured, but the snapshot will *not* that INSERT
+//! because the transaction has not committed, and may not commit until after
+//! the snapshot is complete. In order to force a clear delineation of updates,
+//! the upstream tables in the snapshot must be locked. This lock only needs
+//! to exist long enough to establish the [`Lsn`] and `XSN`.
+//!
+//! SQL server supports exclusive table locks, but those will only be released
+//! once the outermost transaction completes. For this reason, this module
+//! uses two connections for the snapshot process.  The first conection is used
+//! to initiate a transaction and lock the upstream tables.  While the first
+//! connection maintains the locks, the second connection starts a transaction
+//! with [`TransactionIsolationLevel::Snapshot`] isolation and creates a
+//! savepoint.  Once the savepoint is created and [`Lsn`] is captured, the first
+//! connection rolls back the transaction. The snapshot is created by the second
+//! connection within the existing transaction.
 //!
 //! After completing the snapshot we use [`crate::inspect::get_changes_asc`] which will return
 //! all changes between a `[lower, upper)` bound of [`Lsn`]s.
@@ -57,7 +92,7 @@ pub struct CdcStream<'a> {
     ///
     /// Note: When CDC is first enabled in an instance of SQL Server it can take a moment
     /// for it to "completely" startup. Before starting a `TRANSACTION` for our snapshot
-    /// we'll wait this duration for SQL Server to report an LSN and thus indicate CDC is
+    /// we'll wait this duration for SQL Server to report a [`Lsn`] and thus indicate CDC is
     /// ready to go.
     max_lsn_wait: Duration,
 }
@@ -96,7 +131,7 @@ impl<'a> CdcStream<'a> {
         self
     }
 
-    /// The max duration we'll wait for SQL Server to return an LSN before taking a
+    /// The max duration we'll wait for SQL Server to return a [`Lsn`] before taking a
     /// snapshot.
     ///
     /// When CDC is first enabled in SQL Server it can take a moment before it is fully
@@ -143,45 +178,39 @@ impl<'a> CdcStream<'a> {
         // the upstream DB is ready for CDC.
         self.wait_for_ready().await?;
 
-        let mut fencing_client = self.client.new_connection().await?;
+        tracing::info!("Upstream is ready");
+
         // The client that will be used for fencing does not need any special isolation level
-        // as it will be just be locking the tables
+        // as it will be just be locking the table(s).
+        let mut fencing_client = self.client.new_connection().await?;
         let mut fence_txn = fencing_client.transaction().await?;
-        // lock all the tables we are planning to snapshot so that we can ensure that
-        // writes that might be in progress are properly ordered before or after this snapshot
-        // in addition to the LSN being properly ordered.
-        // TODO (maz): we should considering a timeout here because we may lock some tables,
-        // and the next table may be locked for some extended period, resulting in a traffic
-        // jam.
+
+        // TODO (maz): we should consider a timeout or a lock + snapshot per-table instead of collectively
         for (_capture_instance, schema, table) in &tables {
             tracing::trace!(%schema, %table, "locking table");
-            crate::inspect::lock_table(&mut fence_txn, &*schema, &*table).await?;
+            fence_txn.lock_table_exclusive(&*schema, &*table).await?;
         }
+        tracing::info!(?tables, "Locked tables");
 
         self.client
             .set_transaction_isolation(TransactionIsolationLevel::Snapshot)
             .await?;
         let mut txn = self.client.transaction().await?;
-        // The result here is not important, what we are doing is establishing a transaction sequence number (XSN)
-        // while using SNAPSHOT isolation that will be concurrent with a quiesced set of tables that we
-        // wish to snapshot.  Regardless what you might read in *many* articles on Microsoft's site, the XSN is not
-        // set at BEGIN TRANSACTION, but at the first read/write.
-        // The choice of table is driven by a few factors:
-        // - it's a system table, we know it must exist and it will have a well defined schema
-        // - MZ is a CDC client, so should be be able to read from it
-        let res = txn
-            .simple_query("SELECT TOP 1 object_id FROM cdc.change_tables;")
-            .await?;
-        // TODO (maz): nicer error if, somehow, there are no more change tables
-        assert_eq!(res.len(), 1);
 
-        // Creating a savepoint forces a write to the transaction log, which will assign an LSN to this transaction
-        // which is concurrent with the tables that are currently in a consistent state.
+        // Creating a savepoint forces a write to the transaction log, which will
+        // assign a LSN and a transaction sequence number (XSN).
+        // Because the tables are exclusively locked, any write operation has either
+        // completed, or is blocked. The LSN and XSN acquired now represents a
+        // consistent point-in-time view, such that any comitted write will be
+        // visible to this snapshot and the LSN of such a write will be less than
+        // or equal to the LSN captured here.
         txn.create_savepoint(SAVEPOINT_NAME).await?;
+        tracing::info!(%SAVEPOINT_NAME, "Created savepoint");
+        let lsn = txn.get_lsn().await?;
 
-        let lsn = crate::inspect::get_lsn(&mut txn).await?;
-
-        // once we have the snapshot, we can rollback the fencing transaction to allow access to the tables.
+        // Once the XSN is esablished and the LSN captured, the tables no longer
+        // need to be locked.  Any writes that happen to the upstream tables
+        // will have a LSN higher than our captured LSN, and will be read from CDC.
         fence_txn.rollback().await?;
 
         tracing::info!(?tables, ?lsn, "starting snapshot");
@@ -392,7 +421,7 @@ impl<'a> CdcStream<'a> {
             }
         }
 
-        // Ensure all of the capture instances are reporting an LSN.
+        // Ensure all of the capture instances are reporting a LSN.
         for instance in self.capture_instances.keys() {
             let (_client, min_result) = mz_ore::retry::Retry::default()
                 .max_duration(self.max_lsn_wait)
@@ -508,7 +537,7 @@ pub struct Lsn {
 impl Lsn {
     const SIZE: usize = 10;
 
-    /// Interpret the provided bytes as an [`Lsn`].
+    /// Interpret the provided bytes as a [`Lsn`].
     pub fn try_from_bytes(bytes: &[u8]) -> Result<Self, String> {
         if bytes.len() != Self::SIZE {
             return Err(format!("incorrect length, expected 10 got {}", bytes.len()));
@@ -660,7 +689,7 @@ impl timely::order::PartialOrder for Lsn {
 }
 impl timely::order::TotalOrder for Lsn {}
 
-/// Structured format of an [`Lsn`].
+/// Structured format of a [`Lsn`].
 ///
 /// Note: The derived impl of [`PartialOrd`] and [`Ord`] relies on the field
 /// ordering so do not change it.
diff --git a/src/sql-server-util/src/inspect.rs b/src/sql-server-util/src/inspect.rs
@@ -20,7 +20,7 @@ use tiberius::numeric::Numeric;
 
 use crate::cdc::{Lsn, RowFilterOption};
 use crate::desc::{SqlServerColumnRaw, SqlServerTableRaw};
-use crate::{Client, SqlServerError, Transaction};
+use crate::{Client, SqlServerError};
 
 /// Returns the minimum log sequence number for the specified `capture_instance`.
 ///
@@ -60,55 +60,10 @@ pub async fn increment_lsn(client: &mut Client, lsn: Lsn) -> Result<Lsn, SqlServ
     parse_lsn(&result[..1])
 }
 
-/// Retrieves the current LSN of the database by calling SAVE TRANSACTION to create a savepoint with the provided name,
-/// which forces an LSN to be written to the transaction log.
-///
-/// The savepoint name must follow rules for SQL Server identifiers
-/// - starts with letter or underscore
-/// - only contains letters, digits, and underscores
-/// - no resserved words
-/// - 32 char max
-pub async fn create_savepoint(
-    txn: &mut Transaction<'_>,
-    savepoint_name: &str,
-) -> Result<(), SqlServerError> {
-    // TODO (maz): make sure savepoint name is safe
-    let _result = txn
-        .client
-        .simple_query(format!("SAVE TRANSACTION [{savepoint_name}]"))
-        .await?;
-    Ok(())
-}
-
-pub async fn get_lsn(txn: &mut Transaction<'_>) -> Result<Lsn, SqlServerError> {
-    static CURRENT_LSN_QUERY: &str = "
-SELECT dt.database_transaction_begin_lsn
-FROM sys.dm_tran_database_transactions AS dt
-JOIN sys.dm_tran_session_transactions AS st
-  ON dt.transaction_id = st.transaction_id
-WHERE st.session_id = @@SPID
-";
-    let result = txn.client.simple_query(CURRENT_LSN_QUERY).await?;
-    parse_numeric_lsn(&result)
-}
-
-pub async fn lock_table(
-    txn: &mut Transaction<'_>,
-    schema: &str,
-    table: &str,
-) -> Result<(), SqlServerError> {
-    // This query probably seems odd, but there is no LOCK command in MS SQL. Locks are specified
-    // in SELECT using the WITH keyword.  This query does not need to return any rows to lock the table,
-    // hence the 1=0, which is something short that always evaluates to false in this universe;
-    let query = format!("SELECT * FROM {schema}.{table} WITH (TABLOCKX) WHERE 1=0;");
-    let _result = txn.client.query(query, &[]).await?;
-    Ok(())
-}
-
-/// Parse an [`Lsn`] in Decimal(25,0) format of the provided [`tiberius::Row`].
+/// Parse a [`Lsn`] in Decimal(25,0) format of the provided [`tiberius::Row`].
 ///
 /// Returns an error if the provided slice doesn't have exactly one row.
-fn parse_numeric_lsn(row: &[tiberius::Row]) -> Result<Lsn, SqlServerError> {
+pub(crate) fn parse_numeric_lsn(row: &[tiberius::Row]) -> Result<Lsn, SqlServerError> {
     match row {
         [r] => {
             let numeric_lsn = r
@@ -127,7 +82,7 @@ fn parse_numeric_lsn(row: &[tiberius::Row]) -> Result<Lsn, SqlServerError> {
     }
 }
 
-/// Parse an [`Lsn`] from the first column of the provided [`tiberius::Row`].
+/// Parse a [`Lsn`] from the first column of the provided [`tiberius::Row`].
 ///
 /// Returns an error if the provided slice doesn't have exactly one row.
 fn parse_lsn(result: &[tiberius::Row]) -> Result<Lsn, SqlServerError> {
@@ -214,7 +169,7 @@ SELECT @mz_cleanup_status_bit;
     let max_deletes = i64::cast_from(max_deletes);
 
     // First we need to get a valid LSN as our low watermark. If we try to cleanup
-    // a change table with an LSN that doesn't exist in the `cdc.lsn_time_mapping`
+    // a change table with a LSN that doesn't exist in the `cdc.lsn_time_mapping`
     // table we'll get an error code `22964`.
     let result = client
         .query(GET_LSN_QUERY, &[&low_water_mark.as_bytes().as_slice()])
diff --git a/src/sql-server-util/src/lib.rs b/src/sql-server-util/src/lib.rs
@@ -34,6 +34,7 @@ pub mod inspect;
 pub use config::Config;
 pub use desc::{ProtoSqlServerColumnDesc, ProtoSqlServerTableDesc};
 
+use crate::cdc::Lsn;
 use crate::config::TunnelConfig;
 use crate::desc::SqlServerColumnDecodeType;
 
@@ -105,6 +106,8 @@ impl Client {
         Ok(client)
     }
 
+    /// Create a new Client instance with the same configuration that created
+    /// this configuration.
     pub async fn new_connection(&self) -> Result<Self, SqlServerError> {
         Self::connect(self.config.clone()).await
     }
@@ -342,7 +345,7 @@ pub type RowStream<'a> =
 pub struct Transaction<'a> {
     client: &'a mut Client,
     closed: bool,
-    nested_xact_names: Vec<String>,
+    savepoints: Vec<String>,
 }
 
 impl<'a> Transaction<'a> {
@@ -359,34 +362,68 @@ impl<'a> Transaction<'a> {
             Ok(Transaction {
                 client,
                 closed: false,
-                nested_xact_names: Default::default(),
+                savepoints: Default::default(),
             })
         }
     }
 
-    /// Creates a savepoint with a transaction that can be committed or rolled back
-    /// without affecting the out transaction.
+    /// Creates a savepoint via `SAVE TRANSACTION` with the provided name.
+    /// Creating a savepoint forces a write to the transaction log, which will associate an
+    /// [`Lsn`] with the current transaction.
+    ///
+    /// The savepoint name must follow rules for SQL Server identifiers
+    /// - starts with letter or underscore
+    /// - only contains letters, digits, and underscores
+    /// - no reserved words
+    /// - 32 char max
     pub async fn create_savepoint(&mut self, savepoint_name: &str) -> Result<(), SqlServerError> {
+        // Limit the name checks to prevent sending a potentially dangerous string to the SQL Server.
+        // We prefer the server do the majority of the validation.
+        if savepoint_name.is_empty()
+            || !savepoint_name
+                .chars()
+                .all(|c| c.is_alphanumeric() || c == '_')
+        {
+            Err(SqlServerError::ProgrammingError(format!(
+                "Invalid savepoint name: '{savepoint_name}"
+            )))?;
+        }
+
         let stmt = format!("SAVE TRANSACTION [{savepoint_name}]");
         let _result = self.client.simple_query(stmt).await?;
-        self.nested_xact_names.push(savepoint_name.to_string());
+        self.savepoints.push(savepoint_name.to_string());
         Ok(())
     }
 
-    pub async fn rollback_savepoint(&mut self, savepoint_name: &str) -> Result<(), SqlServerError> {
-        let last_xact_name = self.nested_xact_names.pop();
-        if last_xact_name
-            .as_ref()
-            .is_none_or(|last_xact_name| *last_xact_name != savepoint_name)
-        {
-            panic!(
-                "Attempt to rollback savepoint {savepoint_name} doesn't match last savepoint {:?}",
-                last_xact_name
-            );
-        }
-        let stmt = format!("ROLLBACK TRANSACTION [{savepoint_name}]");
-        let _result = self.client.simple_query(stmt).await?;
-        self.nested_xact_names.push(savepoint_name.to_string());
+    /// Retrieve the [`Lsn`] associated with the current session.
+    ///
+    /// MS SQL Server will not assign a [`Lsn`] until a write is performed (e.g. via `SAVE TRANSACTION`).
+    pub async fn get_lsn(&mut self) -> Result<Lsn, SqlServerError> {
+        static CURRENT_LSN_QUERY: &str = "
+SELECT dt.database_transaction_begin_lsn
+FROM sys.dm_tran_database_transactions AS dt
+JOIN sys.dm_tran_session_transactions AS st
+  ON dt.transaction_id = st.transaction_id
+WHERE st.session_id = @@SPID
+";
+        let result = self.client.simple_query(CURRENT_LSN_QUERY).await?;
+        crate::inspect::parse_numeric_lsn(&result)
+    }
+
+    /// Exclusively lock the provided table, uses `TABLOCKX`.
+    ///
+    /// The lock is obtained using a `SELECT` statement that will not read the table. The lock is released
+    /// after transaction commit or rollback.
+    pub async fn lock_table_exclusive(
+        &mut self,
+        schema: &str,
+        table: &str,
+    ) -> Result<(), SqlServerError> {
+        // This query probably seems odd, but there is no LOCK command in MS SQL. Locks are specified
+        // in SELECT using the WITH keyword.  This query does not need to return any rows to lock the table,
+        // hence the 1=0, which is something short that always evaluates to false in this universe;
+        let query = format!("SELECT * FROM [{schema}].[{table}] WITH (TABLOCKX) WHERE 1=0;");
+        let _result = self.client.simple_query(query).await?;
         Ok(())
     }
 
@@ -434,6 +471,7 @@ impl<'a> Transaction<'a> {
         // N.B. Mark closed _before_ running the query. This prevents us from
         // double closing the transaction if this query itself fails.
         self.closed = true;
+        self.savepoints.clear();
         self.client.simple_query(ROLLBACK_QUERY).await?;
         Ok(())
     }
@@ -444,6 +482,7 @@ impl<'a> Transaction<'a> {
         // N.B. Mark closed _before_ running the query. This prevents us from
         // double closing the transaction if this query itself fails.
         self.closed = true;
+        self.savepoints.clear();
         self.client.simple_query(COMMIT_QUERY).await?;
         Ok(())
     }
diff --git a/src/storage-types/src/sources/sql_server.rs b/src/storage-types/src/sources/sql_server.rs
@@ -37,7 +37,7 @@ include!(concat!(
 pub const SNAPSHOT_MAX_LSN_WAIT: Config<Duration> = Config::new(
     "sql_server_snapshot_max_lsn_wait",
     Duration::from_secs(30),
-    "Maximum amount of time we'll wait for SQL Server to report an LSN (in other words for \
+    "Maximum amount of time we'll wait for SQL Server to report a LSN (in other words for \
     CDC to be fully enabled) before taking an initial snapshot.",
 );
 
diff --git a/test/sql-server-cdc/20-column-options.td b/test/sql-server-cdc/20-column-options.td
diff --git a/test/sql-server-cdc/mzcompose.py b/test/sql-server-cdc/mzcompose.py