Simplify the SplitState application for optimizers TBE SSD (#4492)

q10 · facebook-github-bot · commit 16cf2359997f · 2025-07-15T10:15:51.000-07:00
Summary: Pull Request resolved: #4492 Currently, the split state application for optimizer states in TBE SSD presumes Exact Rowwise Adagrad, which has only one optimizer state. This change extends the split application to support optimizers with more than one state, such as Partial Rowwise Adam. The code draws inspiration from `construct_split_state()` in non-SSD TBE, but is much more simplified and declarative and thus ergonomic to use Reviewed By: sryap, emlin, ionuthristodorescu Differential Revision: D76709101 fbshipit-source-id: e258713a7c0fd5aac8de0e07fbe6e0c4d7e76947
diff --git a/fbgemm_gpu/fbgemm_gpu/split_embedding_configs.py b/fbgemm_gpu/fbgemm_gpu/split_embedding_configs.py
@@ -8,10 +8,16 @@
 # pyre-strict
 
 import enum
-from typing import Any, Dict  # noqa: F401
+import itertools
+from typing import Any, Dict, List, Tuple  # noqa: F401
 
 import torch
 
+from fbgemm_gpu.split_table_batched_embeddings_ops_common import (
+    EmbeddingLocation,
+    SplitState,
+)
+
 
 @enum.unique
 class EmbOptimType(enum.Enum):
@@ -68,6 +74,49 @@ def state_size_nbytes(
         else:
             return 0
 
+    def ssd_state_splits(
+        self,
+        embedding_specs: List[Tuple[int, int]],  # Tuple of (rows, dims)
+        optimizer_state_dtypes: Dict[str, "SparseType"] = {},  # noqa: B006
+        enable_optimizer_offloading: bool = False,
+    ) -> List[Tuple[SplitState, str, torch.dtype]]:
+        """
+        Returns the split planning for the optimizer states
+        """
+        (rows, _) = zip(*embedding_specs)
+        T_ = len(embedding_specs)
+
+        # This is the cumulative row counts for rowwise states
+        row_count_cumsum: List[int] = [0] + list(itertools.accumulate(rows))
+        # This is the cumulative element counts for elementwise states
+        table_size_cumsum: List[int] = [0] + list(
+            itertools.accumulate([r * d for r, d in embedding_specs])
+        )
+
+        if self == EmbOptimType.EXACT_ROWWISE_ADAGRAD:
+            params = {"momentum1": row_count_cumsum}
+        elif self == EmbOptimType.PARTIAL_ROWWISE_ADAM:
+            params = {"momentum1": table_size_cumsum, "momentum2": row_count_cumsum}
+        else:
+            params = {}
+
+        return [
+            (
+                SplitState(
+                    dev_size=(
+                        cumsum_table[-1] if not enable_optimizer_offloading else 0
+                    ),
+                    host_size=0,
+                    uvm_size=0,
+                    placements=[EmbeddingLocation.DEVICE for _ in range(T_)],
+                    offsets=cumsum_table[:-1],
+                ),
+                name,
+                self._extract_dtype(optimizer_state_dtypes, name),
+            )
+            for (name, cumsum_table) in params.items()
+        ]
+
     def dtype(self) -> torch.dtype:
         """
         Returns the dtype of the optimizer state
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
@@ -852,21 +852,14 @@ def __init__(
             dtype=table_embedding_dtype,
         )
 
-        momentum1_offsets = [0] + list(itertools.accumulate(rows))
-        self._apply_split(
-            SplitState(
-                dev_size=(
-                    self.total_hash_size if not self.enable_optimizer_offloading else 0
-                ),
-                host_size=0,
-                uvm_size=0,
-                placements=[EmbeddingLocation.DEVICE for _ in range(T_)],
-                offsets=momentum1_offsets[:-1],
-            ),
-            "momentum1",
+        # Create the optimizer state tensors
+        for template in self.optimizer.ssd_state_splits(
+            self.embedding_specs,
+            self.optimizer_state_dtypes,
+            self.enable_optimizer_offloading,
+        ):
             # pyre-fixme[6]: For 3rd argument expected `Type[dtype]` but got `dtype`.
-            dtype=torch.float32,
-        )
+            self._apply_split(*template)
 
         # For storing current iteration data
         self.current_iter_data: Optional[IterData] = None