Expand split_optimizer_states() to support multiple optimizer states (#4495)

q10 · facebook-github-bot · commit 7d8feca07114 · 2025-07-15T13:40:39.000-07:00
Summary: Pull Request resolved: #4495 X-link: facebookresearch/FBGEMM#1548 - Expand `split_optimizer_states()` to support multiple optimizer states. This is necessary for unit tests involving new optimizers such as Partial Rowwise Adam to work. There are 4 cases to handle when attempting to fetch the split optimizer states: 1. The no-KV ZCH case 1. The KV ZCH case, but where `self.load_state_dict` is `True` (i.e. fall back to `self._cached_kvzch_data`) 1. The KV ZCH case, where `self.load_state_dict` is `False`, and `self.enable_optimizer_offloading` is false 1. The KV ZCH case, where `self.load_state_dict` is `False`, and `self.enable_optimizer_offloading` is `True` This diff completes the handling of returning optimizer states from SSD TBE for the non-KV ZCH case (case 1). The rest will be implemented in subsequent diffs along the stack Reviewed By: emlin, ionuthristodorescu Differential Revision: D77337646 fbshipit-source-id: d010b347009867cc936dc177802adbf31066526b
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
@@ -2230,28 +2230,77 @@ def forward(
     @torch.jit.ignore
     def _split_optimizer_states_non_kv_zch(
         self,
-    ) -> List[torch.Tensor]:
+    ) -> List[List[torch.Tensor]]:
         """
-        Returns a list of optimizer states, split by table. So far, we only support EXACT_ROWWISE_ADAGRAD,
-        so only momentum1 state is returned.
+        Returns a list of optimizer states (view), split by table.
+
+        Returns:
+            A list of list of states. Shape = (the number of tables, the number
+            of states).
+
+            The following shows the list of states (in the returned order) for
+            each optimizer:
+
+            (1) `EXACT_ROWWISE_ADAGRAD`: `momentum1` (rowwise)
+
+            (1) `PARTIAL_ROWWISE_ADAM`: `momentum1`, `momentum2` (rowwise)
         """
+
         logging.info("_split_optimizer_states_non_kv_zch")
-        (rows, _) = zip(*self.embedding_specs)
 
-        rows_cumsum = [0] + list(itertools.accumulate(rows))
+        # Row count per table
+        (rows, dims) = zip(*self.embedding_specs)
+        # Cumulative row counts per table for rowwise states
+        row_count_cumsum: List[int] = [0] + list(itertools.accumulate(rows))
+        # Cumulative element counts per table for elementwise states
+        elem_count_cumsum: List[int] = [0] + list(
+            itertools.accumulate([r * d for r, d in self.embedding_specs])
+        )
+
+        # pyre-ignore[53]
+        def _slice(tensor: Tensor, t: int, rowwise: bool) -> Tensor:
+            d: int = dims[t]
+            e: int = rows[t]
+
+            if not rowwise:
+                # Optimizer state is element-wise - compute the table offset for
+                # the table, view the slice as 2D tensor
+                return tensor.detach()[
+                    elem_count_cumsum[t] : elem_count_cumsum[t + 1]
+                ].view(-1, d)
+            else:
+                # Optimizer state is row-wise - fetch elements in range and view
+                # slice as 1D
+                return tensor.detach()[
+                    row_count_cumsum[t] : row_count_cumsum[t + 1]
+                ].view(e)
 
-        return [
-            self.momentum1_dev.detach()[rows_cumsum[t] : rows_cumsum[t + 1]].view(row)
-            for t, row in enumerate(rows)
-        ]
+        if self.optimizer == OptimType.EXACT_ROWWISE_ADAGRAD:
+            return [
+                [_slice(self.momentum1_dev, t, rowwise=True)]
+                for t, _ in enumerate(rows)
+            ]
+        elif self.optimizer == OptimType.PARTIAL_ROWWISE_ADAM:
+            return [
+                [
+                    _slice(self.momentum1_dev, t, rowwise=False),
+                    # pyre-ignore[6]
+                    _slice(self.momentum2_dev, t, rowwise=True),
+                ]
+                for t, _ in enumerate(rows)
+            ]
+        else:
+            raise NotImplementedError(
+                f"Getting optimizer states is not supported for {self.optimizer}"
+            )
 
     @torch.jit.export
     def split_optimizer_states(
         self,
         sorted_id_tensor: Optional[List[torch.Tensor]] = None,
         no_snapshot: bool = True,
         should_flush: bool = False,
-    ) -> List[torch.Tensor]:
+    ) -> List[List[torch.Tensor]]:
         """
         Returns a list of optimizer states split by table. So far, we only support EXACT_ROWWISE_ADAGRAD,
         so only momentum1 state is returned.
@@ -2277,7 +2326,16 @@ def split_optimizer_states(
                 self._cached_kvzch_data is not None
                 and self._cached_kvzch_data.cached_optimizer_state_per_table
             ), "optimizer state is not initialized for load checkpointing"
-            return self._cached_kvzch_data.cached_optimizer_state_per_table
+
+            # NOTE: This is a temporary hack to have split_optimizer_states return a
+            # List[List[Tensor]] instead of List[Tensor] to match the behavior of
+            # _split_optimizer_states_non_kv_zch.  This should be removed after
+            # proper support for multiple optimizers is added for the
+            # enable_optimizer_offloading=True case.
+            return [
+                [opt]
+                for opt in self._cached_kvzch_data.cached_optimizer_state_per_table
+            ]
 
         logging.info(
             f"split_optimizer_states for KV ZCH: {no_snapshot=}, {should_flush=}"
@@ -2401,7 +2459,13 @@ def split_optimizer_states(
             f"KV ZCH tables split_optimizer_states query latency: {(time.time() - start_time) * 1000} ms, "
             f"num ids list: {None if not sorted_id_tensor else [ids.numel() for ids in sorted_id_tensor]}"
         )
-        return opt_list
+
+        # NOTE: This is a temporary hack to have split_optimizer_states return a
+        # List[List[Tensor]] instead of List[Tensor] to match the behavior of
+        # _split_optimizer_states_non_kv_zch.  This should be removed after
+        # proper support for multiple optimizers is added for the
+        # enable_optimizer_offloading=True case.
+        return [[opt] for opt in opt_list]
 
     @torch.jit.export
     def get_offloaded_optimizer_states(
@@ -2438,14 +2502,22 @@ def get_optimizer_state(
         Returns a list of optimizer states split by table. So far, we only support EXACT_ROWWISE_ADAGRAD
         so only momentum1 state is returned.
         """
-        return [
-            ({"momentum1": states})
-            for states in self.split_optimizer_states(
-                sorted_id_tensor=sorted_id_tensor,
-                no_snapshot=no_snapshot,
-                should_flush=should_flush,
+        states_list = self.split_optimizer_states(
+            sorted_id_tensor=sorted_id_tensor,
+            no_snapshot=no_snapshot,
+            should_flush=should_flush,
+        )
+
+        if self.optimizer == OptimType.EXACT_ROWWISE_ADAGRAD:
+            keys = ["momentum1"]
+        elif self.optimizer == OptimType.PARTIAL_ROWWISE_ADAM:
+            keys = ["momentum1", "momentum2"]
+        else:
+            raise NotImplementedError(
+                f"Getting optimizer states is not supported for {self.optimizer}"
             )
-        ]
+
+        return [dict(zip(keys, states)) for states in states_list]
 
     @torch.jit.export
     def debug_split_embedding_weights(self) -> List[torch.Tensor]:
@@ -2460,7 +2532,7 @@ def debug_split_embedding_weights(self) -> List[torch.Tensor]:
         splits = []
         get_event = torch.cuda.Event()
 
-        for t, (row, dim) in enumerate(self.embedding_specs):
+        for t, (row, _) in enumerate(self.embedding_specs):
             weights = torch.empty(
                 (row, self.max_D), dtype=self.weights_precision.as_dtype()
             )
diff --git a/fbgemm_gpu/test/tbe/ssd/ssd_split_tbe_training_test.py b/fbgemm_gpu/test/tbe/ssd/ssd_split_tbe_training_test.py
@@ -757,7 +757,7 @@ def execute_ssd_forward_(
 
     def split_optimizer_states_(
         self, emb: SSDTableBatchedEmbeddingBags
-    ) -> List[torch.Tensor]:
+    ) -> List[List[torch.Tensor]]:
         _, bucket_asc_ids_list, _ = emb.split_embedding_weights(
             no_snapshot=False, should_flush=True
         )
@@ -962,7 +962,7 @@ def test_ssd_backward_adagrad(
             # pyre-fixme[16]: Optional type has no attribute `float`.
             ref_optimizer_state = emb_ref[f].weight.grad.float().to_dense().pow(2)
             torch.testing.assert_close(
-                split_optimizer_states[t].float(),
+                split_optimizer_states[t][0].float(),
                 ref_optimizer_state.mean(dim=1),
                 atol=tolerance,
                 rtol=tolerance,
@@ -978,7 +978,7 @@ def test_ssd_backward_adagrad(
                 emb_r.weight.float(),
                 value=-lr,
                 tensor1=emb_r.weight.grad.float().to_dense(),
-                tensor2=split_optimizer_states[t]
+                tensor2=split_optimizer_states[t][0]
                 .float()
                 .sqrt_()
                 .add_(eps)
@@ -1113,7 +1113,10 @@ def test_ssd_emb_state_dict(
                 emb_r.weight.float(),
                 value=-lr,
                 tensor1=emb_r.weight.grad.float().to_dense(),  # pyre-ignore[16]
-                tensor2=split_optimizer_states[table_index]
+                # NOTE: The [0] index is a hack since the test is fixed to use
+                # EXACT_ROWWISE_ADAGRAD optimizer.  The test in general should
+                # be upgraded in the future to support multiple optimizers
+                tensor2=split_optimizer_states[table_index][0]
                 .float()
                 .sqrt_()
                 .add_(eps)
@@ -1188,7 +1191,8 @@ def execute_ssd_cache_pipeline_(  # noqa C901
         )
 
         optimizer_states_ref = [
-            s.clone().float() for s in self.split_optimizer_states_(emb)
+            [s.clone().float() for s in states]
+            for states in self.split_optimizer_states_(emb)
         ]
 
         Es = [emb.embedding_specs[t][0] for t in range(T)]
@@ -1334,8 +1338,11 @@ def _prefetch(b_it: int) -> int:
             # Compare optimizer states
             split_optimizer_states = self.split_optimizer_states_(emb)
             for f, t in self.get_physical_table_arg_indices_(emb.feature_table_map):
-                optim_state_r = optimizer_states_ref[t]
-                optim_state_t = split_optimizer_states[t]
+                optim_state_r = optimizer_states_ref[t][0]
+                # NOTE: The [0] index is a hack since the test is fixed to use
+                # EXACT_ROWWISE_ADAGRAD optimizer.  The test in general should
+                # be upgraded in the future to support multiple optimizers
+                optim_state_t = split_optimizer_states[t][0]
                 emb_r = emb_ref[f]
 
                 optim_state_r.add_(
@@ -1753,7 +1760,10 @@ def test_kv_emb_state_dict(
                 dim=1
             )
             torch.testing.assert_close(
-                split_optimizer_states[t].float(),
+                # NOTE: The [0] index is a hack since the test is fixed to use
+                # EXACT_ROWWISE_ADAGRAD optimizer.  The test in general should
+                # be upgraded in the future to support multiple optimizers
+                split_optimizer_states[t][0].float(),
                 ref_opt_mean.cpu(),
                 atol=tolerance,
                 rtol=tolerance,
@@ -1799,8 +1809,11 @@ def test_kv_emb_state_dict(
                 .to_dense()[bucket_asc_ids_list[table_index].view(-1)]
             )
             self.assertLess(table_index, len(emb_state_dict_list))
-            assert len(split_optimizer_states[table_index]) == num_ids
-            opt = split_optimizer_states[table_index]
+            assert len(split_optimizer_states[table_index][0]) == num_ids
+            # NOTE: The [0] index is a hack since the test is fixed to use
+            # EXACT_ROWWISE_ADAGRAD optimizer.  The test in general should
+            # be upgraded in the future to support multiple optimizers
+            opt = split_optimizer_states[table_index][0]
             new_ref_weight = torch.addcdiv(
                 emb_r_w.float(),
                 value=-lr,
@@ -1985,7 +1998,10 @@ def test_kv_opt_state_w_offloading(
             # pyre-fixme[16]: Undefined attribute: `Optional` has no attribute `__getitem__`.
             ref_kv_opt = ref_optimizer_state[bucket_asc_ids_list[t]].view(-1)
             torch.testing.assert_close(
-                split_optimizer_states[t].float(),
+                # NOTE: The [0] index is a hack since the test is fixed to use
+                # EXACT_ROWWISE_ADAGRAD optimizer.  The test in general should
+                # be upgraded in the future to support multiple optimizers
+                split_optimizer_states[t][0].float(),
                 ref_kv_opt,
                 atol=tolerance,
                 rtol=tolerance,
@@ -2031,8 +2047,11 @@ def test_kv_opt_state_w_offloading(
                 .to_dense()[bucket_asc_ids_list[table_index].view(-1)]
             )
             self.assertLess(table_index, len(emb_state_dict_list))
-            assert len(split_optimizer_states[table_index]) == num_ids
-            opt = split_optimizer_states[table_index]
+            # NOTE: The [0] index is a hack since the test is fixed to use
+            # EXACT_ROWWISE_ADAGRAD optimizer.  The test in general should
+            # be upgraded in the future to support multiple optimizers
+            assert len(split_optimizer_states[table_index][0]) == num_ids
+            opt = split_optimizer_states[table_index][0]
             new_ref_weight = torch.addcdiv(
                 emb_r_w.float(),
                 value=-lr,
@@ -2221,7 +2240,10 @@ def test_kv_state_dict_w_backend_return_whole_row(
             # pyre-fixme[16]: Undefined attribute: `Optional` has no attribute `__getitem__`.
             ref_kv_opt = ref_optimizer_state[bucket_asc_ids_list[t]].view(-1)
             opt = (
-                split_optimizer_states[t]
+                # NOTE: The [0] index is a hack since the test is fixed to use
+                # EXACT_ROWWISE_ADAGRAD optimizer.  The test in general should
+                # be upgraded in the future to support multiple optimizers
+                split_optimizer_states[t][0]
                 .narrow(0, 0, bucket_asc_ids_list[t].size(0))
                 .view(-1)
                 .view(torch.float32)
@@ -2276,7 +2298,10 @@ def test_kv_state_dict_w_backend_return_whole_row(
                 .to_dense()[bucket_asc_ids_list[table_index].view(-1)]
             )
             self.assertLess(table_index, len(emb_state_dict_list))
-            assert split_optimizer_states[table_index].size(0) == num_ids
+            # NOTE: The [0] index is a hack since the test is fixed to use
+            # EXACT_ROWWISE_ADAGRAD optimizer.  The test in general should
+            # be upgraded in the future to support multiple optimizers
+            assert split_optimizer_states[table_index][0].size(0) == num_ids
             new_ref_weight = torch.addcdiv(
                 emb_r_w.float(),
                 value=-lr,
@@ -2501,9 +2526,12 @@ def test_apply_kv_state_dict(
                 # pyre-fixme[16]: Undefined attribute: Item `torch._tensor.Tensor` of `typing.Uni...
                 emb_state_dict_list[i].full_tensor()
             )
+            # NOTE: The [0] index is a hack since the test is fixed to use
+            # EXACT_ROWWISE_ADAGRAD optimizer.  The test in general should
+            # be upgraded in the future to support multiple optimizers
             # pyre-ignore [16]
             emb2._cached_kvzch_data.cached_optimizer_state_per_table[i].copy_(
-                split_optimizer_states[i]
+                split_optimizer_states[i][0]
             )
             # pyre-ignore [16]
             emb2._cached_kvzch_data.cached_id_tensor_per_table[i].copy_(
@@ -2547,8 +2575,8 @@ def test_apply_kv_state_dict(
                 rtol=tolerance,
             )
             torch.testing.assert_close(
-                split_optimizer_states[t][sorted_ids.indices],
-                split_optimizer_states2[t][sorted_ids2.indices],
+                split_optimizer_states[t][0][sorted_ids.indices],
+                split_optimizer_states2[t][0][sorted_ids2.indices],
                 atol=tolerance,
                 rtol=tolerance,
             )
@@ -2820,7 +2848,10 @@ def copy_weights_hook(
             # pyre-fixme[16]: Optional type has no attribute `float`.
             ref_optimizer_state = emb_ref[f].weight.grad.float().to_dense().pow(2)
             torch.testing.assert_close(
-                split_optimizer_states[t].float(),
+                # NOTE: The [0] index is a hack since the test is fixed to use
+                # EXACT_ROWWISE_ADAGRAD optimizer.  The test in general should
+                # be upgraded in the future to support multiple optimizers
+                split_optimizer_states[t][0].float(),
                 ref_optimizer_state.mean(dim=1),
                 atol=tolerance,
                 rtol=tolerance,
@@ -3036,7 +3067,10 @@ def copy_opt_states_hook(
             cursor += local_idxes.numel()
 
             torch.testing.assert_close(
-                split_optimizer_states[t][indices].float(),
+                # NOTE: The [0] index is a hack since the test is fixed to use
+                # EXACT_ROWWISE_ADAGRAD optimizer.  The test in general should
+                # be upgraded in the future to support multiple optimizers
+                split_optimizer_states[t][0][indices].float(),
                 opt_states_per_tb.cpu().float(),
                 atol=tolerance,
                 rtol=tolerance,