Support optimizer state offloading for partial rowwise adam optimizer (#4477)

q10 · facebook-github-bot · commit d01549a7678a · 2025-07-11T17:38:56.000-07:00
Summary: Pull Request resolved: #4477 X-link: facebookresearch/FBGEMM#1534 Support optimizer state offloading for partial rowwise adam optimizer in the backend C++ code. This does not yet expose support in the frontend Python code, which requires a lot more code changes. The existing non-offloading codepath should not be affected by the changes. This is a re-land of D76491848, but with the backend code enabled instead of the frontend, which was breaking downstream compatibility tests Reviewed By: bobbyliujb, cthi Differential Revision: D78177062 fbshipit-source-id: 72f636d7231409750c5f4d5a6ddfab32c33abbf1
diff --git a/fbgemm_gpu/codegen/genscript/optimizers.py b/fbgemm_gpu/codegen/genscript/optimizers.py
@@ -1120,24 +1120,58 @@ def partial_rowwise_adam() -> Dict[str, Any]:
     """
     )
     split_precomputation += """
+
+    // Define the optimizer state (for use with optimizer offloading)
+    struct OptimizerState {
+        // momentum2 is a single value so it will be accessed directly as a struct field
+        momentum2_ph_t momentum2;
+
+        // momentum1 is an array of D values, so a method to return a pointer given the offset is defined instead
+        DEVICE_INLINE momentum1_ph_t* momentum1_ptr() {
+            // Re-cast the address to momentum1_ph_t* and return
+            return reinterpret_cast<momentum1_ph_t *>(
+                // Cast the address this to momentum2_t* and increment by 1 to skip over the momentum2 value
+                reinterpret_cast<momentum2_ph_t *>(this) + 1
+            );
+        }
+    };
+
+    // Fetch the pointer to the optimizer state along the cache row
+    [[maybe_unused]] auto* optimizer = weight_row_template.template optimizer_state_ptr<OptimizerState>();
+
+    // Fetch the pointer to the momentum1 value
+    // Define the fetch here instead of in split_weight_update to avoid conditionals inside a loop
+    auto* momentum1_start = enable_optimizer_offloading ?
+        (optimizer->momentum1_ptr()) :
+        (&momentum1[idx * D]);
+
     const at::acc_type<cache_t, true> g_avg_square =
         GROUP_REDUCE_ALL_SUM(g_local_sum_square, at::acc_type<cache_t, true>) / D;
 
     at::acc_type<cache_t, true> v_hat_t;
     v_hat_t = 0.0;
     if (threadIdx.x == 0) {
-        at::acc_type<cache_t, true> v_t = momentum2[idx] * beta2 + g_avg_square * (1.0 - beta2);
-        momentum2[idx] = v_t;
+        auto v_t = g_avg_square * (1.0 - beta2);
+
+        if (enable_optimizer_offloading) {
+            v_t += optimizer->momentum2 * beta2;
+            optimizer->momentum2 = v_t;
+        } else {
+            v_t += momentum2[idx] * beta2;
+            momentum2[idx] = v_t;
+        }
+
         v_hat_t = v_t / (1.0 - powf(beta2, iter));
     }
     v_hat_t = SHFL_SYNC(v_hat_t, 0);
     """
 
     split_weight_update = """
-      Vec4T<momentum1_ph_t> m_t(&momentum1[idx * D + d]);
+      auto* momentum1_ptr = momentum1_start + d;
+      Vec4T<momentum1_ph_t> m_t(momentum1_ptr);
       m_t.mul_(beta1);
       m_t.fma_(grad, 1.0 - beta1);
-      m_t.store(&momentum1[idx * D + d]);
+      m_t.store(momentum1_ptr);
 
       weight_new.acc.x -= learning_rate * (m_t.acc.x / (1.0 - powf(beta1, iter)) / (sqrtf(v_hat_t) + eps) + weight_decay * weight_new.acc.x);
       weight_new.acc.y -= learning_rate * (m_t.acc.y / (1.0 - powf(beta1, iter)) / (sqrtf(v_hat_t) + eps) + weight_decay * weight_new.acc.y);
@@ -1179,7 +1213,7 @@ def partial_rowwise_adam() -> Dict[str, Any]:
         "has_gpu_support": True,
         "has_vbe_support": False,
         "has_global_weight_decay_support": False,
-        "has_ssd_support": False,
+        "has_ssd_support": True,
     }