Optimize some code out of compilation in the table lookup kernel (#4371)

q10 · facebook-github-bot · commit fe9946e9b331 · 2025-06-24T08:24:24.000-07:00
Summary: Pull Request resolved: #4371 X-link: facebookresearch/FBGEMM#1440 - Optimize some code out of compilation in the table lookup kernel Reviewed By: spcyppt Differential Revision: D76865732 fbshipit-source-id: a90c14567ea8899a1f7ffc43988bfba720507b6e
diff --git a/fbgemm_gpu/codegen/genscript/jinja_environment.py b/fbgemm_gpu/codegen/genscript/jinja_environment.py
@@ -111,7 +111,7 @@ def generate_optimized_grad_sum_loop_access(
     smem_blob = blob.format(grad_vec="smem_grad_sum[d_vec]")
     reg_blob = blob.format(grad_vec="grad_sum[vec]")
     gen_blob = """
-    if (kUseVecBlocking) {
+    if constexpr (kUseVecBlocking) {
         // max_vecs is not known at compile time
         for (int32_t vec = 0;
             vec < max_vecs &&
@@ -121,8 +121,8 @@ def generate_optimized_grad_sum_loop_access(
             [[maybe_unused]] const int32_t d = d_vec * VEC_WIDTH;
             {smem_blob}
         }
-    }
-    else {
+    
+    } else {
         // kFixedMaxVecsPerThread is known at compile time
         #pragma unroll kFixedMaxVecsPerThread
         for (int32_t vec = 0;
diff --git a/fbgemm_gpu/codegen/genscript/optimizers.py b/fbgemm_gpu/codegen/genscript/optimizers.py
@@ -1053,18 +1053,12 @@ def adam() -> Dict[str, Any]:
 
     split_weight_update = """
       Vec4T<cache_t> m_t(&momentum1[idx * D + d]);
-      m_t.acc.x *= beta1;
-      m_t.acc.y *= beta1;
-      m_t.acc.z *= beta1;
-      m_t.acc.w *= beta1;
+      m_t.mul_(beta1);
       m_t.fma_(grad, 1.0 - beta1);
       m_t.store(&momentum1[idx * D + d]);
 
       Vec4T<cache_t> v_t(&momentum2[idx * D + d]);
-      v_t.acc.x *= beta2;
-      v_t.acc.y *= beta2;
-      v_t.acc.z *= beta2;
-      v_t.acc.w *= beta2;
+      v_t.mul_(beta2);
 
       grad.acc.x *= grad.acc.x;
       grad.acc.y *= grad.acc.y;
@@ -1141,10 +1135,7 @@ def partial_rowwise_adam() -> Dict[str, Any]:
 
     split_weight_update = """
       Vec4T<momentum1_ph_t> m_t(&momentum1[idx * D + d]);
-      m_t.acc.x *= beta1;
-      m_t.acc.y *= beta1;
-      m_t.acc.z *= beta1;
-      m_t.acc.w *= beta1;
+      m_t.mul_(beta1);
       m_t.fma_(grad, 1.0 - beta1);
       m_t.store(&momentum1[idx * D + d]);
 
diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
@@ -1178,7 +1178,7 @@ Tensor {{ embedding_cuda_op }}(
                     int32_t num_warp_per_row_groups = kBackwardMaxThreads / kThreadGroupSize;
                     int32_t warp_per_row_smem_bytes = 0;
 
-                    if (kUseVecBlocking) {
+                    if constexpr (kUseVecBlocking) {
                       warp_per_row_smem_bytes = compute_num_groups_and_dynamic_smem_bytes(
                           &num_warp_per_row_groups,
                           // Use max_D to compute shmem_bytes (for smem_grad_sum)
diff --git a/fbgemm_gpu/codegen/training/optimizer/embedding_optimizer_split_device_kernel_template.cuh b/fbgemm_gpu/codegen/training/optimizer/embedding_optimizer_split_device_kernel_template.cuh
@@ -69,7 +69,7 @@ DEVICE_INLINE void {{ mdesc }}_{{ optimizer }}_table_update_kernel(
     emb_t* __restrict__ weights {nullptr};
     cache_t* __restrict__ cache_weights {nullptr};
     int32_t D_emb = D;
-    if (kIsInt8) {
+    if constexpr (kIsInt8) {
         D_emb += kINT8QparamsBytes;
     }
     const auto weights_placement = static_cast<PlacementType>(weights_placements[t]);

Original file line number	Diff line number	Diff line change
`@@ -69,7 +69,7 @@ DEVICE_INLINE void {{ mdesc }}_{{ optimizer }}_table_update_kernel(`
`69`	`69`	`emb_t* __restrict__ weights {nullptr};`
`70`	`70`	`cache_t* __restrict__ cache_weights {nullptr};`
`71`	`71`	`int32_t D_emb = D;`
`72`		`- if (kIsInt8) {`
	`72`	`+ if constexpr (kIsInt8) {`
`73`	`73`	`D_emb += kINT8QparamsBytes;`
`74`	`74`	`}`
`75`	`75`	`const auto weights_placement = static_cast<PlacementType>(weights_placements[t]);`