Support scale_bias_last on tbe lookup kernel (#4363)

Junnan Wan · facebook-github-bot · commit 0b21002488a3 · 2025-06-23T09:41:02.000-07:00
Summary: Pull Request resolved: #4363 X-link: facebookresearch/FBGEMM#1428 Check https://fb.workplace.com/groups/fbgemmusers/permalink/23950680467919409/ for context With scale_bias_last=true, the TBE tensor could be in same shape between publish and inference runtime which makes model loading much easier (no need to process each row). Reviewed By: sryap Differential Revision: D76615824 fbshipit-source-id: 528bc00955156de38f1ef9bc058a9350ad0d75ee
diff --git a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_cpu_template.cpp b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_cpu_template.cpp
@@ -91,12 +91,12 @@ void pruned_hashmap_insert_{{ wdesc }}_cpu(
                     continue;
                 }
                 const auto capacity = table_end - table_start;
-                
+
                 for (const auto b : c10::irange(B)) {
                     const auto indices_start = offsets_acc[t * B + b];
                     const auto indices_end = offsets_acc[t * B + b + 1];
                     const auto L = indices_end - indices_start;
-                    
+
                     for (const auto l : c10::irange(L)) {
                         const auto idx = indices_acc[indices_start + l];
                         const auto dense_idx = dense_indices_acc[indices_start + l];
@@ -109,20 +109,20 @@ void pruned_hashmap_insert_{{ wdesc }}_cpu(
                         while (true) {
                             const auto ht_idx = table_start + static_cast<int64_t>(slot);
                             const auto slot_sparse_idx = hash_table_acc[ht_idx][0];
-                            
+
                             // Empty slot
                             if (slot_sparse_idx == -1) {
                                 hash_table_acc[ht_idx][0] = static_cast<hash_t>(idx);
                                 hash_table_acc[ht_idx][1] = static_cast<hash_t>(dense_idx);
                                 break;
                             }
-                            
+
                             // Already exists (shouldn't happen in practice)
                             if (slot_sparse_idx == idx) {
                                 hash_table_acc[ht_idx][1] = static_cast<hash_t>(dense_idx);
                                 break;
                             }
-                            
+
                             // Linear probe
                             slot = (slot + 1) % capacity;
                         }
@@ -158,7 +158,8 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
     {% endif %}
     int64_t output_dtype,
     int64_t fp8_exponent_bits,
-    int64_t fp8_exponent_bias
+    int64_t fp8_exponent_bias,
+    bool scale_bias_last
 ) {
     TENSOR_ON_CPU(dev_weights);
     TENSOR_ON_CPU(uvm_weights);
@@ -273,8 +274,9 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
                 if (output_is_int8) {
                     TORCH_CHECK(weight_ty == SparseType::INT8, "int8 output are only supported for int8 weights");
                 }
+                const int32_t scale_bias_size = (weight_ty == SparseType::INT8 && scale_bias_last) ? 8 : 4;
                 // default to 1 byte alignment for CPU TBE
-                const int32_t D_bytes = nbit::padded_row_size_in_bytes(D, weight_ty, row_alignment);
+                const int32_t D_bytes = nbit::padded_row_size_in_bytes(D, weight_ty, row_alignment, scale_bias_size);
 
                 int tt;
                 for (tt = t + 1; tt < T && weights_offsets_acc[tt] == weights_offsets_acc[t]; ++tt);
@@ -352,7 +354,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
                     /*exponent_bias=*/fp8_exponent_bias,
                     {% endif %}
                     {% if has_asmjit %}
-                    /*scale_bias_last=*/false,
+                    /*scale_bias_last=*/scale_bias_last,
                     {% endif %}
                     {% if use_base %}
                     /*no_bag=*/nobag_op,
@@ -466,12 +468,12 @@ Tensor pruned_hashmap_lookup_{{ wdesc }}_cpu(
                         for (const auto l : c10::irange(L)) {
                             dense_indices_acc[indices_start + l] = indices_acc[indices_start + l];
                         }
-                    
+
                     } else {
                         for (const auto l : c10::irange(L)) {
                             const auto idx = indices_acc[indices_start + l];
                             auto slot = pruned_hash_function(static_cast<utdx_t>(idx)) % capacity;
-                            
+
                             while (true) {
                                 const auto ht_idx = table_start + static_cast<int64_t>(slot);
                                 const auto slot_sparse_idx = hash_table_acc[ht_idx][0];
@@ -486,7 +488,7 @@ Tensor pruned_hashmap_lookup_{{ wdesc }}_cpu(
                                     dense_indices_acc[indices_start + l] = static_cast<index_t>(hash_table_acc[ht_idx][1]);
                                     break;
                                 }
-                                
+
                                 // Linear probe
                                 slot = (slot + 1) % capacity;
                             }
@@ -496,7 +498,7 @@ Tensor pruned_hashmap_lookup_{{ wdesc }}_cpu(
             }
         });
     });
-    
+
     return dense_indices;
 }
 
diff --git a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_host_cpu.cpp b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_host_cpu.cpp
@@ -43,7 +43,8 @@ Tensor int_nbit_split_embedding_codegen_forward_unweighted_cpu(
     int64_t row_alignment,
     int64_t output_dtype,
     int64_t fp8_exponent_bits,
-    int64_t fp8_exponent_bias);
+    int64_t fp8_exponent_bias,
+    bool scale_bias_last);
 
 Tensor int_nbit_split_embedding_codegen_forward_weighted_cpu(
     Tensor dev_weights,
@@ -60,7 +61,8 @@ Tensor int_nbit_split_embedding_codegen_forward_weighted_cpu(
     Tensor indice_weights,
     int64_t output_dtype,
     int64_t fp8_exponent_bits,
-    int64_t fp8_exponent_bias);
+    int64_t fp8_exponent_bias,
+    bool scale_bias_last);
 
 Tensor int_nbit_split_embedding_nobag_codegen_forward_unweighted_cpu(
     Tensor dev_weights,
@@ -75,10 +77,10 @@ Tensor int_nbit_split_embedding_nobag_codegen_forward_unweighted_cpu(
     int64_t row_alignment,
     int64_t output_dtype,
     int64_t fp8_exponent_bits,
-    int64_t fp8_exponent_bias);
+    int64_t fp8_exponent_bias,
+    bool scale_bias_last);
 
-///@ingroup embedding-cpu
-Tensor int_nbit_split_embedding_codegen_lookup_function_cpu(
+Tensor int_nbit_split_embedding_codegen_lookup_function_cpu_impl(
     Tensor dev_weights,
     Tensor uvm_weights, // to match the interface of CUDA op using UVM
     Tensor weights_placements, // to match the interface of CUDA op using UVM
@@ -103,10 +105,12 @@ Tensor int_nbit_split_embedding_codegen_lookup_function_cpu(
     std::optional<int64_t> row_alignment,
     std::optional<int64_t> max_float8_D,
     std::optional<int64_t> fp8_exponent_bits,
-    std::optional<int64_t> fp8_exponent_bias) {
+    std::optional<int64_t> fp8_exponent_bias,
+    std::optional<bool> scale_bias_last) {
   if (offsets.scalar_type() != indices.scalar_type()) {
     offsets = offsets.toType(indices.scalar_type());
   }
+  auto scale_bias_last_val = scale_bias_last ? *scale_bias_last : true;
   if (static_cast<PoolingMode>(pooling_mode) == PoolingMode::NONE) {
     std::vector<int64_t> max_D_list{
         max_int2_D,
@@ -117,53 +121,110 @@ Tensor int_nbit_split_embedding_codegen_lookup_function_cpu(
         max_float32_D};
     int64_t max_D = *std::max_element(max_D_list.begin(), max_D_list.end());
     return int_nbit_split_embedding_nobag_codegen_forward_unweighted_cpu(
-        dev_weights,
-        uvm_weights,
-        weights_placements,
-        weights_offsets,
-        weights_tys,
+        std::move(dev_weights),
+        std::move(uvm_weights),
+        std::move(weights_placements),
+        std::move(weights_offsets),
+        std::move(weights_tys),
         max_D,
-        indices,
-        offsets,
+        std::move(indices),
+        std::move(offsets),
         pooling_mode,
         row_alignment ? *row_alignment : 1,
         output_dtype,
         fp8_exponent_bits ? *fp8_exponent_bits : -1,
-        fp8_exponent_bias ? *fp8_exponent_bias : -1);
+        fp8_exponent_bias ? *fp8_exponent_bias : -1,
+        scale_bias_last_val);
   }
   if (!indice_weights || indice_weights->numel() == 0) {
     return int_nbit_split_embedding_codegen_forward_unweighted_cpu(
-        dev_weights,
-        uvm_weights,
-        weights_placements,
-        weights_offsets,
-        weights_tys,
-        D_offsets,
+        std::move(dev_weights),
+        std::move(uvm_weights),
+        std::move(weights_placements),
+        std::move(weights_offsets),
+        std::move(weights_tys),
+        std::move(D_offsets),
         total_D,
-        indices,
-        offsets,
+        std::move(indices),
+        std::move(offsets),
         pooling_mode,
         row_alignment ? *row_alignment : 1,
         output_dtype,
         fp8_exponent_bits ? *fp8_exponent_bits : -1,
-        fp8_exponent_bias ? *fp8_exponent_bias : -1);
+        fp8_exponent_bias ? *fp8_exponent_bias : -1,
+        scale_bias_last_val);
   }
   return int_nbit_split_embedding_codegen_forward_weighted_cpu(
-      dev_weights,
-      uvm_weights,
-      weights_placements,
-      weights_offsets,
-      weights_tys,
-      D_offsets,
+      std::move(dev_weights),
+      std::move(uvm_weights),
+      std::move(weights_placements),
+      std::move(weights_offsets),
+      std::move(weights_tys),
+      std::move(D_offsets),
       total_D,
-      indices,
-      offsets,
+      std::move(indices),
+      std::move(offsets),
       pooling_mode,
       row_alignment ? *row_alignment : 1,
-      *indice_weights,
+      std::move(*indice_weights),
       output_dtype,
       fp8_exponent_bits ? *fp8_exponent_bits : -1,
-      fp8_exponent_bias ? *fp8_exponent_bias : -1);
+      fp8_exponent_bias ? *fp8_exponent_bias : -1,
+      scale_bias_last_val);
+}
+
+///@ingroup embedding-cpu
+Tensor int_nbit_split_embedding_codegen_lookup_function_cpu(
+    Tensor dev_weights,
+    Tensor uvm_weights, // to match the interface of CUDA op using UVM
+    Tensor weights_placements, // to match the interface of CUDA op using UVM
+    Tensor weights_offsets,
+    Tensor weights_tys,
+    Tensor D_offsets,
+    int64_t total_D,
+    int64_t max_int2_D,
+    int64_t max_int4_D,
+    int64_t max_int8_D,
+    int64_t max_float16_D,
+    int64_t max_float32_D,
+    Tensor indices,
+    Tensor offsets,
+    int64_t pooling_mode,
+    std::optional<Tensor> indice_weights,
+    int64_t output_dtype,
+    std::optional<Tensor>
+        lxu_cache_weights, // Not used, to match cache interface for CUDA op
+    std::optional<Tensor>
+        lxu_cache_locations, // Not used, to match cache interface for CUDA op
+    std::optional<int64_t> row_alignment,
+    std::optional<int64_t> max_float8_D,
+    std::optional<int64_t> fp8_exponent_bits,
+    std::optional<int64_t> fp8_exponent_bias) {
+  return int_nbit_split_embedding_codegen_lookup_function_cpu_impl(
+      std::move(dev_weights),
+      std::move(uvm_weights),
+      std::move(weights_placements),
+      std::move(weights_offsets),
+      std::move(weights_tys),
+      std::move(D_offsets),
+      total_D,
+      max_int2_D,
+      max_int4_D,
+      max_int8_D,
+      max_float16_D,
+      max_float32_D,
+      std::move(indices),
+      std::move(offsets),
+      pooling_mode,
+      std::move(indice_weights),
+      output_dtype,
+      std::move(lxu_cache_weights),
+      std::move(lxu_cache_locations),
+      std::move(row_alignment),
+      std::move(max_float8_D),
+      std::move(fp8_exponent_bits),
+      std::move(fp8_exponent_bias),
+      false);
 }
 
 ///@ingroup embedding-cpu