nipung90
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise/f8f8bf16_rowwise_common.cuh
Lines changed: 79 additions & 175 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise/f8f8bf16_rowwise_common.cuh
Lines changed: 79 additions & 175 deletions
@@ -36,7 +36,6 @@ template <
     bool PONG,
     bool COOP,
     bool FAST_ACCUM,
-    bool USE_BIAS,
     typename INPUT_DTYPE,
     typename BIAS_DTYPE>
 at::Tensor f8f8bf16_rowwise_impl(
@@ -158,10 +157,7 @@ at::Tensor f8f8bf16_rowwise_impl(
 
   using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
       cutlass::multiplies,
-      cute::conditional_t< // Second stage output type.
-          USE_BIAS,
-          ElementBias,
-          ElementOutput>,
+      ElementBias, // Second stage output type.
       ElementComputeEpilogue, // Second stage input types.
       cutlass::FloatRoundStyle::round_to_nearest>;
 
@@ -174,11 +170,8 @@ at::Tensor f8f8bf16_rowwise_impl(
       ElementBias, // Final stage input types.
       cutlass::FloatRoundStyle::round_to_nearest>;
 
-  using EVTComputeBias =
-      cutlass::epilogue::fusion::Sm90EVT<ComputeBias, Bias, EVTCompute1>;
-
   using EpilogueEVT =
-      cute::conditional_t<USE_BIAS, EVTComputeBias, EVTCompute1>;
+      cutlass::epilogue::fusion::Sm90EVT<ComputeBias, Bias, EVTCompute1>;
 
   using DefaultSchedule = cutlass::gemm::KernelTmaWarpSpecialized;
   using PongSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
@@ -273,38 +266,26 @@ at::Tensor f8f8bf16_rowwise_impl(
        (ElementOutput*)Y.data_ptr<at::BFloat16>(),
        stride_output}};
 
-  if constexpr (USE_BIAS) {
-    arguments.epilogue.thread = {
-        {reinterpret_cast<ElementBias*>(bias.value().data_ptr())}, // bias
-        // compute_1
-        {
-            {reinterpret_cast<ElementComputeEpilogue*>(
-                w_scale.data_ptr())}, // x_scale
-            // compute_0
-            {
-                {reinterpret_cast<ElementComputeEpilogue*>(
-                    x_scale.data_ptr())}, // w_scale
-                {}, // Accumulator
-                {} // Multiplies
-            },
-            {}, // Multiplies
-        },
-        {}, // Plus
-    };
-  } else {
-    arguments.epilogue.thread = {
-        {reinterpret_cast<ElementComputeEpilogue*>(
-            w_scale.data_ptr())}, // x_scale
-        // compute_0
-        {
-            {reinterpret_cast<ElementComputeEpilogue*>(
-                x_scale.data_ptr())}, // w_scale
-            {}, // Accumulator
-            {} // Multiplies
-        },
-        {}, // Multiplies
-    };
-  }
+  arguments.epilogue.thread = {
+      {bias.has_value()
+           ? reinterpret_cast<ElementBias*>(bias.value().data_ptr())
+           : nullptr}, // bias. Note Cutlass EVT will skip node if argument is
+                       // nullptr
+      // compute_1
+      {
+          {reinterpret_cast<ElementComputeEpilogue*>(
+              w_scale.data_ptr())}, // x_scale
+          // compute_0
+          {
+              {reinterpret_cast<ElementComputeEpilogue*>(
+                  x_scale.data_ptr())}, // w_scale
+              {}, // Accumulator
+              {} // Multiplies
+          },
+          {}, // Multiplies
+      },
+      {}, // Plus
+  };
 
   Gemm gemm;
 
@@ -367,144 +348,71 @@ at::Tensor f8f8bf16_rowwise_wrapper(
             bias.value().dtype() == at::kBFloat16,
         "Bias type must be bfloat16 or float32 if provided.");
   }
-  bool use_bias = bias.has_value();
-  bool bf16_bias = use_bias && bias.value().dtype() == at::kBFloat16;
+  bool bf16_bias = bias.has_value() && bias.value().dtype() == at::kBFloat16;
 
   // Templatize based on input dtype.
   bool use_e5m2 = XQ.dtype() == at::kFloat8_e5m2;
 
-  if (use_bias) {
-    if (bf16_bias) {
-      if (use_fast_accum) {
-        if (use_e5m2) {
-          return f8f8bf16_rowwise_impl<
-              TB_M,
-              TB_N,
-              TB_K,
-              TBS_M,
-              TBS_N,
-              TBS_K,
-              ARCH,
-              PONG,
-              COOP,
-              true,
-              true,
-              cutlass::float_e5m2_t,
-              cutlass::bfloat16_t>(XQ, WQ, x_scale, w_scale, bias, output);
-        } else {
-          return f8f8bf16_rowwise_impl<
-              TB_M,
-              TB_N,
-              TB_K,
-              TBS_M,
-              TBS_N,
-              TBS_K,
-              ARCH,
-              PONG,
-              COOP,
-              true,
-              true,
-              cutlass::float_e4m3_t,
-              cutlass::bfloat16_t>(XQ, WQ, x_scale, w_scale, bias, output);
-        }
+  if (bf16_bias) {
+    if (use_fast_accum) {
+      if (use_e5m2) {
+        return f8f8bf16_rowwise_impl<
+            TB_M,
+            TB_N,
+            TB_K,
+            TBS_M,
+            TBS_N,
+            TBS_K,
+            ARCH,
+            PONG,
+            COOP,
+            true,
+            cutlass::float_e5m2_t,
+            cutlass::bfloat16_t>(XQ, WQ, x_scale, w_scale, bias, output);
       } else {
-        if (use_e5m2) {
-          return f8f8bf16_rowwise_impl<
-              TB_M,
-              TB_N,
-              TB_K,
-              TBS_M,
-              TBS_N,
-              TBS_K,
-              ARCH,
-              PONG,
-              COOP,
-              false,
-              true,
-              cutlass::float_e5m2_t,
-              cutlass::bfloat16_t>(XQ, WQ, x_scale, w_scale, bias, output);
-        } else {
-          return f8f8bf16_rowwise_impl<
-              TB_M,
-              TB_N,
-              TB_K,
-              TBS_M,
-              TBS_N,
-              TBS_K,
-              ARCH,
-              PONG,
-              COOP,
-              false,
-              true,
-              cutlass::float_e4m3_t,
-              cutlass::bfloat16_t>(XQ, WQ, x_scale, w_scale, bias, output);
-        }
+        return f8f8bf16_rowwise_impl<
+            TB_M,
+            TB_N,
+            TB_K,
+            TBS_M,
+            TBS_N,
+            TBS_K,
+            ARCH,
+            PONG,
+            COOP,
+            true,
+            cutlass::float_e4m3_t,
+            cutlass::bfloat16_t>(XQ, WQ, x_scale, w_scale, bias, output);
       }
     } else {
-      if (use_fast_accum) {
-        if (use_e5m2) {
-          return f8f8bf16_rowwise_impl<
-              TB_M,
-              TB_N,
-              TB_K,
-              TBS_M,
-              TBS_N,
-              TBS_K,
-              ARCH,
-              PONG,
-              COOP,
-              true,
-              true,
-              cutlass::float_e5m2_t,
-              float>(XQ, WQ, x_scale, w_scale, bias, output);
-        } else {
-          return f8f8bf16_rowwise_impl<
-              TB_M,
-              TB_N,
-              TB_K,
-              TBS_M,
-              TBS_N,
-              TBS_K,
-              ARCH,
-              PONG,
-              COOP,
-              true,
-              true,
-              cutlass::float_e4m3_t,
-              float>(XQ, WQ, x_scale, w_scale, bias, output);
-        }
+      if (use_e5m2) {
+        return f8f8bf16_rowwise_impl<
+            TB_M,
+            TB_N,
+            TB_K,
+            TBS_M,
+            TBS_N,
+            TBS_K,
+            ARCH,
+            PONG,
+            COOP,
+            false,
+            cutlass::float_e5m2_t,
+            cutlass::bfloat16_t>(XQ, WQ, x_scale, w_scale, bias, output);
       } else {
-        if (use_e5m2) {
-          return f8f8bf16_rowwise_impl<
-              TB_M,
-              TB_N,
-              TB_K,
-              TBS_M,
-              TBS_N,
-              TBS_K,
-              ARCH,
-              PONG,
-              COOP,
-              false,
-              true,
-              cutlass::float_e5m2_t,
-              float>(XQ, WQ, x_scale, w_scale, bias, output);
-        } else {
-          return f8f8bf16_rowwise_impl<
-              TB_M,
-              TB_N,
-              TB_K,
-              TBS_M,
-              TBS_N,
-              TBS_K,
-              ARCH,
-              PONG,
-              COOP,
-              false,
-              true,
-              cutlass::float_e4m3_t,
-              float>(XQ, WQ, x_scale, w_scale, bias, output);
-        }
+        return f8f8bf16_rowwise_impl<
+            TB_M,
+            TB_N,
+            TB_K,
+            TBS_M,
+            TBS_N,
+            TBS_K,
+            ARCH,
+            PONG,
+            COOP,
+            false,
+            cutlass::float_e4m3_t,
+            cutlass::bfloat16_t>(XQ, WQ, x_scale, w_scale, bias, output);
       }
     }
   } else {
@@ -521,7 +429,6 @@ at::Tensor f8f8bf16_rowwise_wrapper(
             PONG,
             COOP,
             true,
-            false,
             cutlass::float_e5m2_t,
             float>(XQ, WQ, x_scale, w_scale, bias, output);
       } else {
@@ -536,7 +443,6 @@ at::Tensor f8f8bf16_rowwise_wrapper(
             PONG,
             COOP,
             true,
-            false,
             cutlass::float_e4m3_t,
             float>(XQ, WQ, x_scale, w_scale, bias, output);
       }
@@ -553,7 +459,6 @@ at::Tensor f8f8bf16_rowwise_wrapper(
             PONG,
             COOP,
             false,
-            false,
             cutlass::float_e5m2_t,
             float>(XQ, WQ, x_scale, w_scale, bias, output);
       } else {
@@ -568,7 +473,6 @@ at::Tensor f8f8bf16_rowwise_wrapper(
             PONG,
             COOP,
             false,
-            false,
             cutlass::float_e4m3_t,
             float>(XQ, WQ, x_scale, w_scale, bias, output);
       }