reduced the epsilon for scale calculation to 1e-20 (#1628)

Xiao Sun · facebook-github-bot · commit 936ec59fd515 · 2023-03-06T11:37:17.000-08:00
Summary: Pull Request resolved: #1628 previously the scale is calcualted by max_int8/(eps + max_val), while eps = 1e-8; this is okay because the forward tensor needs to be larger than that to make a difference; with fp8, the gradients are much smaller, eps =1e-8 will cause the tensor not scaling up to cover the entire range of fp8, thus we change it to 1e-20 Reviewed By: brad-mengchi Differential Revision: D43665395 fbshipit-source-id: 0214e22af7739ef5cf7561b15e02830c35320402
diff --git a/fbgemm_gpu/src/quantize_ops.cu b/fbgemm_gpu/src/quantize_ops.cu
@@ -83,7 +83,7 @@ __global__ inline void _float_to_fused8bitrowwise_cuda_kernel(
     const int nrows,
     const int ncols,
     std::uint8_t* __restrict__ output) {
-  constexpr float kEpsilon = 1e-8f;
+  constexpr float kEpsilon = 1e-20f;
 
   const int ncols_aligned = (ncols + 4 - 1) / 4 * 4;
   const int output_columns = ncols_aligned + 2 * sizeof(float);
@@ -128,7 +128,7 @@ __global__ inline void _float_to_FP8rowwise_cuda_kernel(
     const int ncols,
     std::uint8_t* __restrict__ output,
     bool forward) {
-  constexpr float kEpsilon = 1e-8f;
+  constexpr float kEpsilon = 1e-20f;
   int ebit;
   int bias;
   float max_pos;
@@ -253,7 +253,7 @@ __global__ inline void _get_FP8_qparam_cuda_kernel(
     max_pos = 0.875;
   }
   // starting values for future reductions
-  constexpr float kEpsilon = 1e-8f;
+  constexpr float kEpsilon = 1e-20f;
   float maximum_element = kEpsilon;
   // always a power of 2 up to size 32. Multiple rows can share the same warp
   // when smaller than 32.
@@ -299,7 +299,7 @@ __global__ inline void _compute_8bit_quantize_cuda_kernel(
     const int nrows,
     const int ncols,
     std::uint8_t* const __restrict__ output) {
-  constexpr float kEpsilon = 1e-8f;
+  constexpr float kEpsilon = 1e-20f;
 
   const int ncols_aligned = (ncols + 4 - 1) / 4 * 4;
   const int output_columns = ncols_aligned + 2 * sizeof(float);
@@ -332,7 +332,7 @@ __global__ inline void _compute_FP8_quantize_cuda_kernel(
     const int ncols,
     std::uint8_t* const __restrict__ output,
     bool forward) {
-  constexpr float kEpsilon = 1e-8f;
+  constexpr float kEpsilon = 1e-20f;
   int ebit;
   int bias;
   float max_pos;