Skip to content

Commit 936ec59

Browse files
Xiao Sunfacebook-github-bot
authored andcommitted
reduced the epsilon for scale calculation to 1e-20 (#1628)
Summary: Pull Request resolved: #1628 previously the scale is calcualted by max_int8/(eps + max_val), while eps = 1e-8; this is okay because the forward tensor needs to be larger than that to make a difference; with fp8, the gradients are much smaller, eps =1e-8 will cause the tensor not scaling up to cover the entire range of fp8, thus we change it to 1e-20 Reviewed By: brad-mengchi Differential Revision: D43665395 fbshipit-source-id: 0214e22af7739ef5cf7561b15e02830c35320402
1 parent ce83746 commit 936ec59

File tree

1 file changed

+5
-5
lines changed

1 file changed

+5
-5
lines changed

fbgemm_gpu/src/quantize_ops.cu

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ __global__ inline void _float_to_fused8bitrowwise_cuda_kernel(
8383
const int nrows,
8484
const int ncols,
8585
std::uint8_t* __restrict__ output) {
86-
constexpr float kEpsilon = 1e-8f;
86+
constexpr float kEpsilon = 1e-20f;
8787

8888
const int ncols_aligned = (ncols + 4 - 1) / 4 * 4;
8989
const int output_columns = ncols_aligned + 2 * sizeof(float);
@@ -128,7 +128,7 @@ __global__ inline void _float_to_FP8rowwise_cuda_kernel(
128128
const int ncols,
129129
std::uint8_t* __restrict__ output,
130130
bool forward) {
131-
constexpr float kEpsilon = 1e-8f;
131+
constexpr float kEpsilon = 1e-20f;
132132
int ebit;
133133
int bias;
134134
float max_pos;
@@ -253,7 +253,7 @@ __global__ inline void _get_FP8_qparam_cuda_kernel(
253253
max_pos = 0.875;
254254
}
255255
// starting values for future reductions
256-
constexpr float kEpsilon = 1e-8f;
256+
constexpr float kEpsilon = 1e-20f;
257257
float maximum_element = kEpsilon;
258258
// always a power of 2 up to size 32. Multiple rows can share the same warp
259259
// when smaller than 32.
@@ -299,7 +299,7 @@ __global__ inline void _compute_8bit_quantize_cuda_kernel(
299299
const int nrows,
300300
const int ncols,
301301
std::uint8_t* const __restrict__ output) {
302-
constexpr float kEpsilon = 1e-8f;
302+
constexpr float kEpsilon = 1e-20f;
303303

304304
const int ncols_aligned = (ncols + 4 - 1) / 4 * 4;
305305
const int output_columns = ncols_aligned + 2 * sizeof(float);
@@ -332,7 +332,7 @@ __global__ inline void _compute_FP8_quantize_cuda_kernel(
332332
const int ncols,
333333
std::uint8_t* const __restrict__ output,
334334
bool forward) {
335-
constexpr float kEpsilon = 1e-8f;
335+
constexpr float kEpsilon = 1e-20f;
336336
int ebit;
337337
int bias;
338338
float max_pos;

0 commit comments

Comments
 (0)