From 117aaa99c4011e8911a93a036db3b8517f5a1e36 Mon Sep 17 00:00:00 2001 From: jay-tux Date: Fri, 22 Nov 2024 17:35:59 +0100 Subject: [PATCH] Avoid zero-division --- .../Llama3-8B-1.58-100B-tokens/bitnet-lut-kernels-tl1.h | 4 ++-- .../Llama3-8B-1.58-100B-tokens/bitnet-lut-kernels-tl2.h | 2 +- preset_kernels/bitnet_b1_58-3B/bitnet-lut-kernels-tl1.h | 4 ++-- preset_kernels/bitnet_b1_58-3B/bitnet-lut-kernels-tl2.h | 2 +- preset_kernels/bitnet_b1_58-large/bitnet-lut-kernels-tl1.h | 4 ++-- preset_kernels/bitnet_b1_58-large/bitnet-lut-kernels-tl2.h | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/preset_kernels/Llama3-8B-1.58-100B-tokens/bitnet-lut-kernels-tl1.h b/preset_kernels/Llama3-8B-1.58-100B-tokens/bitnet-lut-kernels-tl1.h index 024fb787..a648a771 100644 --- a/preset_kernels/Llama3-8B-1.58-100B-tokens/bitnet-lut-kernels-tl1.h +++ b/preset_kernels/Llama3-8B-1.58-100B-tokens/bitnet-lut-kernels-tl1.h @@ -31,7 +31,7 @@ void per_tensor_quant(int k, void* lut_scales_, void* b_) {{ float32x4_t abssum = vabsq_f32(vec_bs); temp_max = vmaxq_f32(abssum, temp_max); }} - float32_t scales = 127 / vmaxvq_f32(temp_max); + float32_t scales = 127 / (vmaxvq_f32(temp_max) + 1e-10f); *lut_scales = scales; #elif defined __AVX2__ __m256 max_vec = _mm256_set1_ps(0.f); @@ -45,7 +45,7 @@ void per_tensor_quant(int k, void* lut_scales_, void* b_) {{ __m128 max1 = _mm_max_ps(_mm256_extractf128_ps(max_vec, 1), _mm256_castps256_ps128(max_vec)); max1 = _mm_max_ps(max1, _mm_movehl_ps(max1, max1)); max1 = _mm_max_ss(max1, _mm_movehdup_ps(max1)); - float scales = 127 / _mm_cvtss_f32(max1); + float scales = 127 / (_mm_cvtss_f32(max1) + 1e-10f); *lut_scales = scales; #endif }} diff --git a/preset_kernels/Llama3-8B-1.58-100B-tokens/bitnet-lut-kernels-tl2.h b/preset_kernels/Llama3-8B-1.58-100B-tokens/bitnet-lut-kernels-tl2.h index 88dc9e2a..7a46a037 100644 --- a/preset_kernels/Llama3-8B-1.58-100B-tokens/bitnet-lut-kernels-tl2.h +++ b/preset_kernels/Llama3-8B-1.58-100B-tokens/bitnet-lut-kernels-tl2.h @@ -82,7 +82,7 @@ inline int32_t per_tensor_quant(int k, void* lut_scales_, void* b_) { __m128 max1 = _mm_max_ps(_mm256_extractf128_ps(max_vec, 1), _mm256_castps256_ps128(max_vec)); max1 = _mm_max_ps(max1, _mm_movehl_ps(max1, max1)); max1 = _mm_max_ss(max1, _mm_movehdup_ps(max1)); - float scales = 127 / _mm_cvtss_f32(max1); + float scales = 127 / (_mm_cvtss_f32(max1) + 1e-10f); *lut_scales = scales; #endif return 0; diff --git a/preset_kernels/bitnet_b1_58-3B/bitnet-lut-kernels-tl1.h b/preset_kernels/bitnet_b1_58-3B/bitnet-lut-kernels-tl1.h index 3f3f551b..d7146793 100644 --- a/preset_kernels/bitnet_b1_58-3B/bitnet-lut-kernels-tl1.h +++ b/preset_kernels/bitnet_b1_58-3B/bitnet-lut-kernels-tl1.h @@ -31,7 +31,7 @@ void per_tensor_quant(int k, void* lut_scales_, void* b_) {{ float32x4_t abssum = vabsq_f32(vec_bs); temp_max = vmaxq_f32(abssum, temp_max); }} - float32_t scales = 127 / vmaxvq_f32(temp_max); + float32_t scales = 127 / (vmaxvq_f32(temp_max) + 1e-10f); *lut_scales = scales; #elif defined __AVX2__ __m256 max_vec = _mm256_set1_ps(0.f); @@ -45,7 +45,7 @@ void per_tensor_quant(int k, void* lut_scales_, void* b_) {{ __m128 max1 = _mm_max_ps(_mm256_extractf128_ps(max_vec, 1), _mm256_castps256_ps128(max_vec)); max1 = _mm_max_ps(max1, _mm_movehl_ps(max1, max1)); max1 = _mm_max_ss(max1, _mm_movehdup_ps(max1)); - float scales = 127 / _mm_cvtss_f32(max1); + float scales = 127 / (_mm_cvtss_f32(max1) + 1e-10f); *lut_scales = scales; #endif }} diff --git a/preset_kernels/bitnet_b1_58-3B/bitnet-lut-kernels-tl2.h b/preset_kernels/bitnet_b1_58-3B/bitnet-lut-kernels-tl2.h index 678b0f32..4e0c689d 100644 --- a/preset_kernels/bitnet_b1_58-3B/bitnet-lut-kernels-tl2.h +++ b/preset_kernels/bitnet_b1_58-3B/bitnet-lut-kernels-tl2.h @@ -82,7 +82,7 @@ inline int32_t per_tensor_quant(int k, void* lut_scales_, void* b_) { __m128 max1 = _mm_max_ps(_mm256_extractf128_ps(max_vec, 1), _mm256_castps256_ps128(max_vec)); max1 = _mm_max_ps(max1, _mm_movehl_ps(max1, max1)); max1 = _mm_max_ss(max1, _mm_movehdup_ps(max1)); - float scales = 127 / _mm_cvtss_f32(max1); + float scales = 127 / (_mm_cvtss_f32(max1) + 1e-10f); *lut_scales = scales; #endif return 0; diff --git a/preset_kernels/bitnet_b1_58-large/bitnet-lut-kernels-tl1.h b/preset_kernels/bitnet_b1_58-large/bitnet-lut-kernels-tl1.h index d38806b5..636bb702 100644 --- a/preset_kernels/bitnet_b1_58-large/bitnet-lut-kernels-tl1.h +++ b/preset_kernels/bitnet_b1_58-large/bitnet-lut-kernels-tl1.h @@ -31,7 +31,7 @@ void per_tensor_quant(int k, void* lut_scales_, void* b_) {{ float32x4_t abssum = vabsq_f32(vec_bs); temp_max = vmaxq_f32(abssum, temp_max); }} - float32_t scales = 127 / vmaxvq_f32(temp_max); + float32_t scales = 127 / (vmaxvq_f32(temp_max) + 1e-10f); *lut_scales = scales; #elif defined __AVX2__ __m256 max_vec = _mm256_set1_ps(0.f); @@ -45,7 +45,7 @@ void per_tensor_quant(int k, void* lut_scales_, void* b_) {{ __m128 max1 = _mm_max_ps(_mm256_extractf128_ps(max_vec, 1), _mm256_castps256_ps128(max_vec)); max1 = _mm_max_ps(max1, _mm_movehl_ps(max1, max1)); max1 = _mm_max_ss(max1, _mm_movehdup_ps(max1)); - float scales = 127 / _mm_cvtss_f32(max1); + float scales = 127 / (_mm_cvtss_f32(max1) + 1e-10f); *lut_scales = scales; #endif }} diff --git a/preset_kernels/bitnet_b1_58-large/bitnet-lut-kernels-tl2.h b/preset_kernels/bitnet_b1_58-large/bitnet-lut-kernels-tl2.h index 92bda56b..5e3a4f79 100644 --- a/preset_kernels/bitnet_b1_58-large/bitnet-lut-kernels-tl2.h +++ b/preset_kernels/bitnet_b1_58-large/bitnet-lut-kernels-tl2.h @@ -82,7 +82,7 @@ inline int32_t per_tensor_quant(int k, void* lut_scales_, void* b_) { __m128 max1 = _mm_max_ps(_mm256_extractf128_ps(max_vec, 1), _mm256_castps256_ps128(max_vec)); max1 = _mm_max_ps(max1, _mm_movehl_ps(max1, max1)); max1 = _mm_max_ss(max1, _mm_movehdup_ps(max1)); - float scales = 127 / _mm_cvtss_f32(max1); + float scales = 127 / (_mm_cvtss_f32(max1) + 1e-10f); *lut_scales = scales; #endif return 0;