Skip to content

Commit 10d0d28

Browse files
committed
metal : disable fast math in all quantize kernels (llama/14528)
ggml-ci
1 parent af304ef commit 10d0d28

File tree

1 file changed

+3
-0
lines changed

1 file changed

+3
-0
lines changed

ggml/src/ggml-metal/ggml-metal.metal

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ void dequantize_q4_0_t4(device const block_q4_0 * xb, short il, thread type4 & r
109109
}
110110

111111
void quantize_q4_0(device const float * src, device block_q4_0 & dst) {
112+
#pragma METAL fp math_mode(safe)
112113
float amax = 0.0f; // absolute max
113114
float max = 0.0f;
114115

@@ -167,6 +168,7 @@ void quantize_q4_1(device const float * src, device block_q4_1 & dst) {
167168
}
168169

169170
void quantize_q5_0(device const float * src, device block_q5_0 & dst) {
171+
#pragma METAL fp math_mode(safe)
170172
float amax = 0.0f; // absolute max
171173
float max = 0.0f;
172174

@@ -461,6 +463,7 @@ void dequantize_q8_0_t4(device const block_q8_0 *xb, short il, thread type4 & re
461463
}
462464

463465
void quantize_q8_0(device const float * src, device block_q8_0 & dst) {
466+
#pragma METAL fp math_mode(safe)
464467
float amax = 0.0f; // absolute max
465468

466469
for (int j = 0; j < QK8_0; j++) {

0 commit comments

Comments
 (0)