Skip to content

Commit 52352ec

Browse files
committed
Removal useless KVQ FA K q8_0 V F16 and fixes
And the Quantum KA no FA works only in q6_0, Ikawrakow's newest KV quant. It's fine, it's the best K mode anyway. The older ones from 2023 and iq4_nl/F16 are NaN, so they are disabled. Overdue fix for csshift/noshift in the KVQ algo... cleanup fattn.cu
1 parent 6d818bd commit 52352ec

File tree

4 files changed

+32
-46
lines changed

4 files changed

+32
-46
lines changed

CMakeLists.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -168,8 +168,8 @@ if (LLAMA_CUBLAS)
168168
# list(APPEND GGML_SOURCES_CUDA ${SRCS})
169169
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
170170
list(APPEND GGML_SOURCES_CUDA ${SRCS})
171-
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-f16.cu")
172-
list(APPEND GGML_SOURCES_CUDA ${SRCS})
171+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-f16.cu")
172+
# list(APPEND GGML_SOURCES_CUDA ${SRCS})
173173
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q4_0.cu")
174174
# list(APPEND GGML_SOURCES_CUDA ${SRCS})
175175
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q4_1.cu")
@@ -323,8 +323,8 @@ if (LLAMA_HIPBLAS)
323323
# list(APPEND GGML_SOURCES_ROCM ${SRCS})
324324
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
325325
list(APPEND GGML_SOURCES_ROCM ${SRCS})
326-
file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-f16.cu")
327-
list(APPEND GGML_SOURCES_ROCM ${SRCS})
326+
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-f16.cu")
327+
# list(APPEND GGML_SOURCES_ROCM ${SRCS})
328328
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q4_0.cu")
329329
# list(APPEND GGML_SOURCES_ROCM ${SRCS})
330330
# file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q4_1.cu")

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ ifdef LLAMA_CUDA_FA_ALL_QUANTS
215215
# OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q5_1.cu))
216216
OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*:q8_0-q6_0.cu))
217217
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
218-
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-f16.cu))
218+
# OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-f16.cu))
219219
# OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q4_0.cu))
220220
# OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q4_1.cu))
221221
# OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q5_0.cu))

ggml/src/ggml-cuda/fattn.cu

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -235,24 +235,15 @@ static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, gg
235235
//FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16)
236236
//FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16)
237237
//FATTN_VEC_F16_CASE(128, GGML_TYPE_Q6_0, GGML_TYPE_F16)
238-
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16)
238+
//FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16)
239239
FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
240240
//FATTN_VEC_F16_CASE(128, GGML_TYPE_IQ4_NL, GGML_TYPE_F16)
241241

242242
FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
243243

244-
245244
//FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_IQ4_NL)
246245
//FATTN_VEC_F16_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_IQ4_NL)
247246

248-
//FATTN_VEC_F16_CASE(128, GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_NL)
249-
//FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_IQ4_NL)
250-
251-
//FATTN_VEC_F16_CASE(128, GGML_TYPE_Q6_0, GGML_TYPE_IQ4_NL)
252-
//FATTN_VEC_F16_CASE(128, GGML_TYPE_Q6_0, GGML_TYPE_Q5_0)
253-
//FATTN_VEC_F16_CASE(128, GGML_TYPE_Q6_0, GGML_TYPE_Q6_0)
254-
//FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q6_0)
255-
//FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q6_0)
256247
#else
257248
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
258249

@@ -262,7 +253,6 @@ static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, gg
262253
FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
263254
FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
264255

265-
266256
//FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_IQ4_NL)
267257
//FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_IQ4_NL)
268258
//FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_IQ4_NL)
@@ -369,7 +359,7 @@ static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, gg
369359
//FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16)
370360
//FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16)
371361
//FATTN_VEC_F32_CASE(128, GGML_TYPE_Q6_0, GGML_TYPE_F16)
372-
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16)
362+
//FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16)
373363
FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
374364
//FATTN_VEC_F32_CASE(128, GGML_TYPE_IQ4_NL, GGML_TYPE_F16)
375365

0 commit comments

Comments
 (0)