Removal useless KVQ FA K q8_0 V F16 and fixes

Nexesenex · Nexesenex · commit 52352ec8f650 · 2024-10-23T12:11:52.000+02:00
And the Quantum KA no FA works only in q6_0, Ikawrakow's newest KV quant. It's fine, it's the best K mode anyway.

The older ones from 2023 and iq4_nl/F16 are NaN, so they are disabled.

Overdue fix for csshift/noshift in the KVQ algo...

cleanup fattn.cu
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -168,8 +168,8 @@ if (LLAMA_CUBLAS)
             # list(APPEND GGML_SOURCES_CUDA ${SRCS})
             file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
             list(APPEND GGML_SOURCES_CUDA ${SRCS})
-            file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-f16.cu")
-            list(APPEND GGML_SOURCES_CUDA ${SRCS})
+            # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-f16.cu")
+            # list(APPEND GGML_SOURCES_CUDA ${SRCS})
             # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q4_0.cu")
             # list(APPEND GGML_SOURCES_CUDA ${SRCS})
             # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q4_1.cu")
@@ -323,8 +323,8 @@ if (LLAMA_HIPBLAS)
             # list(APPEND GGML_SOURCES_ROCM ${SRCS})
             file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
             list(APPEND GGML_SOURCES_ROCM ${SRCS})
-            file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-f16.cu")
-            list(APPEND GGML_SOURCES_ROCM ${SRCS})
+            # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-f16.cu")
+            # list(APPEND GGML_SOURCES_ROCM ${SRCS})
             # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q4_0.cu")
             # list(APPEND GGML_SOURCES_ROCM ${SRCS})
             # file(GLOB SRCS "ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q4_1.cu")
diff --git a/Makefile b/Makefile
@@ -215,7 +215,7 @@ ifdef LLAMA_CUDA_FA_ALL_QUANTS
     # OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q5_1.cu))
     OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*:q8_0-q6_0.cu))
     OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
-    OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-f16.cu))
+    # OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-f16.cu))
     # OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q4_0.cu))
     # OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q4_1.cu))
     # OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*f16-q5_0.cu))
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
@@ -235,24 +235,15 @@ static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, gg
     //FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16)
     //FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16)
     //FATTN_VEC_F16_CASE(128, GGML_TYPE_Q6_0, GGML_TYPE_F16)
-    FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16)
+    //FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16)
     FATTN_VEC_F16_CASE(128, GGML_TYPE_F16,  GGML_TYPE_F16)
     //FATTN_VEC_F16_CASE(128, GGML_TYPE_IQ4_NL,  GGML_TYPE_F16)
 
     FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
 
-
     //FATTN_VEC_F16_CASE(256, GGML_TYPE_F16,  GGML_TYPE_IQ4_NL)
     //FATTN_VEC_F16_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_IQ4_NL)
 
-    //FATTN_VEC_F16_CASE(128, GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_NL)
-    //FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0,   GGML_TYPE_IQ4_NL)
-
-    //FATTN_VEC_F16_CASE(128, GGML_TYPE_Q6_0,   GGML_TYPE_IQ4_NL)
-    //FATTN_VEC_F16_CASE(128, GGML_TYPE_Q6_0,   GGML_TYPE_Q5_0)
-    //FATTN_VEC_F16_CASE(128, GGML_TYPE_Q6_0,   GGML_TYPE_Q6_0)
-    //FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0,   GGML_TYPE_Q6_0)
-    //FATTN_VEC_F16_CASE(128, GGML_TYPE_F16,    GGML_TYPE_Q6_0)
 #else
     FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
 
@@ -262,7 +253,6 @@ static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, gg
     FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
     FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
 
-
     //FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16,  GGML_TYPE_IQ4_NL)
     //FATTN_VEC_F16_CASE(128, GGML_TYPE_F16,  GGML_TYPE_IQ4_NL)
     //FATTN_VEC_F16_CASE(256, GGML_TYPE_F16,  GGML_TYPE_IQ4_NL)
@@ -369,7 +359,7 @@ static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, gg
     //FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16)
     //FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16)
     //FATTN_VEC_F32_CASE(128, GGML_TYPE_Q6_0, GGML_TYPE_F16)
-    FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16)
+    //FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16)
     FATTN_VEC_F32_CASE(128, GGML_TYPE_F16,  GGML_TYPE_F16)
     //FATTN_VEC_F32_CASE(128, GGML_TYPE_IQ4_NL, GGML_TYPE_F16)
 
diff --git a/koboldcpp.py b/koboldcpp.py