fix prefix_cache in fa3

lizhenyun01 · lizhenyun01 · commit e477ccfeb0d9 · 2025-07-09T13:00:31.000+08:00
diff --git a/custom_ops/gpu_ops/append_attn/gqa_rope_write_cache.cu b/custom_ops/gpu_ops/append_attn/gqa_rope_write_cache.cu
@@ -152,7 +152,7 @@ template <typename T,
           uint32_t HEAD_DIM,
           uint32_t BLOCK_SIZE,
           uint32_t NUM_WARPS=4>
-__global__ void append_dequant_cache_kv_c16(
+__global__ void append_cache_kv_c16(
     const T *__restrict__ cache_k,
     const T *__restrict__ cache_v,
     T *__restrict__ k_out,
@@ -174,7 +174,7 @@ __global__ void append_dequant_cache_kv_c16(
   const uint32_t batch_id = batch_ids[tile_idx];
   const uint32_t start_kv_idx = tile_ids_per_batch[tile_idx] * BLOCK_SIZE;
   const uint32_t end_idx = seq_lens_decoder[batch_id] - start_kv_idx;
-  if (seq_lens_this_time <= 0) {
+  if (seq_lens_this_time[batch_id] <= 0) {
     return;
   }
 
@@ -250,8 +250,8 @@ __global__ void append_dequant_cache_kv_c16(
       if (row_idx + 8 < end_idx) {
         k_tile_ptr1[0] = frag_dq_T[2];
         k_tile_ptr1[1] = frag_dq_T[3];
-        k_tile_ptr0[8] = frag_dq_T[6];
-        k_tile_ptr0[9] = frag_dq_T[7];
+        k_tile_ptr1[8] = frag_dq_T[6];
+        k_tile_ptr1[9] = frag_dq_T[7];
       }
       k_smem_offset_r = k_smem.advance_offset_by_column<2, num_vecs_per_head>(
         k_smem_offset_r, fy);
@@ -311,8 +311,8 @@ __global__ void append_dequant_cache_kv_c16(
       if (row_idx + 8 < end_idx) {
         v_tile_ptr1[0] = frag_dq_T[2];
         v_tile_ptr1[1] = frag_dq_T[3];
-        v_tile_ptr0[8] = frag_dq_T[6];
-        v_tile_ptr0[9] = frag_dq_T[7];
+        v_tile_ptr1[8] = frag_dq_T[6];
+        v_tile_ptr1[9] = frag_dq_T[7];
       }
       v_smem_offset_r = v_smem.advance_offset_by_column<2, num_vecs_per_head>(
         v_smem_offset_r, fy);
@@ -328,7 +328,7 @@ template <typename T,
           uint32_t BLOCK_SIZE,
           uint32_t NUM_WARPS=4,
           bool IS_FP8=false>
-__global__ void append_dequant_cache_kv_c8(
+__global__ void append_cache_kv_c8(
     const CacheT *__restrict__ cache_k,
     const CacheT *__restrict__ cache_v,
     T *__restrict__ k_out,
@@ -527,7 +527,7 @@ __global__ void append_dequant_cache_kv_c8(
 }
 
 template <typename T, uint32_t HEAD_DIM, uint32_t BLOCK_SIZE>
-void AppendDequantCache(
+void AppendCacheKV(
   const paddle::Tensor &cache_k,
   const paddle::Tensor &cache_v,
   const paddle::Tensor &cache_k_dequant_scales,
@@ -553,7 +553,7 @@ void AppendDequantCache(
   dim3 blocks(32, NUM_WARPS);
   if (cache_quant_type == "none") {
     const uint32_t smem_size = BLOCK_SIZE * HEAD_DIM * sizeof(T) * 2;
-    auto kernel_func = append_dequant_cache_kv_c16<NV_TYPE, NV_TYPE, HEAD_DIM, BLOCK_SIZE, NUM_WARPS>;
+    auto kernel_func = append_cache_kv_c16<NV_TYPE, NV_TYPE, HEAD_DIM, BLOCK_SIZE, NUM_WARPS>;
 
     if (smem_size >= 48 * 1024) {
       cudaFuncSetAttribute(kernel_func,
@@ -577,9 +577,9 @@ void AppendDequantCache(
   } else if (cache_quant_type == "cache_int8" || cache_quant_type == "cache_fp8") {
     const uint32_t smem_size = BLOCK_SIZE * HEAD_DIM * sizeof(uint8_t) * 2;
 
-    auto kernel_func = append_dequant_cache_kv_c8<NV_TYPE, uint8_t, HEAD_DIM, BLOCK_SIZE, NUM_WARPS, false>;
+    auto kernel_func = append_cache_kv_c8<NV_TYPE, uint8_t, HEAD_DIM, BLOCK_SIZE, NUM_WARPS, false>;
     if (cache_quant_type == "cache_fp8") {
-      kernel_func = append_dequant_cache_kv_c8<NV_TYPE, uint8_t, HEAD_DIM, BLOCK_SIZE, NUM_WARPS, true>;
+      kernel_func = append_cache_kv_c8<NV_TYPE, uint8_t, HEAD_DIM, BLOCK_SIZE, NUM_WARPS, true>;
     }
     if (smem_size >= 48 * 1024) {
       cudaFuncSetAttribute(kernel_func,
@@ -757,7 +757,7 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
   }
 
   if (token_num < kv_token_num) {
-    AppendDequantCache<data_t, 128, 64>(
+    AppendCacheKV<data_t, 128, 64>(
       key_cache,
       value_cache,
       cache_k_dequant_scales.get(),