@@ -1598,7 +1598,6 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
1598
1598
const int warpid = threadIdx .x / WARP_SIZE;
1599
1599
const int laneid = threadIdx .x % WARP_SIZE;
1600
1600
const int lane2id = laneid % 2 ;
1601
- const int lane4id = laneid % 4 ;
1602
1601
const int lane16id = laneid % 16 ;
1603
1602
const int rowid = laneid / 16 ;
1604
1603
@@ -1745,7 +1744,6 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
1745
1744
const cache_t * k_ptr2 = k_ptr + kblock_number * kv_block_stride;
1746
1745
const int klocal_token_idx =
1747
1746
TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
1748
- const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
1749
1747
const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE;
1750
1748
const cache_t * k_ptr3 = k_ptr2 + kphysical_block_offset * KX;
1751
1749
@@ -2368,7 +2366,6 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
2368
2366
const int warpid = threadIdx .x / WARP_SIZE;
2369
2367
const int laneid = threadIdx .x % WARP_SIZE;
2370
2368
const int lane2id = laneid % 2 ;
2371
- const int lane4id = laneid % 4 ;
2372
2369
const int lane16id = laneid % 16 ;
2373
2370
const int rowid = laneid / 16 ;
2374
2371
@@ -2514,7 +2511,6 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
2514
2511
const cache_t * k_ptr2 = k_ptr + kblock_number * kv_block_stride;
2515
2512
const int klocal_token_idx =
2516
2513
TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
2517
- const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
2518
2514
const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE;
2519
2515
const cache_t * k_ptr3 = k_ptr2 + kphysical_block_offset * KX;
2520
2516
0 commit comments