Update src/llmcompressor/modifiers/quantization/cache.py

Eviannn · gemini-code-assist[bot] · web-flow · commit 692d1a56e759 · 2025-07-19T19:00:17.000+08:00
per tensor are same

Co-authored-by: gemini-code-assist[bot] &lt;176961590+gemini-code-assist[bot]@users.noreply.github.com&gt;
diff --git a/src/llmcompressor/modifiers/quantization/cache.py b/src/llmcompressor/modifiers/quantization/cache.py
@@ -97,10 +97,11 @@ def update(
         # reshape for per channel scenario
         num_heads = key_states.shape[1]
         head_dim = key_states.shape[-1]
-        # from [batch_size, num_heads, seq_len - residual_length, head_dim]
-        # to [batch_size, seq_len - residual_length, num_heads * head_dim]
-        key_states = key_states.transpose(1, 2).flatten(2)
-        value_states = value_states.transpose(1, 2).flatten(2)
+        if self.quantization_args.strategy == QuantizationStrategy.CHANNEL:
+            # from [batch_size, num_heads, seq_len - residual_length, head_dim]
+            # to [batch_size, seq_len - residual_length, num_heads * head_dim]
+            key_states = key_states.transpose(1, 2).flatten(2)
+            value_states = value_states.transpose(1, 2).flatten(2)
 
         q_key_states = self._quantize(
             key_states.contiguous(), KVCacheScaleType.KEY, layer_idx