|
10 | 10 | from vllm_ascend.quantization.w8a8 import (AscendC8KVCacheMethod,
|
11 | 11 | AscendW8A8FusedMoEMethod,
|
12 | 12 | AscendW8A8LinearMethod,
|
13 |
| - fused_experts, native_grouped_topk, |
| 13 | + fused_experts, fused_experts_310p, |
| 14 | + native_grouped_topk, |
14 | 15 | quant_per_tensor, select_experts)
|
15 | 16 |
|
16 | 17 |
|
@@ -111,6 +112,25 @@ def test_apply_with_x_is_int8(self, mock_npu_quant_matmul):
|
111 | 112 | expected_y_output += bias
|
112 | 113 | self.assertTrue(torch.equal(output, expected_y_output))
|
113 | 114 |
|
| 115 | + @patch("vllm_ascend.quantization.w8a8.is_310p", return_value=True) |
| 116 | + @patch("torch_npu.npu_quant_matmul") |
| 117 | + def test_apply_with_x_is_310p(self, mock_npu_quant_matmul, mock_is_310p): |
| 118 | + layer = MagicMock() |
| 119 | + layer.aclnn_input_scale = 0.1 |
| 120 | + layer.aclnn_input_offset = 0.2 |
| 121 | + layer.weight = torch.randn(128, 256) |
| 122 | + layer.deq_scale = 0.3 |
| 123 | + |
| 124 | + x = torch.randint(-128, 127, (32, 128), dtype=torch.int8) |
| 125 | + bias = torch.randn(256) |
| 126 | + |
| 127 | + expected_y_output = torch.randn(32, 256) |
| 128 | + mock_npu_quant_matmul.return_value = expected_y_output |
| 129 | + |
| 130 | + output = self.method.apply(layer, x, bias) |
| 131 | + expected_y_output += bias |
| 132 | + self.assertTrue(torch.equal(output, expected_y_output)) |
| 133 | + |
114 | 134 | @patch('torch_npu.npu_format_cast')
|
115 | 135 | def test_process_weights_after_loading(self, mock_npu_format_cast):
|
116 | 136 | layer = MagicMock()
|
@@ -221,6 +241,36 @@ def test_apply_with_other_expert_count(self, mock_fused_experts,
|
221 | 241 | mock_fused_experts.assert_called_once()
|
222 | 242 | self.assertEqual(result.shape, (32, self.hidden_size))
|
223 | 243 |
|
| 244 | + @patch("vllm_ascend.quantization.w8a8.is_310p", return_value=True) |
| 245 | + @patch('vllm_ascend.quantization.w8a8.select_experts') |
| 246 | + @patch('vllm_ascend.quantization.w8a8.fused_experts_310p') |
| 247 | + def test_apply_is_310p(self, mock_fused_experts_310p, mock_select_experts, |
| 248 | + mock_is_310p): |
| 249 | + # Setup |
| 250 | + mock_layer = MagicMock() |
| 251 | + x = torch.randn(32, self.hidden_size) |
| 252 | + router_logits = torch.randn(32, 128) # 128 experts |
| 253 | + top_k = 2 |
| 254 | + |
| 255 | + # Mock return values |
| 256 | + mock_select_experts.return_value = (torch.randn(32, top_k), |
| 257 | + torch.randint(0, 128, (32, top_k))) |
| 258 | + mock_fused_experts_310p.return_value = torch.randn( |
| 259 | + 32, self.hidden_size) |
| 260 | + |
| 261 | + # Test |
| 262 | + result = self.moe_method.apply(layer=mock_layer, |
| 263 | + x=x, |
| 264 | + router_logits=router_logits, |
| 265 | + top_k=top_k, |
| 266 | + renormalize=True, |
| 267 | + global_num_experts=128) |
| 268 | + |
| 269 | + # Assertions |
| 270 | + mock_select_experts.assert_called_once() |
| 271 | + mock_fused_experts_310p.assert_called_once() |
| 272 | + self.assertEqual(result.shape, (32, self.hidden_size)) |
| 273 | + |
224 | 274 |
|
225 | 275 | class TestAscendC8KVCacheMethod(TestBase):
|
226 | 276 |
|
@@ -255,7 +305,22 @@ def test_create_weights(self):
|
255 | 305 | expected_shape = (self.layer.num_kv_heads * self.layer.head_size, )
|
256 | 306 | self.assertEqual(param.shape, expected_shape)
|
257 | 307 |
|
258 |
| - def test_process_weights_after_loading(self): |
| 308 | + @patch("vllm_ascend.quantization.w8a8.is_310p", return_value=False) |
| 309 | + def test_process_weights_after_loading_not_310p(self, mock_is_310p): |
| 310 | + key_data = torch.ones(4 * 64) |
| 311 | + value_data = torch.ones(4 * 64) * 2 |
| 312 | + |
| 313 | + self.layer.key_antiquant_scale.data = key_data |
| 314 | + self.layer.value_antiquant_scale.data = value_data |
| 315 | + |
| 316 | + self.method.process_weights_after_loading(self.layer) |
| 317 | + |
| 318 | + self.assertEqual(self.method.antiquant_scale_comb.shape, (2, 256)) |
| 319 | + self.assertTrue(torch.all(self.method.antiquant_scale_comb[0] == 1)) |
| 320 | + self.assertTrue(torch.all(self.method.antiquant_scale_comb[1] == 2)) |
| 321 | + |
| 322 | + @patch("vllm_ascend.quantization.w8a8.is_310p", return_value=True) |
| 323 | + def test_process_weights_after_loading_is_310p(self, mock_is_310p): |
259 | 324 | key_data = torch.ones(4 * 64)
|
260 | 325 | value_data = torch.ones(4 * 64) * 2
|
261 | 326 |
|
@@ -527,6 +592,67 @@ def test_fused_experts_without_expert_map(self, mock_swiglu,
|
527 | 592 | )
|
528 | 593 |
|
529 | 594 |
|
| 595 | +class TestFusedExperts310(TestBase): |
| 596 | + |
| 597 | + @patch('torch_npu.npu_quant_grouped_matmul_dequant') |
| 598 | + @patch("vllm_ascend.quantization.w8a8.quant_per_tensor") |
| 599 | + @patch('vllm_ascend.quantization.w8a8.get_ep_group') |
| 600 | + @patch('torch_npu.npu_swiglu') |
| 601 | + def test_fused_experts_310p_with_expert_map(self, mock_swiglu, |
| 602 | + mock_get_ep_group, |
| 603 | + mock_quant_per_tensor, |
| 604 | + mock_matmul_dequant): |
| 605 | + num_tokens = 32 |
| 606 | + hidden_size = 128 |
| 607 | + intermediate_size = 256 |
| 608 | + num_experts = 4 |
| 609 | + top_k = 1 |
| 610 | + |
| 611 | + hidden_states = torch.randn(num_tokens, hidden_size) |
| 612 | + |
| 613 | + w1 = torch.randn(num_experts, intermediate_size * 2, hidden_size) |
| 614 | + w1_scale = torch.tensor([0.1]) |
| 615 | + w1_input_scale = torch.tensor([[0.2, 0.2], [0.2, 0.2]]) |
| 616 | + |
| 617 | + w2 = torch.randn(num_experts, hidden_size, intermediate_size) |
| 618 | + w2_scale = torch.tensor([0.1]) |
| 619 | + w2_input_scale = torch.tensor([0.2]) |
| 620 | + |
| 621 | + topk_weights = torch.rand(num_tokens, top_k) |
| 622 | + topk_ids = torch.randint(0, num_experts, (num_tokens, top_k)) |
| 623 | + expert_map = torch.arange(num_experts) |
| 624 | + |
| 625 | + mock_get_ep_group.return_value.world_size = 1 |
| 626 | + |
| 627 | + mock_quant_per_tensor.return_value = torch.randint(-128, |
| 628 | + 127, |
| 629 | + hidden_states.shape, |
| 630 | + dtype=torch.int8) |
| 631 | + |
| 632 | + mock_swiglu.return_value = torch.randn(num_tokens * top_k, |
| 633 | + intermediate_size) |
| 634 | + |
| 635 | + mock_matmul_dequant.return_value = hidden_states |
| 636 | + |
| 637 | + output = fused_experts_310p( |
| 638 | + hidden_states=hidden_states, |
| 639 | + w1=w1, |
| 640 | + w1_scale=w1_scale, |
| 641 | + w1_input_scale=w1_input_scale, |
| 642 | + w2=w2, |
| 643 | + w2_scale=w2_scale, |
| 644 | + w2_input_scale=w2_input_scale, |
| 645 | + topk_weights=topk_weights, |
| 646 | + topk_ids=topk_ids, |
| 647 | + top_k=top_k, |
| 648 | + global_num_experts=num_experts, |
| 649 | + expert_map=expert_map, |
| 650 | + ) |
| 651 | + |
| 652 | + self.assertEqual(output.shape, (num_tokens, hidden_size)) |
| 653 | + self.assertEqual(mock_matmul_dequant.call_count, 2) |
| 654 | + |
| 655 | + |
530 | 656 | class TestSelectExperts(TestBase):
|
531 | 657 |
|
532 | 658 | def setUp(self):
|
|
0 commit comments