fixing CI

HDCharles · HDCharles · commit c024f5df4a8e · 2025-05-07T23:52:28.000-07:00
Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/test/quantization/test_moe_quant.py b/test/quantization/test_moe_quant.py
@@ -169,6 +169,25 @@ def test_int8wo_base(self, name, num_tokens, fullgraph):
             fullgraph=fullgraph,
         )
 
+    @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_5, "Test only enabled for 2.5+")
+    @parameterized.expand(
+        [
+            ("single_token", 1, True),
+            ("multiple_tokens", 8, False),
+        ]
+    )
+    def test_int8wo_base_cpu(self, name, num_tokens, fullgraph):
+        config = Int8WeightOnlyConfig()
+        tensor_impl_class = PlainAQTTensorImpl
+
+        self._test_impl_moe_quant(
+            config=config,
+            num_tokens=num_tokens,
+            tensor_impl_class=tensor_impl_class,
+            fullgraph=fullgraph,
+            device="cpu",
+        )
+
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_5, "Test only enabled for 2.5+")
     @parameterized.expand(
diff --git a/torchao/experimental/tests/test_int8_dynamic_activation_intx_weight.py b/torchao/experimental/tests/test_int8_dynamic_activation_intx_weight.py
@@ -646,8 +646,8 @@ def test_moe_quant_intx(self):
         from torchao.quantization.utils import compute_error
 
         with torch.device("cpu"):
-            model = MOEFeedForwardAOQuantizable(512, 256, 8, 2).to(torch.bfloat16)
-            x = torch.randn(1, 512, dtype=torch.bfloat16)
+            model = MOEFeedForwardAOQuantizable(512, 256, 8, 2, empty_init=False).to(torch.bfloat16)
+            x = torch.randn(8, 512, dtype=torch.bfloat16)
 
         out = model(x).clone()
 
@@ -661,7 +661,15 @@ def test_moe_quant_intx(self):
         out_q = model(x).clone()
         assert isinstance(model.experts.w1, FakeExtraDimTensor)
 
-        assert compute_error(out_q, out) > 30, "error bad accuracy but everything ran"
+        mod_c = torch.compile(model, mode="reduce-overhead")
+
+        mod_c(x)
+        mod_c(x)
+
+        out_qc = mod_c(x).clone()
+
+        self.assertGreater(compute_error(out_q, out), 30)
+        self.assertGreater(compute_error(out_qc, out), 30)
 
 
 if __name__ == "__main__":
diff --git a/torchao/quantization/prototype/moe_quant/quantizable_moe_modules.py b/torchao/quantization/prototype/moe_quant/quantizable_moe_modules.py
@@ -120,12 +120,16 @@ def forward(
             ordered_token_indices = (
                 ordered_token_activations.div(top_k).floor().to(torch.int64)
             )  #  [T]
-            num_tokens_per_expert = torch.histc(
-                expert_indices,
-                bins=self.num_experts + 1,
-                min=-1,
-                max=self.num_experts,
-            )  #  [E+1] (added leading 0 so can be used for indexing)
+            if not expert_indices.is_cuda: # histc doesn't work on cpu for integers
+                num_tokens_per_expert = torch.bincount(expert_indices.view(-1)+1, minlength=self.num_experts+1)
+            else:
+                num_tokens_per_expert = torch.histc(
+                    expert_indices,
+                    bins=self.num_experts + 1,
+                    min=-1,
+                    max=self.num_experts,
+                )  #  [E+1] (added leading 0 so can be used for indexing)
+            # num_tokens_per_expert = torch.bincount(expert_indices.view(-1)+1, minlength=self.num_experts+1)
             cum_tokens_per_expert = num_tokens_per_expert.cumsum(0).to(
                 torch.int64
             )  #  [E+1]