align with ipex code

Devjiu · Devjiu · commit db8d603bea30 · 2025-06-03T12:08:05.000Z
diff --git a/bitsandbytes/backends/triton/ops.py b/bitsandbytes/backends/triton/ops.py
@@ -83,7 +83,7 @@ def quantize_4bit(
     n = A.numel()
 
     # TODO: Support when weight matrix is not divisible by blocksize
-    torch._check(n % blocksize == 0, lambda: f"n must be divisible by blocksize, got {n} and {blocksize}")
+    # torch._check(n % blocksize == 0, lambda: f"n must be divisible by blocksize, got {n} and {blocksize}")
 
     blocks = -(n // -(blocksize * 2))
 
diff --git a/bitsandbytes/backends/xpu/ops.py b/bitsandbytes/backends/xpu/ops.py
@@ -6,7 +6,10 @@
 from ..._ops import register_kernel
 from ..utils import ipex_xpu
 
-if torch.__version__ >= (2, 7):
+# With default torch, error:
+#  NotImplementedError: The operator 'aten::_int_mm' for XPU
+if ipex_xpu and torch.__version__ >= (2, 7):
+
     @register_kernel("bitsandbytes::int8_linear_matmul", "xpu")
     def _(A: torch.Tensor, B: torch.Tensor):
         return torch._int_mm(
@@ -16,6 +19,7 @@ def _(A: torch.Tensor, B: torch.Tensor):
 
 
 if ipex_xpu:
+
     @register_kernel("bitsandbytes::dequantize_nf4_ipex", "xpu")
     def _(
         A: torch.Tensor,
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
@@ -677,7 +677,7 @@ def to(self, *args, **kwargs):
         if device is not None and device.type != "meta" and self.data.device.type == "cpu":
             if device.type != "cpu" or self.data.dtype != torch.int8:
                 return self._quantize(device)
-            elif self.data.dtype == torch.int8 and device.type in ("cpu", "xpu"):
+            elif self.data.dtype == torch.int8 and device.type in ("cpu", "xpu") and (ipex_cpu or ipex_xpu):
                 self.CB = self.data
 
         new_param = Int8Params(
diff --git a/tests/test_functional.py b/tests/test_functional.py
@@ -137,11 +137,11 @@ def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize,
         abserr = sum(diffs) / len(diffs)
         relerr = sum(reldiffs) / len(reldiffs)
         if signed:
-            threshold_abserr = 0.0036 if device in ("cpu", "xpu") else 0.0035
+            threshold_abserr = 0.0036 if device in ("cpu", "xpu") and (F.ipex_cpu or F.ipex_xpu) else 0.0035
             assert abserr < 0.0036
             assert relerr < 0.015
         else:
-            assert abserr < 0.00175 if device in ("cpu", "xpu") else 0.0023
+            assert abserr < 0.00175 if device in ("cpu", "xpu") and (F.ipex_cpu or F.ipex_xpu) else 0.0023
             assert relerr < 0.012
         assert A2.dtype == dtype
 
@@ -172,7 +172,7 @@ def test_blockwise_cpu_large(self, hidden, blocksize):
     @pytest.mark.parametrize("bits", range(2, 9), ids=id_formatter("bits"))
     @pytest.mark.parametrize("method", ["linear", "fp8", "dynamic", "quantile"])
     def test_few_bit_quant(self, device, bits, method):
-        if device in ("cpu", "xpu") and bits != 8:
+        if device in ("cpu", "xpu") and bits != 8 and (F.ipex_cpu or F.ipex_xpu):
             pytest.skip("CPU/XPU implementation only supports 8 bits")
 
         abserrs = []
diff --git a/tests/test_ops.py b/tests/test_ops.py
@@ -4,6 +4,7 @@
 import torch
 
 import bitsandbytes
+from bitsandbytes.functional import ipex_xpu
 from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter
 
 # torch.library.opcheck is only available in torch 2.4 and later.
@@ -144,7 +145,7 @@ def test_dequantize_blockwise(self, device, dtype, blocksize):
         assert out.device == A.device
 
         # TODO: Enable it
-        if device == "xpu":
+        if device == "xpu" and ipex_xpu:
             pytest.skip("XPU implementation have torch.op inside torch.op, it will fail on op check")
 
         opcheck(torch.ops.bitsandbytes.dequantize_blockwise.default, (A, absmax, code, blocksize, dtype))
@@ -170,7 +171,7 @@ def test_quantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize
         if storage_dtype != torch.uint8:
             pytest.xfail("opcheck fails for storage_dtype != torch.uint8")
 
-        opcheck(torch.ops.bitsandbytes.quantize_4bit, (A, blocksize, quant_type, storage_dtype))
+        opcheck(torch.ops.bitsandbytes.quantize_4bit.default, (A, blocksize, quant_type, storage_dtype))
 
     @pytest.mark.parametrize("device", get_available_devices())
     @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))