bitsandbytes-foundation
diff --git a/‎bitsandbytes/__init__.py
Lines changed: 3 additions & 0 deletions b/‎bitsandbytes/__init__.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎bitsandbytes/backends/xpu/__init__.py b/‎bitsandbytes/backends/xpu/__init__.py
diff --git a/‎bitsandbytes/backends/xpu/ops.py
Lines changed: 250 additions & 0 deletions b/‎bitsandbytes/backends/xpu/ops.py
Lines changed: 250 additions & 0 deletions
@@ -34,6 +34,9 @@
 if torch.cuda.is_available():
     from .backends.cuda import ops as cuda_ops
 
+if torch.xpu.is_available():
+    from .backends.xpu import ops as xpu_ops
+
 
 def _import_backends():
     """
 
@@ -0,0 +1,250 @@
+from collections.abc import Sequence
+import warnings
+
+import torch
+
+from ..._ops import register_kernel
+
+try:
+    from . import triton_kernels
+
+    triton_available = True
+except ImportError:
+    triton_available = False
+
+
+@torch.compile
+def quantize_blockwise_torch(A, blocksize, code, absmax, quantized_out):
+    n = A.numel()
+    rem = n % blocksize
+    has_rem = rem > 0
+    blocks = n // blocksize + has_rem
+    A_reshaped = A.reshape(n)
+    A_com = A_reshaped[: n - rem]
+    A_com_reshaped = A_com.reshape(n // blocksize, blocksize)
+    absmax[: blocks - has_rem] = torch.abs(A_com_reshaped).max(dim=-1)[0]
+    scaled_A = torch.clamp(A_com_reshaped / absmax[: blocks - has_rem].view(-1, 1), -1, 1)
+    scaled_A = scaled_A.reshape(-1)
+    if has_rem:
+        absmax[-1] = torch.abs(A_reshaped[n - rem :]).max()
+        scaled_A_rem = torch.clamp((A_reshaped[n - rem :] / absmax[-1]), -1, 1)
+        scaled_A = torch.cat([scaled_A, scaled_A_rem], dim=0)
+
+    diff = torch.abs(scaled_A.unsqueeze(-1) - code.to(scaled_A.device))
+    quantized_out = torch.argmin(diff, dim=-1).to(torch.uint8).to(scaled_A.device).reshape(A.shape)
+    return quantized_out, absmax
+
+
+def quantize_blockwise(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
+    torch._check_is_size(blocksize)
+    # torch._check(A.dtype == torch.float32, lambda: f"A must be float32 on xpu, got {A.dtype}")
+
+    n = A.numel()
+    blocks = -(n // -blocksize)
+
+    absmax = torch.empty((blocks,), device=A.device, dtype=A.dtype)
+    out = torch.empty_like(A.flatten(), dtype=torch.uint8)
+
+    # out, absmax = quantize_blockwise_torch(A, blocksize, code, absmax, out)
+
+    triton_kernels.quantize_blockwise_triton(A, blocksize, code, blocks, absmax, out)
+    out = out.reshape(A.shape)
+    return out, absmax
+
+
+def dequantize_blockwise(
+    A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype
+) -> torch.Tensor:
+    torch._check_is_size(blocksize)
+    torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
+    # torch._check(dtype == torch.float32, lambda: f"dtype must be float32 on xpu, got {dtype}")
+
+    out = torch.empty_like(A, dtype=dtype, device=A.device)
+    triton_kernels.dequant_int8_fp16(
+        A,
+        code,
+        absmax,
+        out,
+        blocksize,
+    )
+
+    return out
+
+
+_NF4_QUANT_TABLE = torch.tensor(
+    [
+        -1.0,
+        -0.6961928009986877,
+        -0.5250730514526367,
+        -0.39491748809814453,
+        -0.28444138169288635,
+        -0.18477343022823334,
+        -0.09105003625154495,
+        0.0,
+        0.07958029955625534,
+        0.16093020141124725,
+        0.24611230194568634,
+        0.33791524171829224,
+        0.44070982933044434,
+        0.5626170039176941,
+        0.7229568362236023,
+        1.0,
+    ],
+    dtype=torch.float32,
+    device="xpu",
+)
+
+_FP4_QUANT_TABLE = torch.tensor(
+    [
+        0.0000,
+        0.0052,
+        0.6667,
+        1.0000,
+        0.3333,
+        0.5000,
+        0.1667,
+        0.2500,
+        0.0000,
+        -0.0052,
+        -0.6667,
+        -1.0000,
+        -0.3333,
+        -0.5000,
+        -0.1667,
+        -0.2500,
+    ],
+    dtype=torch.float32,
+    device="xpu",
+)
+
+
+@register_kernel("bitsandbytes::quantize_4bit", "xpu")
+def _(
+    A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
+) -> tuple[torch.Tensor, torch.Tensor]:
+    torch._check_is_size(blocksize)
+    # torch._check(quant_type == "nf4", lambda: f"quant_type must be nf4 on CPU, got {quant_type}")
+    torch._check(
+        A.dtype in [torch.bfloat16, torch.float16, torch.float32],
+        lambda: f"Blockwise 4bit quantization only supports 16/32-bit floats, but got {A.dtype}",
+    )
+
+    n = A.numel()
+
+    # TODO: Support when weight matrix is not divisible by blocksize
+    torch._check(n % blocksize == 0, lambda: f"n must be divisible by blocksize, got {n} and {blocksize}")
+
+    # Divide into blocks and normalize
+    blocks = A.reshape(-1, blocksize)
+    absmax = blocks.abs().max(dim=1).values.float()
+    scaled = blocks / absmax.unsqueeze(-1)
+
+    # Quantize with the lookup table
+    if quant_type == "fp4":
+        quantized = torch.argmin(torch.abs(scaled.view(-1, 1) - _FP4_QUANT_TABLE), dim=-1, keepdim=True).to(
+            torch.uint8
+        )
+    else:
+        quantized = torch.argmin(torch.abs(scaled.view(-1, 1) - _NF4_QUANT_TABLE), dim=-1, keepdim=True).to(
+            torch.uint8
+        )
+
+    # Pack two quantized values per byte
+    packed = quantized[::2] << 4 | quantized[1::2]
+
+    if quant_storage != torch.uint8:
+        packed = packed.squeeze().view(quant_storage).unsqueeze(1)
+
+    return packed, absmax.float()
+
+
+def dequantize_4bit(
+    A: torch.Tensor,
+    absmax: torch.Tensor,
+    blocksize: int,
+    quant_type: str,
+    shape: Sequence[int],
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    torch._check_is_size(blocksize)
+    # torch._check(quant_type == "nf4", lambda: f"quant_type must be nf4 on XPU, got {quant_type}")
+    torch._check(
+        dtype in [torch.bfloat16, torch.float16, torch.float32],
+        lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
+    )
+    # torch._check(
+    #     A.dtype == torch.uint8,
+    #     lambda: f"Blockwise 4bit dequantization on XPU only supports uint8 storage, got {A.dtype}",
+    # )
+    # Check if this is fine and fast
+    if A.dtype != torch.uint8:
+        A = A.squeeze().view(torch.uint8).unsqueeze(1)
+
+    out = torch.empty(shape, dtype=dtype, device=A.device)
+
+    triton_kernels._dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
+    return out
+
+
+def dequantize_4bit_inplace(
+    A: torch.Tensor,
+    absmax: torch.Tensor,
+    blocksize: int,
+    quant_type: str,
+    shape: Sequence[int],
+    dtype: torch.dtype,
+    out: torch.Tensor,
+) -> None:
+    torch._check(out.shape == shape, lambda: f"Expected out.shape == {shape}, got {out.shape}")
+    torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
+    triton_kernels._dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
+
+
+def gemv_4bit(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    shapeB: Sequence[int],
+    absmax: torch.Tensor,
+    code: torch.Tensor,
+    blocksize: int,
+) -> torch.Tensor:
+    # TODO: We need to determine whether `code` is NF4, FP4, or other.
+    # Right now we assume NF4, as this is the only one supported on CPU.
+    quant_type = "fp4" if code[1] > 0 else "nf4"
+    B_dq = dequantize_4bit(B, absmax, blocksize, quant_type, shapeB, A.dtype)
+
+    # For some reason directly passing code causes errors in some cases like:
+    # tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp32-fc1-fp4-DQ_True-xpu]
+    #
+    # B_dq = torch.empty(shapeB, dtype=A.dtype, device=A.device)
+    # if B.dtype != torch.uint8:
+    #     B = B.squeeze().view(torch.uint8).unsqueeze(1)
+
+    # triton_kernels._dequantize_4bit_impl_passing_code(
+    #     B,
+    #     absmax,
+    #     blocksize,
+    #     code,
+    #     dtype=A.dtype,
+    #     out=B_dq,
+    # )
+
+    # User called gemv with B.t(), so we need to transpose it back.
+    # if B.shape[0] == 1:
+    #    B_dq = B_dq.t()
+
+    return torch.nn.functional.linear(
+        A,
+        B_dq,
+        bias=None,
+    )
+
+
+if triton_available:
+    register_kernel("bitsandbytes::quantize_blockwise", "xpu")(quantize_blockwise)
+    register_kernel("bitsandbytes::dequantize_blockwise", "xpu")(dequantize_blockwise)
+    register_kernel("bitsandbytes::dequantize_4bit.out", "xpu")(dequantize_4bit_inplace)
+    register_kernel("bitsandbytes::dequantize_4bit", "xpu")(dequantize_4bit)
+    register_kernel("bitsandbytes::gemv_4bit", "xpu")(gemv_4bit)
+else:
+    warnings.warn("XPU available, but trtion package is missing.")