From d607127a478fdedf59b854f5fa2f469cafce21fc Mon Sep 17 00:00:00 2001 From: electron271 <66094410+electron271@users.noreply.github.com> Date: Sat, 6 Sep 2025 00:28:39 -0500 Subject: [PATCH 1/6] add support for 64 block size on 32 warp size supported amd gpus --- bitsandbytes/backends/cuda/ops.py | 10 +++++----- bitsandbytes/cextension.py | 9 ++++++++- bitsandbytes/cuda_specs.py | 26 ++++++++++++++++++++++++++ bitsandbytes/functional.py | 14 +++++++------- bitsandbytes/nn/modules.py | 4 ++-- tests/test_functional.py | 10 +++++----- tests/test_linear4bit.py | 8 ++++---- tests/test_ops.py | 12 ++++++------ 8 files changed, 63 insertions(+), 30 deletions(-) diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py index 30cad3e34..abc1fd223 100644 --- a/bitsandbytes/backends/cuda/ops.py +++ b/bitsandbytes/backends/cuda/ops.py @@ -8,7 +8,7 @@ from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr from ..._ops import register_kernel -from ...cextension import HIP_ENVIRONMENT, lib +from ...cextension import ROCM_WARP_SIZE_64, lib @register_kernel("bitsandbytes::int8_linear_matmul", "cuda") @@ -211,7 +211,7 @@ def _get_col_absmax( def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]: torch._check_is_size(blocksize) - if HIP_ENVIRONMENT: + if ROCM_WARP_SIZE_64: torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128]) else: torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64]) @@ -269,7 +269,7 @@ def _( def _dequantize_blockwise_impl( A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, out: torch.Tensor ) -> None: - if HIP_ENVIRONMENT: + if ROCM_WARP_SIZE_64: torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128]) else: torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64]) @@ -303,7 +303,7 @@ def _dequantize_blockwise_impl( def _( A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype ) -> tuple[torch.Tensor, torch.Tensor]: - if HIP_ENVIRONMENT: + if ROCM_WARP_SIZE_64: torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128]) else: torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64]) @@ -385,7 +385,7 @@ def _dequantize_4bit_impl( dtype: torch.dtype, out: torch.Tensor, ) -> None: - if HIP_ENVIRONMENT: + if ROCM_WARP_SIZE_64: torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128]) else: torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64]) diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py index 899a83314..42fdd3164 100644 --- a/bitsandbytes/cextension.py +++ b/bitsandbytes/cextension.py @@ -9,7 +9,13 @@ import torch from bitsandbytes.consts import DYNAMIC_LIBRARY_SUFFIX, PACKAGE_DIR -from bitsandbytes.cuda_specs import CUDASpecs, get_cuda_specs, get_cuda_version_tuple, get_rocm_gpu_arch +from bitsandbytes.cuda_specs import ( + CUDASpecs, + get_cuda_specs, + get_cuda_version_tuple, + get_rocm_gpu_arch, + get_rocm_warpsize, +) logger = logging.getLogger(__name__) @@ -298,6 +304,7 @@ def get_native_library() -> BNBNativeLibrary: ROCM_GPU_ARCH = get_rocm_gpu_arch() +ROCM_WARP_SIZE_64 = True if get_rocm_warpsize() == 64 else False try: # to support Intel CPU/GPU (XPU) backend diff --git a/bitsandbytes/cuda_specs.py b/bitsandbytes/cuda_specs.py index 32563a159..71e7568a9 100644 --- a/bitsandbytes/cuda_specs.py +++ b/bitsandbytes/cuda_specs.py @@ -100,3 +100,29 @@ def get_rocm_gpu_arch() -> str: """, ) return "unknown" + + +def get_rocm_warpsize() -> int: + """Get ROCm warp size.""" + logger = logging.getLogger(__name__) + try: + if torch.version.hip: + result = subprocess.run(["rocminfo"], capture_output=True, text=True) + match = re.search(r"Wavefront Size:\s+([0-9]{2})\(0x[0-9]{2}\)", result.stdout) + if match: + return int(match.group(1)) + else: + # default to 64 to be safe + return 64 + else: + # nvidia cards always use 32 warp size + return 32 + except Exception as e: + logger.error(f"Could not detect ROCm warp size: {e}. Defaulting to 64. (some 4-bit functions may not work!)") + if torch.cuda.is_available(): + logger.warning( + """ +ROCm warp size detection failed despite ROCm being available. + """, + ) + return 64 diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index c9f5ece60..f670dfe7c 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -15,7 +15,7 @@ from bitsandbytes.utils import _reverse_4bit_compress_format, pack_dict_to_tensor, unpack_tensor_to_dict -from .cextension import HIP_ENVIRONMENT, ipex_cpu, ipex_xpu, lib +from .cextension import ROCM_WARP_SIZE_64, ipex_cpu, ipex_xpu, lib name2qmap = {} @@ -804,7 +804,7 @@ def quantize_fp4( quant_storage=torch.uint8, ): if blocksize is None: - blocksize = 64 if not HIP_ENVIRONMENT else 128 + blocksize = 64 if not ROCM_WARP_SIZE_64 else 128 return quantize_4bit(A, absmax, out, blocksize, compress_statistics, "fp4", quant_storage) @@ -817,7 +817,7 @@ def quantize_nf4( quant_storage=torch.uint8, ): if blocksize is None: - blocksize = 64 if not HIP_ENVIRONMENT else 128 + blocksize = 64 if not ROCM_WARP_SIZE_64 else 128 return quantize_4bit(A, absmax, out, blocksize, compress_statistics, "nf4", quant_storage) @@ -855,7 +855,7 @@ def quantize_4bit( """ if blocksize is None: - blocksize = 64 if not HIP_ENVIRONMENT else 128 + blocksize = 64 if not ROCM_WARP_SIZE_64 else 128 input_shape = A.shape @@ -910,7 +910,7 @@ def dequantize_fp4( blocksize: Optional[int] = None, ) -> torch.Tensor: if blocksize is None: - blocksize = 64 if not HIP_ENVIRONMENT else 128 + blocksize = 64 if not ROCM_WARP_SIZE_64 else 128 return dequantize_4bit(A, quant_state, absmax, out, blocksize, "fp4") @@ -922,7 +922,7 @@ def dequantize_nf4( blocksize: Optional[int] = None, ) -> torch.Tensor: if blocksize is None: - blocksize = 64 if not HIP_ENVIRONMENT else 128 + blocksize = 64 if not ROCM_WARP_SIZE_64 else 128 return dequantize_4bit(A, quant_state, absmax, out, blocksize, "nf4") @@ -962,7 +962,7 @@ def dequantize_4bit( """ if blocksize is None: - blocksize = 64 if not HIP_ENVIRONMENT else 128 + blocksize = 64 if not ROCM_WARP_SIZE_64 else 128 if quant_state is None: assert absmax is not None and out is not None diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index 1cef1f5e9..a43194e6a 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -11,7 +11,7 @@ import torch.nn.functional as F import bitsandbytes as bnb -from bitsandbytes.cextension import HIP_ENVIRONMENT +from bitsandbytes.cextension import ROCM_WARP_SIZE_64 from bitsandbytes.functional import QuantState, _enable_ipex_fusion, ipex_cpu, ipex_xpu from bitsandbytes.optim import GlobalOptimManager from bitsandbytes.utils import ( @@ -225,7 +225,7 @@ def __new__( data = torch.empty(0) if blocksize is None: - blocksize = 64 if not HIP_ENVIRONMENT else 128 + blocksize = 64 if not ROCM_WARP_SIZE_64 else 128 self = torch.Tensor._make_subclass(cls, data, requires_grad) self.blocksize = blocksize diff --git a/tests/test_functional.py b/tests/test_functional.py index fc37cb4c3..e9bcbc267 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -9,7 +9,7 @@ import bitsandbytes as bnb from bitsandbytes import functional as F -from bitsandbytes.cextension import HIP_ENVIRONMENT, ROCM_GPU_ARCH +from bitsandbytes.cextension import HIP_ENVIRONMENT, ROCM_GPU_ARCH, ROCM_WARP_SIZE_64 from tests.helpers import ( BOOLEAN_TUPLES, TRUE_FALSE, @@ -95,7 +95,7 @@ class Test8BitBlockwiseQuantizeFunctional: @pytest.mark.parametrize("nested", TRUE_FALSE, ids=id_formatter("nested")) @pytest.mark.parametrize( "blocksize", - [4096, 2048, 1024, 512, 256, 128, 64] if not HIP_ENVIRONMENT else [4096, 2048, 1024, 512, 256, 128], + [4096, 2048, 1024, 512, 256, 128, 64] if not ROCM_WARP_SIZE_64 else [4096, 2048, 1024, 512, 256, 128], ) @pytest.mark.parametrize("signed", TRUE_FALSE, ids=id_formatter("signed")) def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize, signed): @@ -1107,7 +1107,7 @@ class TestQuantize4BitFunctional: @pytest.mark.parametrize("quant_type", ["fp4", "nf4"]) @pytest.mark.parametrize( "blocksize", - [64, 128, 256, 512, 1024, 2048, 4096] if not HIP_ENVIRONMENT else [128, 256, 512, 1024, 2048, 4096], + [64, 128, 256, 512, 1024, 2048, 4096] if not ROCM_WARP_SIZE_64 else [128, 256, 512, 1024, 2048, 4096], ) def test_4bit_quant(self, device, dtype, quant_type, blocksize): if device == "hpu" and not is_supported_on_hpu(quant_type, dtype): @@ -1174,7 +1174,7 @@ def test_4bit_quant(self, device, dtype, quant_type, blocksize): @pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("quant_type", ["fp4", "nf4"]) - @pytest.mark.parametrize("blocksize", [64, 128] if not HIP_ENVIRONMENT else [128], ids=id_formatter("blocksize")) + @pytest.mark.parametrize("blocksize", [64, 128] if not ROCM_WARP_SIZE_64 else [128], ids=id_formatter("blocksize")) @pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=describe_dtype) def test_4bit_compressed_stats(self, device, quant_type, blocksize, dtype): if device == "hpu" and not is_supported_on_hpu(quant_type, dtype): @@ -1241,7 +1241,7 @@ def test_bench_4bit_dequant(self, quant_type): # print((time.time()-t0)/iters*1e6) @pytest.mark.skipif( - HIP_ENVIRONMENT, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64" + ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64" ) @pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}") diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py index 1c5e77a32..398fb83d3 100644 --- a/tests/test_linear4bit.py +++ b/tests/test_linear4bit.py @@ -8,7 +8,7 @@ import torch import bitsandbytes as bnb -from bitsandbytes.cextension import HIP_ENVIRONMENT +from bitsandbytes.cextension import ROCM_WARP_SIZE_64 from tests.helpers import ( TRUE_FALSE, describe_dtype, @@ -192,7 +192,7 @@ def test_linear_serialization( @pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("quant_type", ["nf4", "fp4"]) -@pytest.mark.parametrize("blocksize", [64, 128] if not HIP_ENVIRONMENT else [128]) +@pytest.mark.parametrize("blocksize", [64, 128] if not ROCM_WARP_SIZE_64 else [128]) @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics")) def test_copy_param(device, quant_type, blocksize, compress_statistics): if device == "hpu" and not is_supported_on_hpu(quant_type): @@ -249,7 +249,7 @@ def test_params4bit_torch_chunk_split(device, quant_type): @pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("quant_type", ["nf4", "fp4"]) -@pytest.mark.parametrize("blocksize", [64, 128] if not HIP_ENVIRONMENT else [128]) +@pytest.mark.parametrize("blocksize", [64, 128] if not ROCM_WARP_SIZE_64 else [128]) @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics")) def test_deepcopy_param(device, quant_type, blocksize, compress_statistics): if device == "hpu" and not is_supported_on_hpu(quant_type): @@ -278,7 +278,7 @@ def test_deepcopy_param(device, quant_type, blocksize, compress_statistics): @pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("quant_type", ["nf4", "fp4"]) -@pytest.mark.parametrize("blocksize", [64, 128] if not HIP_ENVIRONMENT else [128]) +@pytest.mark.parametrize("blocksize", [64, 128] if not ROCM_WARP_SIZE_64 else [128]) @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics")) def test_params4bit_real_serialization(device, quant_type, blocksize, compress_statistics): if device == "hpu" and not is_supported_on_hpu(quant_type): diff --git a/tests/test_ops.py b/tests/test_ops.py index 8aa0560fd..17961fed1 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -4,7 +4,7 @@ import torch import bitsandbytes -from bitsandbytes.cextension import HIP_ENVIRONMENT +from bitsandbytes.cextension import ROCM_WARP_SIZE_64 from bitsandbytes.functional import ipex_xpu from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter, is_supported_on_hpu @@ -103,7 +103,7 @@ def test_int8_scaled_mm(self, device, dtype, has_bias): class TestInt8BlockwiseQuantOps: @pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype")) - @pytest.mark.parametrize("blocksize", [64, 128, 256, 512] if not HIP_ENVIRONMENT else [128, 256, 512]) + @pytest.mark.parametrize("blocksize", [64, 128, 256, 512] if not ROCM_WARP_SIZE_64 else [128, 256, 512]) def test_quantize_blockwise(self, device, dtype, blocksize): if device == "cpu": if dtype != torch.float32: @@ -127,7 +127,7 @@ def test_quantize_blockwise(self, device, dtype, blocksize): @pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype")) - @pytest.mark.parametrize("blocksize", [64, 128, 256, 512] if not HIP_ENVIRONMENT else [128, 256, 512]) + @pytest.mark.parametrize("blocksize", [64, 128, 256, 512] if not ROCM_WARP_SIZE_64 else [128, 256, 512]) def test_dequantize_blockwise(self, device, dtype, blocksize): if device == "cpu" and dtype != torch.float32: pytest.skip("CPU implementation is only available for float32") @@ -157,7 +157,7 @@ class Test4bitBlockwiseQuantOps: @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype")) @pytest.mark.parametrize("storage_dtype", [torch.uint8, torch.bfloat16], ids=id_formatter("storage_dtype")) @pytest.mark.parametrize("quant_type", ["fp4", "nf4"]) - @pytest.mark.parametrize("blocksize", [64, 128, 256, 512] if not HIP_ENVIRONMENT else [128, 256, 512]) + @pytest.mark.parametrize("blocksize", [64, 128, 256, 512] if not ROCM_WARP_SIZE_64 else [128, 256, 512]) def test_quantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize): if device == "hpu" and not is_supported_on_hpu(quant_type, dtype, storage_dtype): pytest.skip("This configuration is not supported on HPU.") @@ -181,7 +181,7 @@ def test_quantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype")) @pytest.mark.parametrize("storage_dtype", [torch.uint8, torch.bfloat16], ids=id_formatter("storage_dtype")) @pytest.mark.parametrize("quant_type", ["fp4", "nf4"]) - @pytest.mark.parametrize("blocksize", [64, 128, 256, 512] if not HIP_ENVIRONMENT else [128, 256, 512]) + @pytest.mark.parametrize("blocksize", [64, 128, 256, 512] if not ROCM_WARP_SIZE_64 else [128, 256, 512]) def test_dequantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize): if device == "hpu" and not is_supported_on_hpu(quant_type, dtype, storage_dtype): pytest.skip("This configuration is not supported on HPU.") @@ -215,7 +215,7 @@ def test_dequantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksi @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype")) @pytest.mark.parametrize("storage_dtype", [torch.uint8, torch.bfloat16], ids=id_formatter("storage_dtype")) @pytest.mark.parametrize("quant_type", ["fp4", "nf4"]) - @pytest.mark.parametrize("blocksize", [64, 128, 256, 512] if not HIP_ENVIRONMENT else [128, 256, 512]) + @pytest.mark.parametrize("blocksize", [64, 128, 256, 512] if not ROCM_WARP_SIZE_64 else [128, 256, 512]) def test_gemv_4bit(self, device, dtype, storage_dtype, quant_type, blocksize): if device == "hpu" and not is_supported_on_hpu(quant_type, dtype, storage_dtype): pytest.skip("This configuration is not supported on HPU.") From f7b44305f64a1bf162802bcbf9a176499ee39958 Mon Sep 17 00:00:00 2001 From: electron271 <66094410+electron271@users.noreply.github.com> Date: Sat, 6 Sep 2025 14:23:21 -0500 Subject: [PATCH 2/6] uncomment 64 block size support in csrc --- csrc/kernels.hip | 18 +++++++++--------- csrc/ops.hip | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/csrc/kernels.hip b/csrc/kernels.hip index ec3f7f025..cf5164790 100644 --- a/csrc/kernels.hip +++ b/csrc/kernels.hip @@ -3044,7 +3044,7 @@ MAKE_kQuantizeBlockwise(half, 1024, 4, 0, General8bit) MAKE_kQuantizeBlockwise(half, 512, 2, 0, General8bit) MAKE_kQuantizeBlockwise(half, 256, 2, 0, General8bit) MAKE_kQuantizeBlockwise(half, 128, 2, 0, General8bit) -//MAKE_kQuantizeBlockwise(half, 64, 2, 0, General8bit) +MAKE_kQuantizeBlockwise(half, 64, 2, 0, General8bit) MAKE_kQuantizeBlockwise(half, 4096, 4, 0, FP4) MAKE_kQuantizeBlockwise(half, 2048, 4, 0, FP4) @@ -3052,7 +3052,7 @@ MAKE_kQuantizeBlockwise(half, 1024, 4, 0, FP4) MAKE_kQuantizeBlockwise(half, 512, 2, 0, FP4) MAKE_kQuantizeBlockwise(half, 256, 2, 0, FP4) MAKE_kQuantizeBlockwise(half, 128, 2, 0, FP4) -//MAKE_kQuantizeBlockwise(half, 64, 2, 0, FP4) +MAKE_kQuantizeBlockwise(half, 64, 2, 0, FP4) MAKE_kQuantizeBlockwise(half, 4096, 4, 0, NF4) MAKE_kQuantizeBlockwise(half, 2048, 4, 0, NF4) @@ -3060,7 +3060,7 @@ MAKE_kQuantizeBlockwise(half, 1024, 4, 0, NF4) MAKE_kQuantizeBlockwise(half, 512, 2, 0, NF4) MAKE_kQuantizeBlockwise(half, 256, 2, 0, NF4) MAKE_kQuantizeBlockwise(half, 128, 2, 0, NF4) -//MAKE_kQuantizeBlockwise(half, 64, 2, 0, NF4) +MAKE_kQuantizeBlockwise(half, 64, 2, 0, NF4) MAKE_kQuantizeBlockwise(float, 4096, 4, 0, General8bit) MAKE_kQuantizeBlockwise(float, 4096, 4, 1, General8bit) @@ -3069,7 +3069,7 @@ MAKE_kQuantizeBlockwise(float, 1024, 4, 0, General8bit) MAKE_kQuantizeBlockwise(float, 512, 2, 0, General8bit) MAKE_kQuantizeBlockwise(float, 256, 2, 0, General8bit) MAKE_kQuantizeBlockwise(float, 128, 2, 0, General8bit) -//MAKE_kQuantizeBlockwise(float, 64, 2, 0, General8bit) +MAKE_kQuantizeBlockwise(float, 64, 2, 0, General8bit) MAKE_kQuantizeBlockwise(float, 4096, 4, 0, FP4) MAKE_kQuantizeBlockwise(float, 2048, 4, 0, FP4) @@ -3077,7 +3077,7 @@ MAKE_kQuantizeBlockwise(float, 1024, 4, 0, FP4) MAKE_kQuantizeBlockwise(float, 512, 2, 0, FP4) MAKE_kQuantizeBlockwise(float, 256, 2, 0, FP4) MAKE_kQuantizeBlockwise(float, 128, 2, 0, FP4) -//MAKE_kQuantizeBlockwise(float, 64, 2, 0, FP4) +MAKE_kQuantizeBlockwise(float, 64, 2, 0, FP4) MAKE_kQuantizeBlockwise(float, 4096, 4, 0, NF4) MAKE_kQuantizeBlockwise(float, 2048, 4, 0, NF4) @@ -3085,7 +3085,7 @@ MAKE_kQuantizeBlockwise(float, 1024, 4, 0, NF4) MAKE_kQuantizeBlockwise(float, 512, 2, 0, NF4) MAKE_kQuantizeBlockwise(float, 256, 2, 0, NF4) MAKE_kQuantizeBlockwise(float, 128, 2, 0, NF4) -//MAKE_kQuantizeBlockwise(float, 64, 2, 0, NF4) +MAKE_kQuantizeBlockwise(float, 64, 2, 0, NF4) MAKE_kQuantizeBlockwise(hip_bfloat16, 4096, 4, 0, General8bit) MAKE_kQuantizeBlockwise(hip_bfloat16, 4096, 4, 1, General8bit) @@ -3094,7 +3094,7 @@ MAKE_kQuantizeBlockwise(hip_bfloat16, 1024, 4, 0, General8bit) MAKE_kQuantizeBlockwise(hip_bfloat16, 512, 2, 0, General8bit) MAKE_kQuantizeBlockwise(hip_bfloat16, 256, 2, 0, General8bit) MAKE_kQuantizeBlockwise(hip_bfloat16, 128, 2, 0, General8bit) -//MAKE_kQuantizeBlockwise(hip_bfloat16, 64, 2, 0, General8bit) +MAKE_kQuantizeBlockwise(hip_bfloat16, 64, 2, 0, General8bit) MAKE_kQuantizeBlockwise(hip_bfloat16, 4096, 4, 0, FP4) MAKE_kQuantizeBlockwise(hip_bfloat16, 2048, 4, 0, FP4) @@ -3102,7 +3102,7 @@ MAKE_kQuantizeBlockwise(hip_bfloat16, 1024, 4, 0, FP4) MAKE_kQuantizeBlockwise(hip_bfloat16, 512, 2, 0, FP4) MAKE_kQuantizeBlockwise(hip_bfloat16, 256, 2, 0, FP4) MAKE_kQuantizeBlockwise(hip_bfloat16, 128, 2, 0, FP4) -//MAKE_kQuantizeBlockwise(hip_bfloat16, 64, 2, 0, FP4) +MAKE_kQuantizeBlockwise(hip_bfloat16, 64, 2, 0, FP4) MAKE_kQuantizeBlockwise(hip_bfloat16, 4096, 4, 0, NF4) MAKE_kQuantizeBlockwise(hip_bfloat16, 2048, 4, 0, NF4) @@ -3110,7 +3110,7 @@ MAKE_kQuantizeBlockwise(hip_bfloat16, 1024, 4, 0, NF4) MAKE_kQuantizeBlockwise(hip_bfloat16, 512, 2, 0, NF4) MAKE_kQuantizeBlockwise(hip_bfloat16, 256, 2, 0, NF4) MAKE_kQuantizeBlockwise(hip_bfloat16, 128, 2, 0, NF4) -//MAKE_kQuantizeBlockwise(hip_bfloat16, 64, 2, 0, NF4) +MAKE_kQuantizeBlockwise(hip_bfloat16, 64, 2, 0, NF4) template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, half *out, const int blocksize, const int n); template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, half *out, const int blocksize, const int n); diff --git a/csrc/ops.hip b/csrc/ops.hip index 260b74b30..6b429a382 100644 --- a/csrc/ops.hip +++ b/csrc/ops.hip @@ -57,8 +57,8 @@ template void quantizeBlockwise(floa hipLaunchKernelGGL(( kQuantizeBlockwise), dim3(num_blocks), dim3(128), 0, 0, code, A, absmax, out, rand, rand_offset, n); else if(blocksize == 128) hipLaunchKernelGGL(( kQuantizeBlockwise), dim3(num_blocks), dim3(64), 0, 0, code, A, absmax, out, rand, rand_offset, n); - //else if(blocksize == 64) - // hipLaunchKernelGGL(( kQuantizeBlockwise), dim3(num_blocks), dim3(32), 0, 0, code, A, absmax, out, rand, rand_offset, n); + else if(blocksize == 64) + hipLaunchKernelGGL(( kQuantizeBlockwise), dim3(num_blocks), dim3(32), 0, 0, code, A, absmax, out, rand, rand_offset, n); CUDA_CHECK_RETURN(hipPeekAtLastError()); From 6e2e4d239a1dd402fdd11e79061cbcda548f9a28 Mon Sep 17 00:00:00 2001 From: electron271 <66094410+electron271@users.noreply.github.com> Date: Sat, 6 Sep 2025 17:14:57 -0500 Subject: [PATCH 3/6] only enable 64 block size support on architectures with 32 warp size --- csrc/kernels.hip | 36 +++++++++++++++++++++++++++--------- csrc/ops.hip | 2 +- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/csrc/kernels.hip b/csrc/kernels.hip index cf5164790..6956ebac4 100644 --- a/csrc/kernels.hip +++ b/csrc/kernels.hip @@ -3044,7 +3044,9 @@ MAKE_kQuantizeBlockwise(half, 1024, 4, 0, General8bit) MAKE_kQuantizeBlockwise(half, 512, 2, 0, General8bit) MAKE_kQuantizeBlockwise(half, 256, 2, 0, General8bit) MAKE_kQuantizeBlockwise(half, 128, 2, 0, General8bit) -MAKE_kQuantizeBlockwise(half, 64, 2, 0, General8bit) +#if WARP_SIZE == 32 + MAKE_kQuantizeBlockwise(half, 64, 2, 0, General8bit) +#endif MAKE_kQuantizeBlockwise(half, 4096, 4, 0, FP4) MAKE_kQuantizeBlockwise(half, 2048, 4, 0, FP4) @@ -3052,7 +3054,9 @@ MAKE_kQuantizeBlockwise(half, 1024, 4, 0, FP4) MAKE_kQuantizeBlockwise(half, 512, 2, 0, FP4) MAKE_kQuantizeBlockwise(half, 256, 2, 0, FP4) MAKE_kQuantizeBlockwise(half, 128, 2, 0, FP4) -MAKE_kQuantizeBlockwise(half, 64, 2, 0, FP4) +#if WARP_SIZE == 32 + MAKE_kQuantizeBlockwise(half, 64, 2, 0, FP4) +#endif MAKE_kQuantizeBlockwise(half, 4096, 4, 0, NF4) MAKE_kQuantizeBlockwise(half, 2048, 4, 0, NF4) @@ -3060,7 +3064,9 @@ MAKE_kQuantizeBlockwise(half, 1024, 4, 0, NF4) MAKE_kQuantizeBlockwise(half, 512, 2, 0, NF4) MAKE_kQuantizeBlockwise(half, 256, 2, 0, NF4) MAKE_kQuantizeBlockwise(half, 128, 2, 0, NF4) -MAKE_kQuantizeBlockwise(half, 64, 2, 0, NF4) +#if WARP_SIZE == 32 + MAKE_kQuantizeBlockwise(half, 64, 2, 0, NF4) +#endif MAKE_kQuantizeBlockwise(float, 4096, 4, 0, General8bit) MAKE_kQuantizeBlockwise(float, 4096, 4, 1, General8bit) @@ -3069,7 +3075,9 @@ MAKE_kQuantizeBlockwise(float, 1024, 4, 0, General8bit) MAKE_kQuantizeBlockwise(float, 512, 2, 0, General8bit) MAKE_kQuantizeBlockwise(float, 256, 2, 0, General8bit) MAKE_kQuantizeBlockwise(float, 128, 2, 0, General8bit) -MAKE_kQuantizeBlockwise(float, 64, 2, 0, General8bit) +#if WARP_SIZE == 32 + MAKE_kQuantizeBlockwise(float, 64, 2, 0, General8bit) +#endif MAKE_kQuantizeBlockwise(float, 4096, 4, 0, FP4) MAKE_kQuantizeBlockwise(float, 2048, 4, 0, FP4) @@ -3077,7 +3085,9 @@ MAKE_kQuantizeBlockwise(float, 1024, 4, 0, FP4) MAKE_kQuantizeBlockwise(float, 512, 2, 0, FP4) MAKE_kQuantizeBlockwise(float, 256, 2, 0, FP4) MAKE_kQuantizeBlockwise(float, 128, 2, 0, FP4) -MAKE_kQuantizeBlockwise(float, 64, 2, 0, FP4) +#if WARP_SIZE == 32 + MAKE_kQuantizeBlockwise(float, 64, 2, 0, FP4) +#endif MAKE_kQuantizeBlockwise(float, 4096, 4, 0, NF4) MAKE_kQuantizeBlockwise(float, 2048, 4, 0, NF4) @@ -3085,7 +3095,9 @@ MAKE_kQuantizeBlockwise(float, 1024, 4, 0, NF4) MAKE_kQuantizeBlockwise(float, 512, 2, 0, NF4) MAKE_kQuantizeBlockwise(float, 256, 2, 0, NF4) MAKE_kQuantizeBlockwise(float, 128, 2, 0, NF4) -MAKE_kQuantizeBlockwise(float, 64, 2, 0, NF4) +#if WARP_SIZE == 32 + MAKE_kQuantizeBlockwise(float, 64, 2, 0, NF4) +#endif MAKE_kQuantizeBlockwise(hip_bfloat16, 4096, 4, 0, General8bit) MAKE_kQuantizeBlockwise(hip_bfloat16, 4096, 4, 1, General8bit) @@ -3094,7 +3106,9 @@ MAKE_kQuantizeBlockwise(hip_bfloat16, 1024, 4, 0, General8bit) MAKE_kQuantizeBlockwise(hip_bfloat16, 512, 2, 0, General8bit) MAKE_kQuantizeBlockwise(hip_bfloat16, 256, 2, 0, General8bit) MAKE_kQuantizeBlockwise(hip_bfloat16, 128, 2, 0, General8bit) -MAKE_kQuantizeBlockwise(hip_bfloat16, 64, 2, 0, General8bit) +#if WARP_SIZE == 32 + MAKE_kQuantizeBlockwise(hip_bfloat16, 64, 2, 0, General8bit) +#endif MAKE_kQuantizeBlockwise(hip_bfloat16, 4096, 4, 0, FP4) MAKE_kQuantizeBlockwise(hip_bfloat16, 2048, 4, 0, FP4) @@ -3102,7 +3116,9 @@ MAKE_kQuantizeBlockwise(hip_bfloat16, 1024, 4, 0, FP4) MAKE_kQuantizeBlockwise(hip_bfloat16, 512, 2, 0, FP4) MAKE_kQuantizeBlockwise(hip_bfloat16, 256, 2, 0, FP4) MAKE_kQuantizeBlockwise(hip_bfloat16, 128, 2, 0, FP4) -MAKE_kQuantizeBlockwise(hip_bfloat16, 64, 2, 0, FP4) +#if WARP_SIZE == 32 + MAKE_kQuantizeBlockwise(hip_bfloat16, 64, 2, 0, FP4) +#endif MAKE_kQuantizeBlockwise(hip_bfloat16, 4096, 4, 0, NF4) MAKE_kQuantizeBlockwise(hip_bfloat16, 2048, 4, 0, NF4) @@ -3110,7 +3126,9 @@ MAKE_kQuantizeBlockwise(hip_bfloat16, 1024, 4, 0, NF4) MAKE_kQuantizeBlockwise(hip_bfloat16, 512, 2, 0, NF4) MAKE_kQuantizeBlockwise(hip_bfloat16, 256, 2, 0, NF4) MAKE_kQuantizeBlockwise(hip_bfloat16, 128, 2, 0, NF4) -MAKE_kQuantizeBlockwise(hip_bfloat16, 64, 2, 0, NF4) +#if WARP_SIZE == 32 + MAKE_kQuantizeBlockwise(hip_bfloat16, 64, 2, 0, NF4) +#endif template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, half *out, const int blocksize, const int n); template __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, half *out, const int blocksize, const int n); diff --git a/csrc/ops.hip b/csrc/ops.hip index 6b429a382..d1cd91443 100644 --- a/csrc/ops.hip +++ b/csrc/ops.hip @@ -57,7 +57,7 @@ template void quantizeBlockwise(floa hipLaunchKernelGGL(( kQuantizeBlockwise), dim3(num_blocks), dim3(128), 0, 0, code, A, absmax, out, rand, rand_offset, n); else if(blocksize == 128) hipLaunchKernelGGL(( kQuantizeBlockwise), dim3(num_blocks), dim3(64), 0, 0, code, A, absmax, out, rand, rand_offset, n); - else if(blocksize == 64) + else if(blocksize == 64 && warpSize == 32) hipLaunchKernelGGL(( kQuantizeBlockwise), dim3(num_blocks), dim3(32), 0, 0, code, A, absmax, out, rand, rand_offset, n); From 8c24b4d95479666af07ef71f55375008263883ba Mon Sep 17 00:00:00 2001 From: electron271 <66094410+electron271@users.noreply.github.com> Date: Thu, 2 Oct 2025 15:11:43 -0500 Subject: [PATCH 4/6] use BNB_WARP_SIZE instead of warpSize in ops.hip --- csrc/ops.hip | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/csrc/ops.hip b/csrc/ops.hip index d1cd91443..4f7ce9abb 100644 --- a/csrc/ops.hip +++ b/csrc/ops.hip @@ -5,6 +5,7 @@ // This source code is licensed under the MIT license found in the // LICENSE file in the root directory of this source tree. +#include #include #include #include @@ -57,7 +58,7 @@ template void quantizeBlockwise(floa hipLaunchKernelGGL(( kQuantizeBlockwise), dim3(num_blocks), dim3(128), 0, 0, code, A, absmax, out, rand, rand_offset, n); else if(blocksize == 128) hipLaunchKernelGGL(( kQuantizeBlockwise), dim3(num_blocks), dim3(64), 0, 0, code, A, absmax, out, rand, rand_offset, n); - else if(blocksize == 64 && warpSize == 32) + else if(blocksize == 64 && BNB_WARP_SIZE == 32) hipLaunchKernelGGL(( kQuantizeBlockwise), dim3(num_blocks), dim3(32), 0, 0, code, A, absmax, out, rand, rand_offset, n); From 7dd7b88db2782b3b943bd0184c7ad230c0d97f9e Mon Sep 17 00:00:00 2001 From: sstamenk Date: Fri, 17 Oct 2025 22:13:19 +0000 Subject: [PATCH 5/6] Reuse BNB_WARP_SIZE macro --- csrc/common_hip.cuh | 8 +++++-- csrc/kernels.hip | 51 ++++++++++++++++++++++----------------------- csrc/ops.hip | 2 +- 3 files changed, 32 insertions(+), 29 deletions(-) diff --git a/csrc/common_hip.cuh b/csrc/common_hip.cuh index 1d9d9afe0..3ea545e9a 100644 --- a/csrc/common_hip.cuh +++ b/csrc/common_hip.cuh @@ -1,7 +1,11 @@ #pragma once -#define BNB_WARP_SIZE warpSize +#ifdef __GFX9__ + #define BNB_WARP_SIZE 64 +#else + #define BNB_WARP_SIZE 32 +#endif // These are set based on current BNB support for CDNA 2 & RDNA 3. Update as needed for future archs -#define BNB_MAX_THREADS_PER_SM 2048 +#define BNB_MAX_THREADS_PER_CU 2048 #define BNB_BF16_AVAILABLE true diff --git a/csrc/kernels.hip b/csrc/kernels.hip index 6956ebac4..9b94b334d 100644 --- a/csrc/kernels.hip +++ b/csrc/kernels.hip @@ -1933,7 +1933,7 @@ kOptimizerStatic8bit1StateBlockwise(T* p, T* __restrict__ const g, unsigned char // rowStats [rows] // out [rows, cols] template -__launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024) +__launch_bounds__(1024, BNB_MAX_THREADS_PER_CU / 1024) __global__ void kInt8VectorQuant(T * __restrict__ A, int8_t* out, float* rowStats, float threshold, int rows, int cols) { // For sm50/sm52 and CUDA < 12.2 we need to do the reduction in fp32. @@ -1997,7 +1997,7 @@ __global__ void kInt8VectorQuant(T * __restrict__ A, int8_t* out, float* rowStat } template -__launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024) +__launch_bounds__(1024, BNB_MAX_THREADS_PER_CU / 1024) __global__ void kgetRowStats(T * __restrict__ A, float *rowStats, float threshold, int rows, int cols) { using BlockReduceT = hipcub::BlockReduce; @@ -2109,7 +2109,6 @@ __global__ void kdequant_mm_int32_fp16( #define DENORM 1.0f/127.0f #define MAX_SPARSE_COUNT 32 #define SMEM_SIZE 8*256 -#define WARP_SIZE warpSize template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB) { @@ -2130,9 +2129,9 @@ __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *o const int offset = local_max_idx == 0 ? 0 : offset_rowidx[local_max_idx-1]; const int local_row_idx = rowidx[offset]; - const int warp_id = threadIdx.x / WARP_SIZE; - const int warp_idx = threadIdx.x % WARP_SIZE; - const int warp_offset = (warp_id*WARP_SIZE)*SPMM_ITEMS; + const int warp_id = threadIdx.x / BNB_WARP_SIZE; + const int warp_idx = threadIdx.x % BNB_WARP_SIZE; + const int warp_offset = (warp_id*BNB_WARP_SIZE)*SPMM_ITEMS; const int num_items = BITS == 8 ? 8 : 8; int idx_col_B = warp_offset; int local_idx_col_B_offset = 0; @@ -2152,7 +2151,7 @@ __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *o } // each thread processes SPMM_ITEMS=32 per iteration. We have 256 threads. 32*256=x192 - // we expect each warp to be SPMM_ITEMS*WARP_SIZE apart + // we expect each warp to be SPMM_ITEMS*BNB_WARP_SIZE apart // we have a total of 128 bytes for the bank with a bank size of 4 bytes // added 3 bytes = 6 values between warps should reduce bank conflicts __shared__ half smem_dequant_stats[SMEM_SIZE]; @@ -2705,16 +2704,16 @@ template __global__ void kgemm_4bit_inferenc { // per threadblock: - // load step-by-step in chunks of [warp_size,warps]: 1xwarp_size * [warp_size,warps] -> [1,warps] + // load step-by-step in chunks of [BNB_WARP_SIZE,warps]: 1xBNB_WARP_SIZE * [BNB_WARP_SIZE,warps] -> [1,warps] // 4 warps -> 4 loads per iter - // 1xwarp_size * warp_sizex4 -> 1x4 outputs per thread block - typedef hipcub::WarpReduce WarpReduce; - __shared__ typename WarpReduce::TempStorage temp_storage[THREADS/warpSize]; - - const int warp_idx = threadIdx.x / warpSize; - const int warp_lane = threadIdx.x % warpSize; - const int row_B = (THREADS/warpSize)*blockIdx.x + warp_idx; - const int offset_B = ldb*row_B; + // 1xBNB_WARP_SIZE * BNB_WARP_SIZEx4 -> 1x4 outputs per thread block + typedef hipcub::WarpReduce WarpReduce; + __shared__ typename WarpReduce::TempStorage temp_storage[THREADS/BNB_WARP_SIZE]; + + const int warp_idx = threadIdx.x / BNB_WARP_SIZE; + const int warp_lane = threadIdx.x % BNB_WARP_SIZE; + const int row_B = (THREADS/BNB_WARP_SIZE)*blockIdx.x + warp_idx; + const int offset_B = ldb * row_B; const int num_values_8bit = num_values_4bit/2; float local_C = 0.0f; @@ -2732,7 +2731,7 @@ template __global__ void kgemm_4bit_inferenc // A: [1, K] // B: [M, K] - for(int inner_idx = warp_lane*num_values_4bit; inner_idx < K; inner_idx += warpSize*num_values_4bit) + for(int inner_idx = warp_lane*num_values_4bit; inner_idx < K; inner_idx += BNB_WARP_SIZE*num_values_4bit) { const int inner_idx_halved = inner_idx/2; @@ -3044,7 +3043,7 @@ MAKE_kQuantizeBlockwise(half, 1024, 4, 0, General8bit) MAKE_kQuantizeBlockwise(half, 512, 2, 0, General8bit) MAKE_kQuantizeBlockwise(half, 256, 2, 0, General8bit) MAKE_kQuantizeBlockwise(half, 128, 2, 0, General8bit) -#if WARP_SIZE == 32 +#if BNB_WARP_SIZE == 32 MAKE_kQuantizeBlockwise(half, 64, 2, 0, General8bit) #endif @@ -3054,7 +3053,7 @@ MAKE_kQuantizeBlockwise(half, 1024, 4, 0, FP4) MAKE_kQuantizeBlockwise(half, 512, 2, 0, FP4) MAKE_kQuantizeBlockwise(half, 256, 2, 0, FP4) MAKE_kQuantizeBlockwise(half, 128, 2, 0, FP4) -#if WARP_SIZE == 32 +#if BNB_WARP_SIZE == 32 MAKE_kQuantizeBlockwise(half, 64, 2, 0, FP4) #endif @@ -3064,7 +3063,7 @@ MAKE_kQuantizeBlockwise(half, 1024, 4, 0, NF4) MAKE_kQuantizeBlockwise(half, 512, 2, 0, NF4) MAKE_kQuantizeBlockwise(half, 256, 2, 0, NF4) MAKE_kQuantizeBlockwise(half, 128, 2, 0, NF4) -#if WARP_SIZE == 32 +#if BNB_WARP_SIZE == 32 MAKE_kQuantizeBlockwise(half, 64, 2, 0, NF4) #endif @@ -3075,7 +3074,7 @@ MAKE_kQuantizeBlockwise(float, 1024, 4, 0, General8bit) MAKE_kQuantizeBlockwise(float, 512, 2, 0, General8bit) MAKE_kQuantizeBlockwise(float, 256, 2, 0, General8bit) MAKE_kQuantizeBlockwise(float, 128, 2, 0, General8bit) -#if WARP_SIZE == 32 +#if BNB_WARP_SIZE == 32 MAKE_kQuantizeBlockwise(float, 64, 2, 0, General8bit) #endif @@ -3085,7 +3084,7 @@ MAKE_kQuantizeBlockwise(float, 1024, 4, 0, FP4) MAKE_kQuantizeBlockwise(float, 512, 2, 0, FP4) MAKE_kQuantizeBlockwise(float, 256, 2, 0, FP4) MAKE_kQuantizeBlockwise(float, 128, 2, 0, FP4) -#if WARP_SIZE == 32 +#if BNB_WARP_SIZE == 32 MAKE_kQuantizeBlockwise(float, 64, 2, 0, FP4) #endif @@ -3095,7 +3094,7 @@ MAKE_kQuantizeBlockwise(float, 1024, 4, 0, NF4) MAKE_kQuantizeBlockwise(float, 512, 2, 0, NF4) MAKE_kQuantizeBlockwise(float, 256, 2, 0, NF4) MAKE_kQuantizeBlockwise(float, 128, 2, 0, NF4) -#if WARP_SIZE == 32 +#if BNB_WARP_SIZE == 32 MAKE_kQuantizeBlockwise(float, 64, 2, 0, NF4) #endif @@ -3106,7 +3105,7 @@ MAKE_kQuantizeBlockwise(hip_bfloat16, 1024, 4, 0, General8bit) MAKE_kQuantizeBlockwise(hip_bfloat16, 512, 2, 0, General8bit) MAKE_kQuantizeBlockwise(hip_bfloat16, 256, 2, 0, General8bit) MAKE_kQuantizeBlockwise(hip_bfloat16, 128, 2, 0, General8bit) -#if WARP_SIZE == 32 +#if BNB_WARP_SIZE == 32 MAKE_kQuantizeBlockwise(hip_bfloat16, 64, 2, 0, General8bit) #endif @@ -3116,7 +3115,7 @@ MAKE_kQuantizeBlockwise(hip_bfloat16, 1024, 4, 0, FP4) MAKE_kQuantizeBlockwise(hip_bfloat16, 512, 2, 0, FP4) MAKE_kQuantizeBlockwise(hip_bfloat16, 256, 2, 0, FP4) MAKE_kQuantizeBlockwise(hip_bfloat16, 128, 2, 0, FP4) -#if WARP_SIZE == 32 +#if BNB_WARP_SIZE == 32 MAKE_kQuantizeBlockwise(hip_bfloat16, 64, 2, 0, FP4) #endif @@ -3126,7 +3125,7 @@ MAKE_kQuantizeBlockwise(hip_bfloat16, 1024, 4, 0, NF4) MAKE_kQuantizeBlockwise(hip_bfloat16, 512, 2, 0, NF4) MAKE_kQuantizeBlockwise(hip_bfloat16, 256, 2, 0, NF4) MAKE_kQuantizeBlockwise(hip_bfloat16, 128, 2, 0, NF4) -#if WARP_SIZE == 32 +#if BNB_WARP_SIZE == 32 MAKE_kQuantizeBlockwise(hip_bfloat16, 64, 2, 0, NF4) #endif diff --git a/csrc/ops.hip b/csrc/ops.hip index 4f7ce9abb..a7ab32fdc 100644 --- a/csrc/ops.hip +++ b/csrc/ops.hip @@ -693,7 +693,7 @@ template void gemm_4bit_inference_naive(int m, int n, int //warpsize - 32 int num_blocks = (m+3)/4; //warpsize - 64 - if (warpSize == 64) { + if (BNB_WARP_SIZE == 64) { num_blocks = (m+1)/2; } From e9f0af3114695989688f829dfac5d7f272bcb332 Mon Sep 17 00:00:00 2001 From: electron271 <66094410+electron271@users.noreply.github.com> Date: Thu, 23 Oct 2025 20:32:13 -0500 Subject: [PATCH 6/6] Remove unused WARP_SIZE definitions --- csrc/kernels.hip | 5 ----- csrc/ops.hip | 6 ------ 2 files changed, 11 deletions(-) diff --git a/csrc/kernels.hip b/csrc/kernels.hip index 4fe957b24..d7a5d21e4 100644 --- a/csrc/kernels.hip +++ b/csrc/kernels.hip @@ -2057,11 +2057,6 @@ __global__ void kdequant_mm_int32_fp16( #define DENORM 1.0f/127.0f #define MAX_SPARSE_COUNT 32 #define SMEM_SIZE 8*256 -#if defined(__GFX9__) - #define WARP_SIZE 64 -#else - #define WARP_SIZE 32 -#endif template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB) { diff --git a/csrc/ops.hip b/csrc/ops.hip index 4142e2b55..a7ab32fdc 100644 --- a/csrc/ops.hip +++ b/csrc/ops.hip @@ -21,12 +21,6 @@ #define ERR_NOT_IMPLEMENTED 100 -#if defined(__GFX9__) - #define WARP_SIZE 64 -#else - #define WARP_SIZE 32 -#endif - using namespace BinSearch; using std::cout; using std::endl;