Skip to content

Commit db94e2b

Browse files
committed
update
1 parent de1fb4b commit db94e2b

File tree

2 files changed

+9
-12
lines changed

2 files changed

+9
-12
lines changed

src/diffusers/quantizers/gguf/utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -91,18 +91,18 @@ def _fused_mul_mat_gguf(x: torch.Tensor, qweight: torch.Tensor, qweight_type: in
9191
# y = ops.ggml_mul_mat_a8(qweight, x, qweight_type, qweight.shape[0])
9292

9393
# If there is no available MMQ kernel, fallback to dequantize
94-
elif qweight_type in DEQUANT_TYPES:
94+
if qweight_type in DEQUANT_TYPES:
9595
block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type]
9696
shape = (qweight.shape[0], qweight.shape[1] // type_size * block_size)
9797
weight = ops.ggml_dequantize(qweight, qweight_type, *shape)
98-
y = x @ weight.T
98+
y = x @ weight.to(x.dtype).T
9999
else:
100100
# Raise an error if the quantization type is not supported.
101101
# Might be useful if llama.cpp adds a new quantization type.
102102
# Wrap to GGMLQuantizationType IntEnum to make sure it's a valid type.
103103
qweight_type = gguf.GGMLQuantizationType(qweight_type)
104104
raise NotImplementedError(f"Unsupported GGUF quantization type: {qweight_type}")
105-
return y
105+
return y.as_tensor()
106106

107107

108108
# Copied from diffusers.quantizers.bitsandbytes.utils._create_accelerate_new_hook

tests/quantization/gguf/test_gguf.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -73,15 +73,12 @@ def test_cuda_kernels_vs_native(self):
7373

7474
for quant_type in test_quant_types:
7575
qtype = getattr(gguf.GGMLQuantizationType, quant_type)
76-
block_size, type_size = gguf.GGML_QUANT_SIZES[qtype]
77-
7876
in_features, out_features = 512, 512
79-
total_elements = in_features * out_features
80-
n_blocks = total_elements // block_size
81-
weight_bytes = n_blocks * type_size
8277

8378
torch.manual_seed(42)
84-
weight_data = torch.randint(0, 256, (weight_bytes,), dtype=torch.uint8, device=torch_device)
79+
float_weight = torch.randn(out_features, in_features, dtype=torch.float32)
80+
quantized_data = gguf.quants.quantize(float_weight.numpy(), qtype)
81+
weight_data = torch.from_numpy(quantized_data).to(device=torch_device)
8582
weight = GGUFParameter(weight_data, quant_type=qtype)
8683

8784
x = torch.randn(test_shape, dtype=compute_dtype, device=torch_device)
@@ -95,9 +92,9 @@ def test_cuda_kernels_vs_native(self):
9592
output_native = linear.forward_native(x)
9693
output_cuda = linear.forward_cuda(x)
9794

98-
# Compare outputs
99-
max_diff = torch.abs(output_cuda - output_native).max()
100-
assert max_diff < 1e-4, "GGUF CUDA Kernel Output is different from Native Output"
95+
assert torch.allclose(output_native, output_cuda, 1e-2), (
96+
f"GGUF CUDA Kernel Output is different from Native Output for {quant_type}"
97+
)
10198

10299

103100
@nightly

0 commit comments

Comments
 (0)