Skip to content

Commit f9f644d

Browse files
committed
Revert "CUDA: fix crash with partial offloading of MoE (ggml-org#13439)"
This reverts commit 7474e00.
1 parent c2454e5 commit f9f644d

File tree

3 files changed

+6
-12
lines changed

3 files changed

+6
-12
lines changed

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1910,19 +1910,13 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
19101910
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
19111911
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
19121912

1913-
// If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q.
1914-
// But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data.
1915-
// Therefore, in such cases use cuBLAS.
1916-
const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE
1917-
&& ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;
1918-
19191913
bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
19201914
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
19211915
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
1922-
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
1916+
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
19231917
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
19241918
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
1925-
bool use_mul_mat_q = ggml_is_quantized(src0->type) && !bad_padding_clear
1919+
bool use_mul_mat_q = ggml_is_quantized(src0->type)
19261920
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
19271921

19281922
bool any_gpus_with_slow_fp16 = false;

ggml/src/ggml-cuda/mmq.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,11 +91,11 @@ void ggml_cuda_mul_mat_q(
9191

9292
// If src0 is a temporary compute buffer, clear any potential padding.
9393
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
94+
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
95+
GGML_ASSERT(!src0->view_src);
9496
const size_t size_data = ggml_nbytes(src0);
9597
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
9698
if (size_alloc > size_data) {
97-
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
98-
GGML_ASSERT(!src0->view_src);
9999
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
100100
}
101101
}

ggml/src/ggml-cuda/mmvq.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -515,11 +515,11 @@ void ggml_cuda_mul_mat_vec_q(
515515

516516
// If src0 is a temporary compute buffer, clear any potential padding.
517517
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
518+
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
519+
GGML_ASSERT(!src0->view_src);
518520
const size_t size_data = ggml_nbytes(src0);
519521
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
520522
if (size_alloc > size_data) {
521-
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
522-
GGML_ASSERT(!src0->view_src);
523523
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
524524
}
525525
}

0 commit comments

Comments
 (0)