Revert "Reapply "CUDA: fix bad asserts for partial offload (ggml-org#13337)""

Nexesenex · Nexesenex · commit 415a128440bf · 2025-07-17T23:21:32.000+02:00
This reverts commit b25ed015c47f647d6b3d7404217eb7e133d5144a for CUDA.

revert on fattn_commun.cuh
diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -871,7 +871,6 @@ void launch_fattn(
     size_t nb23 = V ? V->nb[3] : nb13;
 
     if (need_f16_K && K->type != GGML_TYPE_F16) {
-        GGML_ASSERT(ggml_is_contiguously_allocated(K));
         K_f16.alloc(ggml_nelements(K));
         to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(K->type);
         to_fp16(K_data, K_f16.ptr, 1, ggml_nelements(K), main_stream);
@@ -887,7 +886,6 @@ void launch_fattn(
 
     if (V && need_f16_V && V->type != GGML_TYPE_F16) {
         // GGML_ASSERT(ggml_is_contiguous(V));
-        GGML_ASSERT(ggml_is_contiguously_allocated(V));
         V_f16.alloc(ggml_nelements(V));
         to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type);
         to_fp16(V_data, V_f16.ptr, 1, ggml_nelements(V), main_stream);