diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 88b17dd682c95..fbcb5e2ba530c 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3116,7 +3116,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_TANH: case GGML_UNARY_OP_EXP: - return ggml_is_contiguous(op->src[0]); + return true; default: return false; } diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu index f9c7b83c40d1b..f78db57434c1c 100644 --- a/ggml/src/ggml-cuda/unary.cu +++ b/ggml/src/ggml-cuda/unary.cu @@ -95,9 +95,51 @@ static __global__ void unary_op_kernel(const T * x, T * dst, const int k) { } template -static void unary_cuda(const T * x, T * dst, const int k, cudaStream_t stream) { +static __global__ void unary_op_kernel_noncont( + const void * x, void * dst, + const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, + const int64_t nb0_x, const int64_t nb1_x, const int64_t nb2_x, const int64_t nb3_x, + const int64_t nb0_d, const int64_t nb1_d, const int64_t nb2_d, const int64_t nb3_d, + const int64_t k) { + + const int64_t i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + + const int64_t i3 = i / (ne2 * ne1 * ne0); + const int64_t i2 = (i / (ne1 * ne0)) % ne2; + const int64_t i1 = (i / ne0) % ne1; + const int64_t i0 = i % ne0; + + const int64_t offset_x = i0*nb0_x + i1*nb1_x + i2*nb2_x + i3*nb3_x; + const int64_t offset_d = i0*nb0_d + i1*nb1_d + i2*nb2_d + i3*nb3_d; + + const T * px = (const T *)((const char *)x + offset_x); + T * pd = (T *)((char *)dst + offset_d); + + *pd = (T)op((float)*px); +} + +template +static void unary_cuda(const T * x, T * dst, const int k, + const ggml_tensor * src, const ggml_tensor * dst_tensor, + cudaStream_t stream) { const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE; - unary_op_kernel<<>>(x, dst, k); + + if (ggml_is_contiguous(src) && ggml_is_contiguous(dst_tensor)) { + unary_op_kernel<<>>(x, dst, k); + } else { + unary_op_kernel_noncont<<>>( + (const void *)x, (void *)dst, + src->ne[0], src->ne[1], src->ne[2], src->ne[3], + src->nb[0], src->nb[1], src->nb[2], src->nb[3], + dst_tensor->nb[0], dst_tensor->nb[1], + dst_tensor->nb[2], dst_tensor->nb[3], + k + ); + } } template @@ -107,16 +149,16 @@ void ggml_cuda_op_unary(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void * dst_d = dst->data; cudaStream_t stream = ctx.stream(); - GGML_ASSERT(ggml_is_contiguous(src0)); - GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); GGML_ASSERT(src0->type == dst->type); if (src0->type == GGML_TYPE_F16) { - unary_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream); + unary_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), + src0, dst, stream); } else { - unary_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream); + unary_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), + src0, dst, stream); } } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 81fe90b99323d..0a967d12a7fd7 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -5642,6 +5642,9 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_mean(GGML_TYPE_F32, {256, 256, 3, 1})); + test_cases.emplace_back(new test_unary((ggml_unary_op) GGML_UNARY_OP_ABS, GGML_TYPE_F32, {256, 256, 3, 1}, 0)); + test_cases.emplace_back(new test_unary((ggml_unary_op) GGML_UNARY_OP_ABS, GGML_TYPE_F32, {256, 256, 3, 1}, 1)); + return test_cases; }