diff --git a/.devops/musa.Dockerfile b/.devops/musa.Dockerfile index 87ce2393f6bf9..fcc47454b8237 100644 --- a/.devops/musa.Dockerfile +++ b/.devops/musa.Dockerfile @@ -1,10 +1,10 @@ ARG UBUNTU_VERSION=22.04 # This needs to generally match the container host's environment. -ARG MUSA_VERSION=rc4.0.1 +ARG MUSA_VERSION=4.2.0 # Target the MUSA build image -ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION} +ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64 -ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION} +ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64 FROM ${BASE_MUSA_DEV_CONTAINER} AS build diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 5bd988b7f7ce3..648e8acbd2ddf 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -515,7 +515,7 @@ jobs: ubuntu-22-cmake-musa: runs-on: ubuntu-22.04 - container: mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04 + container: mthreads/musa:4.2.0-devel-ubuntu22.04-amd64 steps: - name: Clone diff --git a/ci/README.md b/ci/README.md index 6e297f1a82788..092b4a5bddecb 100644 --- a/ci/README.md +++ b/ci/README.md @@ -54,7 +54,7 @@ docker run --privileged -it \ -v $HOME/llama.cpp/ci-cache:/ci-cache \ -v $HOME/llama.cpp/ci-results:/ci-results \ -v $PWD:/ws -w /ws \ - mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04 + mthreads/musa:4.2.0-devel-ubuntu22.04-amd64 ``` Inside the container, execute the following commands: diff --git a/docs/docker.md b/docs/docker.md index cbb333ee32c50..5433a0158e561 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -110,7 +110,7 @@ You may want to pass in some different `ARGS`, depending on the MUSA environment The defaults are: -- `MUSA_VERSION` set to `rc4.0.1` +- `MUSA_VERSION` set to `4.2.0` The resulting images, are essentially the same as the non-MUSA images: diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index de6d789c98a03..8ca1053cab320 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -174,6 +174,8 @@ option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON) option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF) option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF) +option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF) +option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF) option(GGML_VULKAN "ggml: use Vulkan" OFF) option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF) option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 1a2708ec9dff5..9435daf0b3f16 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -765,7 +765,7 @@ struct ggml_tensor_extra_gpu { }; -#if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)) +#if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)) || defined(GGML_MUSA_GRAPHS) #define USE_CUDA_GRAPH #endif diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index 2c55d2149b2d3..b9c40a2483783 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -1,8 +1,8 @@ #include "cpy.cuh" #include "dequantize.cuh" -#ifdef GGML_USE_MUSA +#if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY) #include "ggml-musa/mudnn.cuh" -#endif // GGML_USE_MUSA +#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY typedef void (*cpy_kernel_t)(const char * cx, char * cdst); @@ -358,7 +358,7 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int // Copy destination pointers to GPU to be available when pointer indirection is in use void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream) { -#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) +#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS) if (cuda_graph->dest_ptrs_size < host_dest_ptrs_size) { // (re-)allocate GPU memory for destination pointers CUDA_CHECK(cudaStreamSynchronize(stream)); if (cuda_graph->dest_ptrs_d != nullptr) { @@ -590,7 +590,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg char ** dest_ptrs_d = nullptr; int graph_cpynode_index = -1; -#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) +#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS) if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) { dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d; graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index; @@ -600,11 +600,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg #endif if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) { GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1)); -#ifdef GGML_USE_MUSA +#if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY) if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) { CUDA_CHECK(mudnnMemcpyAsync(ctx, src1, src0)); } else -#endif // GGML_USE_MUSA +#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY { CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream)); } @@ -647,7 +647,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type), ggml_type_name(src1->type)); } -#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) +#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS) if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) { ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index; } diff --git a/ggml/src/ggml-cuda/vendors/musa.h b/ggml/src/ggml-cuda/vendors/musa.h index 937779a90af6e..198963202443a 100644 --- a/ggml/src/ggml-cuda/vendors/musa.h +++ b/ggml/src/ggml-cuda/vendors/musa.h @@ -13,7 +13,7 @@ #define CUBLAS_OP_N MUBLAS_OP_N #define CUBLAS_OP_T MUBLAS_OP_T #define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS -#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT +#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_TENSOR_OP_MATH #define CUDA_R_16F MUSA_R_16F #define CUDA_R_16BF MUSA_R_16BF #define CUDA_R_32F MUSA_R_32F @@ -29,7 +29,7 @@ #define cublasSgemm mublasSgemm #define cublasStatus_t mublasStatus_t #define cublasOperation_t mublasOperation_t -#define cublasGetStatusString mublasStatus_to_string +#define cublasGetStatusString mublasGetStatusString #define cudaDataType_t musaDataType_t #define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer #define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess diff --git a/ggml/src/ggml-musa/CMakeLists.txt b/ggml/src/ggml-musa/CMakeLists.txt index 971314debc714..02904526ade04 100644 --- a/ggml/src/ggml-musa/CMakeLists.txt +++ b/ggml/src/ggml-musa/CMakeLists.txt @@ -34,8 +34,12 @@ if (MUSAToolkit_FOUND) list(APPEND GGML_SOURCES_MUSA ${SRCS}) file(GLOB SRCS "../ggml-cuda/template-instances/mmq*.cu") list(APPEND GGML_SOURCES_MUSA ${SRCS}) - file(GLOB SRCS "../ggml-musa/*.cu") - list(APPEND GGML_SOURCES_MUSA ${SRCS}) + + if (GGML_MUSA_MUDNN_COPY) + file(GLOB SRCS "../ggml-musa/*.cu") + list(APPEND GGML_SOURCES_MUSA ${SRCS}) + add_compile_definitions(GGML_MUSA_MUDNN_COPY) + endif() if (GGML_CUDA_FA_ALL_QUANTS) file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*.cu") @@ -72,6 +76,10 @@ if (MUSAToolkit_FOUND) add_compile_definitions(GGML_USE_MUSA) add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE}) + if (GGML_MUSA_GRAPHS) + add_compile_definitions(GGML_MUSA_GRAPHS) + endif() + if (GGML_CUDA_FORCE_MMQ) add_compile_definitions(GGML_CUDA_FORCE_MMQ) endif() @@ -97,10 +105,16 @@ if (MUSAToolkit_FOUND) endif() if (GGML_STATIC) - # TODO: mudnn has not provided static libraries yet target_link_libraries(ggml-musa PRIVATE MUSA::musart_static MUSA::mublas_static) + # TODO: mudnn has not provided static libraries yet + # if (GGML_MUSA_MUDNN_COPY) + # target_link_libraries(ggml-musa PRIVATE mudnn_static) + # endif() else() - target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas mudnn) + target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas) + if (GGML_MUSA_MUDNN_COPY) + target_link_libraries(ggml-musa PRIVATE mudnn) + endif() endif() if (GGML_CUDA_NO_VMM)