diff --git a/Makefile b/Makefile index 83498098..e2f5387d 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ flake_find: - cd ktransformers && flake8 | grep -Eo '[A-Z][0-9]{3}' | sort | uniq| paste -sd ',' - + cd ktransformers && flake8 | grep -Eo '[A-Z][0-9]{3}' | sort | uniq| paste -sd ',' - format: @cd ktransformers && black . @black setup.py @@ -14,7 +14,11 @@ dev_install: # install ktransformers echo "Installing python dependencies from requirements.txt" - pip install -r requirements-local_chat.txt + @if command -v mcc > /dev/null 2>&1; then \ + bash -c 'pip install -r <(grep -v -E "torch|numpy" requirements-local_chat.txt)'; \ + else \ + pip install -r requirements-local_chat.txt; \ + fi echo "Installing ktransformers" KTRANSFORMERS_FORCE_BUILD=TRUE pip install -e . -v --no-build-isolation diff --git a/install.sh b/install.sh index c5773ece..73d7c173 100644 --- a/install.sh +++ b/install.sh @@ -1,5 +1,5 @@ #!/bin/bash -set -e +set -e # clear build dirs rm -rf build @@ -10,7 +10,11 @@ rm -rf ktransformers/ktransformers_ext/cuda/dist rm -rf ktransformers/ktransformers_ext/cuda/*.egg-info echo "Installing python dependencies from requirements.txt" -pip install -r requirements-local_chat.txt +if command -v mcc > /dev/null 2>&1; then + bash -c 'pip install -r <(grep -v -E "torch|numpy" requirements-local_chat.txt)' +else + pip install -r requirements-local_chat.txt +fi echo "Installing ktransformers" KTRANSFORMERS_FORCE_BUILD=TRUE pip install . --no-build-isolation diff --git a/ktransformers/ktransformers_ext/cpu_backend/vendors/musa.h b/ktransformers/ktransformers_ext/cpu_backend/vendors/musa.h index 6cc1b69e..18922218 100644 --- a/ktransformers/ktransformers_ext/cpu_backend/vendors/musa.h +++ b/ktransformers/ktransformers_ext/cpu_backend/vendors/musa.h @@ -1,137 +1,9 @@ #pragma once #include -#include -#include #include -#include -#define CUBLAS_COMPUTE_16F CUDA_R_16F -#define CUBLAS_COMPUTE_32F CUDA_R_32F -#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F -#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT -#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT -#define CUBLAS_OP_N MUBLAS_OP_N -#define CUBLAS_OP_T MUBLAS_OP_T -#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS -#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT -#define CUDA_R_16F MUSA_R_16F -#define CUDA_R_32F MUSA_R_32F -#define cublasComputeType_t cudaDataType_t -#define cublasCreate mublasCreate -#define cublasDestroy mublasDestroy -#define cublasGemmEx mublasGemmEx -#define cublasGemmBatchedEx mublasGemmBatchedEx -#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx -#define cublasHandle_t mublasHandle_t -#define cublasSetMathMode mublasSetMathMode -#define cublasSetStream mublasSetStream -#define cublasSgemm mublasSgemm -#define cublasStatus_t mublasStatus_t -#define cublasOperation_t mublasOperation_t -#define cublasGetStatusString mublasStatus_to_string -#define cudaDataType_t musaDataType_t -#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer -#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess -#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess -#define cudaDeviceProp musaDeviceProp -#define cudaDeviceSynchronize musaDeviceSynchronize -#define cudaError_t musaError_t -#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled -#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled -#define cudaEventCreateWithFlags musaEventCreateWithFlags -#define cudaEventDisableTiming musaEventDisableTiming -#define cudaEventRecord musaEventRecord -#define cudaEventSynchronize musaEventSynchronize -#define cudaEvent_t musaEvent_t -#define cudaEventDestroy musaEventDestroy -#define cudaFree musaFree -#define cudaFreeHost musaFreeHost -#define cudaGetDevice musaGetDevice -#define cudaGetDeviceCount musaGetDeviceCount -#define cudaGetDeviceProperties musaGetDeviceProperties -#define cudaGetErrorString musaGetErrorString -#define cudaGetLastError musaGetLastError -#define cudaHostRegister musaHostRegister -#define cudaHostRegisterPortable musaHostRegisterPortable -#define cudaHostRegisterReadOnly musaHostRegisterReadOnly -#define cudaHostUnregister musaHostUnregister + #define cudaLaunchHostFunc musaLaunchHostFunc -#define cudaMalloc musaMalloc -#define cudaMallocHost musaMallocHost -#define cudaMallocManaged musaMallocManaged -#define cudaMemcpy musaMemcpy -#define cudaMemcpyAsync musaMemcpyAsync -#define cudaMemcpyPeerAsync musaMemcpyPeerAsync -#define cudaMemcpy2DAsync musaMemcpy2DAsync -#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice -#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost -#define cudaMemcpyHostToDevice musaMemcpyHostToDevice -#define cudaMemcpyKind musaMemcpyKind -#define cudaMemset musaMemset -#define cudaMemsetAsync musaMemsetAsync -#define cudaMemGetInfo musaMemGetInfo -#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize -#define cudaSetDevice musaSetDevice -#define cudaStreamCreateWithFlags musaStreamCreateWithFlags -#define cudaStreamDestroy musaStreamDestroy -#define cudaStreamFireAndForget musaStreamFireAndForget -#define cudaStreamNonBlocking musaStreamNonBlocking -#define cudaStreamPerThread musaStreamPerThread -#define cudaStreamSynchronize musaStreamSynchronize -#define cudaStreamWaitEvent musaStreamWaitEvent #define cudaStream_t musaStream_t -#define cudaSuccess musaSuccess - -// Additional mappings for MUSA virtual memory pool -#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED -#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE -#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED -#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED -#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE -#define CUdevice MUdevice -#define CUdeviceptr MUdeviceptr -#define CUmemAccessDesc MUmemAccessDesc -#define CUmemAllocationProp MUmemAllocationProp -#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle -#define cuDeviceGet muDeviceGet -#define cuDeviceGetAttribute muDeviceGetAttribute -#define cuMemAddressFree muMemAddressFree -#define cuMemAddressReserve muMemAddressReserve -#define cuMemCreate muMemCreate -#define cuMemGetAllocationGranularity muMemGetAllocationGranularity -#define cuMemMap muMemMap -#define cuMemRelease muMemRelease -#define cuMemSetAccess muMemSetAccess -#define cuMemUnmap muMemUnmap -#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize -#define cudaFuncSetAttribute musaFuncSetAttribute -#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms -#define make_cudaExtent make_musaExtent -#define make_cudaPitchedPtr make_musaPitchedPtr - -// Additional mappings for MUSA graphs -#define CUDA_SUCCESS MUSA_SUCCESS -#define CUresult MUresult -#define cuGetErrorString muGetErrorString -#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure -#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction -#define cudaGraphDestroy musaGraphDestroy -#define cudaGraphExecDestroy musaGraphExecDestroy -#define cudaGraphExec_t musaGraphExec_t -#define cudaGraphExecUpdate musaGraphExecUpdate -#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult -#define cudaGraphGetNodes musaGraphGetNodes -#define cudaGraphInstantiate musaGraphInstantiate -#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams -#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams -#define cudaGraphLaunch musaGraphLaunch -#define cudaGraphNodeGetType musaGraphNodeGetType -#define cudaGraphNode_t musaGraphNode_t -#define cudaGraphNodeType musaGraphNodeType -#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel -#define cudaGraph_t musaGraph_t -#define cudaKernelNodeParams musaKernelNodeParams -#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed -#define cudaStreamEndCapture musaStreamEndCapture - -typedef mt_bfloat16 nv_bfloat16; +#define cudaHostFn_t musaHostFn_t +#define nv_bfloat16 mt_bfloat16 \ No newline at end of file diff --git a/ktransformers/ktransformers_ext/vendors/musa.h b/ktransformers/ktransformers_ext/vendors/musa.h index 6cc1b69e..18922218 100644 --- a/ktransformers/ktransformers_ext/vendors/musa.h +++ b/ktransformers/ktransformers_ext/vendors/musa.h @@ -1,137 +1,9 @@ #pragma once #include -#include -#include #include -#include -#define CUBLAS_COMPUTE_16F CUDA_R_16F -#define CUBLAS_COMPUTE_32F CUDA_R_32F -#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F -#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT -#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT -#define CUBLAS_OP_N MUBLAS_OP_N -#define CUBLAS_OP_T MUBLAS_OP_T -#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS -#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT -#define CUDA_R_16F MUSA_R_16F -#define CUDA_R_32F MUSA_R_32F -#define cublasComputeType_t cudaDataType_t -#define cublasCreate mublasCreate -#define cublasDestroy mublasDestroy -#define cublasGemmEx mublasGemmEx -#define cublasGemmBatchedEx mublasGemmBatchedEx -#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx -#define cublasHandle_t mublasHandle_t -#define cublasSetMathMode mublasSetMathMode -#define cublasSetStream mublasSetStream -#define cublasSgemm mublasSgemm -#define cublasStatus_t mublasStatus_t -#define cublasOperation_t mublasOperation_t -#define cublasGetStatusString mublasStatus_to_string -#define cudaDataType_t musaDataType_t -#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer -#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess -#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess -#define cudaDeviceProp musaDeviceProp -#define cudaDeviceSynchronize musaDeviceSynchronize -#define cudaError_t musaError_t -#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled -#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled -#define cudaEventCreateWithFlags musaEventCreateWithFlags -#define cudaEventDisableTiming musaEventDisableTiming -#define cudaEventRecord musaEventRecord -#define cudaEventSynchronize musaEventSynchronize -#define cudaEvent_t musaEvent_t -#define cudaEventDestroy musaEventDestroy -#define cudaFree musaFree -#define cudaFreeHost musaFreeHost -#define cudaGetDevice musaGetDevice -#define cudaGetDeviceCount musaGetDeviceCount -#define cudaGetDeviceProperties musaGetDeviceProperties -#define cudaGetErrorString musaGetErrorString -#define cudaGetLastError musaGetLastError -#define cudaHostRegister musaHostRegister -#define cudaHostRegisterPortable musaHostRegisterPortable -#define cudaHostRegisterReadOnly musaHostRegisterReadOnly -#define cudaHostUnregister musaHostUnregister + #define cudaLaunchHostFunc musaLaunchHostFunc -#define cudaMalloc musaMalloc -#define cudaMallocHost musaMallocHost -#define cudaMallocManaged musaMallocManaged -#define cudaMemcpy musaMemcpy -#define cudaMemcpyAsync musaMemcpyAsync -#define cudaMemcpyPeerAsync musaMemcpyPeerAsync -#define cudaMemcpy2DAsync musaMemcpy2DAsync -#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice -#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost -#define cudaMemcpyHostToDevice musaMemcpyHostToDevice -#define cudaMemcpyKind musaMemcpyKind -#define cudaMemset musaMemset -#define cudaMemsetAsync musaMemsetAsync -#define cudaMemGetInfo musaMemGetInfo -#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize -#define cudaSetDevice musaSetDevice -#define cudaStreamCreateWithFlags musaStreamCreateWithFlags -#define cudaStreamDestroy musaStreamDestroy -#define cudaStreamFireAndForget musaStreamFireAndForget -#define cudaStreamNonBlocking musaStreamNonBlocking -#define cudaStreamPerThread musaStreamPerThread -#define cudaStreamSynchronize musaStreamSynchronize -#define cudaStreamWaitEvent musaStreamWaitEvent #define cudaStream_t musaStream_t -#define cudaSuccess musaSuccess - -// Additional mappings for MUSA virtual memory pool -#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED -#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE -#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED -#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED -#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE -#define CUdevice MUdevice -#define CUdeviceptr MUdeviceptr -#define CUmemAccessDesc MUmemAccessDesc -#define CUmemAllocationProp MUmemAllocationProp -#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle -#define cuDeviceGet muDeviceGet -#define cuDeviceGetAttribute muDeviceGetAttribute -#define cuMemAddressFree muMemAddressFree -#define cuMemAddressReserve muMemAddressReserve -#define cuMemCreate muMemCreate -#define cuMemGetAllocationGranularity muMemGetAllocationGranularity -#define cuMemMap muMemMap -#define cuMemRelease muMemRelease -#define cuMemSetAccess muMemSetAccess -#define cuMemUnmap muMemUnmap -#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize -#define cudaFuncSetAttribute musaFuncSetAttribute -#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms -#define make_cudaExtent make_musaExtent -#define make_cudaPitchedPtr make_musaPitchedPtr - -// Additional mappings for MUSA graphs -#define CUDA_SUCCESS MUSA_SUCCESS -#define CUresult MUresult -#define cuGetErrorString muGetErrorString -#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure -#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction -#define cudaGraphDestroy musaGraphDestroy -#define cudaGraphExecDestroy musaGraphExecDestroy -#define cudaGraphExec_t musaGraphExec_t -#define cudaGraphExecUpdate musaGraphExecUpdate -#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult -#define cudaGraphGetNodes musaGraphGetNodes -#define cudaGraphInstantiate musaGraphInstantiate -#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams -#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams -#define cudaGraphLaunch musaGraphLaunch -#define cudaGraphNodeGetType musaGraphNodeGetType -#define cudaGraphNode_t musaGraphNode_t -#define cudaGraphNodeType musaGraphNodeType -#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel -#define cudaGraph_t musaGraph_t -#define cudaKernelNodeParams musaKernelNodeParams -#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed -#define cudaStreamEndCapture musaStreamEndCapture - -typedef mt_bfloat16 nv_bfloat16; +#define cudaHostFn_t musaHostFn_t +#define nv_bfloat16 mt_bfloat16 \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 028c6a39..796a07ac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [build-system] requires = [ "setuptools", - "torch >= 2.3.0", + "torch >= 2.3.0", "ninja", "packaging", "cpufeature" @@ -12,25 +12,7 @@ build-backend = "setuptools.build_meta" name = "ktransformers" -dynamic = ["version"] - -dependencies = [ - "torch >= 2.3.0", - "transformers == 4.43.2", - "fastapi >= 0.111.0", - "uvicorn >= 0.30.1", - "langchain >= 0.2.0", - "blessed >= 1.20.0", - "accelerate >= 0.31.0", - "sentencepiece >= 0.1.97", - "setuptools", - "ninja", - "wheel", - "colorlog", - "build", - "fire", - "protobuf" -] +dynamic = ["version", "dependencies"] requires-python = ">=3.10" diff --git a/setup.py b/setup.py index 5c29b8f5..67cf3048 100644 --- a/setup.py +++ b/setup.py @@ -67,17 +67,17 @@ def get_musa_bare_metal_version(self, musa_dir): def get_rocm_bare_metal_version(self, rocm_dir): """ Get the ROCm version from the ROCm installation directory. - + Args: rocm_dir: Path to the ROCm installation directory - + Returns: A string representation of the ROCm version (e.g., "63" for ROCm 6.3) """ try: # Try using rocm_agent_enumerator to get version info raw_output = subprocess.check_output( - [rocm_dir + "/bin/rocminfo", "--version"], + [rocm_dir + "/bin/rocminfo", "--version"], universal_newlines=True, stderr=subprocess.STDOUT) # Extract version number from output @@ -90,7 +90,7 @@ def get_rocm_bare_metal_version(self, rocm_dir): except (subprocess.CalledProcessError, FileNotFoundError): # If rocminfo --version fails, try alternative methods pass - + try: # Try reading version from release file with open(os.path.join(rocm_dir, "share/doc/hip/version.txt"), "r") as f: @@ -100,7 +100,7 @@ def get_rocm_bare_metal_version(self, rocm_dir): return rocm_version except (FileNotFoundError, IOError): pass - + # If all else fails, try to extract from directory name dir_name = os.path.basename(os.path.normpath(rocm_dir)) match = re.search(r'rocm-(\d+\.\d+)', dir_name) @@ -109,7 +109,7 @@ def get_rocm_bare_metal_version(self, rocm_dir): version = parse(version_str) rocm_version = f"{version.major}{version.minor}" return rocm_version - + # Fallback to extracting from hipcc version try: raw_output = subprocess.check_output( @@ -124,7 +124,7 @@ def get_rocm_bare_metal_version(self, rocm_dir): return rocm_version except (subprocess.CalledProcessError, FileNotFoundError): pass - + # If we still can't determine the version, raise an error raise ValueError(f"Could not determine ROCm version from directory: {rocm_dir}") @@ -319,7 +319,7 @@ def build_extension(self, ext) -> None: raise ValueError("Unsupported backend: CUDA_HOME and MUSA_HOME are not set.") # log cmake_args print("CMake args:", cmake_args) - + build_args = [] if "CMAKE_ARGS" in os.environ: cmake_args += [ @@ -398,6 +398,23 @@ def build_extension(self, ext) -> None: ["cmake", "--build", ".", "--verbose", *build_args], cwd=build_temp, check=True ) +dependencies = [ + "torch >= 2.3.0", + "transformers == 4.43.2", + "fastapi >= 0.111.0", + "uvicorn >= 0.30.1", + "langchain >= 0.2.0", + "blessed >= 1.20.0", + "accelerate >= 0.31.0", + "sentencepiece >= 0.1.97", + "setuptools", + "ninja", + "wheel", + "colorlog", + "build", + "fire", + "protobuf" +] if CUDA_HOME is not None or ROCM_HOME is not None: ops_module = CUDAExtension('KTransformersOps', [ 'ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu', @@ -415,6 +432,7 @@ def build_extension(self, ext) -> None: } ) elif MUSA_HOME is not None: + dependencies.remove("torch >= 2.3.0") SimplePorting(cuda_dir_path="ktransformers/ktransformers_ext/cuda", mapping_rule={ # Common rules "at::cuda": "at::musa", @@ -443,6 +461,7 @@ def build_extension(self, ext) -> None: setup( name=VersionInfo.PACKAGE_NAME, version=VersionInfo().get_package_version(), + install_requires=dependencies, cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild}, ext_modules=[ CMakeExtension("cpuinfer_ext"),