Revert "infra: move nvrtc_wrapper to conan (#3282)" (#3573)

kaiyux · web-flow · commit 258ae9c58ce1 · 2025-04-15T22:45:13.000+08:00
This reverts commit c0dd6cb. Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
diff --git a/.gitignore b/.gitignore
@@ -32,7 +32,6 @@ config.json
 /*.svg
 cpp/cmake-build-*
 cpp/.ccache
-cpp/.conan
 tensorrt_llm/bin
 tensorrt_llm/include
 tensorrt_llm/libs
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -358,8 +358,6 @@ if(ENABLE_MULTI_DEVICE)
   find_library(NCCL_LIB nccl HINTS ${NCCL_LIB_DIR})
 endif()
 
-find_package(tensorrt_llm_nvrtc_wrapper REQUIRED)
-
 get_filename_component(TRT_LLM_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR} PATH)
 
 set(3RDPARTY_DIR ${TRT_LLM_ROOT_DIR}/3rdparty)
@@ -371,7 +369,6 @@ include_directories(
   ${CUDAToolkit_INCLUDE_DIRS}
   ${CUDNN_ROOT_DIR}/include
   ${NCCL_INCLUDE_DIR}
-  ${tensorrt_llm_nvrtc_wrapper_INCLUDE_DIRS}
   ${3RDPARTY_DIR}/cutlass/include
   ${3RDPARTY_DIR}/cutlass/tools/util/include
   ${3RDPARTY_DIR}/NVTX/include
diff --git a/cpp/conandata.yml b/cpp/conandata.yml
diff --git a/cpp/conanfile.py b/cpp/conanfile.py
diff --git a/cpp/tensorrt_llm/CMakeLists.txt b/cpp/tensorrt_llm/CMakeLists.txt
@@ -139,13 +139,55 @@ find_package(Threads REQUIRED)
 target_link_libraries(${BATCH_MANAGER_TARGET} INTERFACE Threads::Threads)
 target_link_libraries(${EXECUTOR_TARGET} INTERFACE Threads::Threads)
 
-# NVRTC_WRAPPER_LIB_SOURCE_REL_LOC is defined in cpp/conanfile.py
-set(NVRTC_WRAPPER_LIB_BINARY_REL_LOC
-    "kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/libtensorrt_llm_nvrtc_wrapper.so"
-)
-# Copy the .so to build directory, which is needed in build_wheel.py.
-configure_file(${NVRTC_WRAPPER_LIB_SOURCE_REL_LOC}
-               ${NVRTC_WRAPPER_LIB_BINARY_REL_LOC} COPYONLY)
+set(NVRTC_WRAPPER_TARGET tensorrt_llm_nvrtc_wrapper)
+set(NVRTC_WRAPPER_TARGET_ARCH ${TARGET_ARCH})
+
+if(BUILD_NVRTC_WRAPPER)
+  add_subdirectory(
+    kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper)
+else()
+  add_library(${NVRTC_WRAPPER_TARGET} SHARED IMPORTED)
+  set(NVRTC_WRAPPER_LIB_TARBALL
+      "${CMAKE_CURRENT_SOURCE_DIR}/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/${NVRTC_WRAPPER_TARGET_ARCH}/${NVRTC_WRAPPER_TARGET}.tar.xz"
+  )
+  set(NVRTC_WRAPPER_LIB_BINARY_DIR
+      "${CMAKE_CURRENT_BINARY_DIR}/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper"
+  )
+  if(NOT WIN32) # Linux
+    set(NVRTC_WRAPPER_LIB_NAME "lib${NVRTC_WRAPPER_TARGET}.so")
+  else() # Windows
+    set(NVRTC_WRAPPER_LIB_NAME "${NVRTC_WRAPPER_TARGET}.lib")
+    set(NVRTC_WRAPPER_DLL_NAME "${NVRTC_WRAPPER_TARGET}.dll")
+    set(NVRTC_WRAPPER_DLL_PATH
+        "${NVRTC_WRAPPER_LIB_BINARY_DIR}/${NVRTC_WRAPPER_DLL_NAME}")
+  endif()
+  set(NVRTC_WRAPPER_LIB_PATH
+      "${NVRTC_WRAPPER_LIB_BINARY_DIR}/${NVRTC_WRAPPER_LIB_NAME}")
+  add_custom_command(
+    OUTPUT ${NVRTC_WRAPPER_LIB_PATH} ${NVRTC_WRAPPER_DLL_PATH}
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${NVRTC_WRAPPER_LIB_BINARY_DIR}
+    COMMAND ${CMAKE_COMMAND} -E chdir ${NVRTC_WRAPPER_LIB_BINARY_DIR}
+            ${CMAKE_COMMAND} -E tar xf ${NVRTC_WRAPPER_LIB_TARBALL}
+    DEPENDS ${NVRTC_WRAPPER_LIB_TARBALL}
+    VERBATIM)
+  add_custom_target(${NVRTC_WRAPPER_TARGET}_helper
+                    DEPENDS ${NVRTC_WRAPPER_LIB_PATH} ${NVRTC_WRAPPER_DLL_PATH})
+  add_dependencies(${NVRTC_WRAPPER_TARGET} ${NVRTC_WRAPPER_TARGET}_helper)
+  set_property(TARGET ${NVRTC_WRAPPER_TARGET}
+               PROPERTY IMPORTED_LOCATION ${NVRTC_WRAPPER_LIB_PATH})
+  if(WIN32)
+    set_property(TARGET ${NVRTC_WRAPPER_TARGET}
+                 PROPERTY IMPORTED_IMPLIB ${NVRTC_WRAPPER_DLL_PATH})
+  endif()
+
+  file(SIZE ${INTERNAL_CUTLASS_KERNELS_LIB_TARBALL} NVRTC_WRAPPER_LIB_SIZE)
+  if(NVRTC_WRAPPER_LIB_SIZE LESS 1024)
+    message(
+      FATAL_ERROR
+        "The nvrtc wrapper library is truncated or incomplete. This is usually caused by using Git LFS (Large File Storage) incorrectly. Please try running command `git lfs install && git lfs pull`."
+    )
+  endif()
+endif()
 
 set(TRTLLM_LINK_LIBS
     ${CUDA_DRV_LIB}
@@ -231,9 +273,7 @@ if(NOT WIN32)
                                                     "-Wl,-rpath='$ORIGIN'")
 endif()
 
-target_link_libraries(
-  ${SHARED_TARGET}
-  PUBLIC tensorrt_llm_nvrtc_wrapper::tensorrt_llm_nvrtc_wrapper)
+target_link_libraries(${SHARED_TARGET} PUBLIC ${NVRTC_WRAPPER_TARGET})
 
 if(BUILD_PYT)
   add_subdirectory(thop)
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/CMakeLists.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/CMakeLists.txt
@@ -20,6 +20,9 @@ file(GLOB_RECURSE SRC_CPP *.cpp)
 set(SRC_CU)
 set(SRC_CU_EXTRA)
 
+# Exclude files in nvrtcWrapper folder.
+list(FILTER SRC_CPP EXCLUDE REGEX ".*nvrtcWrapper/src.*")
+
 filter_cuda_archs("80" SRC_CPP)
 filter_cuda_archs("86" SRC_CPP)
 filter_cuda_archs("89" SRC_CPP)
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.cpp
@@ -16,12 +16,12 @@
 #include "compileEngine.h"
 
 #include "cubinObj.h"
+#include "nvrtcWrapper/include/nvrtcWrapper.h"
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/stringUtils.h"
 #include "tensorrt_llm/common/tllmException.h"
 #include "tensorrt_llm/common/utils.h"
 #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.h"
-#include <nvrtcWrapper.h>
 #include <string>
 #include <vector>
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
@@ -0,0 +1,2 @@
+5ad6be58302fad71488246c4dea6f96d710143988a195d67b304ea251bd0aa89  libtensorrt_llm_nvrtc_wrapper.so
+commit 9c24486cb2cd9dd9582b311b84e1b428d29a735a
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/include/nvrtcWrapper.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/include/nvrtcWrapper.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * This file is NOT thread safe.
+ */
+#pragma once
+#include <stddef.h>
+
+#ifdef _WIN32
+
+#if COMPILING_DLL
+#define DLLEXPORT __declspec(dllexport)
+#else
+#define DLLEXPORT __declspec(dllimport)
+#endif
+
+#else             // _WIN32
+#define DLLEXPORT // Nothing.
+#endif
+
+#if __cplusplus
+extern "C"
+{
+#endif
+
+    typedef enum
+    {
+        // sm >= 80
+        TLLM_XQA_JIT_HMMA = 0,
+        // sm == 90
+        TLLM_XQA_JIT_QGMMA = 1
+    } tllmXqaJitKernelType;
+
+    typedef enum
+    {
+        TLLM_XQA_JIT_ROPE_NONE = 0,
+        TLLM_XQA_JIT_ROPE_NEOX = 1,
+        TLLM_XQA_JIT_ROPE_GPTJ = 2
+    } tllmXqaJitRopeStyle;
+
+    typedef struct
+    {
+        // Compute capability, e.g. 89.
+        int sm;
+
+        unsigned int head_size;
+        unsigned int num_q_heads;
+        unsigned int num_kv_heads;
+        unsigned int beam_width;
+        unsigned int tokens_per_block;
+        bool multi_query_tokens;
+        unsigned int q_seq_len;
+        bool paged_kv_cache;
+
+        // Actual type: tensorrt_llm::kernels::Data_type
+        int data_type;
+        int kv_cache_data_type;
+
+        tllmXqaJitKernelType kernel_type;
+
+        bool fp8_output;
+        bool use_input_kv;
+        tllmXqaJitRopeStyle rope_style; // useful only when use_input_kv is true.
+    } tllmXqaJitContext;
+
+    // tllmXqaJitProgram is an opaque handle for a program.
+    typedef struct _tllmXqaJitProgram* tllmXqaJitProgram;
+
+    typedef enum
+    {
+        TLLM_XQA_JIT_SUCCESS = 0,
+        TLLM_XQA_JIT_INVALID_INPUT = 1,
+        TLLM_XQA_JIT_INTERNAL_ERROR = 2,
+    } tllmXqaJitStatus;
+
+    // context must outlive prog.
+    DLLEXPORT tllmXqaJitStatus tllmXqaJitCreateAndCompileProgram(
+        tllmXqaJitProgram* prog, tllmXqaJitContext const* context);
+    DLLEXPORT tllmXqaJitStatus tllmXqaJitGetCUBINSize(tllmXqaJitProgram prog, size_t* cubinSizeRet);
+    DLLEXPORT tllmXqaJitStatus tllmXqaJitGetCUBIN(tllmXqaJitProgram prog, char* cubin);
+    DLLEXPORT tllmXqaJitStatus tllmXqaJitDestroyProgram(tllmXqaJitProgram* prog);
+
+    // Returns the size of the error string associated with the last non-success tllmXqaJit function call (including the
+    // trailing \0). Returns 0 if there is no such non-success function call.
+    DLLEXPORT size_t tllmXqaJitGetLastErrorStringSize();
+    // Returns the error string.
+    // Output can be nullptr if the returned value of tllmGetLastErrorStringSize() is 0.
+    DLLEXPORT void tllmXqaJitGetLastErrorString(char* output);
+
+#if __cplusplus
+} // extern "C"
+#endif
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt
@@ -0,0 +1,2 @@
+9d1104bbe6b4f258482549ec71c9d1aed0de912b5824dced5cf7829bff66ba0d  libtensorrt_llm_nvrtc_wrapper.so
+commit 9c24486cb2cd9dd9582b311b84e1b428d29a735a
diff --git a/docker/Dockerfile.user b/docker/Dockerfile.user
@@ -18,6 +18,4 @@ RUN apt-get update && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-ENV PATH="/home/${USER_NAME}/.local/bin:${PATH}"
-
 USER ${USER_NAME}
diff --git a/docker/Makefile b/docker/Makefile
@@ -107,21 +107,13 @@ DOCKER_RUN_OPTS   ?= --rm -it --ipc=host --ulimit memlock=-1 --ulimit stack=6710
 DOCKER_RUN_ARGS   ?=
 GPU_OPTS          ?= --gpus=all
 SOURCE_DIR        ?= $(shell readlink -f ..)
-NVRTC_WRAPPER_DIR ?=
 CODE_DIR          ?= /code/tensorrt_llm
 CCACHE_DIR        ?= ${CODE_DIR}/cpp/.ccache
-CONAN_DIR         ?= ${CODE_DIR}/cpp/.conan
 RUN_CMD           ?=
 CONTAINER_NAME    ?= tensorrt_llm
 WORK_DIR          ?= $(CODE_DIR)
 DOCKER_PULL       ?= 0
 
-ifneq ($(NVRTC_WRAPPER_DIR), )
-NVRTC_WRAPPER_MOUNT := --volume $(NVRTC_WRAPPER_DIR):/mnt/src/tensorrt_llm_nvrtc_wrapper
-else
-NVRTC_WRAPPER_MOUNT :=
-endif
-
 %_run:
 ifeq ($(DOCKER_PULL),1)
 	@$(MAKE) --no-print-directory $*_pull
@@ -132,10 +124,8 @@ endif
 	docker run $(DOCKER_RUN_OPTS) $(DOCKER_RUN_ARGS) \
     		$(GPU_OPTS) \
     		--volume $(SOURCE_DIR):$(CODE_DIR) \
-    		$(NVRTC_WRAPPER_MOUNT) \
     		--env "CCACHE_DIR=${CCACHE_DIR}" \
     		--env "CCACHE_BASEDIR=${CODE_DIR}" \
-    	    --env "CONAN_HOME=${CONAN_DIR}" \
     		--workdir $(WORK_DIR) \
     		--hostname $(shell hostname)-$* \
     		--name $(CONTAINER_NAME)-$*-$(USER_NAME) \
diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+5ad6be58302fad71488246c4dea6f96d710143988a195d67b304ea251bd0aa89 libtensorrt_llm_nvrtc_wrapper.so`
	`2`	`+commit 9c24486cb2cd9dd9582b311b84e1b428d29a735a`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+9d1104bbe6b4f258482549ec71c9d1aed0de912b5824dced5cf7829bff66ba0d libtensorrt_llm_nvrtc_wrapper.so`
	`2`	`+commit 9c24486cb2cd9dd9582b311b84e1b428d29a735a`