triton-inference-server · richardhuo-nv · May 19, 2025 · May 21, 2025 · May 21, 2025 · May 23, 2025
diff --git a/build.py b/build.py
@@ -622,7 +622,7 @@ def backend_cmake_args(images, components, be, install_dir, library_paths):
 
     cargs += cmake_backend_extra_args(be)
     if be == "tensorrtllm":
-        cargs.append("-S ../inflight_batcher_llm -B .")
+        cargs.append("-S ../triton_backend/inflight_batcher_llm -B .")
 
     else:
         cargs.append("..")
@@ -1481,12 +1481,12 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
 
     if "vllm" in backends:
         df += f"""
+# Install required packages for vLLM models
 ARG BUILD_PUBLIC_VLLM="true"
-ARG VLLM_INDEX_URL
-ARG PYTORCH_TRITON_URL
-ARG NVPL_SLIM_URL
-
 RUN --mount=type=secret,id=req,target=/run/secrets/requirements \\
+    --mount=type=secret,id=VLLM_INDEX_URL,env=VLLM_INDEX_URL \\
+    --mount=type=secret,id=PYTORCH_TRITON_URL,env=PYTORCH_TRITON_URL \\
+    --mount=type=secret,id=NVPL_SLIM_URL,env=NVPL_SLIM_URL \\
     if [ "$BUILD_PUBLIC_VLLM" = "false" ]; then \\
         if [ "$(uname -m)" = "x86_64" ]; then \\
             pip3 install --no-cache-dir \\
@@ -1900,10 +1900,10 @@ def create_docker_build_script(script_name, container_install_dir, container_ci_
         if secrets:
             finalargs += [
                 f"--secret id=req,src={requirements}",
-                f"--build-arg VLLM_INDEX_URL={vllm_index_url}",
-                f"--build-arg PYTORCH_TRITON_URL={pytorch_triton_url}",
+                f"--secret id=VLLM_INDEX_URL",
+                f"--secret id=PYTORCH_TRITON_URL",
+                f"--secret id=NVPL_SLIM_URL",
                 f"--build-arg BUILD_PUBLIC_VLLM={build_public_vllm}",
-                f"--build-arg NVPL_SLIM_URL={nvpl_slim_url}",
             ]
         finalargs += [
             "-t",
@@ -2081,7 +2081,16 @@ def backend_build(
     cmake_script.comment()
     cmake_script.mkdir(build_dir)
     cmake_script.cwd(build_dir)
-    cmake_script.gitclone(backend_repo(be), tag, be, github_organization)
+    if be == "tensorrtllm":
+        github_organization = (
+            "https://github.com/NVIDIA"
+            if "triton-inference-server" in FLAGS.github_organization
+            else FLAGS.github_organization
+        )
+        repository_name = "TensorRT-LLM"
+        cmake_script.gitclone(repository_name, tag, be, github_organization)
+    else:
+        cmake_script.gitclone(backend_repo(be), tag, be, github_organization)
 
     if be == "tensorrtllm":
         tensorrtllm_prebuild(cmake_script)
@@ -2769,8 +2778,6 @@ def enable_all():
         metavar=("key", "value"),
         help="Add build secrets in the form of <key> <value>. These secrets are used during the build process for vllm. The secrets are passed to the Docker build step as `--secret id=<key>`. The following keys are expected and their purposes are described below:\n\n"
         "  - 'req': A file containing a list of dependencies for pip (e.g., requirements.txt).\n"
-        "  - 'vllm_index_url': The index URL for the pip install.\n"
-        "  - 'pytorch_triton_url': The location of the PyTorch wheel to download.\n"
         "  - 'build_public_vllm': A flag (default is 'true') indicating whether to build the public VLLM version.\n\n"
         "Ensure that the required environment variables for these secrets are set before running the build.",
     )
@@ -2892,9 +2899,6 @@ def enable_all():
     secrets = dict(getattr(FLAGS, "build_secret", []))
     if secrets:
         requirements = secrets.get("req", "")
-        vllm_index_url = secrets.get("vllm_index_url", "")
-        pytorch_triton_url = secrets.get("pytorch_triton_url", "")
-        nvpl_slim_url = secrets.get("nvpl_slim_url", "")
         build_public_vllm = secrets.get("build_public_vllm", "true")
         log('Build Arg for BUILD_PUBLIC_VLLM: "{}"'.format(build_public_vllm))
 

diff --git a/qa/L0_backend_python/common.sh b/qa/L0_backend_python/common.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -85,7 +85,11 @@ create_conda_env_with_specified_path() {
 create_python_backend_stub() {
   rm -rf python_backend
   git clone ${TRITON_REPO_ORGANIZATION}/python_backend -b $PYTHON_BACKEND_REPO_TAG
+  CUDA_PATH=$(readlink -f /usr/local/cuda)
   (cd python_backend/ && mkdir builddir && cd builddir && \
-  cmake -DTRITON_ENABLE_GPU=ON -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} -DTRITON_BACKEND_REPO_TAG=$TRITON_BACKEND_REPO_TAG -DTRITON_COMMON_REPO_TAG=$TRITON_COMMON_REPO_TAG -DTRITON_CORE_REPO_TAG=$TRITON_CORE_REPO_TAG -DPYBIND11_PYTHON_VERSION=$PY_VERSION ../ && \
+  cmake -DTRITON_ENABLE_GPU=ON -DCMAKE_CUDA_COMPILER=$CUDA_PATH/bin/nvcc \
+     -DCUDAToolkit_ROOT=$CUDA_PATH -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \
+    -DTRITON_BACKEND_REPO_TAG=$TRITON_BACKEND_REPO_TAG -DTRITON_COMMON_REPO_TAG=$TRITON_COMMON_REPO_TAG \
+    -DTRITON_CORE_REPO_TAG=$TRITON_CORE_REPO_TAG -DPYBIND11_PYTHON_VERSION=$PY_VERSION ../ && \
   make -j18 triton-python-backend-stub)
 }
diff --git a/qa/L0_backend_python/env/test.sh b/qa/L0_backend_python/env/test.sh
@@ -53,8 +53,8 @@ conda install numpy=1.26.4 -y
 if [ $TRITON_RHEL -eq 1 ]; then
     TORCH_VERISON="2.17.0"
 fi
-conda install torch=${TORCH_VERSION} -y
-PY312_VERSION_STRING="Python version is 3.12, NumPy version is 1.26.4, and PyTorch version is ${TORCH_VERISON}"
+conda install pytorch=${TORCH_VERSION} -y
+PY312_VERSION_STRING="Python version is 3.12, NumPy version is 1.26.4, and PyTorch version is ${TORCH_VERSION}"
 conda pack -o python3.12.tar.gz
 mkdir -p models/python_3_12/1/
 cp ../../python_models/python_version/config.pbtxt ./models/python_3_12
@@ -122,7 +122,7 @@ fi
 kill_server
 
 set +e
-grep "Locale is ('en_US', 'UTF-8')" $SERVER_LOG
+grep "Locale is ('C', 'UTF-8')" $SERVER_LOG
     if [ $? -ne 0 ]; then
         cat $SERVER_LOG
         echo -e "\n***\n*** Locale UTF-8 was not found in Triton logs. \n***"
@@ -182,10 +182,6 @@ aws s3 mb "${BUCKET_URL}"
 BUCKET_URL=${BUCKET_URL%/}
 BUCKET_URL_SLASH="${BUCKET_URL}/"
 
-# Remove Python 3.7 model because it contains absolute paths and cannot be used
-# with S3.
-rm -rf models/python_3_7
-
 # Test with the bucket url as model repository
 aws s3 cp models/ "${BUCKET_URL_SLASH}" --recursive --include "*"
 
@@ -205,10 +201,10 @@ fi
 kill_server
 
 set +e
-grep "$PY36_VERSION_STRING" $SERVER_LOG
+grep "$PY312_VERSION_STRING" $SERVER_LOG
 if [ $? -ne 0 ]; then
     cat $SERVER_LOG
-    echo -e "\n***\n*** $PY36_VERSION_STRING was not found in Triton logs. \n***"
+    echo -e "\n***\n*** $PY312_VERSION_STRING was not found in Triton logs. \n***"
     RET=1
 fi
 set -e
@@ -217,8 +213,6 @@ set -e
 aws s3 rm "${BUCKET_URL_SLASH}" --recursive --include "*"
 
 # Test with EXECUTION_ENV_PATH outside the model directory
-sed -i "s/TRITON_MODEL_DIRECTORY\/python_3_6_environment/TRITON_MODEL_DIRECTORY\/..\/python_3_6_environment/" models/python_3_6/config.pbtxt
-mv models/python_3_6/python_3_6_environment.tar.gz models
 sed -i "s/\$\$TRITON_MODEL_DIRECTORY\/python_3_12_environment/s3:\/\/triton-bucket-${CI_JOB_ID}\/python_3_12_environment/" models/python_3_12/config.pbtxt
 mv models/python_3_12/python_3_12_environment.tar.gz models
 
@@ -238,7 +232,7 @@ fi
 kill_server
 
 set +e
-for EXPECTED_VERSION_STRING in "$PY36_VERSION_STRING" "$PY312_VERSION_STRING"; do
+for EXPECTED_VERSION_STRING in "$PY312_VERSION_STRING"; do
     grep "$EXPECTED_VERSION_STRING" $SERVER_LOG
     if [ $? -ne 0 ]; then
         cat $SERVER_LOG

diff --git a/qa/L0_backend_python/examples/test.sh b/qa/L0_backend_python/examples/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -69,7 +69,7 @@ pip3 install validators
 # Install JAX
 # Jax has dropped the support for Python 3.8. See https://jax.readthedocs.io/en/latest/changelog.html
 if [ "$TEST_JETSON" == "0" ] && [ ${PYTHON_ENV_VERSION} != "8" ]; then
-    pip3 install --upgrade "jax[cuda12_local]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+    pip install -U "jax[cuda12]"
 fi
 
 git clone ${TRITON_REPO_ORGANIZATION}/python_backend -b $PYTHON_BACKEND_REPO_TAG

diff --git a/qa/L0_backend_python/setup_python_enviroment.sh b/qa/L0_backend_python/setup_python_enviroment.sh
@@ -56,7 +56,7 @@ conda update -n base -c defaults conda -y
 # been setup correctly.
 if [ ${PYTHON_ENV_VERSION} = "11" ]; then
     create_conda_env "3.11" "python-3-11"
-    conda install torch=2.6.0 -y
+    conda install pytorch=2.6.0 -y
     conda install -c conda-forge libstdcxx-ng=14 -y
     conda install numpy=1.23.5 -y
     EXPECTED_VERSION_STRING="Python version is 3.11, NumPy version is 1.23.5, and PyTorch version is 2.6.0"

diff --git a/qa/L0_infer/test.sh b/qa/L0_infer/test.sh
@@ -73,7 +73,7 @@ fi
 if [ "$TEST_SYSTEM_SHARED_MEMORY" -eq 1 ] || [ "$TEST_CUDA_SHARED_MEMORY" -eq 1 ]; then
     EXPECTED_NUM_TESTS=${EXPECTED_NUM_TESTS:="33"}
 else
-    EXPECTED_NUM_TESTS=${EXPECTED_NUM_TESTS:="44"}
+    EXPECTED_NUM_TESTS=${EXPECTED_NUM_TESTS:="46"}
 fi
 
 TEST_JETSON=${TEST_JETSON:=0}