triton-inference-server
diff --git a/‎Dockerfile.sdk
Lines changed: 1 addition & 1 deletion b/‎Dockerfile.sdk
Lines changed: 1 addition & 1 deletion
diff --git a/‎Dockerfile.win10.min
Lines changed: 18 additions & 14 deletions b/‎Dockerfile.win10.min
Lines changed: 18 additions & 14 deletions
diff --git a/‎README.md
Lines changed: 2 additions & 232 deletions b/‎README.md
Lines changed: 2 additions & 232 deletions
diff --git a/‎TRITON_VERSION
Lines changed: 1 addition & 1 deletion b/‎TRITON_VERSION
Lines changed: 1 addition & 1 deletion
diff --git a/‎build.py
Lines changed: 4 additions & 4 deletions b/‎build.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎deploy/aws/values.yaml
Lines changed: 1 addition & 1 deletion b/‎deploy/aws/values.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎deploy/fleetcommand/Chart.yaml
Lines changed: 1 addition & 1 deletion b/‎deploy/fleetcommand/Chart.yaml
Lines changed: 1 addition & 1 deletion
@@ -29,7 +29,7 @@
 #
 
 # Base image on the minimum Triton container
-ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.11-py3-min
+ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.12-py3-min
 
 ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo
 ARG TRITON_PA_REPO_SUBDIR=perfanalyzerrepo
 
@@ -37,9 +37,9 @@ RUN choco install unzip -y
 #
 # Installing TensorRT
 #
-ARG TENSORRT_VERSION=10.4.0.26
+ARG TENSORRT_VERSION=10.7.0.23
 ARG TENSORRT_ZIP="TensorRT-${TENSORRT_VERSION}.Windows.win10.cuda-12.6.zip"
-ARG TENSORRT_SOURCE=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.4.0/zip/TensorRT-10.4.0.26.Windows.win10.cuda-12.6.zip
+ARG TENSORRT_SOURCE=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.7.0/zip/TensorRT-10.7.0.23.Windows.win10.cuda-12.6.zip
 # COPY ${TENSORRT_ZIP} /tmp/${TENSORRT_ZIP}
 ADD ${TENSORRT_SOURCE} /tmp/${TENSORRT_ZIP}
 RUN unzip /tmp/%TENSORRT_ZIP%
@@ -51,9 +51,9 @@ LABEL TENSORRT_VERSION="${TENSORRT_VERSION}"
 #
 # Installing cuDNN
 #
-ARG CUDNN_VERSION=9.4.0.58
+ARG CUDNN_VERSION=9.6.0.74
 ARG CUDNN_ZIP=cudnn-windows-x86_64-${CUDNN_VERSION}_cuda12-archive.zip
-ARG CUDNN_SOURCE=https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.4.0.58_cuda12-archive.zip
+ARG CUDNN_SOURCE=https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.6.0.74_cuda12-archive.zip
 ADD ${CUDNN_SOURCE} /tmp/${CUDNN_ZIP}
 RUN unzip /tmp/%CUDNN_ZIP%
 RUN move cudnn-* cudnn
@@ -75,20 +75,19 @@ RUN choco install git docker unzip -y
 #
 # Installing python
 #
-ARG PYTHON_VERSION=3.10.11
+ARG PYTHON_VERSION=3.12.3
 ARG PYTHON_SOURCE=https://www.python.org/ftp/python/${PYTHON_VERSION}/python-${PYTHON_VERSION}-amd64.exe
 ADD ${PYTHON_SOURCE} python-${PYTHON_VERSION}-amd64.exe
 RUN python-%PYTHON_VERSION%-amd64.exe /quiet InstallAllUsers=1 PrependPath=1 Include_doc=0 TargetDir="C:\python%PYTHON_VERSION%"
 RUN mklink "C:\python%PYTHON_VERSION%\python3.exe" "C:\python%PYTHON_VERSION%\python.exe"
 RUN pip install --upgrade wheel setuptools docker
-RUN pip install grpcio-tools psutil
 
 LABEL PYTHON_VERSION=${PYTHON_VERSION}
 
 #
 # Installing CMake
 #
-ARG CMAKE_VERSION=3.30.0
+ARG CMAKE_VERSION=3.30.5
 RUN pip install cmake==%CMAKE_VERSION%
 
 ENV CMAKE_TOOLCHAIN_FILE /vcpkg/scripts/buildsystems/vcpkg.cmake
@@ -101,14 +100,16 @@ LABEL CMAKE_VERSION=${CMAKE_VERSION}
 #
 # Installing Visual Studio BuildTools: VS17 2022
 #
-ARG BUILDTOOLS_VERSION=17.10.35201.131
 # Download collect.exe in case of an install failure.
 ADD https://aka.ms/vscollect.exe "C:\tmp\collect.exe"
 
 # Use the latest release channel. For more control, specify the location of an internal layout.
 # Download the Build Tools bootstrapper.
 # ARG BUILD_TOOLS_SOURCE=https://aka.ms/vs/17/release/vs_buildtools.exe
-ARG BUILD_TOOLS_SOURCE=https://download.visualstudio.microsoft.com/download/pr/28626b4b-f88f-4b55-a0cf-f3eaa2c643fb/e6c43d4dfb36338d954cdb3ad9010ab2a479e712088f4f6b016eadcc721bab28/vs_BuildTools.exe
+
+ARG BUILDTOOLS_VERSION=17.12.35506.116
+ARG BUILD_TOOLS_SOURCE=https://download.visualstudio.microsoft.com/download/pr/5536698c-711c-4834-876f-2817d31a2ef2/58894fc272e86d3c3a6d85bf3a1df1e5a0685be8b9ab65d9f3cc5c2a8c6921cc/vs_BuildTools.exe
+
 ADD ${BUILD_TOOLS_SOURCE} vs_buildtools.exe
 # Install Build Tools with the Microsoft.VisualStudio.Workload.VCTools workload, including recommended.
 ARG VS_INSTALL_PATH_WP="C:\BuildTools"
@@ -149,12 +150,13 @@ WORKDIR /
 # Installing CUDA
 #
 ARG CUDA_MAJOR=12
-ARG CUDA_MINOR=5
-ARG CUDA_PATCH=1
+ARG CUDA_MINOR=6
+ARG CUDA_PATCH=3
 ARG CUDA_VERSION=${CUDA_MAJOR}.${CUDA_MINOR}.${CUDA_PATCH}
 ARG CUDA_PACKAGES="nvcc_${CUDA_MAJOR}.${CUDA_MINOR} \
                    cudart_${CUDA_MAJOR}.${CUDA_MINOR} \
                    nvml_dev_${CUDA_MAJOR}.${CUDA_MINOR} \
+                   nvrtc_${CUDA_MAJOR}.${CUDA_MINOR} nvrtc_dev_${CUDA_MAJOR}.${CUDA_MINOR} \
                    cublas_${CUDA_MAJOR}.${CUDA_MINOR} cublas_dev_${CUDA_MAJOR}.${CUDA_MINOR} \
                    cufft_${CUDA_MAJOR}.${CUDA_MINOR} cufft_dev_${CUDA_MAJOR}.${CUDA_MINOR} \
                    curand_${CUDA_MAJOR}.${CUDA_MINOR} curand_dev_${CUDA_MAJOR}.${CUDA_MINOR} \
@@ -175,21 +177,23 @@ RUN copy "%CUDA_INSTALL_ROOT_WP%\extras\visual_studio_integration\MSBuildExtensi
 
 RUN setx PATH "%CUDA_INSTALL_ROOT_WP%\bin;%PATH%"
 
-ARG CUDNN_VERSION=9.4.0.58
+ENV CUDA_VERSION=${CUDA_VERSION}
+LABEL CUDA_VERSION="${CUDA_VERSION}"
+
+ARG CUDNN_VERSION=9.6.0.74
 ENV CUDNN_VERSION ${CUDNN_VERSION}
 COPY --from=dependency_base /cudnn /cudnn
 RUN copy cudnn\bin\cudnn*.dll "%CUDA_INSTALL_ROOT_WP%\bin\."
 RUN copy cudnn\lib\x64\cudnn*.lib "%CUDA_INSTALL_ROOT_WP%\lib\x64\."
 RUN copy cudnn\include\cudnn*.h "%CUDA_INSTALL_ROOT_WP%\include\."
 LABEL CUDNN_VERSION="${CUDNN_VERSION}"
 
-ARG TENSORRT_VERSION=10.4.0.26
+ARG TENSORRT_VERSION=10.7.0.23
 ENV TRT_VERSION ${TENSORRT_VERSION}
 COPY --from=dependency_base /TensorRT /TensorRT
 RUN setx PATH "c:\TensorRT\lib;%PATH%"
 LABEL TENSORRT_VERSION="${TENSORRT_VERSION}"
 
-LABEL CUDA_VERSION="${CUDA_VERSION}"
 # It is important that the entrypoint initialize VisualStudio
 # environment otherwise the build will fail. Also set
 # CMAKE_TOOLCHAIN_FILE and VCPKG_TARGET_TRIPLET so
 
@@ -30,235 +30,5 @@
 
 [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)
 
->[!WARNING]
->You are currently on the `main` branch which tracks under-development progress
->towards the next release. The current release is version [2.52.0](https://github.com/triton-inference-server/server/releases/latest)
->and corresponds to the 24.11 container release on NVIDIA GPU Cloud (NGC).
-
-Triton Inference Server is an open source inference serving software that
-streamlines AI inferencing. Triton enables teams to deploy any AI model from
-multiple deep learning and machine learning frameworks, including TensorRT,
-TensorFlow, PyTorch, ONNX, OpenVINO, Python, RAPIDS FIL, and more. Triton
-Inference Server supports inference across cloud, data center, edge and embedded
-devices on NVIDIA GPUs, x86 and ARM CPU, or AWS Inferentia. Triton Inference
-Server delivers optimized performance for many query types, including real time,
-batched, ensembles and audio/video streaming. Triton inference Server is part of
-[NVIDIA AI Enterprise](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/),
-a software platform that accelerates the data science pipeline and streamlines
-the development and deployment of production AI.
-
-Major features include:
-
-- [Supports multiple deep learning
-  frameworks](https://github.com/triton-inference-server/backend#where-can-i-find-all-the-backends-that-are-available-for-triton)
-- [Supports multiple machine learning
-  frameworks](https://github.com/triton-inference-server/fil_backend)
-- [Concurrent model
-  execution](docs/user_guide/architecture.md#concurrent-model-execution)
-- [Dynamic batching](docs/user_guide/model_configuration.md#dynamic-batcher)
-- [Sequence batching](docs/user_guide/model_configuration.md#sequence-batcher) and
-  [implicit state management](docs/user_guide/architecture.md#implicit-state-management)
-  for stateful models
-- Provides [Backend API](https://github.com/triton-inference-server/backend) that
-  allows adding custom backends and pre/post processing operations
-- Supports writing custom backends in python, a.k.a.
-  [Python-based backends.](https://github.com/triton-inference-server/backend/blob/main/docs/python_based_backends.md#python-based-backends)
-- Model pipelines using
-  [Ensembling](docs/user_guide/architecture.md#ensemble-models) or [Business
-  Logic Scripting
-  (BLS)](https://github.com/triton-inference-server/python_backend#business-logic-scripting)
-- [HTTP/REST and GRPC inference
-  protocols](docs/customization_guide/inference_protocols.md) based on the community
-  developed [KServe
-  protocol](https://github.com/kserve/kserve/tree/master/docs/predict-api/v2)
-- A [C API](docs/customization_guide/inference_protocols.md#in-process-triton-server-api) and
-  [Java API](docs/customization_guide/inference_protocols.md#java-bindings-for-in-process-triton-server-api)
-  allow Triton to link directly into your application for edge and other in-process use cases
-- [Metrics](docs/user_guide/metrics.md) indicating GPU utilization, server
-  throughput, server latency, and more
-
-**New to Triton Inference Server?** Make use of
-[these tutorials](https://github.com/triton-inference-server/tutorials)
-to begin your Triton journey!
-
-Join the [Triton and TensorRT community](https://www.nvidia.com/en-us/deep-learning-ai/triton-tensorrt-newsletter/) and
-stay current on the latest product updates, bug fixes, content, best practices,
-and more.  Need enterprise support?  NVIDIA global support is available for Triton
-Inference Server with the
-[NVIDIA AI Enterprise software suite](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/).
-
-## Serve a Model in 3 Easy Steps
-
-```bash
-# Step 1: Create the example model repository
-git clone -b r24.11 https://github.com/triton-inference-server/server.git
-cd server/docs/examples
-./fetch_models.sh
-
-# Step 2: Launch triton from the NGC Triton container
-docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.11-py3 tritonserver --model-repository=/models
-
-# Step 3: Sending an Inference Request
-# In a separate console, launch the image_client example from the NGC Triton SDK container
-docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.11-py3-sdk
-/workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg
-
-# Inference should return the following
-Image '/workspace/images/mug.jpg':
-    15.346230 (504) = COFFEE MUG
-    13.224326 (968) = CUP
-    10.422965 (505) = COFFEEPOT
-```
-Please read the [QuickStart](docs/getting_started/quickstart.md) guide for additional information
-regarding this example. The quickstart guide also contains an example of how to launch Triton on [CPU-only systems](docs/getting_started/quickstart.md#run-on-cpu-only-system). New to Triton and wondering where to get started? Watch the [Getting Started video](https://youtu.be/NQDtfSi5QF4).
-
-## Examples and Tutorials
-
-Check out [NVIDIA LaunchPad](https://www.nvidia.com/en-us/data-center/products/ai-enterprise-suite/trial/)
-for free access to a set of hands-on labs with Triton Inference Server hosted on
-NVIDIA infrastructure.
-
-Specific end-to-end examples for popular models, such as ResNet, BERT, and DLRM
-are located in the
-[NVIDIA Deep Learning Examples](https://github.com/NVIDIA/DeepLearningExamples)
-page on GitHub. The
-[NVIDIA Developer Zone](https://developer.nvidia.com/nvidia-triton-inference-server)
-contains additional documentation, presentations, and examples.
-
-## Documentation
-
-### Build and Deploy
-
-The recommended way to build and use Triton Inference Server is with Docker
-images.
-
-- [Install Triton Inference Server with Docker containers](docs/customization_guide/build.md#building-with-docker) (*Recommended*)
-- [Install Triton Inference Server without Docker containers](docs/customization_guide/build.md#building-without-docker)
-- [Build a custom Triton Inference Server Docker container](docs/customization_guide/compose.md)
-- [Build Triton Inference Server from source](docs/customization_guide/build.md#building-on-unsupported-platforms)
-- [Build Triton Inference Server for Windows 10](docs/customization_guide/build.md#building-for-windows-10)
-- Examples for deploying Triton Inference Server with Kubernetes and Helm on [GCP](deploy/gcp/README.md),
-  [AWS](deploy/aws/README.md), and [NVIDIA FleetCommand](deploy/fleetcommand/README.md)
-- [Secure Deployment Considerations](docs/customization_guide/deploy.md)
-
-### Using Triton
-
-#### Preparing Models for Triton Inference Server
-
-The first step in using Triton to serve your models is to place one or
-more models into a [model repository](docs/user_guide/model_repository.md). Depending on
-the type of the model and on what Triton capabilities you want to enable for
-the model, you may need to create a [model
-configuration](docs/user_guide/model_configuration.md) for the model.
-
-- [Add custom operations to Triton if needed by your model](docs/user_guide/custom_operations.md)
-- Enable model pipelining with [Model Ensemble](docs/user_guide/architecture.md#ensemble-models)
-  and [Business Logic Scripting (BLS)](https://github.com/triton-inference-server/python_backend#business-logic-scripting)
-- Optimize your models setting [scheduling and batching](docs/user_guide/architecture.md#models-and-schedulers)
-  parameters and [model instances](docs/user_guide/model_configuration.md#instance-groups).
-- Use the [Model Analyzer tool](https://github.com/triton-inference-server/model_analyzer)
-  to help optimize your model configuration with profiling
-- Learn how to [explicitly manage what models are available by loading and
-  unloading models](docs/user_guide/model_management.md)
-
-#### Configure and Use Triton Inference Server
-
-- Read the [Quick Start Guide](docs/getting_started/quickstart.md) to run Triton Inference
-  Server on both GPU and CPU
-- Triton supports multiple execution engines, called
-  [backends](https://github.com/triton-inference-server/backend#where-can-i-find-all-the-backends-that-are-available-for-triton), including
-  [TensorRT](https://github.com/triton-inference-server/tensorrt_backend),
-  [TensorFlow](https://github.com/triton-inference-server/tensorflow_backend),
-  [PyTorch](https://github.com/triton-inference-server/pytorch_backend),
-  [ONNX](https://github.com/triton-inference-server/onnxruntime_backend),
-  [OpenVINO](https://github.com/triton-inference-server/openvino_backend),
-  [Python](https://github.com/triton-inference-server/python_backend), and more
-- Not all the above backends are supported on every platform supported by Triton.
-  Look at the
-  [Backend-Platform Support Matrix](https://github.com/triton-inference-server/backend/blob/main/docs/backend_platform_support_matrix.md)
-  to learn which backends are supported on your target platform.
-- Learn how to [optimize performance](docs/user_guide/optimization.md) using the
-  [Performance Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md)
-  and
-  [Model Analyzer](https://github.com/triton-inference-server/model_analyzer)
-- Learn how to [manage loading and unloading models](docs/user_guide/model_management.md) in
-  Triton
-- Send requests directly to Triton with the [HTTP/REST JSON-based
-  or gRPC protocols](docs/customization_guide/inference_protocols.md#httprest-and-grpc-protocols)
-
-#### Client Support and Examples
-
-A Triton *client* application sends inference and other requests to Triton. The
-[Python and C++ client libraries](https://github.com/triton-inference-server/client)
-provide APIs to simplify this communication.
-
-- Review client examples for [C++](https://github.com/triton-inference-server/client/blob/main/src/c%2B%2B/examples),
-  [Python](https://github.com/triton-inference-server/client/blob/main/src/python/examples),
-  and [Java](https://github.com/triton-inference-server/client/blob/main/src/java/src/main/java/triton/client/examples)
-- Configure [HTTP](https://github.com/triton-inference-server/client#http-options)
-  and [gRPC](https://github.com/triton-inference-server/client#grpc-options)
-  client options
-- Send input data (e.g. a jpeg image) directly to Triton in the [body of an HTTP
-  request without any additional metadata](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_binary_data.md#raw-binary-request)
-
-### Extend Triton
-
-[Triton Inference Server's architecture](docs/user_guide/architecture.md) is specifically
-designed for modularity and flexibility
-
-- [Customize Triton Inference Server container](docs/customization_guide/compose.md) for your use case
-- [Create custom backends](https://github.com/triton-inference-server/backend)
-  in either [C/C++](https://github.com/triton-inference-server/backend/blob/main/README.md#triton-backend-api)
-  or [Python](https://github.com/triton-inference-server/python_backend)
-- Create [decoupled backends and models](docs/user_guide/decoupled_models.md) that can send
-  multiple responses for a request or not send any responses for a request
-- Use a [Triton repository agent](docs/customization_guide/repository_agents.md) to add functionality
-  that operates when a model is loaded and unloaded, such as authentication,
-  decryption, or conversion
-- Deploy Triton on [Jetson and JetPack](docs/user_guide/jetson.md)
-- [Use Triton on AWS
-   Inferentia](https://github.com/triton-inference-server/python_backend/tree/main/inferentia)
-
-### Additional Documentation
-
-- [FAQ](docs/user_guide/faq.md)
-- [User Guide](docs/README.md#user-guide)
-- [Customization Guide](docs/README.md#customization-guide)
-- [Release Notes](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/index.html)
-- [GPU, Driver, and CUDA Support
-Matrix](https://docs.nvidia.com/deeplearning/dgx/support-matrix/index.html)
-
-## Contributing
-
-Contributions to Triton Inference Server are more than welcome. To
-contribute please review the [contribution
-guidelines](CONTRIBUTING.md). If you have a backend, client,
-example or similar contribution that is not modifying the core of
-Triton, then you should file a PR in the [contrib
-repo](https://github.com/triton-inference-server/contrib).
-
-## Reporting problems, asking questions
-
-We appreciate any feedback, questions or bug reporting regarding this project.
-When posting [issues in GitHub](https://github.com/triton-inference-server/server/issues),
-follow the process outlined in the [Stack Overflow document](https://stackoverflow.com/help/mcve).
-Ensure posted examples are:
-- minimal – use as little code as possible that still produces the
-  same problem
-- complete – provide all parts needed to reproduce the problem. Check
-  if you can strip external dependencies and still show the problem. The
-  less time we spend on reproducing problems the more time we have to
-  fix it
-- verifiable – test the code you're about to provide to make sure it
-  reproduces the problem. Remove all other problems that are not
-  related to your request/question.
-
-For issues, please use the provided bug report and feature request templates.
-
-For questions, we recommend posting in our community
-[GitHub Discussions.](https://github.com/triton-inference-server/server/discussions)
-
-## For more information
-
-Please refer to the [NVIDIA Developer Triton page](https://developer.nvidia.com/nvidia-triton-inference-server)
-for more information.
+> [!WARNING]
+> You are currently on the `24.12` branch which tracks under-development and unreleased features.
@@ -1 +1 @@
-2.53.0dev
+2.53.0
@@ -71,10 +71,10 @@
 #
 
 DEFAULT_TRITON_VERSION_MAP = {
-    "release_version": "2.53.0dev",
-    "triton_container_version": "24.12dev",
-    "upstream_container_version": "24.11",
-    "ort_version": "1.19.2",
+    "release_version": "2.53.0",
+    "triton_container_version": "24.12",
+    "upstream_container_version": "24.12",
+    "ort_version": "1.20.1",
     "ort_openvino_version": "2024.4.0",
     "standalone_openvino_version": "2024.4.0",
     "dcgm_version": "3.3.6",
 
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.11-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.12-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: s3://triton-inference-server-repository/model_repository
   numGpus: 1
 
@@ -26,7 +26,7 @@
 
 apiVersion: v1
 # appVersion is the Triton version; update when changing release
-appVersion: "2.51.0"
+appVersion: "2.53.0"
 description: Triton Inference Server (Fleet Command)
 name: triton-inference-server
 # version is the Chart version; update when changing anything in the chart
Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@`
`29`	`29`	`#`
`30`	`30`
`31`	`31`	`# Base image on the minimum Triton container`
`32`		`-ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.11-py3-min`
	`32`	`+ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.12-py3-min`
`33`	`33`
`34`	`34`	`ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo`
`35`	`35`	`ARG TRITON_PA_REPO_SUBDIR=perfanalyzerrepo`