triton-inference-server
diff --git a/‎Dockerfile.QA
Lines changed: 19 additions & 12 deletions b/‎Dockerfile.QA
Lines changed: 19 additions & 12 deletions
diff --git a/‎build.py
Lines changed: 9 additions & 41 deletions b/‎build.py
Lines changed: 9 additions & 41 deletions
diff --git a/‎compose.py
Lines changed: 5 additions & 13 deletions b/‎compose.py
Lines changed: 5 additions & 13 deletions
diff --git a/‎deploy/alibaba-cloud/README.md
Lines changed: 5 additions & 5 deletions b/‎deploy/alibaba-cloud/README.md
Lines changed: 5 additions & 5 deletions
diff --git a/‎deploy/aws/README.md
Lines changed: 2 additions & 2 deletions b/‎deploy/aws/README.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎deploy/gcp/README.md
Lines changed: 2 additions & 2 deletions b/‎deploy/gcp/README.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎deploy/k8s-onprem/README.md
Lines changed: 1 addition & 1 deletion b/‎deploy/k8s-onprem/README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎deploy/oci/README.md
Lines changed: 2 additions & 2 deletions b/‎deploy/oci/README.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/examples/fetch_models.sh
Lines changed: 12 additions & 4 deletions b/‎docs/examples/fetch_models.sh
Lines changed: 12 additions & 4 deletions
diff --git a/‎docs/examples/model_repository/inception_graphdef/config.pbtxt
Lines changed: 0 additions & 19 deletions b/‎docs/examples/model_repository/inception_graphdef/config.pbtxt
Lines changed: 0 additions & 19 deletions
@@ -61,6 +61,7 @@ RUN apt-get update && \
             python3-pip \
             python3-wheel \
             python3-setuptools \
+            python3-venv \
             rapidjson-dev \
             software-properties-common && \
     rm -rf /var/lib/apt/lists/*
@@ -74,12 +75,19 @@ RUN apt update -q=2 \
     && apt-get install -y --no-install-recommends cmake=3.28.3* cmake-data=3.28.3*
 
 # Add inception_graphdef model to example repo
+# FIXME: This should be changed to using the fetch_models.sh script
+# in order to ensure the public facing docs are up-to-date.
 WORKDIR /workspace/docs/examples/model_repository
-RUN mkdir -p inception_graphdef/1 && \
-    wget -O ${TRITONTMP_DIR}/inception_v3_2016_08_28_frozen.pb.tar.gz \
-        https://storage.googleapis.com/download.tensorflow.org/models/inception_v3_2016_08_28_frozen.pb.tar.gz && \
-    (cd ${TRITONTMP_DIR} && tar xzf inception_v3_2016_08_28_frozen.pb.tar.gz) && \
-    mv ${TRITONTMP_DIR}/inception_v3_2016_08_28_frozen.pb inception_graphdef/1/model.graphdef
+RUN mkdir -p model_repository/inception_onnx/1 && \
+        wget -O /tmp/inception_v3_2016_08_28_frozen.pb.tar.gz \
+            https://storage.googleapis.com/download.tensorflow.org/models/inception_v3_2016_08_28_frozen.pb.tar.gz && \
+        (cd /tmp && tar xzf inception_v3_2016_08_28_frozen.pb.tar.gz) && \
+        python3 -m venv tf2onnx && \
+            source ./tf2onnx/bin/activate && \
+            pip3 install "numpy<2" tensorflow tf2onnx && \
+            python3 -m tf2onnx.convert --graphdef /tmp/inception_v3_2016_08_28_frozen.pb --output inception_v3_onnx.model.onnx --inputs input:0 --outputs InceptionV3/Predictions/Softmax:0 && \
+        deactivate  && \
+        mv inception_v3_onnx.model.onnx model_repository/inception_onnx/1/model.onnx
 
 # Update the qa/ directory with test executables, models, etc.
 WORKDIR /workspace
@@ -109,7 +117,7 @@ RUN mkdir -p qa/common && \
     cp -r docs/examples/model_repository/simple_identity qa/L0_grpc/models && \
     cp -r docs/examples/model_repository/simple_sequence qa/L0_grpc/models && \
     cp -r docs/examples/model_repository/simple_string qa/L0_grpc/models && \
-    cp -r docs/examples/model_repository/inception_graphdef qa/L0_grpc/models && \
+    cp -r docs/examples/model_repository/inception_onnx qa/L0_grpc/models && \
     mkdir qa/L0_grpc_state_cleanup/models && \
     cp -r /workspace/src/test/models/repeat_int32 qa/L0_grpc_state_cleanup/models/ && \
     mkdir qa/L0_http/models && \
@@ -118,7 +126,7 @@ RUN mkdir -p qa/common && \
     cp -r docs/examples/model_repository/simple_identity qa/L0_http/models && \
     cp -r docs/examples/model_repository/simple_sequence qa/L0_http/models && \
     cp -r docs/examples/model_repository/simple_string qa/L0_http/models && \
-    cp -r docs/examples/model_repository/inception_graphdef qa/L0_http/models && \
+    cp -r docs/examples/model_repository/inception_onnx qa/L0_grpc/models && \
     mkdir qa/L0_https/models && \
     cp -r docs/examples/model_repository/simple qa/L0_https/models/. && \
     mkdir qa/L0_secure_grpc/models && \
@@ -149,21 +157,20 @@ RUN mkdir -p qa/common && \
     cp bin/triton_json_test qa/L0_json/. && \
     cp bin/backend_output_detail_test qa/L0_backend_output_detail/. && \
     cp -r deploy/mlflow-triton-plugin qa/L0_mlflow/. && \
-    cp bin/input_byte_size_test qa/L0_input_validation/. && \
-    cp -r docs/examples/model_repository/simple_identity qa/L0_input_validation/models
+    cp bin/input_byte_size_test qa/L0_input_validation/.
 
 RUN mkdir -p qa/pkgs && \
     cp python/triton*.whl qa/pkgs/. && \
     cp -rf python/test/. qa/L0_python_api/.
 
 RUN mkdir -p qa/L0_simple_ensemble/models/simple/1 && \
-    cp docs/examples/model_repository/simple/1/model.graphdef \
+    cp docs/examples/model_repository/simple/1/model.onnx \
         qa/L0_simple_ensemble/models/simple/1/. && \
     mkdir -p qa/L0_simple_ensemble/models/simple/2 && \
-    cp docs/examples/model_repository/simple/1/model.graphdef \
+    cp docs/examples/model_repository/simple/1/model.onnx \
         qa/L0_simple_ensemble/models/simple/2/. && \
     mkdir -p qa/L0_socket/models/simple/1 && \
-    cp docs/examples/model_repository/simple/1/model.graphdef \
+    cp docs/examples/model_repository/simple/1/model.onnx \
         qa/L0_socket/models/simple/1/.
 
 RUN mkdir -p qa/L0_backend_identity/models && \
 
@@ -562,8 +562,6 @@ def backend_cmake_args(images, components, be, install_dir, library_paths):
         args = onnxruntime_cmake_args(images, library_paths)
     elif be == "openvino":
         args = openvino_cmake_args()
-    elif be == "tensorflow":
-        args = tensorflow_cmake_args(images, library_paths)
     elif be == "python":
         args = python_cmake_args()
     elif be == "dali":
@@ -795,23 +793,6 @@ def tensorrt_cmake_args():
     return cargs
 
 
-def tensorflow_cmake_args(images, library_paths):
-    backend_name = "tensorflow"
-    extra_args = []
-
-    # If a specific TF image is specified use it, otherwise pull from NGC.
-    if backend_name in images:
-        image = images[backend_name]
-    else:
-        image = "nvcr.io/nvidia/tensorflow:{}-tf2-py3".format(
-            FLAGS.upstream_container_version
-        )
-    extra_args = [
-        cmake_backend_arg(backend_name, "TRITON_TENSORFLOW_DOCKER_IMAGE", None, image)
-    ]
-    return extra_args
-
-
 def dali_cmake_args():
     return [
         cmake_backend_enable("dali", "TRITON_DALI_SKIP_DOWNLOAD", False),
@@ -1233,10 +1214,10 @@ def create_dockerfile_linux(
         argmap["BASE_IMAGE"],
     )
 
-    # PyTorch and TensorFlow backends need extra CUDA and other
+    # PyTorch backends need extra CUDA and other
     # dependencies during runtime that are missing in the CPU-only base container.
     # These dependencies must be copied from the Triton Min image.
-    if not FLAGS.enable_gpu and (("pytorch" in backends) or ("tensorflow" in backends)):
+    if not FLAGS.enable_gpu and ("pytorch" in backends):
         df += """
 ############################################################################
 ##  Triton Min image
@@ -1602,10 +1583,10 @@ def add_cpu_libs_to_linux_dockerfile(backends, target_machine):
             cuda_arch=cuda_arch, libs_arch=libs_arch
         )
 
-    if ("pytorch" in backends) or ("tensorflow" in backends):
-        # Add NCCL dependency for tensorflow/pytorch backend.
+    if "pytorch" in backends:
+        # Add NCCL dependency for pytorch backend.
         # Note: Even though the build is CPU-only, the version of
-        # tensorflow/pytorch we are using depends upon the NCCL library.
+        # pytorch we are using depends upon the NCCL library.
         # Since this dependency is not present in the ubuntu base image,
         # we must copy it from the Triton min container ourselves.
         df += """
@@ -1720,11 +1701,10 @@ def create_build_dockerfiles(
     }
 
     # For CPU-only image we need to copy some cuda libraries and dependencies
-    # since we are using PyTorch and TensorFlow containers that
-    # are not CPU-only.
+    # since we are using PyTorch containers that are not CPU-only.
     if (
         not FLAGS.enable_gpu
-        and (("pytorch" in backends) or ("tensorflow" in backends))
+        and ("pytorch" in backends)
         and (target_platform() != "windows")
     ):
         if "gpu-base" in images:
@@ -2351,7 +2331,6 @@ def enable_all():
             "identity",
             "square",
             "repeat",
-            "tensorflow",
             "onnxruntime",
             "python",
             "dali",
@@ -2586,7 +2565,7 @@ def enable_all():
         "--image",
         action="append",
         required=False,
-        help='Use specified Docker image in build as <image-name>,<full-image-name>. <image-name> can be "base", "gpu-base", "tensorflow", or "pytorch".',
+        help='Use specified Docker image in build as <image-name>,<full-image-name>. <image-name> can be "base", "gpu-base", or "pytorch".',
     )
 
     parser.add_argument(
@@ -2887,12 +2866,6 @@ def enable_all():
         parts = be.split(":")
         if len(parts) == 1:
             parts.append(default_repo_tag)
-        if parts[0] == "tensorflow1":
-            fail(
-                "Starting from Triton version 23.04, support for TensorFlow 1 has been discontinued. Please switch to Tensorflow 2."
-            )
-        if parts[0] == "tensorflow2":
-            parts[0] = "tensorflow"
         log('backend "{}" at tag/branch "{}"'.format(parts[0], parts[1]))
         backends[parts[0]] = parts[1]
 
@@ -2939,13 +2912,10 @@ def enable_all():
             len(parts) != 2, "--image must specify <image-name>,<full-image-registry>"
         )
         fail_if(
-            parts[0]
-            not in ["base", "gpu-base", "pytorch", "tensorflow", "tensorflow2"],
+            parts[0] not in ["base", "gpu-base", "pytorch"],
             "unsupported value for --image",
         )
         log('image "{}": "{}"'.format(parts[0], parts[1]))
-        if parts[0] == "tensorflow2":
-            parts[0] = "tensorflow"
         images[parts[0]] = parts[1]
 
     # Initialize map of library paths for each backend.
@@ -2954,8 +2924,6 @@ def enable_all():
         parts = lpath.split(":")
         if len(parts) == 2:
             log('backend "{}" library path "{}"'.format(parts[0], parts[1]))
-            if parts[0] == "tensorflow2":
-                parts[0] = "tensorflow"
             library_paths[parts[0]] = parts[1]
 
     # Parse any explicitly specified cmake arguments
 
@@ -71,14 +71,10 @@ def start_dockerfile(ddir, images, argmap, dockerfile_name, backends):
         argmap["TRITON_VERSION"], argmap["TRITON_CONTAINER_VERSION"], images["full"]
     )
 
-    # PyTorch, TensorFlow backends need extra CUDA and other
+    # PyTorch backends need extra CUDA and other
     # dependencies during runtime that are missing in the CPU-only base container.
     # These dependencies must be copied from the Triton Min image.
-    if not FLAGS.enable_gpu and (
-        ("pytorch" in backends)
-        or ("tensorflow" in backends)
-        or ("tensorflow2" in backends)
-    ):
+    if not FLAGS.enable_gpu and "pytorch" in backends:
         df += """
 FROM {} AS min_container
 
@@ -406,7 +402,7 @@ def create_argmap(images, skip_pull):
         '<image-name>,<full-image-name>. <image-name> can be "min", "gpu-min" '
         'or "full". Both "min" and "full" need to be specified at the same time.'
         'This will override "--container-version". "gpu-min" is needed for '
-        "CPU-only container to copy TensorFlow and PyTorch deps.",
+        "CPU-only container to copy PyTorch deps.",
     )
     parser.add_argument(
         "--enable-gpu",
@@ -504,13 +500,9 @@ def create_argmap(images, skip_pull):
     fail_if(len(images) < 2, "Need to specify both 'full' and 'min' images if at all")
 
     # For CPU-only image we need to copy some cuda libraries and dependencies
-    # since we are using PyTorch, TensorFlow 1, TensorFlow 2 containers that
+    # since we are using PyTorch containers that
     # are not CPU-only.
-    if (
-        ("pytorch" in FLAGS.backend)
-        or ("tensorflow" in FLAGS.backend)
-        or ("tensorflow2" in FLAGS.backend)
-    ) and ("gpu-min" not in images):
+    if ("pytorch" in FLAGS.backend) and ("gpu-min" not in images):
         images["gpu-min"] = "nvcr.io/nvidia/tritonserver:{}-py3-min".format(
             FLAGS.container_version
         )
 
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -39,7 +39,7 @@ This repository contains information about how to deploy NVIDIA Triton Inference
 - EAS provides a simple way for deep learning developers to deploy their models in Alibaba Cloud.
 - Using **Triton Processor** is the recommended way on EAS to deploy Triton Inference Server. Users can simply deploy a Triton Server by preparing models and creating a EAS service by setting processor type to `triton`.
 - Models should be uploaded to Alibaba Cloud's OSS(Object Storage Service). User's model repository in OSS will be mounted onto local path visible to Triton Server.
-- This documentation uses Triton's own example models for demo. The tensorflow inception model can be downloaded by the `fetch_models.sh` script.
+- This documentation uses Triton's own example models for demo. The ONNX inception v3 model can be obtained by the `fetch_models.sh` script.
 
 # Prerequisites
 - You should register an Alibaba Cloud Account, and being able to use EAS by [eascmd](https://help.aliyun.com/document_detail/111031.html?spm=a2c4g.11186623.6.752.42356f46FN5fU1), which is a command line tool to create stop or scale services on EAS.
@@ -48,10 +48,10 @@ This repository contains information about how to deploy NVIDIA Triton Inference
 
 # Demo Instruction
 ## Prepare a model repo directory in OSS
-Download the tensorflow inception model via [fetch_model.sh](https://github.com/triton-inference-server/server/blob/main/docs/examples/fetch_models.sh). Then using [ossutil](https://help.aliyun.com/document_detail/50452.html?spm=a2c4g.11186623.6.833.26d66d51dPEytI) , which is a command line tool to use OSS, to upload the model to a certain OSS dir as you want.
+Download the ONNX inception v3 model via [fetch_model.sh](https://github.com/triton-inference-server/server/blob/main/docs/examples/fetch_models.sh). Then using [ossutil](https://help.aliyun.com/document_detail/50452.html?spm=a2c4g.11186623.6.833.26d66d51dPEytI) , which is a command line tool to use OSS, to upload the model to a certain OSS dir as you want.
 
 ```
-./ossutil cp inception_graphdef/ oss://triton-model-repo/models
+./ossutil cp inception_v3_onnx/ oss://triton-model-repo/models
 ```
 ## Create Triton Service with json config by eascmd
 The following is the json we use when creating a Triton Server on EAS.
@@ -125,7 +125,7 @@ triton_client = httpclient.InferenceServerClient(url=URL, verbose=False)
 start = time.time()
 for i in range(10):
     results = triton_client.infer(
-        "inception_graphdef", inputs=[input_img], outputs=[output], headers=HEADERS
+        "inception_v3_onnx", inputs=[input_img], outputs=[output], headers=HEADERS
     )
     res_body = results.get_response()
     elapsed_ms = (time.time() - start) * 1000
 
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -224,7 +224,7 @@ using image classification models being served by the inference
 server. For example,
 
 ```
-$ image_client -u 34.83.9.133:8000 -m inception_graphdef -s INCEPTION -c3 mug.jpg
+$ image_client -u 34.83.9.133:8000 -m inception_v3_onnx -s INCEPTION -c3 mug.jpg
 Request 0, batch size 1
 Image 'images/mug.jpg':
     504 (COFFEE MUG) = 0.723992
 
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -262,7 +262,7 @@ using image classification models being served by the inference
 server. For example,
 
 ```
-$ image_client -u 34.83.9.133:8000 -m inception_graphdef -s INCEPTION -c3 mug.jpg
+$ image_client -u 34.83.9.133:8000 -m inception_v3_onnx -s INCEPTION -c3 mug.jpg
 Request 0, batch size 1
 Image 'images/mug.jpg':
     504 (COFFEE MUG) = 0.723992
 
@@ -303,7 +303,7 @@ using image classification models on the inference
 server. For example,
 
 ```
-$ image_client -u $cluster_ip:8000 -m inception_graphdef -s INCEPTION -c3 mug.jpg
+$ image_client -u $cluster_ip:8000 -m inception_v3_onnx -s INCEPTION -c3 mug.jpg
 Request 0, batch size 1
 Image 'images/mug.jpg':
     504 (COFFEE MUG) = 0.723992
 
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -268,7 +268,7 @@ using image classification models being served by the inference
 server. For example,
 
 ```
-$ image_client -u 34.83.9.133:8000 -m inception_graphdef -s INCEPTION -c3 mug.jpg
+$ image_client -u 34.83.9.133:8000 -m inception_v3_onnx -s INCEPTION -c3 mug.jpg
 Request 0, batch size 1
 Image 'images/mug.jpg':
     504 (COFFEE MUG) = 0.723992
 
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+
+# Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -27,12 +28,19 @@
 
 set -ex
 
-# TensorFlow inception
-mkdir -p model_repository/inception_graphdef/1
+# Convert Tensorflow inception V3 module to ONNX
+# Pre-requisite: Python3, venv, and Pip3 are installed on the system
+mkdir -p model_repository/inception_onnx/1
 wget -O /tmp/inception_v3_2016_08_28_frozen.pb.tar.gz \
      https://storage.googleapis.com/download.tensorflow.org/models/inception_v3_2016_08_28_frozen.pb.tar.gz
 (cd /tmp && tar xzf inception_v3_2016_08_28_frozen.pb.tar.gz)
-mv /tmp/inception_v3_2016_08_28_frozen.pb model_repository/inception_graphdef/1/model.graphdef
+python3 -m venv tf2onnx
+source ./tf2onnx/bin/activate
+pip3 install "numpy<2" tensorflow tf2onnx
+python3 -m tf2onnx.convert --graphdef /tmp/inception_v3_2016_08_28_frozen.pb --output inception_v3_onnx.model.onnx --inputs input:0 --outputs InceptionV3/Predictions/Softmax:0
+deactivate
+mv inception_v3_onnx.model.onnx model_repository/inception_onnx/1/model.onnx
+
 
 # ONNX densenet
 mkdir -p model_repository/densenet_onnx/1