triton-inference-server
diff --git a/‎docs/user_guide/metrics.md
Lines changed: 37 additions & 0 deletions b/‎docs/user_guide/metrics.md
Lines changed: 37 additions & 0 deletions
diff --git a/‎qa/L0_metrics/ensemble_decoupled/async_execute_decouple/1/model.py
Lines changed: 59 additions & 0 deletions b/‎qa/L0_metrics/ensemble_decoupled/async_execute_decouple/1/model.py
Lines changed: 59 additions & 0 deletions
diff --git a/‎qa/L0_metrics/ensemble_decoupled/async_execute_decouple/config.pbtxt
Lines changed: 54 additions & 0 deletions b/‎qa/L0_metrics/ensemble_decoupled/async_execute_decouple/config.pbtxt
Lines changed: 54 additions & 0 deletions
diff --git a/‎qa/L0_metrics/ensemble_decoupled/ensemble/config.pbtxt
Lines changed: 89 additions & 0 deletions b/‎qa/L0_metrics/ensemble_decoupled/ensemble/config.pbtxt
Lines changed: 89 additions & 0 deletions
@@ -204,6 +204,43 @@ metrics are used for latencies:
 
 To disable these metrics specifically, you can set `--metrics-config counter_latencies=false`
 
+#### Histograms
+
+> **Note**
+>
+> The following Histogram feature is experimental for the time being and may be
+> subject to change based on user feedback.
+
+By default, the following
+[Histogram](https://prometheus.io/docs/concepts/metric_types/#histogram)
+metrics are used for latencies:
+
+|Category      |Metric          |Metric Name |Description                |Granularity|Frequency    |Model Type
+|--------------|----------------|------------|---------------------------|-----------|-------------|-------------|
+|Latency       |Request to First Response Time    |`nv_inference_first_response_histogram_ms` |Histogram of end-to-end inference request to the first response time |Per model  |Per request  | Decoupled |
+
+To enable these metrics specifically, you can set `--metrics-config histogram_latencies=true`
+
+Each histogram above is composed of several sub-metrics. For each histogram
+metric, there is a set of `le` (less than or equal to) thresholds tracking
+the counter for each bucket. Additionally, there are `_count` and `_sum`
+metrics that aggregate the count and observed values for each. For example,
+see the following information exposed by the "Time to First Response" histogram
+metrics:
+```
+# HELP nv_first_response_histogram_ms Duration from request to first response in milliseconds
+# TYPE nv_first_response_histogram_ms histogram
+nv_inference_first_response_histogram_ms_count{model="my_model",version="1"} 37
+nv_inference_first_response_histogram_ms_sum{model="my_model",version="1"} 10771
+nv_inference_first_response_histogram_ms{model="my_model",version="1", le="100"} 8
+nv_inference_first_response_histogram_ms{model="my_model",version="1", le="500"} 30
+nv_inference_first_response_histogram_ms{model="my_model",version="1", le="2000"} 36
+nv_inference_first_response_histogram_ms{model="my_model",version="1", le="5000"} 37
+nv_inference_first_response_histogram_ms{model="my_model",version="1", le="+Inf"} 37
+```
+
+Triton initializes histograms with default buckets for each, as shown above. Customization of buckets per metric is currently unsupported.
+
 #### Summaries
 
 > **Note**
 
@@ -0,0 +1,59 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import time
+
+import numpy as np
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    async def execute(self, requests):
+        request = requests[0]
+        wait_secs = pb_utils.get_input_tensor_by_name(
+            request, "WAIT_SECONDS"
+        ).as_numpy()[0]
+        response_num = pb_utils.get_input_tensor_by_name(
+            request, "RESPONSE_NUM"
+        ).as_numpy()[0]
+        output_tensors = [
+            pb_utils.Tensor("WAIT_SECONDS", np.array([wait_secs], np.float32)),
+            pb_utils.Tensor("RESPONSE_NUM", np.array([1], np.uint8)),
+        ]
+
+        # Wait
+        time.sleep(wait_secs.item())
+        response_sender = request.get_response_sender()
+        for i in range(response_num):
+            response = pb_utils.InferenceResponse(output_tensors)
+            if i != response_num - 1:
+                response_sender.send(response)
+            else:
+                response_sender.send(
+                    response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
+                )
+
+        return None
@@ -0,0 +1,54 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+backend: "python"
+input [
+  {
+    name: "WAIT_SECONDS"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+  },
+  {
+    name: "RESPONSE_NUM"
+    data_type: TYPE_UINT8
+    dims: [ 1 ]
+  }
+]
+output [
+  {
+    name: "WAIT_SECONDS"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+  },
+  {
+    name: "RESPONSE_NUM"
+    data_type: TYPE_UINT8
+    dims: [ 1 ]
+  }
+]
+
+instance_group [{ kind: KIND_CPU }]
+model_transaction_policy { decoupled: True }
@@ -0,0 +1,89 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "ensemble"
+platform: "ensemble"
+input [
+ {
+  name: "INPUT0"
+  data_type: TYPE_FP32
+  dims: [ 1 ]
+ },
+ {
+  name: "INPUT1"
+  data_type: TYPE_UINT8
+  dims: [ 1 ]
+ }
+]
+output [
+ {
+  name: "OUTPUT"
+  data_type: TYPE_FP32
+  dims: [ 1 ]
+ }
+]
+ensemble_scheduling {
+ step [
+  {
+   # decoupled model
+   model_name: "async_execute_decouple"
+   model_version: 1
+   input_map {
+    key: "WAIT_SECONDS"
+    value: "INPUT0"
+   }
+   input_map {
+    key: "RESPONSE_NUM"
+    value: "INPUT1"
+   }
+   output_map {
+    key: "WAIT_SECONDS"
+    value: "temp_output0"
+   }
+   output_map {
+    key: "RESPONSE_NUM"
+    value: "temp_output1"
+   }
+  },
+  {
+   # non-decoupled model
+   model_name: "async_execute"
+   model_version: 1
+   input_map {
+    key: "WAIT_SECONDS"
+    value: "temp_output0"
+   }
+   input_map {
+    key: "RESPONSE_NUM"
+    value: "temp_output1"
+   }
+   output_map {
+    key: "WAIT_SECONDS"
+    value: "OUTPUT"
+   }
+  }
+ ]
+}