Skip to content

Commit 854bdcb

Browse files
authored
Add response statistics (#325)
* Add response statistics * Backend api version 1.19 * Fix see comment * Rename ResponseStatsIndex() to GetAndIncrementResponseIndex() * Clarify error object ownership * Move API parameters into a struct * Fix typo * Make API parameters struct opaque * Update comment * [Continue] Update comment
1 parent b177852 commit 854bdcb

File tree

7 files changed

+576
-14
lines changed

7 files changed

+576
-14
lines changed

include/triton/core/tritonbackend.h

Lines changed: 160 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Redistribution and use in source and binary forms, with or without
44
// modification, are permitted provided that the following conditions
@@ -65,6 +65,7 @@ struct TRITONBACKEND_Response;
6565
struct TRITONBACKEND_Backend;
6666
struct TRITONBACKEND_Model;
6767
struct TRITONBACKEND_ModelInstance;
68+
struct TRITONBACKEND_ModelInstanceResponseStatistics;
6869
struct TRITONBACKEND_BackendAttribute;
6970
struct TRITONBACKEND_Batcher;
7071

@@ -94,7 +95,7 @@ struct TRITONBACKEND_Batcher;
9495
/// }
9596
///
9697
#define TRITONBACKEND_API_VERSION_MAJOR 1
97-
#define TRITONBACKEND_API_VERSION_MINOR 18
98+
#define TRITONBACKEND_API_VERSION_MINOR 19
9899

99100
/// Get the TRITONBACKEND API version supported by Triton. This value
100101
/// can be compared against the TRITONBACKEND_API_VERSION_MAJOR and
@@ -761,8 +762,9 @@ TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ResponseOutput(
761762
/// \param send_flags Flags associated with the response. \see
762763
/// TRITONSERVER_ResponseCompleteFlag. \see
763764
/// TRITONSERVER_InferenceResponseCompleteFn_t.
764-
/// \param error The TRITONSERVER_Error to send if the response is an
765-
/// error, or nullptr if the response is successful.
765+
/// \param error The TRITONSERVER_Error to send if the response is an error, or
766+
/// nullptr if the response is successful. The caller retains ownership to the
767+
/// error object and must free it with TRITONSERVER_ErrorDelete.
766768
/// \return a TRITONSERVER_Error indicating success or failure.
767769
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ResponseSend(
768770
TRITONBACKEND_Response* response, const uint32_t send_flags,
@@ -1319,16 +1321,16 @@ TRITONBACKEND_ModelInstanceReportMemoryUsage(
13191321
/// TRITONBACKEND_ModelInstanceExecute.
13201322
///
13211323
/// TRITONBACKEND_ModelInstanceExecute()
1322-
/// CAPTURE TIMESPACE (exec_start_ns)
1324+
/// CAPTURE TIMESTAMP (exec_start_ns)
13231325
/// < process input tensors to prepare them for inference
13241326
/// execution, including copying the tensors to/from GPU if
13251327
/// necessary>
1326-
/// CAPTURE TIMESPACE (compute_start_ns)
1328+
/// CAPTURE TIMESTAMP (compute_start_ns)
13271329
/// < perform inference computations to produce outputs >
1328-
/// CAPTURE TIMESPACE (compute_end_ns)
1330+
/// CAPTURE TIMESTAMP (compute_end_ns)
13291331
/// < allocate output buffers and extract output tensors, including
13301332
/// copying the tensors to/from GPU if necessary>
1331-
/// CAPTURE TIMESPACE (exec_end_ns)
1333+
/// CAPTURE TIMESTAMP (exec_end_ns)
13321334
/// return
13331335
///
13341336
/// Note that these statistics are associated with a valid
@@ -1356,6 +1358,156 @@ TRITONBACKEND_ModelInstanceReportStatistics(
13561358
const uint64_t compute_start_ns, const uint64_t compute_end_ns,
13571359
const uint64_t exec_end_ns);
13581360

1361+
/// Create a new inference response statistics object.
1362+
///
1363+
/// \param response_statistics The new response statistics object to be created.
1364+
/// \return a TRITONSERVER_Error indicating success or failure.
1365+
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
1366+
TRITONBACKEND_ModelInstanceResponseStatisticsNew(
1367+
TRITONBACKEND_ModelInstanceResponseStatistics** response_statistics);
1368+
1369+
/// Delete an inference response statistics object.
1370+
///
1371+
/// The caller retains ownership to the objects set on the deleted response
1372+
/// statistics object and must free them separately.
1373+
///
1374+
/// \param response_statistics The response statistics object to be deleted.
1375+
/// \return a TRITONSERVER_Error indicating success or failure.
1376+
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
1377+
TRITONBACKEND_ModelInstanceResponseStatisticsDelete(
1378+
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics);
1379+
1380+
/// Set model instance to an inference response statistics object.
1381+
///
1382+
/// \param response_statistics The response statistics object.
1383+
/// \param model_instance The model instance.
1384+
/// \return a TRITONSERVER_Error indicating success or failure.
1385+
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
1386+
TRITONBACKEND_ModelInstanceResponseStatisticsSetModelInstance(
1387+
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
1388+
TRITONBACKEND_ModelInstance* model_instance);
1389+
1390+
/// Set response factory to an inference response statistics object.
1391+
///
1392+
/// \param response_statistics The response statistics object.
1393+
/// \param response_factory The response factory.
1394+
/// \return a TRITONSERVER_Error indicating success or failure.
1395+
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
1396+
TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseFactory(
1397+
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
1398+
TRITONBACKEND_ResponseFactory* response_factory);
1399+
1400+
/// Set response start time to an inference response statistics object.
1401+
///
1402+
/// All timestamps should be reported in nanonseconds and collected using
1403+
/// std::chrono::steady_clock::now().time_since_epoch() or the equivalent.
1404+
///
1405+
/// For consistency of measurement across different backends, the timestamps
1406+
/// should be collected at the following points during
1407+
/// TRITONBACKEND_ModelInstanceExecute.
1408+
///
1409+
/// TRITONBACKEND_ModelInstanceExecute()
1410+
/// < start of this response >
1411+
/// CAPTURE TIMESTAMP (response_start)
1412+
/// < generate this response >
1413+
/// CAPTURE TIMESTAMP (compute_output_start)
1414+
/// < allocate output buffers and extract output tensors, including copying
1415+
/// the tensors to/from GPU if necessary >
1416+
/// CAPTURE TIMESTAMP (response_end)
1417+
/// < end of this response >
1418+
/// return
1419+
///
1420+
/// \param response_statistics The response statistics object.
1421+
/// \param response_start The response start time.
1422+
/// \return a TRITONSERVER_Error indicating success or failure.
1423+
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
1424+
TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseStart(
1425+
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
1426+
uint64_t response_start);
1427+
1428+
/// Set compute output start time to an inference response statistics object.
1429+
///
1430+
/// Do NOT set this compute output start time (or set it to 0), if reporting an
1431+
/// empty response.
1432+
///
1433+
/// All timestamps should be reported in nanonseconds and collected using
1434+
/// std::chrono::steady_clock::now().time_since_epoch() or the equivalent.
1435+
///
1436+
/// For consistency of measurement across different backends, the timestamps
1437+
/// should be collected at the following points during
1438+
/// TRITONBACKEND_ModelInstanceExecute.
1439+
///
1440+
/// TRITONBACKEND_ModelInstanceExecute()
1441+
/// < start of this response >
1442+
/// CAPTURE TIMESTAMP (response_start)
1443+
/// < generate this response >
1444+
/// CAPTURE TIMESTAMP (compute_output_start)
1445+
/// < allocate output buffers and extract output tensors, including copying
1446+
/// the tensors to/from GPU if necessary >
1447+
/// CAPTURE TIMESTAMP (response_end)
1448+
/// < end of this response >
1449+
/// return
1450+
///
1451+
/// \param response_statistics The response statistics object.
1452+
/// \param compute_output_start The compute output start time.
1453+
/// \return a TRITONSERVER_Error indicating success or failure.
1454+
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
1455+
TRITONBACKEND_ModelInstanceResponseStatisticsSetComputeOutputStart(
1456+
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
1457+
uint64_t compute_output_start);
1458+
1459+
/// Set response end time to an inference response statistics object.
1460+
///
1461+
/// All timestamps should be reported in nanonseconds and collected using
1462+
/// std::chrono::steady_clock::now().time_since_epoch() or the equivalent.
1463+
///
1464+
/// For consistency of measurement across different backends, the timestamps
1465+
/// should be collected at the following points during
1466+
/// TRITONBACKEND_ModelInstanceExecute.
1467+
///
1468+
/// TRITONBACKEND_ModelInstanceExecute()
1469+
/// < start of this response >
1470+
/// CAPTURE TIMESTAMP (response_start)
1471+
/// < generate this response >
1472+
/// CAPTURE TIMESTAMP (compute_output_start)
1473+
/// < allocate output buffers and extract output tensors, including copying
1474+
/// the tensors to/from GPU if necessary >
1475+
/// CAPTURE TIMESTAMP (response_end)
1476+
/// < end of this response >
1477+
/// return
1478+
///
1479+
/// \param response_statistics The response statistics object.
1480+
/// \param response_end The response end time.
1481+
/// \return a TRITONSERVER_Error indicating success or failure.
1482+
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
1483+
TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseEnd(
1484+
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
1485+
uint64_t response_end);
1486+
1487+
/// Set error to an inference response statistics object.
1488+
///
1489+
/// Use the same error object passed to the TRITONBACKEND_ResponseSend. \see
1490+
/// TRITONBACKEND_ResponseSend.
1491+
///
1492+
/// \param response_statistics The response statistics object.
1493+
/// \param error The error object.
1494+
/// \return a TRITONSERVER_Error indicating success or failure.
1495+
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
1496+
TRITONBACKEND_ModelInstanceResponseStatisticsSetError(
1497+
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
1498+
TRITONSERVER_Error* error);
1499+
1500+
/// Record statistics for an inference response.
1501+
///
1502+
/// The caller retains ownership to the response statistics and must free it
1503+
/// after this function returns.
1504+
///
1505+
/// \param response_statistics The statistics to be recorded.
1506+
/// \return a TRITONSERVER_Error indicating success or failure.
1507+
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
1508+
TRITONBACKEND_ModelInstanceReportResponseStatistics(
1509+
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics);
1510+
13591511
/// Record statistics for the execution of an entire batch of
13601512
/// inference requests.
13611513
///

src/backend_model_instance.cc

Lines changed: 160 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Redistribution and use in source and binary forms, with or without
44
// modification, are permitted provided that the following conditions
@@ -795,6 +795,23 @@ TritonModelInstance::TritonBackendThread::BackendThread()
795795
LOG_VERBOSE(1) << "Stopping backend thread for " << name_ << "...";
796796
}
797797

798+
// Opaque object for the response statistics C-API
799+
struct ModelInstanceResponseStatistics {
800+
#ifdef TRITON_ENABLE_STATS
801+
ModelInstanceResponseStatistics()
802+
: model_instance(nullptr), response_factory(nullptr), response_start(0),
803+
compute_output_start(0), response_end(0), error(nullptr)
804+
{
805+
}
806+
TritonModelInstance* model_instance;
807+
std::shared_ptr<InferenceResponseFactory>* response_factory;
808+
uint64_t response_start;
809+
uint64_t compute_output_start;
810+
uint64_t response_end;
811+
TRITONSERVER_Error* error;
812+
#endif // TRITON_ENABLE_STATS
813+
};
814+
798815
extern "C" {
799816

800817
TRITONAPI_DECLSPEC TRITONSERVER_Error*
@@ -953,6 +970,148 @@ TRITONBACKEND_ModelInstanceReportStatistics(
953970
return nullptr; // success
954971
}
955972

973+
TRITONAPI_DECLSPEC TRITONSERVER_Error*
974+
TRITONBACKEND_ModelInstanceResponseStatisticsNew(
975+
TRITONBACKEND_ModelInstanceResponseStatistics** response_statistics)
976+
{
977+
#ifdef TRITON_ENABLE_STATS
978+
*response_statistics =
979+
reinterpret_cast<TRITONBACKEND_ModelInstanceResponseStatistics*>(
980+
new ModelInstanceResponseStatistics());
981+
#endif // TRITON_ENABLE_STATS
982+
983+
return nullptr; // success
984+
}
985+
986+
TRITONAPI_DECLSPEC TRITONSERVER_Error*
987+
TRITONBACKEND_ModelInstanceResponseStatisticsDelete(
988+
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics)
989+
{
990+
#ifdef TRITON_ENABLE_STATS
991+
delete reinterpret_cast<ModelInstanceResponseStatistics*>(
992+
response_statistics);
993+
#endif // TRITON_ENABLE_STATS
994+
995+
return nullptr; // success
996+
}
997+
998+
TRITONAPI_DECLSPEC TRITONSERVER_Error*
999+
TRITONBACKEND_ModelInstanceResponseStatisticsSetModelInstance(
1000+
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
1001+
TRITONBACKEND_ModelInstance* model_instance)
1002+
{
1003+
#ifdef TRITON_ENABLE_STATS
1004+
ModelInstanceResponseStatistics* rs =
1005+
reinterpret_cast<ModelInstanceResponseStatistics*>(response_statistics);
1006+
rs->model_instance = reinterpret_cast<TritonModelInstance*>(model_instance);
1007+
#endif // TRITON_ENABLE_STATS
1008+
1009+
return nullptr; // success
1010+
}
1011+
1012+
TRITONAPI_DECLSPEC TRITONSERVER_Error*
1013+
TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseFactory(
1014+
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
1015+
TRITONBACKEND_ResponseFactory* response_factory)
1016+
{
1017+
#ifdef TRITON_ENABLE_STATS
1018+
ModelInstanceResponseStatistics* rs =
1019+
reinterpret_cast<ModelInstanceResponseStatistics*>(response_statistics);
1020+
rs->response_factory =
1021+
reinterpret_cast<std::shared_ptr<InferenceResponseFactory>*>(
1022+
response_factory);
1023+
;
1024+
#endif // TRITON_ENABLE_STATS
1025+
1026+
return nullptr; // success
1027+
}
1028+
1029+
TRITONAPI_DECLSPEC TRITONSERVER_Error*
1030+
TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseStart(
1031+
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
1032+
uint64_t response_start)
1033+
{
1034+
#ifdef TRITON_ENABLE_STATS
1035+
ModelInstanceResponseStatistics* rs =
1036+
reinterpret_cast<ModelInstanceResponseStatistics*>(response_statistics);
1037+
rs->response_start = response_start;
1038+
#endif // TRITON_ENABLE_STATS
1039+
1040+
return nullptr; // success
1041+
}
1042+
1043+
TRITONAPI_DECLSPEC TRITONSERVER_Error*
1044+
TRITONBACKEND_ModelInstanceResponseStatisticsSetComputeOutputStart(
1045+
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
1046+
uint64_t compute_output_start)
1047+
{
1048+
#ifdef TRITON_ENABLE_STATS
1049+
ModelInstanceResponseStatistics* rs =
1050+
reinterpret_cast<ModelInstanceResponseStatistics*>(response_statistics);
1051+
rs->compute_output_start = compute_output_start;
1052+
#endif // TRITON_ENABLE_STATS
1053+
1054+
return nullptr; // success
1055+
}
1056+
1057+
TRITONAPI_DECLSPEC TRITONSERVER_Error*
1058+
TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseEnd(
1059+
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
1060+
uint64_t response_end)
1061+
{
1062+
#ifdef TRITON_ENABLE_STATS
1063+
ModelInstanceResponseStatistics* rs =
1064+
reinterpret_cast<ModelInstanceResponseStatistics*>(response_statistics);
1065+
rs->response_end = response_end;
1066+
#endif // TRITON_ENABLE_STATS
1067+
1068+
return nullptr; // success
1069+
}
1070+
1071+
TRITONAPI_DECLSPEC TRITONSERVER_Error*
1072+
TRITONBACKEND_ModelInstanceResponseStatisticsSetError(
1073+
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
1074+
TRITONSERVER_Error* error)
1075+
{
1076+
#ifdef TRITON_ENABLE_STATS
1077+
ModelInstanceResponseStatistics* rs =
1078+
reinterpret_cast<ModelInstanceResponseStatistics*>(response_statistics);
1079+
rs->error = error;
1080+
#endif // TRITON_ENABLE_STATS
1081+
1082+
return nullptr; // success
1083+
}
1084+
1085+
TRITONAPI_DECLSPEC TRITONSERVER_Error*
1086+
TRITONBACKEND_ModelInstanceReportResponseStatistics(
1087+
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics)
1088+
{
1089+
#ifdef TRITON_ENABLE_STATS
1090+
ModelInstanceResponseStatistics* rs =
1091+
reinterpret_cast<ModelInstanceResponseStatistics*>(response_statistics);
1092+
1093+
InferenceStatsAggregator* sa =
1094+
rs->model_instance->Model()->MutableStatsAggregator();
1095+
std::string key =
1096+
std::to_string((*rs->response_factory)->GetAndIncrementResponseIndex());
1097+
1098+
if (rs->error == nullptr) {
1099+
if (rs->compute_output_start > 0) {
1100+
RETURN_TRITONSERVER_ERROR_IF_ERROR(sa->UpdateResponseSuccess(
1101+
key, rs->response_start, rs->compute_output_start, rs->response_end));
1102+
} else {
1103+
RETURN_TRITONSERVER_ERROR_IF_ERROR(
1104+
sa->UpdateResponseEmpty(key, rs->response_start, rs->response_end));
1105+
}
1106+
} else {
1107+
RETURN_TRITONSERVER_ERROR_IF_ERROR(sa->UpdateResponseFail(
1108+
key, rs->response_start, rs->compute_output_start, rs->response_end));
1109+
}
1110+
#endif // TRITON_ENABLE_STATS
1111+
1112+
return nullptr; // success
1113+
}
1114+
9561115
TRITONAPI_DECLSPEC TRITONSERVER_Error*
9571116
TRITONBACKEND_ModelInstanceReportBatchStatistics(
9581117
TRITONBACKEND_ModelInstance* instance, const uint64_t batch_size,

0 commit comments

Comments
 (0)