Skip to content

Commit 9024918

Browse files
authored
Merge pull request #1912 from igchor/latency_tracker_histogram_hdr
[common] Histogram-based latency tracker
2 parents 2233030 + b93ecbb commit 9024918

File tree

7 files changed

+264
-0
lines changed

7 files changed

+264
-0
lines changed

.github/workflows/cmake.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ jobs:
2020
compiler: [{c: gcc, cxx: g++}]
2121
libbacktrace: ['-DVAL_USE_LIBBACKTRACE_BACKTRACE=OFF']
2222
pool_tracking: ['-DUMF_ENABLE_POOL_TRACKING=ON', '-DUMF_ENABLE_POOL_TRACKING=OFF']
23+
latency_tracking: ['-DUMF_ENABLE_LATENCY_TRACKING=OFF']
2324
include:
2425
- os: 'ubuntu-22.04'
2526
build_type: Release
@@ -36,6 +37,10 @@ jobs:
3637
- os: 'ubuntu-20.04'
3738
build_type: Release
3839
compiler: {c: gcc-7, cxx: g++-7}
40+
- os: 'ubuntu-22.04'
41+
build_type: Release
42+
compiler: {c: clang, cxx: clang++}
43+
latency_tracking: '-DUMF_ENABLE_LATENCY_TRACKING=ON'
3944
runs-on: ${{ (matrix.os == 'ubuntu-22.04' && github.repository_owner == 'oneapi-src') && 'intel-ubuntu-22.04' || matrix.os }}
4045

4146
steps:
@@ -92,6 +97,7 @@ jobs:
9297
-DUR_DPCXX=${{github.workspace}}/dpcpp_compiler/bin/clang++
9398
${{matrix.libbacktrace}}
9499
${{matrix.pool_tracking}}
100+
${{matrix.latency_tracking}}
95101
96102
- name: Configure CMake
97103
if: matrix.os == 'ubuntu-20.04'
@@ -106,6 +112,7 @@ jobs:
106112
-DUR_FORMAT_CPP_STYLE=ON
107113
${{matrix.libbacktrace}}
108114
${{matrix.pool_tracking}}
115+
${{matrix.latency_tracking}}
109116
110117
- name: Generate source from spec, check for uncommitted diff
111118
if: matrix.os == 'ubuntu-22.04'

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ option(UR_ENABLE_ASSERTIONS "Enable assertions for all build types" OFF)
5757
option(UR_BUILD_XPTI_LIBS "Build the XPTI libraries when tracing is enabled" ON)
5858
option(UR_STATIC_LOADER "Build loader as a static library" OFF)
5959
option(UR_FORCE_LIBSTDCXX "Force use of libstdc++ in a build using libc++ on Linux" OFF)
60+
option(UR_ENABLE_LATENCY_HISTOGRAM "Enable latncy histogram" OFF)
6061
set(UR_DPCXX "" CACHE FILEPATH "Path of the DPC++ compiler executable")
6162
set(UR_DPCXX_BUILD_FLAGS "" CACHE STRING "Build flags to pass to DPC++ when compiling device programs")
6263
set(UR_SYCL_LIBRARY_DIR "" CACHE PATH

source/adapters/level_zero/v2/command_list_cache.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ command_list_cache_t::getImmediateCommandList(
8181
ze_device_handle_t ZeDevice, bool IsInOrder, uint32_t Ordinal,
8282
ze_command_queue_mode_t Mode, ze_command_queue_priority_t Priority,
8383
std::optional<uint32_t> Index) {
84+
TRACK_SCOPE_LATENCY("command_list_cache_t::getImmediateCommandList");
85+
8486
immediate_command_list_descriptor_t Desc;
8587
Desc.ZeDevice = ZeDevice;
8688
Desc.Ordinal = Ordinal;
@@ -100,12 +102,15 @@ command_list_cache_t::getImmediateCommandList(
100102
raii::cache_borrowed_command_list_t
101103
command_list_cache_t::getRegularCommandList(ze_device_handle_t ZeDevice,
102104
bool IsInOrder, uint32_t Ordinal) {
105+
TRACK_SCOPE_LATENCY("command_list_cache_t::getRegularCommandList");
106+
103107
regular_command_list_descriptor_t Desc;
104108
Desc.ZeDevice = ZeDevice;
105109
Desc.IsInOrder = IsInOrder;
106110
Desc.Ordinal = Ordinal;
107111

108112
auto CommandList = getCommandList(Desc).release();
113+
109114
return raii::cache_borrowed_command_list_t(
110115
CommandList, [Cache = this, Desc](ze_command_list_handle_t CmdList) {
111116
Cache->addCommandList(

source/adapters/level_zero/v2/command_list_cache.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <functional>
1313
#include <stack>
1414

15+
#include "latency_tracker.hpp"
1516
#include <ur/ur.hpp>
1617
#include <ur_api.h>
1718
#include <ze_api.h>

source/adapters/level_zero/v2/event_provider_normal.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include "../common.hpp"
1212
#include "../context.hpp"
1313
#include "event_provider.hpp"
14+
#include "latency_tracker.hpp"
1415
#include "ur_api.h"
1516
#include "ze_api.h"
1617
#include <memory>
@@ -79,7 +80,10 @@ std::unique_ptr<provider_pool> provider_normal::createProviderPool() {
7980
}
8081

8182
event_allocation provider_normal::allocate() {
83+
TRACK_SCOPE_LATENCY("provider_normal::allocate");
84+
8285
if (pools.empty()) {
86+
TRACK_SCOPE_LATENCY("provider_normal::allocate#createProviderPool");
8387
pools.emplace_back(createProviderPool());
8488
}
8589

@@ -91,6 +95,7 @@ event_allocation provider_normal::allocate() {
9195
}
9296
}
9397

98+
TRACK_SCOPE_LATENCY("provider_normal::allocate#slowPath");
9499
std::sort(pools.begin(), pools.end(), [](auto &a, auto &b) {
95100
return a->nfree() < b->nfree(); // asceding
96101
});

source/common/CMakeLists.txt

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
add_ur_library(ur_common STATIC
77
ur_util.cpp
88
ur_util.hpp
9+
latency_tracker.hpp
910
$<$<PLATFORM_ID:Windows>:windows/ur_lib_loader.cpp>
1011
$<$<PLATFORM_ID:Linux,Darwin>:linux/ur_lib_loader.cpp>
1112
)
@@ -45,6 +46,24 @@ set(UMF_BUILD_LIBUMF_POOL_DISJOINT ON CACHE INTERNAL "Build Disjoint Pool")
4546
FetchContent_MakeAvailable(unified-memory-framework)
4647
FetchContent_GetProperties(unified-memory-framework)
4748

49+
if(UR_ENABLE_LATENCY_HISTOGRAM)
50+
set(HDR_HISTOGRAM_BUILD_STATIC CACHE INTERNAL ON "")
51+
set(HDR_HISTOGRAM_BUILD_SHARED CACHE INTERNAL OFF "")
52+
53+
include(FetchContent)
54+
FetchContent_Declare(hdr_histogram
55+
GIT_REPOSITORY https://github.com/HdrHistogram/HdrHistogram_c.git
56+
GIT_TAG 0.11.8
57+
)
58+
59+
FetchContent_MakeAvailable(hdr_histogram)
60+
FetchContent_GetProperties(hdr_histogram)
61+
62+
target_link_libraries(ur_common PUBLIC hdr_histogram_static)
63+
target_include_directories(ur_common PUBLIC $<BUILD_INTERFACE:${hdr_histogram_SOURCE_DIR}/include>)
64+
target_compile_options(ur_common PUBLIC -DUR_ENABLE_LATENCY_HISTOGRAM=1)
65+
endif()
66+
4867
target_link_libraries(ur_common PUBLIC
4968
${CMAKE_DL_LIBS}
5069
${PROJECT_NAME}::headers

source/common/latency_tracker.hpp

Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
//===--------- latency_tracker.cpp - common ------------------------------===//
2+
//
3+
// Copyright (C) 2024 Intel Corporation
4+
//
5+
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
6+
// Exceptions. See LICENSE.TXT
7+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#pragma once
12+
13+
#include <chrono>
14+
#include <cstdint>
15+
#include <limits>
16+
#include <memory>
17+
18+
#include "logger/ur_logger.hpp"
19+
20+
#if defined(UR_ENABLE_LATENCY_HISTOGRAM)
21+
22+
#include <hdr/hdr_histogram.h>
23+
24+
static inline bool trackLatency = []() {
25+
try {
26+
auto map = getenv_to_map("UR_LOG_LATENCY");
27+
28+
if (!map.has_value()) {
29+
return false;
30+
}
31+
32+
auto it = map->find("level");
33+
return it != map->end() &&
34+
logger::str_to_level(it->second.front()) != logger::Level::QUIET;
35+
} catch (...) {
36+
return false;
37+
}
38+
}();
39+
40+
static constexpr size_t numPercentiles = 7;
41+
static constexpr double percentiles[numPercentiles] = {
42+
50.0, 90.0, 99.0, 99.9, 99.99, 99.999, 99.9999};
43+
44+
struct latencyValues {
45+
int64_t min;
46+
int64_t max;
47+
int64_t mean;
48+
int64_t stddev;
49+
int64_t percentileValues[numPercentiles];
50+
};
51+
52+
using histogram_ptr =
53+
std::unique_ptr<struct hdr_histogram, decltype(&hdr_close)>;
54+
55+
static inline latencyValues getValues(const struct hdr_histogram *histogram) {
56+
latencyValues values;
57+
values.max = hdr_max(histogram);
58+
values.min = hdr_min(histogram);
59+
values.mean = static_cast<int64_t>(hdr_mean(histogram));
60+
values.stddev = static_cast<int64_t>(hdr_stddev(histogram));
61+
62+
auto ret = hdr_value_at_percentiles(
63+
histogram, percentiles, values.percentileValues, numPercentiles);
64+
if (ret != 0) {
65+
logger::error("Failed to get percentiles from latency histogram");
66+
}
67+
68+
return values;
69+
}
70+
71+
class latency_printer {
72+
public:
73+
latency_printer() : logger(logger::create_logger("latency", true, false)) {}
74+
75+
inline void publishLatency(const std::string &name,
76+
histogram_ptr histogram) {
77+
auto [it, inserted] = values.try_emplace(name, std::move(histogram));
78+
if (!inserted) {
79+
// combine histograms
80+
hdr_add(it->second.get(), histogram.get());
81+
}
82+
}
83+
84+
inline ~latency_printer() {
85+
if (trackLatency) {
86+
print();
87+
}
88+
}
89+
90+
inline void print() {
91+
printHeader();
92+
93+
for (auto &[name, histogram] : values) {
94+
auto value = getValues(histogram.get());
95+
logger.log(logger::Level::INFO,
96+
"{},{},{},{},{},{},{},{},{},{},{},{},ns", name,
97+
value.min, value.max, value.mean, value.stddev,
98+
value.percentileValues[0], value.percentileValues[1],
99+
value.percentileValues[2], value.percentileValues[3],
100+
value.percentileValues[4], value.percentileValues[5],
101+
value.percentileValues[6]);
102+
}
103+
}
104+
105+
private:
106+
inline void printHeader() {
107+
logger.log(logger::Level::INFO, "Latency histogram:");
108+
logger.log(logger::Level::INFO,
109+
"name,min,max,mean,stdev,p{},p{},p{},p{},p{},p{},p{},unit",
110+
percentiles[0], percentiles[1], percentiles[2],
111+
percentiles[3], percentiles[4], percentiles[5],
112+
percentiles[6]);
113+
}
114+
115+
std::map<std::string, histogram_ptr> values;
116+
logger::Logger logger;
117+
};
118+
119+
inline latency_printer &globalLatencyPrinter() {
120+
static latency_printer printer;
121+
return printer;
122+
}
123+
124+
class latency_histogram {
125+
public:
126+
inline latency_histogram(const char *name,
127+
latency_printer &printer = globalLatencyPrinter(),
128+
int64_t lowestDiscernibleValue = 1,
129+
int64_t highestTrackableValue = 100'000'000'000,
130+
int significantFigures = 3)
131+
: name(name), histogram(nullptr, nullptr), printer(printer) {
132+
if (trackLatency) {
133+
struct hdr_histogram *cHistogram;
134+
auto ret = hdr_init(lowestDiscernibleValue, highestTrackableValue,
135+
significantFigures, &cHistogram);
136+
if (ret != 0) {
137+
logger::error("Failed to initialize latency histogram");
138+
}
139+
histogram =
140+
std::unique_ptr<struct hdr_histogram, decltype(&hdr_close)>(
141+
cHistogram, &hdr_close);
142+
}
143+
}
144+
145+
latency_histogram(const latency_histogram &) = delete;
146+
latency_histogram(latency_histogram &&) = delete;
147+
148+
inline ~latency_histogram() {
149+
if (!trackLatency || !histogram) {
150+
return;
151+
}
152+
153+
if (hdr_min(histogram.get()) == std::numeric_limits<int64_t>::max()) {
154+
logger::info("[{}] latency: no data", name);
155+
return;
156+
}
157+
158+
printer.publishLatency(name, std::move(histogram));
159+
}
160+
161+
inline void trackValue(int64_t value) {
162+
hdr_record_value(histogram.get(), value);
163+
}
164+
165+
private:
166+
const char *name;
167+
histogram_ptr histogram;
168+
latency_printer &printer;
169+
};
170+
171+
class latency_tracker {
172+
public:
173+
inline explicit latency_tracker(latency_histogram &stats)
174+
: stats(trackLatency ? &stats : nullptr), begin() {
175+
if (trackLatency) {
176+
begin = std::chrono::steady_clock::now();
177+
}
178+
}
179+
inline latency_tracker() {}
180+
inline ~latency_tracker() {
181+
if (stats) {
182+
auto tp = std::chrono::steady_clock::now();
183+
auto diffNanos =
184+
std::chrono::duration_cast<std::chrono::nanoseconds>(tp - begin)
185+
.count();
186+
stats->trackValue(static_cast<int64_t>(diffNanos));
187+
}
188+
}
189+
190+
latency_tracker(const latency_tracker &) = delete;
191+
latency_tracker &operator=(const latency_tracker &) = delete;
192+
193+
inline latency_tracker(latency_tracker &&rhs) noexcept
194+
: stats(rhs.stats), begin(rhs.begin) {
195+
rhs.stats = nullptr;
196+
}
197+
198+
inline latency_tracker &operator=(latency_tracker &&rhs) noexcept {
199+
if (this != &rhs) {
200+
this->~latency_tracker();
201+
new (this) latency_tracker(std::move(rhs));
202+
}
203+
return *this;
204+
}
205+
206+
private:
207+
latency_histogram *stats{nullptr};
208+
std::chrono::time_point<std::chrono::steady_clock> begin;
209+
};
210+
211+
// To resolve __COUNTER__
212+
#define CONCAT(a, b) a##b
213+
214+
// Each tracker has it's own thread-local histogram.
215+
// At program exit, all histograms for the same scope are
216+
// aggregated.
217+
#define TRACK_SCOPE_LATENCY_CNT(name, cnt) \
218+
static thread_local latency_histogram CONCAT(histogram, cnt)(name); \
219+
latency_tracker CONCAT(tracker, cnt)(CONCAT(histogram, cnt));
220+
#define TRACK_SCOPE_LATENCY(name) TRACK_SCOPE_LATENCY_CNT(name, __COUNTER__)
221+
222+
#else // UR_ENABLE_LATENCY_HISTOGRAM
223+
224+
#define TRACK_SCOPE_LATENCY(name)
225+
226+
#endif // UR_ENABLE_LATENCY_HISTOGRAM

0 commit comments

Comments
 (0)