Skip to content

Commit 836dd2d

Browse files
authored
[SYCL][UR] Fix the accuracy of command submission timestamp (#18735)
Current "submission time" calculation is inaccurate because we don't use both synchronized timestamps returned by zeDeviceGetGlobalTimestamps but using only device timestamp from that call and use std::chrono "close" to that call to record the host time. This estimation becomes inaccurate pretty quickly. This PR fixes this problem using the known fact that L0 runtime implementation uses CLOCK_MONOTONIC_RAW on Linux and QueryPerformanceCounter on Windows. So, with this fix, at the first call, we use both device and host timestamps from zeDeviceGetGlobalTimestamps, subsequent calls (when device timestamp is not requested) will return corresponding host timestamp only, without making the `zeDeviceGetGlobalTimestamps` call which has high latency. Even though this approach improves accuracy and submit time doesn't become "invalid" (submit time > start_time) fast, it still doesn't guarantee that it will not happen. So, there will be additional fix done in #18717 to fix that. Test is also updated there to check larger number of iterations. Also apply the same fix as 138bef7 for cuda adapter (i.e. record host time after cuEventSynchronize for more precise measurement)
1 parent b51f875 commit 836dd2d

File tree

3 files changed

+74
-45
lines changed

3 files changed

+74
-45
lines changed

sycl/source/detail/device_impl.cpp

Lines changed: 27 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -324,19 +324,32 @@ ur_native_handle_t device_impl::getNative() const {
324324
// clock drift between host and device.
325325
//
326326
uint64_t device_impl::getCurrentDeviceTime() {
327-
using namespace std::chrono;
328-
uint64_t HostTime =
329-
duration_cast<nanoseconds>(steady_clock::now().time_since_epoch())
330-
.count();
327+
auto GetGlobalTimestamps = [this](ur_device_handle_t Device,
328+
uint64_t *DeviceTime, uint64_t *HostTime) {
329+
auto Result =
330+
getAdapter()->call_nocheck<UrApiKind::urDeviceGetGlobalTimestamps>(
331+
Device, DeviceTime, HostTime);
332+
if (Result == UR_RESULT_ERROR_INVALID_OPERATION) {
333+
// NOTE(UR port): Removed the call to GetLastError because we shouldn't
334+
// be calling it after ERROR_INVALID_OPERATION: there is no
335+
// adapter-specific error.
336+
throw detail::set_ur_error(
337+
sycl::exception(
338+
make_error_code(errc::feature_not_supported),
339+
"Device and/or backend does not support querying timestamp."),
340+
UR_RESULT_ERROR_INVALID_OPERATION);
341+
} else {
342+
getAdapter()->checkUrResult<errc::feature_not_supported>(Result);
343+
}
344+
};
331345

346+
uint64_t HostTime = 0;
347+
uint64_t Diff = 0;
332348
// To account for potential clock drift between host clock and device clock.
333349
// The value set is arbitrary: 200 seconds
334-
std::shared_lock<std::shared_mutex> ReadLock(MDeviceHostBaseTimeMutex);
335350
constexpr uint64_t TimeTillRefresh = 200e9;
336-
assert(HostTime >= MDeviceHostBaseTime.second);
337-
uint64_t Diff = HostTime - MDeviceHostBaseTime.second;
338-
339351
// If getCurrentDeviceTime is called for the first time or we have to refresh.
352+
std::shared_lock<std::shared_mutex> ReadLock(MDeviceHostBaseTimeMutex);
340353
if (!MDeviceHostBaseTime.second || Diff > TimeTillRefresh) {
341354
ReadLock.unlock();
342355
std::unique_lock<std::shared_mutex> WriteLock(MDeviceHostBaseTimeMutex);
@@ -346,38 +359,12 @@ uint64_t device_impl::getCurrentDeviceTime() {
346359
// MDeviceHostBaseTime, so we can just return the current device time.
347360
return MDeviceHostBaseTime.first + Diff;
348361
}
349-
const auto &Adapter = getAdapter();
350-
auto Result = Adapter->call_nocheck<UrApiKind::urDeviceGetGlobalTimestamps>(
351-
MDevice, &MDeviceHostBaseTime.first, &MDeviceHostBaseTime.second);
352-
// We have to remember base host timestamp right after UR call and it is
353-
// going to be used for calculation of the device timestamp at the next
354-
// getCurrentDeviceTime() call. We need to do it here because getAdapter()
355-
// and urDeviceGetGlobalTimestamps calls may take significant amount of
356-
// time, for example on the first call to getAdapter adapters may need to be
357-
// initialized. If we use timestamp from the beginning of the function then
358-
// the difference between host timestamps of the current
359-
// getCurrentDeviceTime and the next getCurrentDeviceTime will be incorrect
360-
// because it will include execution time of the code before we get device
361-
// timestamp from urDeviceGetGlobalTimestamps.
362-
HostTime =
363-
duration_cast<nanoseconds>(steady_clock::now().time_since_epoch())
364-
.count();
365-
if (Result == UR_RESULT_ERROR_INVALID_OPERATION) {
366-
// NOTE(UR port): Removed the call to GetLastError because we shouldn't
367-
// be calling it after ERROR_INVALID_OPERATION: there is no
368-
// adapter-specific error.
369-
throw detail::set_ur_error(
370-
sycl::exception(
371-
make_error_code(errc::feature_not_supported),
372-
"Device and/or backend does not support querying timestamp."),
373-
UR_RESULT_ERROR_INVALID_OPERATION);
374-
} else {
375-
Adapter->checkUrResult<errc::feature_not_supported>(Result);
376-
}
377-
// Until next sync we will compute device time based on the host time
378-
// returned in HostTime, so make this our base host time.
379-
MDeviceHostBaseTime.second = HostTime;
380-
Diff = 0;
362+
GetGlobalTimestamps(MDevice, &MDeviceHostBaseTime.first,
363+
&MDeviceHostBaseTime.second);
364+
} else {
365+
GetGlobalTimestamps(MDevice, nullptr, &HostTime);
366+
assert(HostTime >= MDeviceHostBaseTime.second);
367+
Diff = HostTime - MDeviceHostBaseTime.second;
381368
}
382369
return MDeviceHostBaseTime.first + Diff;
383370
}

unified-runtime/source/adapters/cuda/device.cpp

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1320,6 +1320,14 @@ ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice,
13201320
UR_CHECK_ERROR(cuEventCreate(&Event, CU_EVENT_DEFAULT));
13211321
UR_CHECK_ERROR(cuEventRecord(Event, 0));
13221322
}
1323+
1324+
if (pDeviceTimestamp) {
1325+
UR_CHECK_ERROR(cuEventSynchronize(Event));
1326+
*pDeviceTimestamp = hDevice->getElapsedTime(Event);
1327+
}
1328+
1329+
// Record the host timestamp after the cuEventSynchronize() call for more
1330+
// precise measurement.
13231331
if (pHostTimestamp) {
13241332

13251333
using namespace std::chrono;
@@ -1328,11 +1336,6 @@ ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice,
13281336
.count();
13291337
}
13301338

1331-
if (pDeviceTimestamp) {
1332-
UR_CHECK_ERROR(cuEventSynchronize(Event));
1333-
*pDeviceTimestamp = hDevice->getElapsedTime(Event);
1334-
}
1335-
13361339
return UR_RESULT_SUCCESS;
13371340
}
13381341

unified-runtime/source/adapters/level_zero/device.cpp

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@
1515
#include "ur_util.hpp"
1616
#include <algorithm>
1717
#include <climits>
18+
#if defined(__linux__)
19+
#include <ctime>
20+
#elif defined(_WIN32)
21+
#include <windows.h>
22+
#endif
1823
#include <optional>
1924
#include <vector>
2025

@@ -1569,6 +1574,40 @@ ur_result_t urDeviceGetGlobalTimestamps(
15691574
/// [out][optional] pointer to the Host's global timestamp that correlates
15701575
/// with the Device's global timestamp value
15711576
uint64_t *HostTimestamp) {
1577+
if (!DeviceTimestamp && HostTimestamp) {
1578+
// If only HostTimestamp is requested, we need to avoid making a call to
1579+
// zeDeviceGetGlobalTimestamps which has higher latency. This is a
1580+
// workaround for the fact that Level Zero does not provide a way to get the
1581+
// host timestamp directly. It is known that current implementation of L0
1582+
// runtime uses CLOCK_MONOTONIC_RAW on Linux and QueryPerformanceCounter on
1583+
// Windows.
1584+
#if defined(__linux__)
1585+
timespec Monotonic;
1586+
if (clock_gettime(CLOCK_MONOTONIC_RAW, &Monotonic) != 0) {
1587+
UR_LOG(ERR, "Failed to get CLOCK_MONOTONIC time");
1588+
return UR_RESULT_ERROR_UNINITIALIZED;
1589+
}
1590+
*HostTimestamp = static_cast<uint64_t>(Monotonic.tv_sec) * 1'000'000'000 +
1591+
static_cast<uint64_t>(Monotonic.tv_nsec);
1592+
return UR_RESULT_SUCCESS;
1593+
#elif defined(_WIN32)
1594+
// Use QueryPerformanceCounter on Windows
1595+
uint64_t Counter;
1596+
if (!QueryPerformanceCounter((LARGE_INTEGER *)&Counter)) {
1597+
UR_LOG(ERR, "Failed to get performance counter");
1598+
return UR_RESULT_ERROR_UNINITIALIZED;
1599+
}
1600+
LARGE_INTEGER Frequency;
1601+
if (!QueryPerformanceFrequency(&Frequency)) {
1602+
UR_LOG(ERR, "Failed to get performance frequency");
1603+
return UR_RESULT_ERROR_UNINITIALIZED;
1604+
}
1605+
*HostTimestamp = static_cast<uint64_t>(
1606+
(static_cast<double>(Counter) * 1'000'000'000 / Frequency.QuadPart));
1607+
return UR_RESULT_SUCCESS;
1608+
#endif
1609+
}
1610+
15721611
const uint64_t &ZeTimerResolution =
15731612
Device->ZeDeviceProperties->timerResolution;
15741613
const uint64_t TimestampMaxCount = Device->getTimestampMask();

0 commit comments

Comments
 (0)