Skip to content

Commit cdf4060

Browse files
committed
Merge remote-tracking branch 'origin/sycl' into llvmspirv_pulldown
2 parents 1631cf7 + 6bcb265 commit cdf4060

File tree

21 files changed

+500
-122
lines changed

21 files changed

+500
-122
lines changed

devops/containers/ubuntu2404_base.Dockerfile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@ ENV DEBIAN_FRONTEND=noninteractive
44

55
USER root
66

7+
# Configure LLVM nightly repo
8+
RUN apt-get update -qq && apt-get install --no-install-recommends -yqq curl ca-certificates
9+
RUN curl -sSL https://apt.llvm.org/llvm-snapshot.gpg.key -o /etc/apt/trusted.gpg.d/apt.llvm.org.asc
10+
RUN echo 'deb http://apt.llvm.org/noble/ llvm-toolchain-noble main' > /etc/apt/sources.list.d/llvm.list
11+
712
# Install SYCL prerequisites
813
COPY scripts/install_build_tools.sh /install.sh
914
RUN /install.sh

sycl/doc/extensions/proposed/sycl_ext_intel_event_mode.asciidoc renamed to sycl/doc/extensions/experimental/sycl_ext_intel_event_mode.asciidoc

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,12 @@ This extension also depends on the following other SYCL extensions:
5252

5353
== Status
5454

55-
This is a proposed extension specification, intended to gather community
56-
feedback. Interfaces defined in this specification may not be implemented yet
57-
or may be in a preliminary state. The specification itself may also change in
58-
incompatible ways before it is finalized. *Shipping software products should
59-
not rely on APIs defined in this specification.*
55+
This is an experimental extension specification, intended to provide early
56+
access to features and gather community feedback. Interfaces defined in this
57+
specification are implemented in {dpcpp}, but they are not finalized and may
58+
change incompatibly in future versions of {dpcpp} without prior notice.
59+
*Shipping software products should not rely on APIs defined in this
60+
specification.*
6061

6162

6263
== Overview

sycl/source/detail/kernel_name_based_cache_t.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,10 @@ struct FastKernelSubcacheT {
7474
struct KernelNameBasedCacheT {
7575
FastKernelSubcacheT FastKernelSubcache;
7676
std::optional<bool> UsesAssert;
77+
// Implicit local argument position is represented by an optional int, this
78+
// uses another optional on top of that to represent lazy initialization of
79+
// the cached value.
80+
std::optional<std::optional<int>> ImplicitLocalArgPos;
7781
};
7882

7983
} // namespace detail

sycl/source/detail/program_manager/program_manager.cpp

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1841,12 +1841,24 @@ void ProgramManager::cacheKernelImplicitLocalArg(RTDeviceBinaryImage &Img) {
18411841
}
18421842
}
18431843

1844-
std::optional<int>
1845-
ProgramManager::kernelImplicitLocalArgPos(KernelNameStrRefT KernelName) const {
1846-
auto it = m_KernelImplicitLocalArgPos.find(KernelName);
1847-
if (it != m_KernelImplicitLocalArgPos.end())
1848-
return it->second;
1849-
return {};
1844+
std::optional<int> ProgramManager::kernelImplicitLocalArgPos(
1845+
KernelNameStrRefT KernelName,
1846+
KernelNameBasedCacheT *KernelNameBasedCachePtr) const {
1847+
auto getLocalArgPos = [&]() -> std::optional<int> {
1848+
auto it = m_KernelImplicitLocalArgPos.find(KernelName);
1849+
if (it != m_KernelImplicitLocalArgPos.end())
1850+
return it->second;
1851+
return {};
1852+
};
1853+
1854+
if (!KernelNameBasedCachePtr)
1855+
return getLocalArgPos();
1856+
std::optional<std::optional<int>> &ImplicitLocalArgPos =
1857+
KernelNameBasedCachePtr->ImplicitLocalArgPos;
1858+
if (!ImplicitLocalArgPos.has_value()) {
1859+
ImplicitLocalArgPos = getLocalArgPos();
1860+
}
1861+
return ImplicitLocalArgPos.value();
18501862
}
18511863

18521864
static bool isBfloat16DeviceLibImage(sycl_device_binary RawImg,

sycl/source/detail/program_manager/program_manager.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -373,8 +373,9 @@ class ProgramManager {
373373

374374
SanitizerType kernelUsesSanitizer() const { return m_SanitizerFoundInImage; }
375375

376-
std::optional<int>
377-
kernelImplicitLocalArgPos(KernelNameStrRefT KernelName) const;
376+
std::optional<int> kernelImplicitLocalArgPos(
377+
KernelNameStrRefT KernelName,
378+
KernelNameBasedCacheT *KernelNameBasedCachePtr) const;
378379

379380
std::set<RTDeviceBinaryImage *>
380381
getRawDeviceImages(const std::vector<kernel_id> &KernelIDs);

sycl/source/detail/scheduler/commands.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2389,8 +2389,9 @@ static ur_result_t SetKernelParamsAndLaunch(
23892389
const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
23902390
bool IsCooperative, bool KernelUsesClusterLaunch,
23912391
uint32_t WorkGroupMemorySize, const RTDeviceBinaryImage *BinImage,
2392-
KernelNameStrRefT KernelName, void *KernelFuncPtr = nullptr,
2393-
int KernelNumArgs = 0,
2392+
KernelNameStrRefT KernelName,
2393+
KernelNameBasedCacheT *KernelNameBasedCachePtr,
2394+
void *KernelFuncPtr = nullptr, int KernelNumArgs = 0,
23942395
detail::kernel_param_desc_t (*KernelParamDescGetter)(int) = nullptr,
23952396
bool KernelHasSpecialCaptures = true) {
23962397
const AdapterPtr &Adapter = Queue.getAdapter();
@@ -2437,7 +2438,8 @@ static ur_result_t SetKernelParamsAndLaunch(
24372438
}
24382439

24392440
std::optional<int> ImplicitLocalArg =
2440-
ProgramManager::getInstance().kernelImplicitLocalArgPos(KernelName);
2441+
ProgramManager::getInstance().kernelImplicitLocalArgPos(
2442+
KernelName, KernelNameBasedCachePtr);
24412443
// Set the implicit local memory buffer to support
24422444
// get_work_group_scratch_memory. This is for backend not supporting
24432445
// CUDA-style local memory setting. Note that we may have -1 as a position,
@@ -2752,8 +2754,8 @@ void enqueueImpKernel(
27522754
*Queue, Args, DeviceImageImpl, Kernel, NDRDesc, EventsWaitList,
27532755
OutEventImpl, EliminatedArgMask, getMemAllocationFunc,
27542756
KernelIsCooperative, KernelUsesClusterLaunch, WorkGroupMemorySize,
2755-
BinImage, KernelName, KernelFuncPtr, KernelNumArgs,
2756-
KernelParamDescGetter, KernelHasSpecialCaptures);
2757+
BinImage, KernelName, KernelNameBasedCachePtr, KernelFuncPtr,
2758+
KernelNumArgs, KernelParamDescGetter, KernelHasSpecialCaptures);
27572759
}
27582760
if (UR_RESULT_SUCCESS != Error) {
27592761
// If we have got non-success error code, let's analyze it to emit nice

sycl/source/feature_test.hpp.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ inline namespace _V1 {
120120
// In progress yet
121121
#define SYCL_EXT_ONEAPI_ATOMIC16 0
122122
#define SYCL_KHR_DEFAULT_CONTEXT 1
123+
#define SYCL_EXT_INTEL_EVENT_MODE 1
123124

124125
#ifndef __has_include
125126
#define __has_include(x) 0
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
4+
#include <sycl/detail/core.hpp>
5+
#include <sycl/usm.hpp>
6+
7+
#include <iostream>
8+
#include <vector>
9+
10+
std::vector<sycl::event> submit_dependencies(sycl::queue q1, sycl::queue q2,
11+
int *mem1, int *mem2) {
12+
int delay_ops = 1024 * 1024;
13+
auto delay = [=] {
14+
volatile int value = delay_ops;
15+
while (--value)
16+
;
17+
};
18+
19+
auto ev1 =
20+
q1.parallel_for(sycl::range<1>(1024), [=]([[maybe_unused]] auto u) {
21+
delay();
22+
mem1[u.get_id()] = 1;
23+
});
24+
auto ev2 =
25+
q2.parallel_for(sycl::range<1>(1024), [=]([[maybe_unused]] auto u) {
26+
delay();
27+
mem2[u.get_id()] = 2;
28+
});
29+
30+
return {ev1, ev2};
31+
}
32+
33+
void test_host_task() {
34+
sycl::context c1{};
35+
sycl::context c2{};
36+
37+
sycl::queue q1(c1, sycl::default_selector_v);
38+
sycl::queue q2(c2, sycl::default_selector_v);
39+
40+
auto mem1 = sycl::malloc_host<int>(1024, q1);
41+
auto mem2 = sycl::malloc_host<int>(1024, q2);
42+
43+
auto events = submit_dependencies(q1, q2, mem1, mem2);
44+
45+
q2.submit([&](sycl::handler &cgh) {
46+
cgh.depends_on(events[0]);
47+
cgh.depends_on(events[1]);
48+
cgh.host_task([=]() {
49+
for (int i = 0; i < 1024; i++) {
50+
assert(mem1[i] == 1);
51+
assert(mem2[i] == 2);
52+
}
53+
});
54+
});
55+
56+
q2.wait();
57+
58+
sycl::free(mem1, c1);
59+
sycl::free(mem2, c2);
60+
}
61+
62+
void test_kernel() {
63+
sycl::context c1{};
64+
sycl::context c2{};
65+
66+
sycl::queue q1(c1, sycl::default_selector_v);
67+
sycl::queue q2(c2, sycl::default_selector_v);
68+
69+
auto mem1 = sycl::malloc_device<int>(1024, q1);
70+
auto mem2 = sycl::malloc_device<int>(1024, q2);
71+
72+
auto events = submit_dependencies(q1, q2, mem1, mem2);
73+
74+
q1.submit([&](sycl::handler &cgh) {
75+
cgh.depends_on(events[0]);
76+
cgh.depends_on(events[1]);
77+
cgh.parallel_for(sycl::range<1>(1024),
78+
[=](auto item) { assert(mem1[item.get_id()] == 1); });
79+
});
80+
81+
q2.submit([&](sycl::handler &cgh) {
82+
cgh.depends_on(events[0]);
83+
cgh.depends_on(events[1]);
84+
cgh.parallel_for(sycl::range<1>(1024),
85+
[=](auto item) { assert(mem2[item.get_id()] == 2); });
86+
});
87+
88+
q1.wait();
89+
q2.wait();
90+
91+
sycl::free(mem1, c1);
92+
sycl::free(mem2, c2);
93+
}
94+
95+
int main() {
96+
test_host_task();
97+
test_kernel();
98+
99+
return 0;
100+
}

unified-runtime/source/adapters/level_zero/v2/memory.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ struct ur_mem_image_t : ur_object {
247247
ZeStruct<ze_image_desc_t> zeImageDesc;
248248
};
249249

250-
struct ur_mem_handle_t_ {
250+
struct ur_mem_handle_t_ : ur::handle_base<ur::level_zero::ddi_getter> {
251251
template <typename T, typename... Args>
252252
static ur_mem_handle_t_ *create(Args &&...args) {
253253
return new ur_mem_handle_t_(std::in_place_type<T>,
@@ -293,7 +293,8 @@ struct ur_mem_handle_t_ {
293293
private:
294294
template <typename T, typename... Args>
295295
ur_mem_handle_t_(std::in_place_type_t<T>, Args &&...args)
296-
: mem(std::in_place_type<T>, std::forward<Args>(args)...) {}
296+
: ur::handle_base<ur::level_zero::ddi_getter>(),
297+
mem(std::in_place_type<T>, std::forward<Args>(args)...) {}
297298

298299
std::variant<ur_usm_handle_t, ur_integrated_buffer_handle_t,
299300
ur_discrete_buffer_handle_t, ur_shared_buffer_handle_t,

unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1026,9 +1026,14 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCommandBufferExp(
10261026
ur_event_handle_t executionEvent =
10271027
hCommandBuffer->getExecutionEventUnlocked();
10281028

1029+
if (executionEvent != nullptr) {
1030+
ZE2UR_CALL(zeEventHostSynchronize,
1031+
(executionEvent->getZeEvent(), UINT64_MAX));
1032+
}
1033+
10291034
UR_CALL(enqueueGenericCommandListsExp(
10301035
1, &commandBufferCommandList, phEvent, numEventsInWaitList,
1031-
phEventWaitList, UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, executionEvent));
1036+
phEventWaitList, UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, nullptr));
10321037
UR_CALL(hCommandBuffer->registerExecutionEventUnlocked(*phEvent));
10331038
if (internalEvent != nullptr) {
10341039
internalEvent->release();

0 commit comments

Comments
 (0)