Skip to content

Commit 230aead

Browse files
committed
[dl-cifar][SYCL] Add CMake option to use in-order queue
Add the queue properties in_order and, if available, discard_events. The addition is steered by a CMake build option IN_ORDER_QUEUE. Set the default value to ON for NVIDIA and AMD backends and keep as OFF for other backends.
1 parent 890d3f0 commit 230aead

File tree

6 files changed

+55
-12
lines changed

6 files changed

+55
-12
lines changed

dl-cifar/README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,11 @@ CC=clang CXX=clang++ **cmake** -DUSE_AMD_BACKEND=YES -DUSE_AMD_ARCH=gfx90a .. \
9696
**ONEAPI_DEVICE_SELECTOR=hip:gpu ./dl-cifar_sycl**
9797

9898
---------------------------------------------------------------------------------------------------------
99+
## In-order queue
100+
The CMake option `-DIN_ORDER_QUEUE` adds the `in_order` property to the SYCL
101+
queue, as well as `discard_events` if available. The default value of this
102+
option is `ON` for NVIDIA and AMD backends, and `OFF` otherwise.
103+
99104
## Workload logging/tracing
100105

101106
**DL-CIFAR provides function tracing:**

dl-cifar/SYCL/CMakeLists.txt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,12 @@ option(USE_NVIDIA_BACKEND "Build for NVIDIA backend" OFF)
3434
option(USE_AMDHIP_BACKEND "Build for AMD HIP backend" OFF)
3535
option(DEVICE_TIMER "Build using Device Timer" OFF)
3636

37+
set(IN_ORDER_QUEUE_DEFAULT OFF)
38+
if (${USE_NVIDIA_BACKEND} OR ${USE_AMDHIP_BACKEND})
39+
set(IN_ORDER_QUEUE_DEFAULT ON)
40+
endif()
41+
option(IN_ORDER_QUEUE "Use in-order SYCL queue" ${IN_ORDER_QUEUE_DEFAULT})
42+
3743
set(DEF_INTEL_WL_CXX_FLAGS " -DMKL_ILP64 ")
3844
set(DEF_NVIDIA_WL_CXX_FLAGS " -DUSE_CUBLAS ")
3945
set(DEF_AMD_WL_CXX_FLAGS " -DUSE_ROCBLAS -D__HIP_PLATFORM_AMD__ ")
@@ -42,6 +48,11 @@ set(DEF_INTEL_GENERAL_CXX_FLAGS " -O3 -fsycl -ffast-math ")
4248
set(DEF_NVIDIA_GENERAL_CXX_FLAGS " -O3 -fsycl -ffast-math ")
4349
set(DEF_AMD_GENERAL_CXX_FLAGS " -O3 -fsycl -ffast-math ")
4450

51+
if (${IN_ORDER_QUEUE})
52+
string(APPEND DEF_INTEL_GENERAL_CXX_FLAGS " -DIN_ORDER_QUEUE ")
53+
string(APPEND DEF_NVIDIA_GENERAL_CXX_FLAGS " -DIN_ORDER_QUEUE ")
54+
string(APPEND DEF_AMD_GENERAL_CXX_FLAGS " -DIN_ORDER_QUEUE ")
55+
endif()
4556

4657
# -DCMAKE_CXX_FLAGS=" -blah -blah " overrides the default flags (BOTH general and WL specific)
4758
# -DOVERRIDE_GENERAL_CXX_FLAGS=" -blah -blah " overrides the general flags only (and not the workload specific flags)

dl-cifar/SYCL/basic-dl/lnorm_layer.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,8 @@ void LNormLayer::doFw() {
8080
}
8181
}
8282
});
83-
}).wait();
83+
});
84+
langHandle_->getSyclQueue()->wait();
8485

8586

8687
Tracer::func_end("LNormLayer::doFw");
@@ -157,7 +158,8 @@ void LNormLayer::doBw() {
157158

158159
}
159160
});
160-
}).wait();
161+
});
162+
langHandle_->getSyclQueue()->wait();
161163

162164
Tracer::func_end("LNormLayer::doBw");
163165
}

dl-cifar/SYCL/basic-dl/lnorm_layer.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,8 @@ class LNormLayerController {
8888
float *d_input, *d_d_input;
8989
d_input = (float *)sycl::malloc_device(inputSize*sizeof(float), sycl_queue);
9090
d_d_input = (float *)sycl::malloc_device(inputSize*sizeof(float), sycl_queue);
91-
sycl_queue.memcpy(d_input, h_input, sizeof(float) * inputSize).wait();
91+
sycl_queue.memcpy(d_input, h_input, sizeof(float) * inputSize);
92+
sycl_queue.wait();
9293

9394
int outputSize = inputSize;
9495
float *h_d_output = (float*)calloc(outputSize, sizeof(float));
@@ -104,12 +105,14 @@ class LNormLayerController {
104105
for(int i=0; i<iterCount; i++) {
105106
// for some reason the compiler is not liking calls to ImageProcessor::initImage() from here
106107
//ImageProcessor::initImage(h_input, inputSize);
107-
sycl_queue.memcpy(d_input, h_input, sizeof(float) * inputSize).wait();
108+
sycl_queue.memcpy(d_input, h_input, sizeof(float) * inputSize);
109+
sycl_queue.wait();
108110
lNormLayer->doFw();
109111

110112
// for some reason the compiler is not liking calls to ImageProcessor::initImage() from here
111113
//ImageProcessor::initImage(h_d_output, outputSize);
112-
sycl_queue.memcpy(d_d_output, h_d_output, sizeof(float) * outputSize).wait();
114+
sycl_queue.memcpy(d_d_output, h_d_output, sizeof(float) * outputSize);
115+
sycl_queue.wait();
113116
lNormLayer->doBw();
114117
}
115118

dl-cifar/SYCL/handle.h

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -104,11 +104,18 @@ class LangHandle {
104104
#ifdef DEVICE_TIMER
105105
timer->recordOpTimeTaken(1000, calculate_op_time_taken(start), "CREATE_SYCL_CONTEXT");
106106
#endif
107-
//auto propList = sycl::property_list{sycl::property::queue::in_order()};
107+
auto propList = sycl::property_list{
108+
#ifdef IN_ORDER_QUEUE
109+
sycl::property::queue::in_order{},
110+
#ifdef SYCL_EXT_ONEAPI_DISCARD_QUEUE_EVENTS
111+
sycl::ext::oneapi::property::queue::discard_events{}
112+
#endif
113+
#endif
114+
};
108115
#ifdef DEVICE_TIMER
109116
start = get_time_now();
110117
#endif
111-
sycl_queue_ = new sycl::queue(*context_, *dht_);
118+
sycl_queue_ = new sycl::queue(*context_, *dht_, propList);
112119
#ifdef DEVICE_TIMER
113120
timer->recordOpTimeTaken(1000, calculate_op_time_taken(start), "CREATE_SYCL_QUEUE");
114121
#endif
@@ -246,11 +253,18 @@ class LangHandle {
246253
#ifdef DEVICE_TIMER
247254
timer->recordOpTimeTaken(1000, calculate_op_time_taken(start), "CREATE_SYCL_CONTEXT");
248255
#endif
249-
//auto propList = sycl::property_list{sycl::property::queue::in_order()};
256+
auto propList = sycl::property_list{
257+
#ifdef IN_ORDER_QUEUE
258+
sycl::property::queue::in_order{},
259+
#ifdef SYCL_EXT_ONEAPI_DISCARD_QUEUE_EVENTS
260+
sycl::ext::oneapi::property::queue::discard_events{}
261+
#endif
262+
#endif
263+
};
250264
#ifdef DEVICE_TIMER
251265
start = get_time_now();
252266
#endif
253-
sycl_queue_ = new sycl::queue(*context_, *dht_);
267+
sycl_queue_ = new sycl::queue(*context_, *dht_, propList);
254268
#ifdef DEVICE_TIMER
255269
timer->recordOpTimeTaken(1000, calculate_op_time_taken(start), "CREATE_SYCL_QUEUE");
256270
#endif
@@ -343,11 +357,18 @@ class LangHandle {
343357
#ifdef DEVICE_TIMER
344358
timer->recordOpTimeTaken(1000, calculate_op_time_taken(start), "CREATE_SYCL_CONTEXT");
345359
#endif
346-
//auto propList = sycl::property_list{sycl::property::queue::in_order()};
360+
auto propList = sycl::property_list{
361+
#ifdef IN_ORDER_QUEUE
362+
sycl::property::queue::in_order{},
363+
#ifdef SYCL_EXT_ONEAPI_DISCARD_QUEUE_EVENTS
364+
sycl::ext::oneapi::property::queue::discard_events{}
365+
#endif
366+
#endif
367+
};
347368
#ifdef DEVICE_TIMER
348369
start = get_time_now();
349370
#endif
350-
sycl_queue_ = new sycl::queue(*context_, *dht_);
371+
sycl_queue_ = new sycl::queue(*context_, *dht_, propList);
351372
#ifdef DEVICE_TIMER
352373
timer->recordOpTimeTaken(1000, calculate_op_time_taken(start), "CREATE_SYCL_QUEUE");
353374
#endif

dl-cifar/SYCL/upsample.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,8 @@ void Upsampler::upsample(LangHandle *langHandle, float *d_src, float *d_dst, int
7878
d_dst[dst_pixCntUntilSegment + relPixIdxInSegment] = d_src[src_imgIdx*src_noOfPixelsPerImg + (pixelIY*srcWidth) + pixelIX];
7979
}
8080
});
81-
}).wait();
81+
});
82+
langHandle->getSyclQueue()->wait();
8283

8384
Tracer::func_end("Upsampler::upsample");
8485
}

0 commit comments

Comments
 (0)