Skip to content

Commit 6555657

Browse files
authored
Merge pull request #95 from rafbiels/dl-cifar-inorder-nativecmd
[dl-cifar][SYCL] In-order queue and native commands
2 parents 0957551 + b8068ac commit 6555657

File tree

10 files changed

+122
-127
lines changed

10 files changed

+122
-127
lines changed

dl-cifar/README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,11 @@ CC=clang CXX=clang++ **cmake** -DUSE_AMD_BACKEND=YES -DUSE_AMD_ARCH=gfx90a .. \
9696
**ONEAPI_DEVICE_SELECTOR=hip:gpu ./dl-cifar_sycl**
9797

9898
---------------------------------------------------------------------------------------------------------
99+
## In-order queue
100+
The CMake option `-DIN_ORDER_QUEUE` adds the `in_order` property to the SYCL
101+
queue, as well as `discard_events` if available. The default value of this
102+
option is `ON` for NVIDIA and AMD backends, and `OFF` otherwise.
103+
99104
## Workload logging/tracing
100105

101106
**DL-CIFAR provides function tracing:**

dl-cifar/SYCL/CMakeLists.txt

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,15 @@ set(CMAKE_CXX_EXTENSIONS OFF) # Use -std, not -gnu
3131

3232
option(GPU_AOT "Build AOT for Intel GPU" OFF)
3333
option(USE_NVIDIA_BACKEND "Build for NVIDIA backend" OFF)
34-
option(USE_AMDHIP_BACKEND "Build for AMD HIP backend" OFF)
34+
option(USE_AMD_BACKEND "Build for AMD backend" OFF)
3535
option(DEVICE_TIMER "Build using Device Timer" OFF)
3636

37+
set(IN_ORDER_QUEUE_DEFAULT OFF)
38+
if (${USE_NVIDIA_BACKEND} OR ${USE_AMD_BACKEND})
39+
set(IN_ORDER_QUEUE_DEFAULT ON)
40+
endif()
41+
option(IN_ORDER_QUEUE "Use in-order SYCL queue" ${IN_ORDER_QUEUE_DEFAULT})
42+
3743
set(DEF_INTEL_WL_CXX_FLAGS " -DMKL_ILP64 ")
3844
set(DEF_NVIDIA_WL_CXX_FLAGS " -DUSE_CUBLAS ")
3945
set(DEF_AMD_WL_CXX_FLAGS " -DUSE_ROCBLAS -D__HIP_PLATFORM_AMD__ ")
@@ -42,6 +48,11 @@ set(DEF_INTEL_GENERAL_CXX_FLAGS " -O3 -fsycl -ffast-math ")
4248
set(DEF_NVIDIA_GENERAL_CXX_FLAGS " -O3 -fsycl -ffast-math ")
4349
set(DEF_AMD_GENERAL_CXX_FLAGS " -O3 -fsycl -ffast-math ")
4450

51+
if (${IN_ORDER_QUEUE})
52+
string(APPEND DEF_INTEL_GENERAL_CXX_FLAGS " -DIN_ORDER_QUEUE ")
53+
string(APPEND DEF_NVIDIA_GENERAL_CXX_FLAGS " -DIN_ORDER_QUEUE ")
54+
string(APPEND DEF_AMD_GENERAL_CXX_FLAGS " -DIN_ORDER_QUEUE ")
55+
endif()
4556

4657
# -DCMAKE_CXX_FLAGS=" -blah -blah " overrides the default flags (BOTH general and WL specific)
4758
# -DOVERRIDE_GENERAL_CXX_FLAGS=" -blah -blah " overrides the general flags only (and not the workload specific flags)

dl-cifar/SYCL/basic-dl/lnorm_layer.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,8 @@ void LNormLayer::doFw() {
8080
}
8181
}
8282
});
83-
}).wait();
83+
});
84+
langHandle_->getSyclQueue()->wait();
8485

8586

8687
Tracer::func_end("LNormLayer::doFw");
@@ -157,7 +158,8 @@ void LNormLayer::doBw() {
157158

158159
}
159160
});
160-
}).wait();
161+
});
162+
langHandle_->getSyclQueue()->wait();
161163

162164
Tracer::func_end("LNormLayer::doBw");
163165
}

dl-cifar/SYCL/basic-dl/lnorm_layer.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,8 @@ class LNormLayerController {
8888
float *d_input, *d_d_input;
8989
d_input = (float *)sycl::malloc_device(inputSize*sizeof(float), sycl_queue);
9090
d_d_input = (float *)sycl::malloc_device(inputSize*sizeof(float), sycl_queue);
91-
sycl_queue.memcpy(d_input, h_input, sizeof(float) * inputSize).wait();
91+
sycl_queue.memcpy(d_input, h_input, sizeof(float) * inputSize);
92+
sycl_queue.wait();
9293

9394
int outputSize = inputSize;
9495
float *h_d_output = (float*)calloc(outputSize, sizeof(float));
@@ -104,12 +105,14 @@ class LNormLayerController {
104105
for(int i=0; i<iterCount; i++) {
105106
// for some reason the compiler is not liking calls to ImageProcessor::initImage() from here
106107
//ImageProcessor::initImage(h_input, inputSize);
107-
sycl_queue.memcpy(d_input, h_input, sizeof(float) * inputSize).wait();
108+
sycl_queue.memcpy(d_input, h_input, sizeof(float) * inputSize);
109+
sycl_queue.wait();
108110
lNormLayer->doFw();
109111

110112
// for some reason the compiler is not liking calls to ImageProcessor::initImage() from here
111113
//ImageProcessor::initImage(h_d_output, outputSize);
112-
sycl_queue.memcpy(d_d_output, h_d_output, sizeof(float) * outputSize).wait();
114+
sycl_queue.memcpy(d_d_output, h_d_output, sizeof(float) * outputSize);
115+
sycl_queue.wait();
113116
lNormLayer->doBw();
114117
}
115118

dl-cifar/SYCL/basic-dl/softmax_layer.cpp

Lines changed: 9 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -108,9 +108,7 @@ SoftmaxLayer::SoftmaxLayer(LangHandle *langHandle, Timer* timer,
108108
void SoftmaxLayer::doFw() {
109109
#if defined(USE_CUBLAS)
110110

111-
langHandle_->getSyclQueue()->submit([&](sycl::handler &cgh) {
112-
//auto d_A = b_A.get_access<sycl::access::mode::read_write>(cgh);
113-
cgh.host_task([=](sycl::interop_handle ih) {
111+
SYCL::ExecNativeCommand(*langHandle_->getSyclQueue(), [=](sycl::interop_handle ih) {
114112
cuCtxSetCurrent(ih.get_native_context<sycl::backend::ext_oneapi_cuda>());
115113
cublasSetStream(*(langHandle_->getCublasHandle()), ih.get_native_queue<sycl::backend::ext_oneapi_cuda>());
116114

@@ -129,14 +127,9 @@ void SoftmaxLayer::doFw() {
129127
d_output_));
130128
//cublasDestroy(handle);
131129
//cudaStreamSynchronize(cudaStreamHandle);
132-
assertDevApiInvar(cudaDeviceSynchronize());
133-
});
134-
});
135-
langHandle_->getSyclQueue()->wait_and_throw();
130+
}, []{assertDevApiInvar(cudaDeviceSynchronize())});
136131
#elif defined(USE_ROCBLAS)
137-
langHandle_->getSyclQueue()->submit([&](sycl::handler &cgh) {
138-
//auto d_A = b_A.get_access<sycl::access::mode::read_write>(cgh);
139-
cgh.host_task([=](sycl::interop_handle ih) {
132+
SYCL::ExecNativeCommand(*langHandle_->getSyclQueue(), [=](sycl::interop_handle ih) {
140133
//cuCtxSetCurrent(ih.get_native_context<sycl::backend::ext_oneapi_cuda>());
141134
//cublasSetStream(*(langHandle_->getCublasHandle()), ih.get_native_queue<sycl::backend::ext_oneapi_cuda>());
142135

@@ -153,10 +146,7 @@ void SoftmaxLayer::doFw() {
153146
d_output_));
154147
//cublasDestroy(handle);
155148
//cudaStreamSynchronize(cudaStreamHandle);
156-
assertDevApiInvar(hipDeviceSynchronize());
157-
});
158-
});
159-
langHandle_->getSyclQueue()->wait_and_throw();
149+
}, []{assertDevApiInvar(hipDeviceSynchronize())});
160150
#else
161151
std::unordered_map<int, memory> softmax_args;
162152
softmax_args.insert({DNNL_ARG_SRC, src_mem});
@@ -170,9 +160,7 @@ void SoftmaxLayer::doFw() {
170160
void SoftmaxLayer::doBw() {
171161
#if defined(USE_CUBLAS)
172162

173-
langHandle_->getSyclQueue()->submit([&](sycl::handler &cgh) {
174-
//auto d_A = b_A.get_access<sycl::access::mode::read_write>(cgh);
175-
cgh.host_task([=](sycl::interop_handle ih) {
163+
SYCL::ExecNativeCommand(*langHandle_->getSyclQueue(), [=](sycl::interop_handle ih) {
176164
cuCtxSetCurrent(ih.get_native_context<sycl::backend::ext_oneapi_cuda>());
177165
cublasSetStream(*(langHandle_->getCublasHandle()), ih.get_native_queue<sycl::backend::ext_oneapi_cuda>());
178166

@@ -193,14 +181,9 @@ void SoftmaxLayer::doBw() {
193181
d_d_input_));
194182
//cublasDestroy(handle);
195183
//cudaStreamSynchronize(cudaStreamHandle);
196-
assertDevApiInvar(cudaDeviceSynchronize());
197-
});
198-
});
199-
langHandle_->getSyclQueue()->wait_and_throw();
184+
}, []{assertDevApiInvar(cudaDeviceSynchronize())});
200185
#elif defined(USE_ROCBLAS)
201-
langHandle_->getSyclQueue()->submit([&](sycl::handler &cgh) {
202-
//auto d_A = b_A.get_access<sycl::access::mode::read_write>(cgh);
203-
cgh.host_task([=](sycl::interop_handle ih) {
186+
SYCL::ExecNativeCommand(*langHandle_->getSyclQueue(), [=](sycl::interop_handle ih) {
204187
//cuCtxSetCurrent(ih.get_native_context<sycl::backend::ext_oneapi_cuda>());
205188
//cublasSetStream(*(langHandle_->getCublasHandle()), ih.get_native_queue<sycl::backend::ext_oneapi_cuda>());
206189

@@ -219,10 +202,7 @@ void SoftmaxLayer::doBw() {
219202
d_d_input_));
220203
//cublasDestroy(handle);
221204
//cudaStreamSynchronize(cudaStreamHandle);
222-
assertDevApiInvar(hipDeviceSynchronize());
223-
});
224-
});
225-
langHandle_->getSyclQueue()->wait_and_throw();
205+
}, []{assertDevApiInvar(hipDeviceSynchronize())});
226206
#else
227207
std::unordered_map<int, memory> softmax_args;
228208
softmax_args.insert({DNNL_ARG_SRC, src_mem});
@@ -237,4 +217,4 @@ void SoftmaxLayer::doBw() {
237217

238218
SoftmaxLayer::~SoftmaxLayer() {
239219

240-
}
220+
}

dl-cifar/SYCL/basic-dl/softmax_layer.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include "timing.h"
2929
#include "tracing.h"
3030
#include "handle.h"
31+
#include "SYCL.h"
3132

3233
#include <sycl/sycl.hpp>
3334

@@ -108,4 +109,4 @@ class SoftmaxLayer {
108109
};
109110

110111

111-
#endif
112+
#endif

0 commit comments

Comments
 (0)