oneapi-src
diff --git a/‎dl-cifar/README.md
Lines changed: 5 additions & 0 deletions b/‎dl-cifar/README.md
Lines changed: 5 additions & 0 deletions
diff --git a/‎dl-cifar/SYCL/CMakeLists.txt
Lines changed: 12 additions & 1 deletion b/‎dl-cifar/SYCL/CMakeLists.txt
Lines changed: 12 additions & 1 deletion
diff --git a/‎dl-cifar/SYCL/basic-dl/lnorm_layer.cpp
Lines changed: 4 additions & 2 deletions b/‎dl-cifar/SYCL/basic-dl/lnorm_layer.cpp
Lines changed: 4 additions & 2 deletions
diff --git a/‎dl-cifar/SYCL/basic-dl/lnorm_layer.h
Lines changed: 6 additions & 3 deletions b/‎dl-cifar/SYCL/basic-dl/lnorm_layer.h
Lines changed: 6 additions & 3 deletions
diff --git a/‎dl-cifar/SYCL/basic-dl/softmax_layer.cpp
Lines changed: 9 additions & 29 deletions b/‎dl-cifar/SYCL/basic-dl/softmax_layer.cpp
Lines changed: 9 additions & 29 deletions
diff --git a/‎dl-cifar/SYCL/basic-dl/softmax_layer.h
Lines changed: 2 additions & 1 deletion b/‎dl-cifar/SYCL/basic-dl/softmax_layer.h
Lines changed: 2 additions & 1 deletion
@@ -96,6 +96,11 @@ CC=clang CXX=clang++ **cmake** -DUSE_AMD_BACKEND=YES -DUSE_AMD_ARCH=gfx90a .. \
 **ONEAPI_DEVICE_SELECTOR=hip:gpu ./dl-cifar_sycl** 
 
 ---------------------------------------------------------------------------------------------------------
+## In-order queue
+The CMake option `-DIN_ORDER_QUEUE` adds the `in_order` property to the SYCL
+queue, as well as `discard_events` if available. The default value of this
+option is `ON` for NVIDIA and AMD backends, and `OFF` otherwise.
+
 ## Workload logging/tracing
 
 **DL-CIFAR provides function tracing:**
 
@@ -31,9 +31,15 @@ set(CMAKE_CXX_EXTENSIONS OFF)        # Use -std, not -gnu
 
 option(GPU_AOT                 "Build AOT for Intel GPU"      OFF)
 option(USE_NVIDIA_BACKEND      "Build for NVIDIA backend"     OFF)
-option(USE_AMDHIP_BACKEND      "Build for AMD HIP backend"    OFF)
+option(USE_AMD_BACKEND         "Build for AMD backend"        OFF)
 option(DEVICE_TIMER            "Build using Device Timer" OFF)
 
+set(IN_ORDER_QUEUE_DEFAULT OFF)
+if (${USE_NVIDIA_BACKEND} OR ${USE_AMD_BACKEND})
+    set(IN_ORDER_QUEUE_DEFAULT ON)
+endif()
+option(IN_ORDER_QUEUE "Use in-order SYCL queue" ${IN_ORDER_QUEUE_DEFAULT})
+
 set(DEF_INTEL_WL_CXX_FLAGS  " -DMKL_ILP64 ")
 set(DEF_NVIDIA_WL_CXX_FLAGS " -DUSE_CUBLAS ")
 set(DEF_AMD_WL_CXX_FLAGS    " -DUSE_ROCBLAS -D__HIP_PLATFORM_AMD__ ")
@@ -42,6 +48,11 @@ set(DEF_INTEL_GENERAL_CXX_FLAGS  " -O3 -fsycl -ffast-math ")
 set(DEF_NVIDIA_GENERAL_CXX_FLAGS " -O3 -fsycl -ffast-math ")
 set(DEF_AMD_GENERAL_CXX_FLAGS    " -O3 -fsycl -ffast-math ")
 
+if (${IN_ORDER_QUEUE})
+    string(APPEND DEF_INTEL_GENERAL_CXX_FLAGS " -DIN_ORDER_QUEUE ")
+    string(APPEND DEF_NVIDIA_GENERAL_CXX_FLAGS " -DIN_ORDER_QUEUE ")
+    string(APPEND DEF_AMD_GENERAL_CXX_FLAGS " -DIN_ORDER_QUEUE ")
+endif()
 
 # -DCMAKE_CXX_FLAGS=" -blah -blah " overrides the default flags (BOTH general and WL specific)
 # -DOVERRIDE_GENERAL_CXX_FLAGS=" -blah -blah " overrides the general flags only (and not the workload specific flags)
 
@@ -80,7 +80,8 @@ void LNormLayer::doFw() {
                 }
             }
         });
-    }).wait();
+    });
+    langHandle_->getSyclQueue()->wait();
 
 
     Tracer::func_end("LNormLayer::doFw");   
@@ -157,7 +158,8 @@ void LNormLayer::doBw() {
 
             }
         });
-    }).wait();    
+    });
+    langHandle_->getSyclQueue()->wait();
 
     Tracer::func_end("LNormLayer::doBw");   
 }
 
@@ -88,7 +88,8 @@ class LNormLayerController {
             float *d_input, *d_d_input;
             d_input   = (float *)sycl::malloc_device(inputSize*sizeof(float),   sycl_queue);
             d_d_input   = (float *)sycl::malloc_device(inputSize*sizeof(float),   sycl_queue);
-            sycl_queue.memcpy(d_input, h_input, sizeof(float) * inputSize).wait();
+            sycl_queue.memcpy(d_input, h_input, sizeof(float) * inputSize);
+            sycl_queue.wait();
 
             int outputSize   = inputSize;
             float *h_d_output   = (float*)calloc(outputSize,   sizeof(float));  
@@ -104,12 +105,14 @@ class LNormLayerController {
             for(int i=0; i<iterCount; i++) {
                 // for some reason the compiler is not liking calls to ImageProcessor::initImage() from here
                 //ImageProcessor::initImage(h_input, inputSize);
-                sycl_queue.memcpy(d_input, h_input, sizeof(float) * inputSize).wait();
+                sycl_queue.memcpy(d_input, h_input, sizeof(float) * inputSize);
+                sycl_queue.wait();
                 lNormLayer->doFw();
 
                 // for some reason the compiler is not liking calls to ImageProcessor::initImage() from here
                 //ImageProcessor::initImage(h_d_output, outputSize);
-                sycl_queue.memcpy(d_d_output, h_d_output, sizeof(float) * outputSize).wait();
+                sycl_queue.memcpy(d_d_output, h_d_output, sizeof(float) * outputSize);
+                sycl_queue.wait();
                 lNormLayer->doBw();
             }
 
 
@@ -108,9 +108,7 @@ SoftmaxLayer::SoftmaxLayer(LangHandle *langHandle, Timer* timer,
 void SoftmaxLayer::doFw() {  
 #if defined(USE_CUBLAS)
 
-    langHandle_->getSyclQueue()->submit([&](sycl::handler &cgh) {
-        //auto d_A = b_A.get_access<sycl::access::mode::read_write>(cgh);
-        cgh.host_task([=](sycl::interop_handle ih) {
+    SYCL::ExecNativeCommand(*langHandle_->getSyclQueue(), [=](sycl::interop_handle ih) {
             cuCtxSetCurrent(ih.get_native_context<sycl::backend::ext_oneapi_cuda>());
             cublasSetStream(*(langHandle_->getCublasHandle()), ih.get_native_queue<sycl::backend::ext_oneapi_cuda>());
 
@@ -129,14 +127,9 @@ void SoftmaxLayer::doFw() {
                                     d_output_));
             //cublasDestroy(handle);
             //cudaStreamSynchronize(cudaStreamHandle);
-            assertDevApiInvar(cudaDeviceSynchronize());
-        });
-    });
-    langHandle_->getSyclQueue()->wait_and_throw();
+        }, []{assertDevApiInvar(cudaDeviceSynchronize())});
 #elif defined(USE_ROCBLAS) 
-    langHandle_->getSyclQueue()->submit([&](sycl::handler &cgh) {
-        //auto d_A = b_A.get_access<sycl::access::mode::read_write>(cgh);
-        cgh.host_task([=](sycl::interop_handle ih) {
+    SYCL::ExecNativeCommand(*langHandle_->getSyclQueue(), [=](sycl::interop_handle ih) {
             //cuCtxSetCurrent(ih.get_native_context<sycl::backend::ext_oneapi_cuda>());
             //cublasSetStream(*(langHandle_->getCublasHandle()), ih.get_native_queue<sycl::backend::ext_oneapi_cuda>());
 
@@ -153,10 +146,7 @@ void SoftmaxLayer::doFw() {
                                     d_output_));
             //cublasDestroy(handle);
             //cudaStreamSynchronize(cudaStreamHandle);
-            assertDevApiInvar(hipDeviceSynchronize());
-        });
-    });
-    langHandle_->getSyclQueue()->wait_and_throw();
+        }, []{assertDevApiInvar(hipDeviceSynchronize())});
 #else
     std::unordered_map<int, memory> softmax_args;
     softmax_args.insert({DNNL_ARG_SRC, src_mem});
@@ -170,9 +160,7 @@ void SoftmaxLayer::doFw() {
 void SoftmaxLayer::doBw() {
 #if defined(USE_CUBLAS)
 
-    langHandle_->getSyclQueue()->submit([&](sycl::handler &cgh) {
-        //auto d_A = b_A.get_access<sycl::access::mode::read_write>(cgh);
-        cgh.host_task([=](sycl::interop_handle ih) {
+    SYCL::ExecNativeCommand(*langHandle_->getSyclQueue(), [=](sycl::interop_handle ih) {
             cuCtxSetCurrent(ih.get_native_context<sycl::backend::ext_oneapi_cuda>());
             cublasSetStream(*(langHandle_->getCublasHandle()), ih.get_native_queue<sycl::backend::ext_oneapi_cuda>());
 
@@ -193,14 +181,9 @@ void SoftmaxLayer::doBw() {
                                     d_d_input_));
             //cublasDestroy(handle);
             //cudaStreamSynchronize(cudaStreamHandle);
-            assertDevApiInvar(cudaDeviceSynchronize());
-        });
-    });
-    langHandle_->getSyclQueue()->wait_and_throw();
+        }, []{assertDevApiInvar(cudaDeviceSynchronize())});
 #elif defined(USE_ROCBLAS)
-    langHandle_->getSyclQueue()->submit([&](sycl::handler &cgh) {
-        //auto d_A = b_A.get_access<sycl::access::mode::read_write>(cgh);
-        cgh.host_task([=](sycl::interop_handle ih) {
+    SYCL::ExecNativeCommand(*langHandle_->getSyclQueue(), [=](sycl::interop_handle ih) {
             //cuCtxSetCurrent(ih.get_native_context<sycl::backend::ext_oneapi_cuda>());
             //cublasSetStream(*(langHandle_->getCublasHandle()), ih.get_native_queue<sycl::backend::ext_oneapi_cuda>());
 
@@ -219,10 +202,7 @@ void SoftmaxLayer::doBw() {
                                     d_d_input_));
             //cublasDestroy(handle);
             //cudaStreamSynchronize(cudaStreamHandle);
-            assertDevApiInvar(hipDeviceSynchronize());
-        });
-    });
-    langHandle_->getSyclQueue()->wait_and_throw();
+        }, []{assertDevApiInvar(hipDeviceSynchronize())});
 #else    
     std::unordered_map<int, memory> softmax_args;
     softmax_args.insert({DNNL_ARG_SRC, src_mem});
@@ -237,4 +217,4 @@ void SoftmaxLayer::doBw() {
 
 SoftmaxLayer::~SoftmaxLayer() {
 
-}
+}
@@ -28,6 +28,7 @@
 #include "timing.h"
 #include "tracing.h"
 #include "handle.h"
+#include "SYCL.h"
 
 #include <sycl/sycl.hpp>
 
@@ -108,4 +109,4 @@ class SoftmaxLayer {
 };
 
 
-#endif
+#endif
Original file line number	Diff line number	Diff line change
`@@ -80,7 +80,8 @@ void LNormLayer::doFw() {`
`80`	`80`	`}`
`81`	`81`	`}`
`82`	`82`	`});`
`83`		`- }).wait();`
	`83`	`+ });`
	`84`	`+ langHandle_->getSyclQueue()->wait();`
`84`	`85`
`85`	`86`
`86`	`87`	`Tracer::func_end("LNormLayer::doFw");`
`@@ -157,7 +158,8 @@ void LNormLayer::doBw() {`
`157`	`158`
`158`	`159`	`}`
`159`	`160`	`});`
`160`		`- }).wait();`
	`161`	`+ });`
	`162`	`+ langHandle_->getSyclQueue()->wait();`
`161`	`163`
`162`	`164`	`Tracer::func_end("LNormLayer::doBw");`
`163`	`165`	`}`