From 4a858db63e8ce273d64340a6c659caee3fd3d459 Mon Sep 17 00:00:00 2001 From: Rafal Bielski Date: Thu, 28 Nov 2024 19:43:59 +0000 Subject: [PATCH] [svm] Print both total and compute-only times Align the time measurements between CUDA, HIP and SYCL versions to measure identical scopes. Add an extra timer to measure just the computation time without initialisation. Print both at the end. --- svm/CUDA/cuSVM/cuSVMSolver.cu | 25 ++++++++++++------------- svm/HIP/cuSVM/cuSVMSolver.cpp | 25 ++++++++++++++----------- svm/SYCL/cuSVM/cuSVMSolver.dp.cpp | 23 +++++++++++------------ 3 files changed, 37 insertions(+), 36 deletions(-) diff --git a/svm/CUDA/cuSVM/cuSVMSolver.cu b/svm/CUDA/cuSVM/cuSVMSolver.cu index 3a59438b..d0185db9 100644 --- a/svm/CUDA/cuSVM/cuSVMSolver.cu +++ b/svm/CUDA/cuSVM/cuSVMSolver.cu @@ -549,25 +549,23 @@ void SVMTrain(float *mexalpha,float* beta,float*y,float *x ,float _C, float _ker printf("_C %f\n", _C); - std::chrono::time_point start_ct1; - std::chrono::time_point stop_ct1; + std::chrono::time_point start_clock_init; + std::chrono::time_point start_clock_exec; + std::chrono::time_point stop_clock; - - start_ct1 = std::chrono::high_resolution_clock::now(); + start_clock_init = std::chrono::steady_clock::now(); + // Creating and recording an event implicitly initialises device queue cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); + cudaEventRecord(start,0); - - //cudaSetDevice(0); + start_clock_exec = std::chrono::steady_clock::now(); mxArray *mexelapsed =mxCreateNumericMatrix(1, 1,mxSINGLE_CLASS, mxREAL); float * elapsed=(float *)mxGetData(mexelapsed); - - cudaEventRecord(start,0); - int numBlocks=64; dim3 ReduceGrid(numBlocks, 1, 1); dim3 ReduceBlock(256, 1, 1); @@ -831,11 +829,12 @@ void SVMTrain(float *mexalpha,float* beta,float*y,float *x ,float _C, float _ker cudaEventElapsedTime(elapsed, start, stop); - stop_ct1 = std::chrono::high_resolution_clock::now(); + stop_clock = std::chrono::steady_clock::now(); - //stop.wait_and_throw(); - float duration = std::chrono::duration(stop_ct1 - start_ct1).count(); - printf("Total run time: %f seconds\n", duration/1000.00); + float duration_compute = std::chrono::duration(stop_clock - start_clock_exec).count(); + float duration_total = std::chrono::duration(stop_clock - start_clock_init).count(); + printf("Compute time: %f seconds\n", duration_compute/1000.00); + printf("Total run time: %f seconds\n", duration_total/1000.00); printf("Iter:%i\n", iter); printf("M:%i\n", m); diff --git a/svm/HIP/cuSVM/cuSVMSolver.cpp b/svm/HIP/cuSVM/cuSVMSolver.cpp index c2acacf8..48a95630 100644 --- a/svm/HIP/cuSVM/cuSVMSolver.cpp +++ b/svm/HIP/cuSVM/cuSVMSolver.cpp @@ -522,21 +522,23 @@ void SVMTrain(float *mexalpha,float* beta,float*y,float *x ,float _C, float _ker printf("_C %f\n", _C); + std::chrono::time_point start_clock_init; + std::chrono::time_point start_clock_exec; + std::chrono::time_point stop_clock; + + start_clock_init = std::chrono::steady_clock::now(); + + // Creating and recording an event implicitly initialises device queue hipEvent_t start, stop; hipEventCreate(&start); hipEventCreate(&stop); + hipEventRecord(start,0); - std::chrono::time_point start_ct1; - std::chrono::time_point stop_ct1; - - start_ct1 = std::chrono::high_resolution_clock::now(); + start_clock_exec = std::chrono::steady_clock::now(); mxArray *mexelapsed =mxCreateNumericMatrix(1, 1,mxSINGLE_CLASS, mxREAL); float * elapsed=(float *)mxGetData(mexelapsed); - - hipEventRecord(start,0); - int numBlocks=64; dim3 ReduceGrid(numBlocks, 1, 1); dim3 ReduceBlock(256, 1, 1); @@ -800,11 +802,12 @@ void SVMTrain(float *mexalpha,float* beta,float*y,float *x ,float _C, float _ker hipEventElapsedTime(elapsed, start, stop); - stop_ct1 = std::chrono::high_resolution_clock::now(); + stop_clock = std::chrono::steady_clock::now(); - //stop.wait_and_throw(); - float duration = std::chrono::duration(stop_ct1 - start_ct1).count(); - printf("Total run time: %f seconds\n", duration/1000.00); + float duration_compute = std::chrono::duration(stop_clock - start_clock_exec).count(); + float duration_total = std::chrono::duration(stop_clock - start_clock_init).count(); + printf("Compute time: %f seconds\n", duration_compute/1000.00); + printf("Total run time: %f seconds\n", duration_total/1000.00); printf("Iter:%i\n", iter); printf("M:%i\n", m); diff --git a/svm/SYCL/cuSVM/cuSVMSolver.dp.cpp b/svm/SYCL/cuSVM/cuSVMSolver.dp.cpp index 57232293..cba09ed5 100644 --- a/svm/SYCL/cuSVM/cuSVMSolver.dp.cpp +++ b/svm/SYCL/cuSVM/cuSVMSolver.dp.cpp @@ -794,18 +794,16 @@ extern "C" void SVMTrain(float *mexalpha, float *beta, float *y, float *x, sycl::event queue_event; sycl::event start, stop; - std::chrono::time_point start_ct1; - std::chrono::time_point stop_ct1; - - start_ct1 = std::chrono::high_resolution_clock::now(); + std::chrono::time_point start_clock_init; + std::chrono::time_point start_clock_exec; + std::chrono::time_point stop_clock; + start_clock_init = std::chrono::steady_clock::now(); + // Select device and initialise the queue sycl::device selected_device = sycl::device(sycl::default_selector()); sycl::context context({selected_device}); - auto max_wgroup_size = selected_device.get_info(); - printf("Workgroup Size: %lu\n", max_wgroup_size); - #if KERNEL_USE_PROFILE auto propList = sycl::property_list{sycl::property::queue::enable_profiling()}; sycl::queue q_ct1(context, selected_device, propList); @@ -813,6 +811,7 @@ extern "C" void SVMTrain(float *mexalpha, float *beta, float *y, float *x, sycl::queue q_ct1(context, selected_device); #endif + start_clock_exec = std::chrono::steady_clock::now(); mxArray *mexelapsed =mxCreateNumericMatrix(1, 1,mxSINGLE_CLASS, mxREAL); float * elapsed=(float *)mxGetData(mexelapsed); @@ -1280,12 +1279,12 @@ _kernelwidth*=-1; q_ct1.memcpy(mexalpha, d_alpha, m * sizeof(float)).wait(); - stop_ct1 = std::chrono::high_resolution_clock::now(); + stop_clock = std::chrono::steady_clock::now(); - //stop.wait_and_throw(); - float duration = std::chrono::duration(stop_ct1 - start_ct1).count(); - printf("Total run time: %f seconds\n", duration/1000.00); - + float duration_compute = std::chrono::duration(stop_clock - start_clock_exec).count(); + float duration_total = std::chrono::duration(stop_clock - start_clock_init).count(); + printf("Compute time: %f seconds\n", duration_compute/1000.00); + printf("Total run time: %f seconds\n", duration_total/1000.00); printf("Iter:%i\n", iter); printf("M:%i\n", m);