From 225abef4a0f12969ad874dc93d38eaac6a2df3aa Mon Sep 17 00:00:00 2001 From: Scott Straughan Date: Thu, 11 Jul 2024 11:42:56 +0100 Subject: [PATCH] Add numerous research papers. --- ...ative-study-of-sycl-open-cl-and-open-mp.md | 31 +++++++++++++ ...ons-with-modern-c-and-sycl-a-case-study.md | 39 ++++++++++++++++ ...medical-imaging-applications-using-sycl.md | 30 +++++++++++++ ...operator-mini-app-using-kokkos-and-sycl.md | 33 ++++++++++++++ ...-study-of-k-means-clustering-using-sycl.md | 42 ++++++++++++++++++ ...-routine-in-sycl-on-integrated-graphics.md | 29 ++++++++++++ ...hine-learning-systems-a-sycl-case-study.md | 25 +++++++++++ ...ty-of-contemporary-sycl-implementations.md | 31 +++++++++++++ ...-structured-mesh-computations-with-sycl.md | 25 +++++++++++ ...extending-sycl-hierarchical-parallelism.md | 28 ++++++++++++ ...benchmark-on-nvidia-amd-and-intel-gp-us.md | 44 +++++++++++++++++++ ...dbms-operator-implementation-using-sycl.md | 24 ++++++++++ ...rtability-of-the-sycl-programming-model.md | 21 +++++++++ ...rization-algorithm-with-openmp-and-sycl.md | 29 ++++++++++++ ...communication-on-sycl-programmed-fpg-as.md | 27 ++++++++++++ ...ds-with-perturbative-triples-using-sycl.md | 34 ++++++++++++++ ...form-reduction-in-hip-and-sycl-on-gp-us.md | 20 +++++++++ ...s-using-sycl-and-cuda-on-tesla-v100-gpu.md | 31 +++++++++++++ ...s-applications-in-sycl-on-an-nvidia-gpu.md | 21 +++++++++ ...mechanism-for-sycl-on-sx-aurora-tsubasa.md | 34 ++++++++++++++ ...-cl-and-sycl-applications-via-r-open-cl.md | 24 ++++++++++ ...-calculation-in-bioinformatics-on-gp-us.md | 21 +++++++++ ...th-gene-expression-connectivity-mapping.md | 24 ++++++++++ ...9-rna-guided-endonucleases-on-amd-gp-us.md | 24 ++++++++++ ...se-search-on-nvidia-amd-and-intel-gp-us.md | 36 +++++++++++++++ ...g-hyperdimensional-classifier-with-sycl.md | 33 ++++++++++++++ ...n-to-cross-platform-sycl-implementation.md | 35 +++++++++++++++ ...-kokkos-a-case-study-on-lbm-simulations.md | 41 +++++++++++++++++ ...es-building-an-mlir-based-sycl-compiler.md | 41 +++++++++++++++++ ...ll-codes-on-different-gpu-architectures.md | 30 +++++++++++++ 30 files changed, 907 insertions(+) create mode 100644 content/research_papers/2016/2016-10-26-a-comparative-study-of-sycl-open-cl-and-open-mp.md create mode 100644 content/research_papers/2018/2018-07-10-solving-maxwells-equations-with-modern-c-and-sycl-a-case-study.md create mode 100644 content/research_papers/2019/2019-11-18-evaluation-of-medical-imaging-applications-using-sycl.md create mode 100644 content/research_papers/2019/2019-11-22-performance-portability-of-a-wilson-dslash-stencil-operator-mini-app-using-kokkos-and-sycl.md create mode 100644 content/research_papers/2019/2019-12-09-a-case-study-of-k-means-clustering-using-sycl.md create mode 100644 content/research_papers/2020/2020-05-18-a-case-study-on-the-hac-cmk-routine-in-sycl-on-integrated-graphics.md create mode 100644 content/research_papers/2020/2020-05-18-towards-automated-kernel-selection-in-machine-learning-systems-a-sycl-case-study.md create mode 100644 content/research_papers/2020/2020-11-13-evaluating-the-performance-and-portability-of-contemporary-sycl-implementations.md create mode 100644 content/research_papers/2021/2021-09-07-automatic-parallelization-of-structured-mesh-computations-with-sycl.md create mode 100644 content/research_papers/2021/2021-11-14-benchmarking-and-extending-sycl-hierarchical-parallelism.md create mode 100644 content/research_papers/2021/2021-11-14-case-study-of-using-kokkos-and-sycl-as-performance-portable-frameworks-for-milc-dslash-benchmark-on-nvidia-amd-and-intel-gp-us.md create mode 100644 content/research_papers/2021/2021-11-24-efficient-hardware-agnostic-dbms-operator-implementation-using-sycl.md create mode 100644 content/research_papers/2022/2022-07-10-a-benchmark-suite-for-improving-performance-portability-of-the-sycl-programming-model.md create mode 100644 content/research_papers/2022/2022-10-28-portability and-performance-assessment-of-the-non-negative-matrix-factorization-algorithm-with-openmp-and-sycl.md create mode 100644 content/research_papers/2022/2022-11-13-a-first-step-towards-support-for-mpi-partitioned-communication-on-sycl-programmed-fpg-as.md create mode 100644 content/research_papers/2022/2022-11-13-towards-cross-platform-portability-of-coupled-cluster-methods-with-perturbative-triples-using-sycl.md create mode 100644 content/research_papers/2022/2022-11-18-evaluating-nonuniform-reduction-in-hip-and-sycl-on-gp-us.md create mode 100644 content/research_papers/2022/2022-12-01-performance-study-of-gpu-applications-using-sycl-and-cuda-on-tesla-v100-gpu.md create mode 100644 content/research_papers/2022/2022-12-06-understanding-performance-portability-of-bioinformatics-applications-in-sycl-on-an-nvidia-gpu.md create mode 100644 content/research_papers/2022/2022-12-23-a-memory-bank-conflict-prevention-mechanism-for-sycl-on-sx-aurora-tsubasa.md create mode 100644 content/research_papers/2023/2023-05-15-remote-execution-of-open-cl-and-sycl-applications-via-r-open-cl.md create mode 100644 content/research_papers/2023/2023-05-15-understanding-performance-portability-of-sycl-kernels-a-case-study-with-the-all-pairs-distance-calculation-in-bioinformatics-on-gp-us.md create mode 100644 content/research_papers/2023/2023-05-15-understanding-sycl-portability-for-pseudorandom-number-generation-a-case-study-with-gene-expression-connectivity-mapping.md create mode 100644 content/research_papers/2023/2023-09-05-experience-migrating-open-cl-to-sycl-a-case-study-on-searches-for-potential-off-target-sites-of-cas-9-rna-guided-endonucleases-on-amd-gp-us.md create mode 100644 content/research_papers/2023/2023-10-17-comparing-performance-and-portability-between-cuda-and-sycl-for-protein-database-search-on-nvidia-amd-and-intel-gp-us.md create mode 100644 content/research_papers/2023/2023-10-31-accelerating-hyperdimensional-classifier-with-sycl.md create mode 100644 content/research_papers/2023/2023-12-18-migration-of-cuda-based-seismic-application-to-cross-platform-sycl-implementation.md create mode 100644 content/research_papers/2023/2023-12-21-evaluating-performance-portability-of-sycl-and-kokkos-a-case-study-on-lbm-simulations.md create mode 100644 content/research_papers/2024/2024-03-02-experiences-building-an-mlir-based-sycl-compiler.md create mode 100644 content/research_papers/2024/2024-05-20-unveiling-performance-insights-and-portability-achievements-between-cuda-and-sycl-for-particle-in-cell-codes-on-different-gpu-architectures.md diff --git a/content/research_papers/2016/2016-10-26-a-comparative-study-of-sycl-open-cl-and-open-mp.md b/content/research_papers/2016/2016-10-26-a-comparative-study-of-sycl-open-cl-and-open-mp.md new file mode 100644 index 0000000..8d92f9d --- /dev/null +++ b/content/research_papers/2016/2016-10-26-a-comparative-study-of-sycl-open-cl-and-open-mp.md @@ -0,0 +1,31 @@ +--- +contributor: max +date: '2016-10-26T10:57:29+01:00' +title: 'A Comparative Study of SYCL, OpenCL, and OpenMP' +external_url: https://ieeexplore.ieee.org/document/7803697 +authors: + - name: Hércules Cardoso da Silva + affiliation: Inst. of Comput + - name: Flávia Pisani + affiliation: Institute of Computing, + - name: Edson Borin + affiliation: Institute of Computing +tags: + - opencl + - openmp + - parallel + - performance + - evaluation +--- + +Recent trends indicate that future computing systems will be composed by a group of heterogeneous computing devices, +including CPUs, GPUs, and other hardware accelerators. These devices provide increased processing performance, however, +creating efficient code for them may require that programmers manage memory assignments and use specialized APIs, +compilers, or runtime systems, thus making their programs dependent on specific tools. In this scenario, SYCL is an +emerging C++ programming model for OpenCL that allows developers to write code for heterogeneous computing devices that +are compatible with standard C++ compilation frameworks. In this paper, we analyze the performance and programming +characteristics of SYCL, OpenMP, and OpenCL using both a benchmark and a real-world application. Our performance results +indicate that programs that rely on available SYCL runtimes are not on par with the ones based on OpenMP and OpenCL yet. +Nonetheless, the gap is getting smaller if we consider the results reported by previous studies. In terms of +programmability, SYCL presents itself as a competitive alternative to OpenCL, requiring fewer lines of code to implement +kernels and also fewer calls to essential API functions and methods. \ No newline at end of file diff --git a/content/research_papers/2018/2018-07-10-solving-maxwells-equations-with-modern-c-and-sycl-a-case-study.md b/content/research_papers/2018/2018-07-10-solving-maxwells-equations-with-modern-c-and-sycl-a-case-study.md new file mode 100644 index 0000000..53c4652 --- /dev/null +++ b/content/research_papers/2018/2018-07-10-solving-maxwells-equations-with-modern-c-and-sycl-a-case-study.md @@ -0,0 +1,39 @@ +--- +contributor: scott +date: '2018-07-10T08:08:10.490000+00:00' +title: 'Solving Maxwells Equations with Modern C++ and SYCL: A Case Study' +external_url: https://ieeexplore.ieee.org/document/8445127 +authors: + - name: Ayesha Afzal + affiliation: Friedrich-Alexander university Erlangen-Nurnberg + - name: Christian Schmitt + affiliation: Friedrich-Alexander university Erlangen-Nurnberg + - name: Samer Alhaddad + affiliation: Paderborn University + - name: Yevgen Grynko + affiliation: Paderborn University + - name: Jurgen Teich + affiliation: Friedrich-Alexander university Erlangen-Nurnberg + - name: Jens Forstner + affiliation: Paderborn University + - name: Frank Hannig + affiliation: Friedrich-Alexander university Erlangen-Nurnberg +tags: + - maxwell + - c++ + - case-study +--- + +In scientific computing, unstructured meshes are a crucial foundation for the simulation of real-world physical +phenomena. Compared to regular grids, they allow resembling the computational domain with a much higher accuracy, which +in turn leads to more efficient computations. There exists a wealth of supporting libraries and frameworks that aid +programmers with the implementation of applications working on such grids, each built on top of existing parallelization +technologies. However, many approaches require the programmer to introduce a different programming paradigm into their +application or provide different variants of the code. SYCL is a new programming standard providing a remedy to this +dilemma by building on standard C++ 17 with its so-called single-source approach: Programmers write standard C++ code +and expose parallelism using C++ 17 keywords. The application is then transformed into a concrete implementation by the +SYCL implementation. By encapsulating the OpenCL ecosystem, different SYCL implementations enable not only the +programming of CPUs but also of heterogeneous platforms such as GPUs or other devices. For the first time, this paper +showcases a SY CL-based solver for the nodal Discontinuous Galerkin method for Maxwell's equations on unstructured +meshes. We compare our solution to a previous C-based implementation with respect to programmability and performance on +heterogeneous platforms. diff --git a/content/research_papers/2019/2019-11-18-evaluation-of-medical-imaging-applications-using-sycl.md b/content/research_papers/2019/2019-11-18-evaluation-of-medical-imaging-applications-using-sycl.md new file mode 100644 index 0000000..c0c47db --- /dev/null +++ b/content/research_papers/2019/2019-11-18-evaluation-of-medical-imaging-applications-using-sycl.md @@ -0,0 +1,30 @@ +--- +contributor: scott +date: '2019-11-18T10:57:29+01:00' +title: Evaluation of Medical Imaging Applications using SYCL +external_url: https://ieeexplore.ieee.org/document/8982983 +authors: + - name: Zheming Jin + affiliation: Argonne National Laboratory + - name: Hal Finkel + affiliation: Argonne National Laboratory +tags: + - benchmark + - performance + - medical + - rodina + - imaging +--- + +As opposed to the Open Computing Language (OpenCL) programming model in which host and device codes are written in +different languages, the SYCL programming model can combine host and device codes for an application in a type-safe way +to improve development productivity. In this paper, we chose two medical imaging applications (Heart Wall and Particle +Filter) in the Rodinia benchmark suite to study the performance and programming productivity of the SYCL programming +model. More specifically, we introduced the SYCL programming model, shared our experience of implementing the +applications using SYCL, and compared the performance and programming portability of the SYCL implementations with the +OpenCL implementations on an Intel® Xeon® CPU and an Iris® Pro integrated GPU. The results are promising. For the Heart +Wall application, the SYCL implementation is on average 15% faster than the OpenCL implementation on the GPU. For the +Particle Filter application, the SYCL implementation is 3% slower than the OpenCL implementation on the GPU, but it is +75% faster on the CPU. Using lines of code as an indicator of programming productivity, the SYCL host program reduces +the lines of code of the OpenCL host program by 52% and 38% for the Heart Wall and Particle Filter applications, +respectively. diff --git a/content/research_papers/2019/2019-11-22-performance-portability-of-a-wilson-dslash-stencil-operator-mini-app-using-kokkos-and-sycl.md b/content/research_papers/2019/2019-11-22-performance-portability-of-a-wilson-dslash-stencil-operator-mini-app-using-kokkos-and-sycl.md new file mode 100644 index 0000000..cc009ab --- /dev/null +++ b/content/research_papers/2019/2019-11-22-performance-portability-of-a-wilson-dslash-stencil-operator-mini-app-using-kokkos-and-sycl.md @@ -0,0 +1,33 @@ +--- +contributor: scott +date: '2019-11-22T10:57:29+01:00' +title: 'Performance Portability of a Wilson Dslash Stencil Operator Mini-App Using Kokkos and SYCL' +external_url: https://ieeexplore.ieee.org/document/8945798 +authors: + - name: Bálint Joó + affiliation: Jefferson Lab + - name: Thorsten Kurth + affiliation: NERSC + - name: M. A. Clark + affiliation: NVIDIA + - name: Jeongnim Kim + affiliation: Intel Corporation + - name: Christian Robert Trott + affiliation: Sandia National Laboratories + - name: Dan Ibanez + affiliation: Sandia National Laboratories + - name: Daniel Sunderland + affiliation: Sandia National Laboratories + - name: Jack Deslippe + affiliation: NERSC +tags: + - kokkos + - performance + - portability + - lattice-qcd +--- + +We describe our experiences in creating mini-apps for the Wilson-Dslash stencil operator for Lattice Quantum +Chromodynamics using the Kokkos and SYCL programming models. In particular we comment on the performance achieved on a +variety of hardware architectures, limitations we have reached in both programming models and how these have been +resolved by us, or may be resolved by the developers of these models. diff --git a/content/research_papers/2019/2019-12-09-a-case-study-of-k-means-clustering-using-sycl.md b/content/research_papers/2019/2019-12-09-a-case-study-of-k-means-clustering-using-sycl.md new file mode 100644 index 0000000..bca62fe --- /dev/null +++ b/content/research_papers/2019/2019-12-09-a-case-study-of-k-means-clustering-using-sycl.md @@ -0,0 +1,42 @@ +--- +contributor: scott +date: '2019-12-09T10:57:29+01:00' +title: A Case Study of k-means Clustering using SYCL +external_url: https://ieeexplore.ieee.org/document/9005555 +authors: + - name: Zheming Jin + affiliation: Argonne National Laboratory + - name: Hal Finkel + affiliation: Argonne National Laboratory +tags: + - benchmark + - energy-consumption + - programming-language + - gpu + - lowest-consumption + - rodinia + - minimum-distance + - data-transfer + - api + - means-clustering + - fuzzy-clustering + - haswell + - broadwell + - skywell +--- + +As opposed to the OpenCL programming model in which host and device codes are written in two programming languages, the +SYCL programming model combines them for an application in a type-safe way to improve development productivity. As a +popular cluster analysis algorithm, k-means has been implemented using programming models such as OpenMP, OpenCL, and +CUDA. Developing a SYCL implementation of k-means as a case study allows us to have a better understanding of +performance portability and programming productivity of the SYCL programming model. Specifically, we explained the +k-means benchmark in Rodinia, described our efforts of porting the OpenCL k-means benchmark, and evaluated the +performance of the OpenCL and SYCL implementations on the Intel ® Haswell, Broadwell, and Skylake processors. We +summarized the migration steps from OpenCL to SYCL, compiled the SYCL program using Codeplay and Intel ® SYCL compilers, +analyzed the SYCL and OpenCL programs using an open-source profiling tool which can intercept OpenCL runtime calls, and +compared the performance of the implementations on Intel ® CPUs and integrated GPU. The experimental results show that +the SYCL version in which the kernels run on the GPU is 2% and 8% faster than the OpenCL version for the two large +datasets. However, the OpenCL version is still much faster than the SYCL version on the CPUs. Compared to the Intel ® +Haswell and Skylake CPUs, running the k-means benchmark on the Intel ® Broadwell low-power processor with a CPU and an +integrated GPU can achieve the lowest energy consumption. In terms of programming productivity, the lines of code of the +SYCL program are 51% fewer than those of the OpenCL program. \ No newline at end of file diff --git a/content/research_papers/2020/2020-05-18-a-case-study-on-the-hac-cmk-routine-in-sycl-on-integrated-graphics.md b/content/research_papers/2020/2020-05-18-a-case-study-on-the-hac-cmk-routine-in-sycl-on-integrated-graphics.md new file mode 100644 index 0000000..73ba67d --- /dev/null +++ b/content/research_papers/2020/2020-05-18-a-case-study-on-the-hac-cmk-routine-in-sycl-on-integrated-graphics.md @@ -0,0 +1,29 @@ +--- +contributor: scott +date: '2022-05-18T08:08:10.490000+00:00' +title: 'A Case Study on the HACCmk Routine in SYCL on Integrated Graphics' +external_url: https://ieeexplore.ieee.org/document/9150310 +authors: + - name: Zheming Jin + affiliation: Argonne National Laboratory + - name: Vitali Morozov + affiliation: Argonne National Laboratory + - name: Hal Finkel + affiliation: Argonne National Laboratory +tags: + - compute + - haccmk + - integrated-grapghics + - case-study +--- + +As opposed to the Open Computing Language (OpenCL) programming model in which host and device codes are generally +written in different languages, the SYCL programming model can combine host and device codes for an application in a +type-safe way to improve development productivity. In this paper, we chose the HACCmk routine, a representative +compute-bound kernel, as a case study on the performance of the SYCL programming model targeting a heterogeneous +computing device. More specifically, we introduced the SYCL programming model, presented the OpenCL and SYCL +implementations of the routine, and compared the performance of the two implementations using the offline and online +compilation on Intel® IrisTM Pro integrated GPUs. We found that the overhead of online compilation may become +significant compared to the execution time of a kernel. Compared to the performance of OpenCL implementations, the SYCL +implementation can maintain the performance using the offline compilation. The number of execution units in a GPU are +critical to improving the raw performance of a compute-bound kernel. diff --git a/content/research_papers/2020/2020-05-18-towards-automated-kernel-selection-in-machine-learning-systems-a-sycl-case-study.md b/content/research_papers/2020/2020-05-18-towards-automated-kernel-selection-in-machine-learning-systems-a-sycl-case-study.md new file mode 100644 index 0000000..303b334 --- /dev/null +++ b/content/research_papers/2020/2020-05-18-towards-automated-kernel-selection-in-machine-learning-systems-a-sycl-case-study.md @@ -0,0 +1,25 @@ +--- +contributor: scott +date: '2022-05-18T08:08:10.490000+00:00' +title: 'Towards automated kernel selection in machine learning systems: A SYCL case study' +external_url: https://ieeexplore.ieee.org/document/9150358 +authors: + - name: John Lawson + affiliation: Codeplay Software Ltd +tags: + - tuning + - sycl + - gpgpu + - machine-learning + - ai +--- + +Automated tuning of compute kernels is a popular area of research, mainly focused on finding optimal kernel parameters +for a problem with fixed input sizes. This approach is good for deploying machine learning models, where the network +topology is constant, but machine learning research often involves changing network topologies and hyperparameters. +Traditional kernel auto-tuning has limited impact in this case; a more general selection of kernels is required for +libraries to accelerate machine learning research. In this paper we present initial results using machine learning to +select kernels in a case study deploying high performance SYCL kernels in libraries that target a range of heterogeneous +devices from desktop GPUs to embedded accelerators. The techniques investigated apply more generally and could similarly +be integrated with other heterogeneous programming systems. By combining auto-tuning and machine learning these kernel +selection processes can be deployed with little developer effort to achieve high performance on new hardware. diff --git a/content/research_papers/2020/2020-11-13-evaluating-the-performance-and-portability-of-contemporary-sycl-implementations.md b/content/research_papers/2020/2020-11-13-evaluating-the-performance-and-portability-of-contemporary-sycl-implementations.md new file mode 100644 index 0000000..8967236 --- /dev/null +++ b/content/research_papers/2020/2020-11-13-evaluating-the-performance-and-portability-of-contemporary-sycl-implementations.md @@ -0,0 +1,31 @@ +--- +contributor: scott +date: '2020-11-13T08:08:10.490000+00:00' +title: 'Evaluating the Performance and Portability of Contemporary SYCL Implementations' +external_url: https://ieeexplore.ieee.org/document/9309045 +authors: + - name: Beau Johnston + affiliation: Oak Ridge National Laboratory + - name: Jeffrey S. Vetter + affiliation: Oak Ridge National Laboratory + - name: Josh Milthorpe + affiliation: Australian National University +tags: + - benchmarks + - performance + - portability +--- + +SYCL is a single-source programming model for heterogeneous systems; it promises improved maintainability, productivity, +and opportunity for compiler optimization, when compared to accelerator specific programming models. Several +implementations of the SYCL standard have been developed over the past few years, including several backends using +contemporary accelerator languages, like OpenCL, CUDA, and HIP. These implementations vary widely in their support for +specific features of the standard and in their performance. As SYCL grows in popularity, developers need to know how +features are implemented across popular implementations in order to make proper design choices. In this paper, we +evaluate the existing SYCL implementations for important SYCL features across a range of hardware in order to understand +SYCL's performance and portability. This work uses the newest SYCL benchmark suite (SYCL-Bench, 38 kernels) to evaluate +these four existing implementations, comparing support of language features across backends and highlighting feature +completeness and performance. For features, we focus on the five major SYCL parallel constructs, using a motivating +example of the matrix multiplication benchmark. Our results show that the basic data parallelism construct is the best +choice for performance on current SYCL implementations, and we identify opportunities for improvement in several of the +SYCL implementations. diff --git a/content/research_papers/2021/2021-09-07-automatic-parallelization-of-structured-mesh-computations-with-sycl.md b/content/research_papers/2021/2021-09-07-automatic-parallelization-of-structured-mesh-computations-with-sycl.md new file mode 100644 index 0000000..f2e60ef --- /dev/null +++ b/content/research_papers/2021/2021-09-07-automatic-parallelization-of-structured-mesh-computations-with-sycl.md @@ -0,0 +1,25 @@ +--- +contributor: scott +date: '2021-06-07T08:08:10.490000+00:00' +title: 'Automatic Parallelization of Structured Mesh Computations with SYCL' +external_url: https://ieeexplore.ieee.org/document/9555976 +authors: + - name: Gábor Dániel Balogh + affiliation: Pázmány Péter Catholic University + - name: István Reguly + affiliation: Pázmány Péter Catholic University +tags: + - parallel-programming + - nvidia + - intel +--- + +Structured meshes are widely used for scientific computations such as Computational Fluid Dynamics (CFD) applications or +finance. Modern applications often have grid points in the millions. To perform such computations parallelisation is +crucial. However it is unfeasible to port each application every time a new architecture arrives, hence in recent years +the demand for automatic parallelisation and optimisation for the used hardware is increasing. The OPS (Oxford Parallel +library for Structured mesh solvers) has shown good performance and scaling on a wide range of HPC architectures. This +research aims to extend the OPS framework with a SYCL backend to extend the range of architectures that OPS can support +and further increase Performance Portability of OPS applications. The performance of the Intel OneAPI is struggling with +reductions due to high synchronisation cost, but shows promising performance gain on builtin reduction constructs on an +Intel® Xeon® Gold 6226R. We compare the performance of hipSYCL on NVidia V100 GPU to the CUDA implementations. diff --git a/content/research_papers/2021/2021-11-14-benchmarking-and-extending-sycl-hierarchical-parallelism.md b/content/research_papers/2021/2021-11-14-benchmarking-and-extending-sycl-hierarchical-parallelism.md new file mode 100644 index 0000000..9ae98b1 --- /dev/null +++ b/content/research_papers/2021/2021-11-14-benchmarking-and-extending-sycl-hierarchical-parallelism.md @@ -0,0 +1,28 @@ +--- +contributor: scott +date: '2021-11-14T08:08:10.490000+00:00' +title: 'Benchmarking and Extending SYCL Hierarchical Parallelism' +external_url: https://ieeexplore.ieee.org/document/9654235 +authors: + - name: Tom Deakin + affiliation: University of Bristol + - name: Simon McIntosh-Smith + affiliation: University of Bristol + - name: Aksel Alpay + affiliation: Universität Heidelberg + - name: Vincent Heuveline + affiliation: Universität Heidelberg +tags: + - benchmarks + - extending + - parallelism +--- + +SYCL is an open-standard, parallel programming model for programming heterogeneous devices from Khronos. It allows +single-source programming of diverse attached devices in a cross-platform manner in modern C++. SYCL provides different +layers of parallel abstractions, including Same Instruction Multiple Thread (SIMT) kernels, data-parallel loop +concurrency and hierarchical parallelism. We discuss Scoped Parallelism as an extension to the existing Hierarchical +Parallelism in SYCL, and highlight the advantages and disadvantages of these models from the perspective of the +programmer and an implementer of SYCL. In this paper, we compare writing benchmark programs using SIMT kernel, +hierarchical parallelism and scoped parallelism paradigms, and present results running on a high-performance CPU and +GPU. diff --git a/content/research_papers/2021/2021-11-14-case-study-of-using-kokkos-and-sycl-as-performance-portable-frameworks-for-milc-dslash-benchmark-on-nvidia-amd-and-intel-gp-us.md b/content/research_papers/2021/2021-11-14-case-study-of-using-kokkos-and-sycl-as-performance-portable-frameworks-for-milc-dslash-benchmark-on-nvidia-amd-and-intel-gp-us.md new file mode 100644 index 0000000..00842cc --- /dev/null +++ b/content/research_papers/2021/2021-11-14-case-study-of-using-kokkos-and-sycl-as-performance-portable-frameworks-for-milc-dslash-benchmark-on-nvidia-amd-and-intel-gp-us.md @@ -0,0 +1,44 @@ +--- +contributor: scott +date: '2021-11-14T08:08:10.490000+00:00' +title: 'Case Study of Using Kokkos and SYCL as Performance-Portable Frameworks for Milc-Dslash Benchmark on NVIDIA, AMD and Intel GPUs' +external_url: https://ieeexplore.ieee.org/document/9652859 +authors: + - name: Amanda S. Dufek + affiliation: NERSC/LBNL + - name: Rahulkumar Gayatri + affiliation: NERSC/LBNL + - name: Neil Mehta + affiliation: NERSC/LBNL + - name: Douglas Doerfler + affiliation: NERSC/LBNL + - name: Brandon Cook + affiliation: NERSC/LBNL + - name: Yasaman Ghadar + affiliation: Argonne National Laboratory + - name: Carleton DeTar + affiliation: University of Utah +tags: + - kokkos + - milc-dslash + - performance + - portability + - nvidia + - intel + - amd +--- + +Six of the top ten supercomputers in the TOP500 list from June 2021 rely on NVIDIA GPUs to achieve their peak compute +bandwidth. With the announcement of Aurora, Frontier, and El Capitan, Intel and AMD have also entered the domain of +providing GPUs for scientific computing. A consequence of the increased diversity in the GPU landscape is the emergence +of portable programming models such as Kokkos, SYCL, OpenCL, and OpenMP, which allow application developers to maintain +a single-source code across a diverse range of hardware architectures. While the portable frameworks try to optimize the +compute resource usage on a given architecture, it is the programmers responsibility to expose parallelism in an +application that can take advantage of thousands of processing elements available on GPUs. In this paper, we introduce a +GPU-friendly parallel implementation of Milc-Dslash that exposes multiple hierarchies of parallelism in the algorithm. +Milc-Dslash was designed to serve as a benchmark with highly optimized matrix-vector multiplications to measure the +resource utilization on the GPU systems. The parallel hierarchies in the Milc-Dslash algorithm are mapped onto a target +hardware using Kokkos and SYCL programming models. We present the performance achieved by Kokkos and SYCL +implementations of Milc-Dslash on NVIDIA A100 GPU, AMD MI100 GPU, and Intel Gen9 GPU. Additionally, we compare the +Kokkos and SYCL performances with those obtained from the versions written in CUDA and HIP programming models on NVIDIA +A100 GPU and AMD MI100 GPU, respectively. diff --git a/content/research_papers/2021/2021-11-24-efficient-hardware-agnostic-dbms-operator-implementation-using-sycl.md b/content/research_papers/2021/2021-11-24-efficient-hardware-agnostic-dbms-operator-implementation-using-sycl.md new file mode 100644 index 0000000..45bb9ea --- /dev/null +++ b/content/research_papers/2021/2021-11-24-efficient-hardware-agnostic-dbms-operator-implementation-using-sycl.md @@ -0,0 +1,24 @@ +--- +contributor: scott +date: '2021-11-24T08:08:10.490000+00:00' +title: 'Efficient Hardware-Agnostic DBMS Operator Implementation Using SYCL' +external_url: https://ieeexplore.ieee.org/document/9681747 +authors: + - name: Daniil Kulikov + affiliation: SPbU + - name: Daria Nikolskaia + affiliation: ITMO + - name: Petr Kurapov + affiliation: MIPT +tags: + - parallel programming + - hash-join + - gpu + - dbms +--- + +Heterogeneous hardware requires tedious optimization of DBMS algorithms for each platform it supports when implemented +with vendor-specific toolchains. Such an approach inevitably leads to specialization and maintainability issues. In this +paper we evaluate several hash-joins implemented with a high-level language SYCL and compare the data across different +execution devices. We provide a roof-line performance estimations for algorithms and show these implementations are on +par with existing hardware specific implementations. \ No newline at end of file diff --git a/content/research_papers/2022/2022-07-10-a-benchmark-suite-for-improving-performance-portability-of-the-sycl-programming-model.md b/content/research_papers/2022/2022-07-10-a-benchmark-suite-for-improving-performance-portability-of-the-sycl-programming-model.md new file mode 100644 index 0000000..5ec1129 --- /dev/null +++ b/content/research_papers/2022/2022-07-10-a-benchmark-suite-for-improving-performance-portability-of-the-sycl-programming-model.md @@ -0,0 +1,21 @@ +--- +contributor: scott +date: '2022-07-10T08:08:10.490000+00:00' +title: 'A Benchmark Suite for Improving Performance Portability of the SYCL Programming Model' +external_url: https://ieeexplore.ieee.org/document/10158214 +authors: + - name: Zheming Jin + affiliation: Oak Ridge National Laboratory + - name: Jeffrey S. Vetter + affiliation: Oak Ridge National Laboratory +tags: + - benchmarking + - performance + - portability +--- + +SYCL is a portable programming model for multivendor computing devices. Portability is critical for its success. The +heterogeneous computing benchmark suite (HeCBench) is a collection of samples, benchmarks, and mini-applications from +many open-source projects for heterogeneous computing. We hope that HeCBench is useful for understanding and improving +performance portability in the development of the SYCL ecosystem. This abstract is a summary of the background, use +cases, improvement, and future work of the benchmark suite. diff --git a/content/research_papers/2022/2022-10-28-portability and-performance-assessment-of-the-non-negative-matrix-factorization-algorithm-with-openmp-and-sycl.md b/content/research_papers/2022/2022-10-28-portability and-performance-assessment-of-the-non-negative-matrix-factorization-algorithm-with-openmp-and-sycl.md new file mode 100644 index 0000000..e28bf24 --- /dev/null +++ b/content/research_papers/2022/2022-10-28-portability and-performance-assessment-of-the-non-negative-matrix-factorization-algorithm-with-openmp-and-sycl.md @@ -0,0 +1,29 @@ +--- +contributor: scott +date: '2022-10-28T08:08:10.490000+00:00' +title: 'Portability and Performance Assessment of the Non-Negative Matrix Factorization Algorithm with OpenMP and SYCL' +external_url: https://ieeexplore.ieee.org/document/9959906 +authors: + - name: Youssef Faqir-Rhazoui + affiliation: Universidad Complutense de Madrid + - name: Carlos García + affiliation: Instituto de Tecnología del Conocimiento + - name: Francisco Tirado + affiliation: Informática +tags: + - openmp + - sycl + - dpc++ + - oneapi + - matrix + - hpc +--- + +The SYCL standard was released to improve code portability across heterogeneous environments. Intel released the oneAPI +toolkit, which includes the Data-Parallel C++ (DPC++) compiler which is the Intel’s SYCL implementation. SYCL is +designed to use a single source code to target multiple accelerators such as: multi-core CPUs, GPUs and even FPGAs. +Additionally, the C/C++ compiler provided in the oneAPI toolkit supports OpenMP which also allows targeting codes on +both CPU and GPU devices. In this paper, the performance of SYCL and OpenMP is evaluated using the well-known +non-negative matrix factorization (NMF) algorithm. Three different NMF implementations are developed: baseline, SYCL and +OpenMP versions to analyze the acceleration on CPU and GPU. Experimental results show that while the two programming +models perform almost identically on CPU, on GPU, SYCL outperforms its OpenMP counterpart slightly. diff --git a/content/research_papers/2022/2022-11-13-a-first-step-towards-support-for-mpi-partitioned-communication-on-sycl-programmed-fpg-as.md b/content/research_papers/2022/2022-11-13-a-first-step-towards-support-for-mpi-partitioned-communication-on-sycl-programmed-fpg-as.md new file mode 100644 index 0000000..d639c3c --- /dev/null +++ b/content/research_papers/2022/2022-11-13-a-first-step-towards-support-for-mpi-partitioned-communication-on-sycl-programmed-fpg-as.md @@ -0,0 +1,27 @@ +--- +contributor: scott +date: '2022-11-13T08:08:10.490000+00:00' +title: 'A First Step towards Support for MPI Partitioned Communication on SYCL-programmed FPGAs' +external_url: https://ieeexplore.ieee.org/document/10027494 +authors: + - name: Steffen Christgau + affiliation: Zuse Institute Berlin, Berlin + - name: Marius Knaust + affiliation: Zuse Institute Berlin, Berlin + - name: Thomas Steinke + affiliation: Zuse Institute Berlin, Berlin +tags: + - fpgas + - mpi + - oneapi + - dpc++ +--- + +Version 4.0 of the Message Passing Interface standard introduced the concept of Partitioned Communication, which adds +support for multiple contributions to a communication buffer. Although initially targeted at multithreaded MPI +applications, Partitioned Communication currently receives attraction in the context of accelerators, especially GPUs. +In this publication it is demonstrated that this communication concept can be implemented for SYCL-programmed FPGAs. +This includes a discussion of the design space and the presentation of a prototype implementation. Experimental results +show that a lightweight implementation on top of an existing MPI library is possible. The presented approach also +reveals issues in both the SYCL and the MPI standard, which needs to be addressed for improved support for the intended +communication style. diff --git a/content/research_papers/2022/2022-11-13-towards-cross-platform-portability-of-coupled-cluster-methods-with-perturbative-triples-using-sycl.md b/content/research_papers/2022/2022-11-13-towards-cross-platform-portability-of-coupled-cluster-methods-with-perturbative-triples-using-sycl.md new file mode 100644 index 0000000..106a70f --- /dev/null +++ b/content/research_papers/2022/2022-11-13-towards-cross-platform-portability-of-coupled-cluster-methods-with-perturbative-triples-using-sycl.md @@ -0,0 +1,34 @@ +--- +contributor: scott +date: '2022-11-13T08:08:10.490000+00:00' +title: 'Towards Cross-Platform Portability of Coupled-Cluster Methods with Perturbative Triples using SYCL' +external_url: https://ieeexplore.ieee.org/document/10024604 +authors: + - name: Abhishek Bagusetty + affiliation: Argonne National Laboratory + - name: Ajay Panyala + affiliation: Pacific Northwest National Laboratory + - name: Gordon Brown + affiliation: Codeplay Software Ltd + - name: Jack Kirk + affiliation: Codeplay Software Ltd +tags: + - performance + - nvidia + - perturbation + - triples + - coupled-cluster + - portability +--- + +Tensor contractions form the fundamental computational operation of computational chemistry, and these contractions +dictate the performance of widely used coupled-cluster (CC) methods in computational chemistry. In this work, we study a +single-source, cross-platform C++ abstraction layer programming model, SYCL, for applications related to the +computational chemistry methods such as CCSD(T) coupled-cluster formalism. An existing optimized CUDA implementation was +migrated to SYCL to make use of the novel algorithm that provides tractable GPU memory needs for solving +high-dimensional tensor contractions for accelerating CCSD(T). We present the cross-platform performance achieved using +SYCL implementations for the non-iterative triples contribution of the CCSD(T) formalism which is considered as the +performance bottle neck on NVIDIA A100 and AMD Instinct MI250X. Additionally, we also draw comparisons of similar +performance metrics from vendor-based native programming models such as CUDA and ROCm HIP. Our results indicate that the +performance of SYCL measured at-scale was on-par with the code written in HIP for AMD MI250X GPUs while the performance +is slightly lacking on NVIDIA A100 GPUs in comparison to CUDA. diff --git a/content/research_papers/2022/2022-11-18-evaluating-nonuniform-reduction-in-hip-and-sycl-on-gp-us.md b/content/research_papers/2022/2022-11-18-evaluating-nonuniform-reduction-in-hip-and-sycl-on-gp-us.md new file mode 100644 index 0000000..cba3ec9 --- /dev/null +++ b/content/research_papers/2022/2022-11-18-evaluating-nonuniform-reduction-in-hip-and-sycl-on-gp-us.md @@ -0,0 +1,20 @@ +--- +contributor: scott +date: '2022-11-18T08:08:10.490000+00:00' +title: 'Evaluating Nonuniform Reduction in HIP and SYCL on GPUs' +external_url: https://ieeexplore.ieee.org/document/10025472 +authors: + - name: Zheming Jin + affiliation: Oak Ridge National Laboratory + - name: Jeffrey S. Vetter + affiliation: Oak Ridge National Laboratory +tags: + - reduction + - nonuniform + - evaluating +--- + +Motivated by maturing programming models and portability for heterogeneous computing, we describe the challenges posed +by hardware architectures and programming models when migrating an optimized implementation of nonuniform reduction from +CUDA to HIP and SYCL. We explain the migration experience, evaluate the performance of the reduction on GPU -based +computing platforms, and provide feedback on improving portability for the development of the SYCL programming model. diff --git a/content/research_papers/2022/2022-12-01-performance-study-of-gpu-applications-using-sycl-and-cuda-on-tesla-v100-gpu.md b/content/research_papers/2022/2022-12-01-performance-study-of-gpu-applications-using-sycl-and-cuda-on-tesla-v100-gpu.md new file mode 100644 index 0000000..588f17e --- /dev/null +++ b/content/research_papers/2022/2022-12-01-performance-study-of-gpu-applications-using-sycl-and-cuda-on-tesla-v100-gpu.md @@ -0,0 +1,31 @@ +--- +contributor: scott +date: '2022-12-01T08:08:10.490000+00:00' +title: 'Performance Study of GPU applications using SYCL and CUDA on Tesla V100 GPU' +external_url: https://ieeexplore.ieee.org/document/9622813 +authors: + - name: Goutham Kalikrishna Reddy Kuncham + affiliation: NextGen R&D + - name: Rahul Vaidya + affiliation: NextGen R&D + - name: Mahesh Barve + affiliation: HPC Center Of Excellence +tags: + - performance + - runtime + - conferences + - gpu + - ram + - throughput + - hardware +--- + +SYCL standard enables single-source programs to run on heterogeneous platforms consisting of CPUs, GPUs, FPGAs across +different hardware vendors. SYCL combines modern C++ features along with OpenCL’s portability. SYCL runtime is also +capable of targeting the CUDA backend directly on NVIDIA GPUs. This approach can potentially improve the performance of +SYCL on NVIDIA devices. Although NVIDIA GPUs can be targeted via OpenCL backend, their features and capabilities are +limited, and the performance is inadequate.In this study, we compare the performance of the Nvidia V100 GPU using SYCL +and CUDA. For performance evaluation, we selected three GPU applications: BabelStream, Mixbench, and Tiled +Matrix-Multiplication. We conducted extensive tests to understand the performance in terms of DRAM bandwidth, kernel +execution time, compilation time, and throughput. As per our study, the performance of SYCL and CUDA were found to be +similar. However, in some cases, CUDA outperformed SYCL. \ No newline at end of file diff --git a/content/research_papers/2022/2022-12-06-understanding-performance-portability-of-bioinformatics-applications-in-sycl-on-an-nvidia-gpu.md b/content/research_papers/2022/2022-12-06-understanding-performance-portability-of-bioinformatics-applications-in-sycl-on-an-nvidia-gpu.md new file mode 100644 index 0000000..a4ce246 --- /dev/null +++ b/content/research_papers/2022/2022-12-06-understanding-performance-portability-of-bioinformatics-applications-in-sycl-on-an-nvidia-gpu.md @@ -0,0 +1,21 @@ +--- +contributor: scott +date: '2022-12-06T08:08:10.490000+00:00' +title: 'Understanding Performance Portability of Bioinformatics Applications in SYCL on an NVIDIA GPU' +external_url: https://ieeexplore.ieee.org/document/9995222 +authors: + - name: Zheming Jin + affiliation: Oak Ridge National Laboratory + - name: Jeffrey S. Vetter + affiliation: Oak Ridge National Laboratory +tags: + - performance + - nvidia + - bioinformatics + - portability +--- + +Our goal is to have a better understanding of performance portability of SYCL kernels on a GPU. Toward this goal, we +migrate representative kernels in bioinformatics applications from CUDA to SYCL, evaluate their performance on an NVIDIA +GPU, and explain the performance gaps through performance profiling and analyses. We hope that the findings provide +valuable feedback to the development of the SYCL ecosystem. diff --git a/content/research_papers/2022/2022-12-23-a-memory-bank-conflict-prevention-mechanism-for-sycl-on-sx-aurora-tsubasa.md b/content/research_papers/2022/2022-12-23-a-memory-bank-conflict-prevention-mechanism-for-sycl-on-sx-aurora-tsubasa.md new file mode 100644 index 0000000..0f7d8b5 --- /dev/null +++ b/content/research_papers/2022/2022-12-23-a-memory-bank-conflict-prevention-mechanism-for-sycl-on-sx-aurora-tsubasa.md @@ -0,0 +1,34 @@ +--- +contributor: scott +date: '2022-12-23T08:08:10.490000+00:00' +title: 'A memory bank conflict prevention mechanism for SYCL on SX-Aurora TSUBASA' +external_url: https://ieeexplore.ieee.org/document/9644088 +authors: + - name: Wenbin Wang + affiliation: Tohoku University + - name: Jiahao Li + affiliation: Tohoku University + - name: Yohichi Shimomura + affiliation: Tohoku University + - name: Hiroyuki Takizawa + affiliation: Tohoku University +tags: + - performance + - runtime + - bandwidth + - programming + - supercomputers + - libraries + - kernel +--- + +A modern vector supercomputer, NEC SX-Aurora TSUBASA, consists of Vector Hosts (VHs) and Vector Engines (VEs). A VH is a +standard CPU to perform general tasks and hosting the VEs, while a VE is a special device designed to operate on long +vectors, and provides world’s top-class theoretical memory bandwidth of 1.53 TB/s. However, in some cases, the sustained +memory bandwidth achieved in practical use is far from the theoretical one. This is because frequent memory bank +conflicts cause performance degradation. The purpose of this work is to achieve high sustained memory bandwidth by +introducing a bank conflict prevention mechanism to a SYCL implementation, named neoSYCL. The evaluation results using +several kernels clearly show that this mechanism can be used without changing the standard interface defined in the SYCL +specification. It is also demonstrated that the proposed approach can successfully prevent memory bank conflicts, and +thus achieve higher sustained memory bandwidths than the original one, meaning to expect higher sustained performance on +memory-intensive scientific applications. diff --git a/content/research_papers/2023/2023-05-15-remote-execution-of-open-cl-and-sycl-applications-via-r-open-cl.md b/content/research_papers/2023/2023-05-15-remote-execution-of-open-cl-and-sycl-applications-via-r-open-cl.md new file mode 100644 index 0000000..dd26860 --- /dev/null +++ b/content/research_papers/2023/2023-05-15-remote-execution-of-open-cl-and-sycl-applications-via-r-open-cl.md @@ -0,0 +1,24 @@ +--- +contributor: scott +date: '2023-05-15T08:08:10.490000+00:00' +title: 'Remote Execution of OpenCL and SYCL Applications via rOpenCL' +external_url: https://ieeexplore.ieee.org/document/10196646 +authors: + - name: Rui Alves + affiliation: Instituto Politécnico de Bragança Campus de Santa Apolónia + - name: José Rufino + affiliation: Instituto Politécnico de Bragança +tags: + - hpc + - api + - opencl + - remote +--- + +Here, we present the migration of a CUDA based seismic application, named SeisAcoMod2D, to SYCL codebase using Intel® +oneAPI. SYCL programming enables developers to have single source codebase across different computing architectures and +vendors of CPUs, GPUs, and FPGAs. SeisAcoMod2D performs acoustic wave propagation using finite difference time domain +modelling, which is useful in oil exploration applications. The migrated SYCL code has been optimized for GPUs and the +output data is validated. The migrated unified SYCL code is executed on GPUs from Intel and Nvidia and on CPUs from +Intel. The performance of the SYCL code is found similar to that of the CUDA code on Nvidia® A100 GPU. A speed up of +1.75x is obtained on Intel® Data Center GPU Max 1550 GPU (Ponte Vecchio) over Nvidia® A100 (80GB) GPU. diff --git a/content/research_papers/2023/2023-05-15-understanding-performance-portability-of-sycl-kernels-a-case-study-with-the-all-pairs-distance-calculation-in-bioinformatics-on-gp-us.md b/content/research_papers/2023/2023-05-15-understanding-performance-portability-of-sycl-kernels-a-case-study-with-the-all-pairs-distance-calculation-in-bioinformatics-on-gp-us.md new file mode 100644 index 0000000..147a274 --- /dev/null +++ b/content/research_papers/2023/2023-05-15-understanding-performance-portability-of-sycl-kernels-a-case-study-with-the-all-pairs-distance-calculation-in-bioinformatics-on-gp-us.md @@ -0,0 +1,21 @@ +--- +contributor: scott +date: '2023-05-15T08:08:10.490000+00:00' +title: 'Understanding Performance Portability of SYCL Kernels: A Case Study with the All-Pairs Distance Calculation in Bioinformatics on GPUs' +external_url: https://ieeexplore.ieee.org/document/10196541 +authors: + - name: Zheming Jin + affiliation: Oak Ridge National Laboratory + - name: Jeffrey S. Vetter + affiliation: Oak Ridge National Laboratory +tags: + - portability + - performance + - bioinformatics +--- + +SYCL is a portable programming model. Toward the goal of a better understanding of performance portability of SYCL +kernels on GPUs, we select a bioinformatics kernel for computing the all-pairs distance as a case study. After migrating +the kernel from CUDA to HIP and SYCL, we evaluate the performance of the CUDA, HIP, and SYCL kernels on NVIDIA V100 and +AMD MI210 GPUs. We analyze the GPU instructions from the kernels to explain performance gaps between SYCL and CUDA/HIP. +We hope that the findings are valuable for improving performance portability of SYCL. diff --git a/content/research_papers/2023/2023-05-15-understanding-sycl-portability-for-pseudorandom-number-generation-a-case-study-with-gene-expression-connectivity-mapping.md b/content/research_papers/2023/2023-05-15-understanding-sycl-portability-for-pseudorandom-number-generation-a-case-study-with-gene-expression-connectivity-mapping.md new file mode 100644 index 0000000..ae05e61 --- /dev/null +++ b/content/research_papers/2023/2023-05-15-understanding-sycl-portability-for-pseudorandom-number-generation-a-case-study-with-gene-expression-connectivity-mapping.md @@ -0,0 +1,24 @@ +--- +contributor: scott +date: '2023-05-15T08:08:10.490000+00:00' +title: 'Understanding SYCL Portability for Pseudorandom Number Generation: a Case Study with Gene-Expression Connectivity Mapping' +external_url: https://ieeexplore.ieee.org/document/10196601 +authors: + - name: Zheming Jin + affiliation: Oak Ridge National Laboratory + - name: Jeffrey S. Vetter + affiliation: Oak Ridge National Laboratory +tags: + - portability + - pseudorandom + - random-number + - bioinformatics +--- + +Towards the goal of improving functional and performance portability of SYCL, we study a bioinformatics application that +has been accelerated with CUDA and fast pseudorandom number generation on a GPU. We describe the experience of migrating +pseudorandom number generation from CUDA to SYCL, evaluate the performance of pseudorandom number generators using the +CUDA random number generation library, suggest the support of the XORWOW pseudorandom number generator in the oneAPI +math kernel library (oneMKL) interface for performance portability, and identify the performance gap using the MKL +interface in SYCL that supports pseudorandom number generation with third-party libraries. We hope that the results are +valuable for the development of the SYCL ecosystem. diff --git a/content/research_papers/2023/2023-09-05-experience-migrating-open-cl-to-sycl-a-case-study-on-searches-for-potential-off-target-sites-of-cas-9-rna-guided-endonucleases-on-amd-gp-us.md b/content/research_papers/2023/2023-09-05-experience-migrating-open-cl-to-sycl-a-case-study-on-searches-for-potential-off-target-sites-of-cas-9-rna-guided-endonucleases-on-amd-gp-us.md new file mode 100644 index 0000000..1f420e6 --- /dev/null +++ b/content/research_papers/2023/2023-09-05-experience-migrating-open-cl-to-sycl-a-case-study-on-searches-for-potential-off-target-sites-of-cas-9-rna-guided-endonucleases-on-amd-gp-us.md @@ -0,0 +1,24 @@ +--- +contributor: scott +date: '2023-09-05T08:08:10.490000+00:00' +title: 'Experience Migrating OpenCL to SYCL: A Case Study on Searches for Potential Off-Target Sites of Cas9 RNA-Guided Endonucleases on AMD GPUs' +external_url: https://ieeexplore.ieee.org/document/10256881 +authors: + - name: Zheming Jin + affiliation: Oak Ridge National Laboratory + - name: Jeffrey S. Vetter + affiliation: Oak Ridge National Laboratory +tags: + - sequence + - analysis + - opencl + - migrating + - cas9 + - rna +--- + +Cas-OFFinder is a popular application for genome editing. Its OpenCL implementation searches potential off-target sites +in parallel on a GPU. In this work, we describe our experience migrating the application from OpenCL to SYCL. Evaluating +the performance of the OpenCL and SYCL applications using human genome sequences shows that the SYCL program could +achieve performance portability on the target GPUs. Exploring the optimizations of the hotspot kernel in SYCL may +further improve the performance of the application by 9% to 23%. diff --git a/content/research_papers/2023/2023-10-17-comparing-performance-and-portability-between-cuda-and-sycl-for-protein-database-search-on-nvidia-amd-and-intel-gp-us.md b/content/research_papers/2023/2023-10-17-comparing-performance-and-portability-between-cuda-and-sycl-for-protein-database-search-on-nvidia-amd-and-intel-gp-us.md new file mode 100644 index 0000000..8178a20 --- /dev/null +++ b/content/research_papers/2023/2023-10-17-comparing-performance-and-portability-between-cuda-and-sycl-for-protein-database-search-on-nvidia-amd-and-intel-gp-us.md @@ -0,0 +1,36 @@ +--- +contributor: scott +date: '2023-10-17T08:08:10.490000+00:00' +title: 'Comparing Performance and Portability Between CUDA and SYCL for Protein Database Search on NVIDIA, AMD, and Intel GPUs' +external_url: https://ieeexplore.ieee.org/document/10306194 +authors: + - name: Manuel Costanzo + affiliation: UNLP - CIC, La Plata + - name: Enzo Rucci + affiliation: UNLP - CIC, La Plata + - name: Carlos García-Sánchez + affiliation: Universidad Complutense de Madrid + - name: Marcelo Naiouf + affiliation: UNLP - CIC, La Plata + - name: Manuel Prieto-Matías + affiliation: Universidad Complutense de Madrid +tags: + - cuda + - gpu + - portability + - performance + - comparison + - nvidia + - amd + - intel +--- + +The heterogeneous computing paradigm has led to the need for portable and efficient programming solutions that can +leverage the capabilities of various hardware devices, such as NVIDIA, Intel, and AMD GPUs. This study evaluates the +portability and performance of the SYCL and CUDA languages for one fundamental bioinformatics application ( +Smith-Waterman protein database search) across different GPU architectures, considering single and multi-GPU +configurations from different vendors. The experimental work showed that, while both CUDA and SYCL versions achieve +similar performance on NVIDIA devices, the latter demonstrated remarkable code portability to other GPU architectures, +such as AMD and Intel. Furthermore, the architectural efficiency rates achieved on these devices were superior in 3 of +the 4 cases tested. This brief study highlights the potential of SYCL as a viable solution for achieving both +performance and portability in the heterogeneous computing ecosystem. diff --git a/content/research_papers/2023/2023-10-31-accelerating-hyperdimensional-classifier-with-sycl.md b/content/research_papers/2023/2023-10-31-accelerating-hyperdimensional-classifier-with-sycl.md new file mode 100644 index 0000000..e6845ea --- /dev/null +++ b/content/research_papers/2023/2023-10-31-accelerating-hyperdimensional-classifier-with-sycl.md @@ -0,0 +1,33 @@ +--- +contributor: scott +date: '2023-10-31T08:08:10.490000+00:00' +title: 'Accelerating Hyperdimensional Classifier with SYCL' +external_url: https://ieeexplore.ieee.org/document/10321902 +authors: + - name: Zheming Jin + affiliation: Oak Ridge National Laboratory + - name: Jeffrey S. Vetter + affiliation: Oak Ridge National Laboratory +tags: + - parallel + - search + - dimension + - accelerating + - performance + - mathematial +--- + +Hyperdimensional (HD) computing is based on mathematical properties of high-dimensional spaces which show remarkable +agreement with brain-controlled behaviors. Rahimi et al. describe an HD-based classifier for the task of +recognizing the languages of text samples. It consists of an encoding module that generates a hypervector for each +text sample and a search module that compares the generated vector with a set of trained hypervectors. One of the +challenges of the HD computing research is that hardware simulation of the classifier is extremely time-consuming with +many text samples. To address the challenge, the classifier may be modelled as a compute routine in Open Computing +Language (OpenCL) and executed on graphics processing units (GPUs) for acceleration. While OpenCL allows for +writing parallel and portable programs targeting vendors’ computing platforms, writing an OpenCL program tends to be +error-prone and time-consuming. Built on the underlying concepts, portability, and efficiency of OpenCL, SYCL defines a +single-source abstract layer in C++. In this work, we adopt the SYCL abstraction for productivity and performance. +Compared to the OpenCL application, the SYCL application approximately reduces the lines of code by 24% and increases +the performance by 2.13X on four GPUs. In addition, the speedups of executing the application in parallel over the +fastest serial execution on the four heterogeneous computing systems are approximately 2.11X, 1.23X, 1.56X, and 1.03X, +respectively. diff --git a/content/research_papers/2023/2023-12-18-migration-of-cuda-based-seismic-application-to-cross-platform-sycl-implementation.md b/content/research_papers/2023/2023-12-18-migration-of-cuda-based-seismic-application-to-cross-platform-sycl-implementation.md new file mode 100644 index 0000000..9f81793 --- /dev/null +++ b/content/research_papers/2023/2023-12-18-migration-of-cuda-based-seismic-application-to-cross-platform-sycl-implementation.md @@ -0,0 +1,35 @@ +--- +contributor: scott +date: '2023-12-18T08:08:10.490000+00:00' +title: 'Migration of CUDA Based Seismic Application to Cross-Platform SYCL Implementation' +external_url: https://ieeexplore.ieee.org/document/10502402 +authors: + - name: Om Jadhav + affiliation: HPC-Technologies Group + - name: Sandeep Agrawal + affiliation: HPC-Technologies Group + - name: Abhishek Srivastava + affiliation: HPC-SE&A Group + - name: Richa Rastogi + affiliation: HPC-SE&A Group + - name: Sanjay Wandhekar + affiliation: HPC-Technologies Group + - name: Vinutha SV + affiliation: Intel Technology India Pvt. Ltd + - name: Jyotsna Khemka + affiliation: Intel Technology India Pvt. Ltd +tags: + - acoustic-waves + - seismic-data + - manually-optimized + - cuda + - migration +--- + +Here, we present the migration of a CUDA based seismic application, named SeisAcoMod2D, to SYCL codebase using Intel® +oneAPI. SYCL programming enables developers to have single source codebase across different computing architectures and +vendors of CPUs, GPUs, and FPGAs. SeisAcoMod2D performs acoustic wave propagation using finite difference time domain +modelling, which is useful in oil exploration applications. The migrated SYCL code has been optimized for GPUs and the +output data is validated. The migrated unified SYCL code is executed on GPUs from Intel and Nvidia and on CPUs from +Intel. The performance of the SYCL code is found similar to that of the CUDA code on Nvidia® A100 GPU. A speed up of +1.75x is obtained on Intel® Data Center GPU Max 1550 GPU (Ponte Vecchio) over Nvidia® A100 (80GB) GPU. diff --git a/content/research_papers/2023/2023-12-21-evaluating-performance-portability-of-sycl-and-kokkos-a-case-study-on-lbm-simulations.md b/content/research_papers/2023/2023-12-21-evaluating-performance-portability-of-sycl-and-kokkos-a-case-study-on-lbm-simulations.md new file mode 100644 index 0000000..3f97b03 --- /dev/null +++ b/content/research_papers/2023/2023-12-21-evaluating-performance-portability-of-sycl-and-kokkos-a-case-study-on-lbm-simulations.md @@ -0,0 +1,41 @@ +--- +contributor: scott +date: '2023-12-21T08:08:10.490000+00:00' +title: 'Evaluating Performance Portability of SYCL and Kokkos: A Case Study on LBM Simulations' +external_url: https://ieeexplore.ieee.org/document/10491773 +authors: + - name: Yue Ding + affiliation: National University of Defense Technology + - name: Chuanfu Xu + affiliation: National University of Defense Technology + - name: Haozhong Qiu + affiliation: National University of Defense Technology + - name: Qingsong Wang + affiliation: National University of Defense Technology + - name: Weixi Dai + affiliation: National University of Defense Technology + - name: Yongzhen Lin + affiliation: National University of Defense Technology + - name: Yonggang Che + affiliation: National University of Defense Technology +tags: + - kokkos + - performance + - portability + - cross-platform + - lbm + - simulations +--- + +Since modern high performance computing systems are evolving towards diverse and heterogeneous architectures, the +emergence of high-level portable programming models leads to a particular focus on performance portability. In this +paper, we evaluate the performance portability and explore performance optimization methods for two portable programming +models SYCL and Kokkos. We take an open-source multi-phase Lattice Boltzmann Method (LBM) flow simulation code as a case +study and implement portable versions with different optimizations. Then we compare our portable implementations with +engineer-tuned OpenMP and CUDA versions on Intel CPUs and NVIDIA GPUs. Experimental results show that both SYCL and +Kokkos can deliver superior performance than traditional programming models, but the best performance of the portable +versions depends heavily on platform-specific optimizations. There is no single implementation that can achieve the best +performance on both CPUs and GPUs. Consequently, we conclude that the performance portability still needs to be further +improved for both SYCL and Kokkos. In addition, we present a comparative analysis of different optimization methods that +qualify the performance enhancement when using SYCL and Kokkos on CPUs and GPUs. Our work offers valuable references for +the development of both portable programming models and applications. diff --git a/content/research_papers/2024/2024-03-02-experiences-building-an-mlir-based-sycl-compiler.md b/content/research_papers/2024/2024-03-02-experiences-building-an-mlir-based-sycl-compiler.md new file mode 100644 index 0000000..895e7b3 --- /dev/null +++ b/content/research_papers/2024/2024-03-02-experiences-building-an-mlir-based-sycl-compiler.md @@ -0,0 +1,41 @@ +--- +contributor: scott +date: '2024-03-02T08:08:10.490000+00:00' +title: 'Experiences Building an MLIR-Based SYCL Compiler' +external_url: https://ieeexplore.ieee.org/document/10444866 +authors: + - name: Ettore Tiotto + affiliation: Intel Corporation + - name: Víctor Pérez + affiliation: Codeplay Software + - name: Whitney Tsang + affiliation: Intel Corporation + - name: Lukas Sommer + affiliation: Codeplay Software + - name: Julian Oppermann + affiliation: Codeplay Software + - name: Victor Lomüller + affiliation: Codeplay Software + - name: Mehdi Goli + affiliation: Codeplay Software + - name: James Brodman + affiliation: Intel Corporation +tags: + - SYCL + - MLIR + - compiler + - optimization + - heterogeneous-programming +--- + +Similar to other programming models, compilers for SYCL, the open programming model for heterogeneous computing based on +C++, would benefit from access to higher-level intermediate representations. The loss of high-level structure and +semantics caused by premature lowering to low-level intermediate representations and the inability to reason about host +and device code simultaneously present major challenges for SYCL compilers. The MLIR compiler framework, through its +dialect mechanism, allows to model domain-specific, high-level intermediate representations and provides the necessary +facilities to address these challenges. This work therefore describes practical experience with the design and +implementation of an MLIR-based SYCL compiler. By modeling key elements of the SYCL programming model in host and device +code in the MLIR dialect framework, the presented approach enables the implementation of powerful device code +optimizations as well as analyses across host and device code. Compared to two LLVM-based SYCL implementations, this +yields speedups of up to 4.3x on a collection of SYCL benchmark applications. Finally, this work also discusses +challenges encountered in the design and implementation and how these could be addressed in the future. diff --git a/content/research_papers/2024/2024-05-20-unveiling-performance-insights-and-portability-achievements-between-cuda-and-sycl-for-particle-in-cell-codes-on-different-gpu-architectures.md b/content/research_papers/2024/2024-05-20-unveiling-performance-insights-and-portability-achievements-between-cuda-and-sycl-for-particle-in-cell-codes-on-different-gpu-architectures.md new file mode 100644 index 0000000..bbb3fec --- /dev/null +++ b/content/research_papers/2024/2024-05-20-unveiling-performance-insights-and-portability-achievements-between-cuda-and-sycl-for-particle-in-cell-codes-on-different-gpu-architectures.md @@ -0,0 +1,30 @@ +--- +contributor: scott +date: '2024-05-20T08:08:10.490000+00:00' +title: 'Unveiling Performance Insights and Portability Achievements Between CUDA and SYCL for Particle-in-Cell Codes on Different GPU Architectures' +external_url: https://ieeexplore.ieee.org/document/10569866 +authors: + - name: Ivona Vasileska + affiliation: University of Ljubljana + - name: Pavel Tomšič + affiliation: University of Ljubljana + - name: Leon Kos + affiliation: University of Ljubljana + - name: Leon Bogdanović + affiliation: University of Ljubljana +tags: + - gpu + - cuda + - hpc + - pic + - plasma +--- + +The HPC systems worldwide are getting more powerful with the combination of CPU, GPU, and other accelerators (e.g., +FPGAs and Quantum Processors). Many programming frameworks mainly offer excellent support portability to the existing +scientific codes to use the exascale HPC systems. This study evaluates the performance and portability of CUDA and SYCL +for one of the most used plasma kinetic simulation codes Particle-In-Cell (PIC). The PIC codes are numerical modelling +tools used for handling the extreme nonlinear methods in fusion devices. The experimental work showed that accelerating +the PIC code with CUDA and SYCL achieve similar performance on NVIDIA devices, the latter demonstrated remarkable code +portability to other GPU architectures. This brief study highlights the potential of SYCL as a viable solution for +achieving both performance and portability in the heterogeneous computing ecosystem.