From 225abef4a0f12969ad874dc93d38eaac6a2df3aa Mon Sep 17 00:00:00 2001
From: Scott Straughan <scott.straughan@codeplay.com>
Date: Thu, 11 Jul 2024 11:42:56 +0100
Subject: [PATCH] Add numerous research papers.

---
 ...ative-study-of-sycl-open-cl-and-open-mp.md | 31 +++++++++++++
 ...ons-with-modern-c-and-sycl-a-case-study.md | 39 ++++++++++++++++
 ...medical-imaging-applications-using-sycl.md | 30 +++++++++++++
 ...operator-mini-app-using-kokkos-and-sycl.md | 33 ++++++++++++++
 ...-study-of-k-means-clustering-using-sycl.md | 42 ++++++++++++++++++
 ...-routine-in-sycl-on-integrated-graphics.md | 29 ++++++++++++
 ...hine-learning-systems-a-sycl-case-study.md | 25 +++++++++++
 ...ty-of-contemporary-sycl-implementations.md | 31 +++++++++++++
 ...-structured-mesh-computations-with-sycl.md | 25 +++++++++++
 ...extending-sycl-hierarchical-parallelism.md | 28 ++++++++++++
 ...benchmark-on-nvidia-amd-and-intel-gp-us.md | 44 +++++++++++++++++++
 ...dbms-operator-implementation-using-sycl.md | 24 ++++++++++
 ...rtability-of-the-sycl-programming-model.md | 21 +++++++++
 ...rization-algorithm-with-openmp-and-sycl.md | 29 ++++++++++++
 ...communication-on-sycl-programmed-fpg-as.md | 27 ++++++++++++
 ...ds-with-perturbative-triples-using-sycl.md | 34 ++++++++++++++
 ...form-reduction-in-hip-and-sycl-on-gp-us.md | 20 +++++++++
 ...s-using-sycl-and-cuda-on-tesla-v100-gpu.md | 31 +++++++++++++
 ...s-applications-in-sycl-on-an-nvidia-gpu.md | 21 +++++++++
 ...mechanism-for-sycl-on-sx-aurora-tsubasa.md | 34 ++++++++++++++
 ...-cl-and-sycl-applications-via-r-open-cl.md | 24 ++++++++++
 ...-calculation-in-bioinformatics-on-gp-us.md | 21 +++++++++
 ...th-gene-expression-connectivity-mapping.md | 24 ++++++++++
 ...9-rna-guided-endonucleases-on-amd-gp-us.md | 24 ++++++++++
 ...se-search-on-nvidia-amd-and-intel-gp-us.md | 36 +++++++++++++++
 ...g-hyperdimensional-classifier-with-sycl.md | 33 ++++++++++++++
 ...n-to-cross-platform-sycl-implementation.md | 35 +++++++++++++++
 ...-kokkos-a-case-study-on-lbm-simulations.md | 41 +++++++++++++++++
 ...es-building-an-mlir-based-sycl-compiler.md | 41 +++++++++++++++++
 ...ll-codes-on-different-gpu-architectures.md | 30 +++++++++++++
 30 files changed, 907 insertions(+)
 create mode 100644 content/research_papers/2016/2016-10-26-a-comparative-study-of-sycl-open-cl-and-open-mp.md
 create mode 100644 content/research_papers/2018/2018-07-10-solving-maxwells-equations-with-modern-c-and-sycl-a-case-study.md
 create mode 100644 content/research_papers/2019/2019-11-18-evaluation-of-medical-imaging-applications-using-sycl.md
 create mode 100644 content/research_papers/2019/2019-11-22-performance-portability-of-a-wilson-dslash-stencil-operator-mini-app-using-kokkos-and-sycl.md
 create mode 100644 content/research_papers/2019/2019-12-09-a-case-study-of-k-means-clustering-using-sycl.md
 create mode 100644 content/research_papers/2020/2020-05-18-a-case-study-on-the-hac-cmk-routine-in-sycl-on-integrated-graphics.md
 create mode 100644 content/research_papers/2020/2020-05-18-towards-automated-kernel-selection-in-machine-learning-systems-a-sycl-case-study.md
 create mode 100644 content/research_papers/2020/2020-11-13-evaluating-the-performance-and-portability-of-contemporary-sycl-implementations.md
 create mode 100644 content/research_papers/2021/2021-09-07-automatic-parallelization-of-structured-mesh-computations-with-sycl.md
 create mode 100644 content/research_papers/2021/2021-11-14-benchmarking-and-extending-sycl-hierarchical-parallelism.md
 create mode 100644 content/research_papers/2021/2021-11-14-case-study-of-using-kokkos-and-sycl-as-performance-portable-frameworks-for-milc-dslash-benchmark-on-nvidia-amd-and-intel-gp-us.md
 create mode 100644 content/research_papers/2021/2021-11-24-efficient-hardware-agnostic-dbms-operator-implementation-using-sycl.md
 create mode 100644 content/research_papers/2022/2022-07-10-a-benchmark-suite-for-improving-performance-portability-of-the-sycl-programming-model.md
 create mode 100644 content/research_papers/2022/2022-10-28-portability and-performance-assessment-of-the-non-negative-matrix-factorization-algorithm-with-openmp-and-sycl.md
 create mode 100644 content/research_papers/2022/2022-11-13-a-first-step-towards-support-for-mpi-partitioned-communication-on-sycl-programmed-fpg-as.md
 create mode 100644 content/research_papers/2022/2022-11-13-towards-cross-platform-portability-of-coupled-cluster-methods-with-perturbative-triples-using-sycl.md
 create mode 100644 content/research_papers/2022/2022-11-18-evaluating-nonuniform-reduction-in-hip-and-sycl-on-gp-us.md
 create mode 100644 content/research_papers/2022/2022-12-01-performance-study-of-gpu-applications-using-sycl-and-cuda-on-tesla-v100-gpu.md
 create mode 100644 content/research_papers/2022/2022-12-06-understanding-performance-portability-of-bioinformatics-applications-in-sycl-on-an-nvidia-gpu.md
 create mode 100644 content/research_papers/2022/2022-12-23-a-memory-bank-conflict-prevention-mechanism-for-sycl-on-sx-aurora-tsubasa.md
 create mode 100644 content/research_papers/2023/2023-05-15-remote-execution-of-open-cl-and-sycl-applications-via-r-open-cl.md
 create mode 100644 content/research_papers/2023/2023-05-15-understanding-performance-portability-of-sycl-kernels-a-case-study-with-the-all-pairs-distance-calculation-in-bioinformatics-on-gp-us.md
 create mode 100644 content/research_papers/2023/2023-05-15-understanding-sycl-portability-for-pseudorandom-number-generation-a-case-study-with-gene-expression-connectivity-mapping.md
 create mode 100644 content/research_papers/2023/2023-09-05-experience-migrating-open-cl-to-sycl-a-case-study-on-searches-for-potential-off-target-sites-of-cas-9-rna-guided-endonucleases-on-amd-gp-us.md
 create mode 100644 content/research_papers/2023/2023-10-17-comparing-performance-and-portability-between-cuda-and-sycl-for-protein-database-search-on-nvidia-amd-and-intel-gp-us.md
 create mode 100644 content/research_papers/2023/2023-10-31-accelerating-hyperdimensional-classifier-with-sycl.md
 create mode 100644 content/research_papers/2023/2023-12-18-migration-of-cuda-based-seismic-application-to-cross-platform-sycl-implementation.md
 create mode 100644 content/research_papers/2023/2023-12-21-evaluating-performance-portability-of-sycl-and-kokkos-a-case-study-on-lbm-simulations.md
 create mode 100644 content/research_papers/2024/2024-03-02-experiences-building-an-mlir-based-sycl-compiler.md
 create mode 100644 content/research_papers/2024/2024-05-20-unveiling-performance-insights-and-portability-achievements-between-cuda-and-sycl-for-particle-in-cell-codes-on-different-gpu-architectures.md

diff --git a/content/research_papers/2016/2016-10-26-a-comparative-study-of-sycl-open-cl-and-open-mp.md b/content/research_papers/2016/2016-10-26-a-comparative-study-of-sycl-open-cl-and-open-mp.md
new file mode 100644
index 0000000..8d92f9d
--- /dev/null
+++ b/content/research_papers/2016/2016-10-26-a-comparative-study-of-sycl-open-cl-and-open-mp.md
@@ -0,0 +1,31 @@
+---
+contributor: max
+date: '2016-10-26T10:57:29+01:00'
+title: 'A Comparative Study of SYCL, OpenCL, and OpenMP'
+external_url: https://ieeexplore.ieee.org/document/7803697
+authors:
+  - name: Hércules Cardoso da Silva
+    affiliation: Inst. of Comput
+  - name: Flávia Pisani
+    affiliation: Institute of Computing,
+  - name: Edson Borin
+    affiliation: Institute of Computing
+tags:
+  - opencl
+  - openmp
+  - parallel
+  - performance
+  - evaluation
+---
+
+Recent trends indicate that future computing systems will be composed by a group of heterogeneous computing devices,
+including CPUs, GPUs, and other hardware accelerators. These devices provide increased processing performance, however,
+creating efficient code for them may require that programmers manage memory assignments and use specialized APIs,
+compilers, or runtime systems, thus making their programs dependent on specific tools. In this scenario, SYCL is an
+emerging C++ programming model for OpenCL that allows developers to write code for heterogeneous computing devices that
+are compatible with standard C++ compilation frameworks. In this paper, we analyze the performance and programming
+characteristics of SYCL, OpenMP, and OpenCL using both a benchmark and a real-world application. Our performance results
+indicate that programs that rely on available SYCL runtimes are not on par with the ones based on OpenMP and OpenCL yet.
+Nonetheless, the gap is getting smaller if we consider the results reported by previous studies. In terms of
+programmability, SYCL presents itself as a competitive alternative to OpenCL, requiring fewer lines of code to implement
+kernels and also fewer calls to essential API functions and methods.
\ No newline at end of file
diff --git a/content/research_papers/2018/2018-07-10-solving-maxwells-equations-with-modern-c-and-sycl-a-case-study.md b/content/research_papers/2018/2018-07-10-solving-maxwells-equations-with-modern-c-and-sycl-a-case-study.md
new file mode 100644
index 0000000..53c4652
--- /dev/null
+++ b/content/research_papers/2018/2018-07-10-solving-maxwells-equations-with-modern-c-and-sycl-a-case-study.md
@@ -0,0 +1,39 @@
+---
+contributor: scott
+date: '2018-07-10T08:08:10.490000+00:00'
+title: 'Solving Maxwells Equations with Modern C++ and SYCL: A Case Study'
+external_url: https://ieeexplore.ieee.org/document/8445127
+authors:
+  - name: Ayesha Afzal
+    affiliation: Friedrich-Alexander university Erlangen-Nurnberg
+  - name: Christian Schmitt
+    affiliation: Friedrich-Alexander university Erlangen-Nurnberg
+  - name: Samer Alhaddad
+    affiliation: Paderborn University
+  - name: Yevgen Grynko
+    affiliation: Paderborn University
+  - name: Jurgen Teich
+    affiliation: Friedrich-Alexander university Erlangen-Nurnberg
+  - name: Jens Forstner
+    affiliation: Paderborn University
+  - name: Frank Hannig
+    affiliation: Friedrich-Alexander university Erlangen-Nurnberg
+tags:
+  - maxwell
+  - c++
+  - case-study
+---
+
+In scientific computing, unstructured meshes are a crucial foundation for the simulation of real-world physical
+phenomena. Compared to regular grids, they allow resembling the computational domain with a much higher accuracy, which
+in turn leads to more efficient computations. There exists a wealth of supporting libraries and frameworks that aid
+programmers with the implementation of applications working on such grids, each built on top of existing parallelization
+technologies. However, many approaches require the programmer to introduce a different programming paradigm into their
+application or provide different variants of the code. SYCL is a new programming standard providing a remedy to this
+dilemma by building on standard C++ 17 with its so-called single-source approach: Programmers write standard C++ code
+and expose parallelism using C++ 17 keywords. The application is then transformed into a concrete implementation by the
+SYCL implementation. By encapsulating the OpenCL ecosystem, different SYCL implementations enable not only the
+programming of CPUs but also of heterogeneous platforms such as GPUs or other devices. For the first time, this paper
+showcases a SY CL-based solver for the nodal Discontinuous Galerkin method for Maxwell's equations on unstructured
+meshes. We compare our solution to a previous C-based implementation with respect to programmability and performance on
+heterogeneous platforms.
diff --git a/content/research_papers/2019/2019-11-18-evaluation-of-medical-imaging-applications-using-sycl.md b/content/research_papers/2019/2019-11-18-evaluation-of-medical-imaging-applications-using-sycl.md
new file mode 100644
index 0000000..c0c47db
--- /dev/null
+++ b/content/research_papers/2019/2019-11-18-evaluation-of-medical-imaging-applications-using-sycl.md
@@ -0,0 +1,30 @@
+---
+contributor: scott
+date: '2019-11-18T10:57:29+01:00'
+title: Evaluation of Medical Imaging Applications using SYCL
+external_url: https://ieeexplore.ieee.org/document/8982983
+authors:
+  - name: Zheming Jin
+    affiliation: Argonne National Laboratory
+  - name: Hal Finkel
+    affiliation: Argonne National Laboratory
+tags:
+  - benchmark
+  - performance
+  - medical
+  - rodina
+  - imaging
+---
+
+As opposed to the Open Computing Language (OpenCL) programming model in which host and device codes are written in
+different languages, the SYCL programming model can combine host and device codes for an application in a type-safe way
+to improve development productivity. In this paper, we chose two medical imaging applications (Heart Wall and Particle
+Filter) in the Rodinia benchmark suite to study the performance and programming productivity of the SYCL programming
+model. More specifically, we introduced the SYCL programming model, shared our experience of implementing the
+applications using SYCL, and compared the performance and programming portability of the SYCL implementations with the
+OpenCL implementations on an Intel® Xeon® CPU and an Iris® Pro integrated GPU. The results are promising. For the Heart
+Wall application, the SYCL implementation is on average 15% faster than the OpenCL implementation on the GPU. For the
+Particle Filter application, the SYCL implementation is 3% slower than the OpenCL implementation on the GPU, but it is
+75% faster on the CPU. Using lines of code as an indicator of programming productivity, the SYCL host program reduces
+the lines of code of the OpenCL host program by 52% and 38% for the Heart Wall and Particle Filter applications,
+respectively.
diff --git a/content/research_papers/2019/2019-11-22-performance-portability-of-a-wilson-dslash-stencil-operator-mini-app-using-kokkos-and-sycl.md b/content/research_papers/2019/2019-11-22-performance-portability-of-a-wilson-dslash-stencil-operator-mini-app-using-kokkos-and-sycl.md
new file mode 100644
index 0000000..cc009ab
--- /dev/null
+++ b/content/research_papers/2019/2019-11-22-performance-portability-of-a-wilson-dslash-stencil-operator-mini-app-using-kokkos-and-sycl.md
@@ -0,0 +1,33 @@
+---
+contributor: scott
+date: '2019-11-22T10:57:29+01:00'
+title: 'Performance Portability of a Wilson Dslash Stencil Operator Mini-App Using Kokkos and SYCL'
+external_url: https://ieeexplore.ieee.org/document/8945798
+authors:
+  - name: Bálint Joó
+    affiliation: Jefferson Lab
+  - name: Thorsten Kurth
+    affiliation: NERSC
+  - name: M. A. Clark
+    affiliation: NVIDIA
+  - name: Jeongnim Kim
+    affiliation: Intel Corporation
+  - name: Christian Robert Trott
+    affiliation: Sandia National Laboratories
+  - name: Dan Ibanez
+    affiliation: Sandia National Laboratories
+  - name: Daniel Sunderland
+    affiliation: Sandia National Laboratories
+  - name: Jack Deslippe
+    affiliation: NERSC
+tags:
+  - kokkos
+  - performance
+  - portability
+  - lattice-qcd
+---
+
+We describe our experiences in creating mini-apps for the Wilson-Dslash stencil operator for Lattice Quantum
+Chromodynamics using the Kokkos and SYCL programming models. In particular we comment on the performance achieved on a
+variety of hardware architectures, limitations we have reached in both programming models and how these have been
+resolved by us, or may be resolved by the developers of these models.
diff --git a/content/research_papers/2019/2019-12-09-a-case-study-of-k-means-clustering-using-sycl.md b/content/research_papers/2019/2019-12-09-a-case-study-of-k-means-clustering-using-sycl.md
new file mode 100644
index 0000000..bca62fe
--- /dev/null
+++ b/content/research_papers/2019/2019-12-09-a-case-study-of-k-means-clustering-using-sycl.md
@@ -0,0 +1,42 @@
+---
+contributor: scott
+date: '2019-12-09T10:57:29+01:00'
+title: A Case Study of k-means Clustering using SYCL
+external_url: https://ieeexplore.ieee.org/document/9005555
+authors:
+  - name: Zheming Jin
+    affiliation: Argonne National Laboratory
+  - name: Hal Finkel
+    affiliation: Argonne National Laboratory
+tags:
+  - benchmark
+  - energy-consumption
+  - programming-language
+  - gpu
+  - lowest-consumption
+  - rodinia
+  - minimum-distance
+  - data-transfer
+  - api
+  - means-clustering
+  - fuzzy-clustering
+  - haswell
+  - broadwell
+  - skywell
+---
+
+As opposed to the OpenCL programming model in which host and device codes are written in two programming languages, the
+SYCL programming model combines them for an application in a type-safe way to improve development productivity. As a
+popular cluster analysis algorithm, k-means has been implemented using programming models such as OpenMP, OpenCL, and
+CUDA. Developing a SYCL implementation of k-means as a case study allows us to have a better understanding of
+performance portability and programming productivity of the SYCL programming model. Specifically, we explained the
+k-means benchmark in Rodinia, described our efforts of porting the OpenCL k-means benchmark, and evaluated the
+performance of the OpenCL and SYCL implementations on the Intel ® Haswell, Broadwell, and Skylake processors. We
+summarized the migration steps from OpenCL to SYCL, compiled the SYCL program using Codeplay and Intel ® SYCL compilers,
+analyzed the SYCL and OpenCL programs using an open-source profiling tool which can intercept OpenCL runtime calls, and
+compared the performance of the implementations on Intel ® CPUs and integrated GPU. The experimental results show that
+the SYCL version in which the kernels run on the GPU is 2% and 8% faster than the OpenCL version for the two large
+datasets. However, the OpenCL version is still much faster than the SYCL version on the CPUs. Compared to the Intel ®
+Haswell and Skylake CPUs, running the k-means benchmark on the Intel ® Broadwell low-power processor with a CPU and an
+integrated GPU can achieve the lowest energy consumption. In terms of programming productivity, the lines of code of the
+SYCL program are 51% fewer than those of the OpenCL program.
\ No newline at end of file
diff --git a/content/research_papers/2020/2020-05-18-a-case-study-on-the-hac-cmk-routine-in-sycl-on-integrated-graphics.md b/content/research_papers/2020/2020-05-18-a-case-study-on-the-hac-cmk-routine-in-sycl-on-integrated-graphics.md
new file mode 100644
index 0000000..73ba67d
--- /dev/null
+++ b/content/research_papers/2020/2020-05-18-a-case-study-on-the-hac-cmk-routine-in-sycl-on-integrated-graphics.md
@@ -0,0 +1,29 @@
+---
+contributor: scott
+date: '2022-05-18T08:08:10.490000+00:00'
+title: 'A Case Study on the HACCmk Routine in SYCL on Integrated Graphics'
+external_url: https://ieeexplore.ieee.org/document/9150310
+authors:
+  - name: Zheming Jin
+    affiliation: Argonne National Laboratory
+  - name: Vitali Morozov
+    affiliation: Argonne National Laboratory
+  - name: Hal Finkel
+    affiliation: Argonne National Laboratory
+tags:
+  - compute
+  - haccmk
+  - integrated-grapghics
+  - case-study
+---
+
+As opposed to the Open Computing Language (OpenCL) programming model in which host and device codes are generally
+written in different languages, the SYCL programming model can combine host and device codes for an application in a
+type-safe way to improve development productivity. In this paper, we chose the HACCmk routine, a representative
+compute-bound kernel, as a case study on the performance of the SYCL programming model targeting a heterogeneous
+computing device. More specifically, we introduced the SYCL programming model, presented the OpenCL and SYCL
+implementations of the routine, and compared the performance of the two implementations using the offline and online
+compilation on Intel® IrisTM Pro integrated GPUs. We found that the overhead of online compilation may become
+significant compared to the execution time of a kernel. Compared to the performance of OpenCL implementations, the SYCL
+implementation can maintain the performance using the offline compilation. The number of execution units in a GPU are
+critical to improving the raw performance of a compute-bound kernel.
diff --git a/content/research_papers/2020/2020-05-18-towards-automated-kernel-selection-in-machine-learning-systems-a-sycl-case-study.md b/content/research_papers/2020/2020-05-18-towards-automated-kernel-selection-in-machine-learning-systems-a-sycl-case-study.md
new file mode 100644
index 0000000..303b334
--- /dev/null
+++ b/content/research_papers/2020/2020-05-18-towards-automated-kernel-selection-in-machine-learning-systems-a-sycl-case-study.md
@@ -0,0 +1,25 @@
+---
+contributor: scott
+date: '2022-05-18T08:08:10.490000+00:00'
+title: 'Towards automated kernel selection in machine learning systems: A SYCL case study'
+external_url: https://ieeexplore.ieee.org/document/9150358
+authors:
+  - name: John Lawson
+    affiliation: Codeplay Software Ltd
+tags:
+  - tuning
+  - sycl
+  - gpgpu
+  - machine-learning
+  - ai
+---
+
+Automated tuning of compute kernels is a popular area of research, mainly focused on finding optimal kernel parameters
+for a problem with fixed input sizes. This approach is good for deploying machine learning models, where the network
+topology is constant, but machine learning research often involves changing network topologies and hyperparameters.
+Traditional kernel auto-tuning has limited impact in this case; a more general selection of kernels is required for
+libraries to accelerate machine learning research. In this paper we present initial results using machine learning to
+select kernels in a case study deploying high performance SYCL kernels in libraries that target a range of heterogeneous
+devices from desktop GPUs to embedded accelerators. The techniques investigated apply more generally and could similarly
+be integrated with other heterogeneous programming systems. By combining auto-tuning and machine learning these kernel
+selection processes can be deployed with little developer effort to achieve high performance on new hardware.
diff --git a/content/research_papers/2020/2020-11-13-evaluating-the-performance-and-portability-of-contemporary-sycl-implementations.md b/content/research_papers/2020/2020-11-13-evaluating-the-performance-and-portability-of-contemporary-sycl-implementations.md
new file mode 100644
index 0000000..8967236
--- /dev/null
+++ b/content/research_papers/2020/2020-11-13-evaluating-the-performance-and-portability-of-contemporary-sycl-implementations.md
@@ -0,0 +1,31 @@
+---
+contributor: scott
+date: '2020-11-13T08:08:10.490000+00:00'
+title: 'Evaluating the Performance and Portability of Contemporary SYCL Implementations'
+external_url: https://ieeexplore.ieee.org/document/9309045
+authors:
+  - name: Beau Johnston
+    affiliation: Oak Ridge National Laboratory
+  - name: Jeffrey S. Vetter
+    affiliation: Oak Ridge National Laboratory
+  - name: Josh Milthorpe
+    affiliation: Australian National University
+tags:
+  - benchmarks
+  - performance
+  - portability
+---
+
+SYCL is a single-source programming model for heterogeneous systems; it promises improved maintainability, productivity,
+and opportunity for compiler optimization, when compared to accelerator specific programming models. Several
+implementations of the SYCL standard have been developed over the past few years, including several backends using
+contemporary accelerator languages, like OpenCL, CUDA, and HIP. These implementations vary widely in their support for
+specific features of the standard and in their performance. As SYCL grows in popularity, developers need to know how
+features are implemented across popular implementations in order to make proper design choices. In this paper, we
+evaluate the existing SYCL implementations for important SYCL features across a range of hardware in order to understand
+SYCL's performance and portability. This work uses the newest SYCL benchmark suite (SYCL-Bench, 38 kernels) to evaluate
+these four existing implementations, comparing support of language features across backends and highlighting feature
+completeness and performance. For features, we focus on the five major SYCL parallel constructs, using a motivating
+example of the matrix multiplication benchmark. Our results show that the basic data parallelism construct is the best
+choice for performance on current SYCL implementations, and we identify opportunities for improvement in several of the
+SYCL implementations.
diff --git a/content/research_papers/2021/2021-09-07-automatic-parallelization-of-structured-mesh-computations-with-sycl.md b/content/research_papers/2021/2021-09-07-automatic-parallelization-of-structured-mesh-computations-with-sycl.md
new file mode 100644
index 0000000..f2e60ef
--- /dev/null
+++ b/content/research_papers/2021/2021-09-07-automatic-parallelization-of-structured-mesh-computations-with-sycl.md
@@ -0,0 +1,25 @@
+---
+contributor: scott
+date: '2021-06-07T08:08:10.490000+00:00'
+title: 'Automatic Parallelization of Structured Mesh Computations with SYCL'
+external_url: https://ieeexplore.ieee.org/document/9555976
+authors:
+  - name: Gábor Dániel Balogh
+    affiliation: Pázmány Péter Catholic University
+  - name: István Reguly
+    affiliation: Pázmány Péter Catholic University
+tags:
+  - parallel-programming
+  - nvidia
+  - intel
+---
+
+Structured meshes are widely used for scientific computations such as Computational Fluid Dynamics (CFD) applications or
+finance. Modern applications often have grid points in the millions. To perform such computations parallelisation is
+crucial. However it is unfeasible to port each application every time a new architecture arrives, hence in recent years
+the demand for automatic parallelisation and optimisation for the used hardware is increasing. The OPS (Oxford Parallel
+library for Structured mesh solvers) has shown good performance and scaling on a wide range of HPC architectures. This
+research aims to extend the OPS framework with a SYCL backend to extend the range of architectures that OPS can support
+and further increase Performance Portability of OPS applications. The performance of the Intel OneAPI is struggling with
+reductions due to high synchronisation cost, but shows promising performance gain on builtin reduction constructs on an
+Intel® Xeon® Gold 6226R. We compare the performance of hipSYCL on NVidia V100 GPU to the CUDA implementations.
diff --git a/content/research_papers/2021/2021-11-14-benchmarking-and-extending-sycl-hierarchical-parallelism.md b/content/research_papers/2021/2021-11-14-benchmarking-and-extending-sycl-hierarchical-parallelism.md
new file mode 100644
index 0000000..9ae98b1
--- /dev/null
+++ b/content/research_papers/2021/2021-11-14-benchmarking-and-extending-sycl-hierarchical-parallelism.md
@@ -0,0 +1,28 @@
+---
+contributor: scott
+date: '2021-11-14T08:08:10.490000+00:00'
+title: 'Benchmarking and Extending SYCL Hierarchical Parallelism'
+external_url: https://ieeexplore.ieee.org/document/9654235
+authors:
+  - name: Tom Deakin
+    affiliation: University of Bristol
+  - name: Simon McIntosh-Smith
+    affiliation: University of Bristol
+  - name: Aksel Alpay
+    affiliation:  Universität Heidelberg
+  - name: Vincent Heuveline
+    affiliation: Universität Heidelberg
+tags:
+  - benchmarks
+  - extending
+  - parallelism
+---
+
+SYCL is an open-standard, parallel programming model for programming heterogeneous devices from Khronos. It allows
+single-source programming of diverse attached devices in a cross-platform manner in modern C++. SYCL provides different
+layers of parallel abstractions, including Same Instruction Multiple Thread (SIMT) kernels, data-parallel loop
+concurrency and hierarchical parallelism. We discuss Scoped Parallelism as an extension to the existing Hierarchical
+Parallelism in SYCL, and highlight the advantages and disadvantages of these models from the perspective of the
+programmer and an implementer of SYCL. In this paper, we compare writing benchmark programs using SIMT kernel,
+hierarchical parallelism and scoped parallelism paradigms, and present results running on a high-performance CPU and
+GPU.
diff --git a/content/research_papers/2021/2021-11-14-case-study-of-using-kokkos-and-sycl-as-performance-portable-frameworks-for-milc-dslash-benchmark-on-nvidia-amd-and-intel-gp-us.md b/content/research_papers/2021/2021-11-14-case-study-of-using-kokkos-and-sycl-as-performance-portable-frameworks-for-milc-dslash-benchmark-on-nvidia-amd-and-intel-gp-us.md
new file mode 100644
index 0000000..00842cc
--- /dev/null
+++ b/content/research_papers/2021/2021-11-14-case-study-of-using-kokkos-and-sycl-as-performance-portable-frameworks-for-milc-dslash-benchmark-on-nvidia-amd-and-intel-gp-us.md
@@ -0,0 +1,44 @@
+---
+contributor: scott
+date: '2021-11-14T08:08:10.490000+00:00'
+title: 'Case Study of Using Kokkos and SYCL as Performance-Portable Frameworks for Milc-Dslash Benchmark on NVIDIA, AMD and Intel GPUs'
+external_url: https://ieeexplore.ieee.org/document/9652859
+authors:
+  - name: Amanda S. Dufek
+    affiliation: NERSC/LBNL
+  - name: Rahulkumar Gayatri
+    affiliation: NERSC/LBNL
+  - name: Neil Mehta
+    affiliation: NERSC/LBNL
+  - name: Douglas Doerfler
+    affiliation: NERSC/LBNL
+  - name: Brandon Cook
+    affiliation: NERSC/LBNL
+  - name: Yasaman Ghadar
+    affiliation: Argonne National Laboratory
+  - name: Carleton DeTar
+    affiliation: University of Utah
+tags:
+  - kokkos
+  - milc-dslash
+  - performance
+  - portability
+  - nvidia
+  - intel
+  - amd
+---
+
+Six of the top ten supercomputers in the TOP500 list from June 2021 rely on NVIDIA GPUs to achieve their peak compute
+bandwidth. With the announcement of Aurora, Frontier, and El Capitan, Intel and AMD have also entered the domain of
+providing GPUs for scientific computing. A consequence of the increased diversity in the GPU landscape is the emergence
+of portable programming models such as Kokkos, SYCL, OpenCL, and OpenMP, which allow application developers to maintain
+a single-source code across a diverse range of hardware architectures. While the portable frameworks try to optimize the
+compute resource usage on a given architecture, it is the programmers responsibility to expose parallelism in an
+application that can take advantage of thousands of processing elements available on GPUs. In this paper, we introduce a
+GPU-friendly parallel implementation of Milc-Dslash that exposes multiple hierarchies of parallelism in the algorithm.
+Milc-Dslash was designed to serve as a benchmark with highly optimized matrix-vector multiplications to measure the
+resource utilization on the GPU systems. The parallel hierarchies in the Milc-Dslash algorithm are mapped onto a target
+hardware using Kokkos and SYCL programming models. We present the performance achieved by Kokkos and SYCL
+implementations of Milc-Dslash on NVIDIA A100 GPU, AMD MI100 GPU, and Intel Gen9 GPU. Additionally, we compare the
+Kokkos and SYCL performances with those obtained from the versions written in CUDA and HIP programming models on NVIDIA
+A100 GPU and AMD MI100 GPU, respectively.
diff --git a/content/research_papers/2021/2021-11-24-efficient-hardware-agnostic-dbms-operator-implementation-using-sycl.md b/content/research_papers/2021/2021-11-24-efficient-hardware-agnostic-dbms-operator-implementation-using-sycl.md
new file mode 100644
index 0000000..45bb9ea
--- /dev/null
+++ b/content/research_papers/2021/2021-11-24-efficient-hardware-agnostic-dbms-operator-implementation-using-sycl.md
@@ -0,0 +1,24 @@
+---
+contributor: scott
+date: '2021-11-24T08:08:10.490000+00:00'
+title: 'Efficient Hardware-Agnostic DBMS Operator Implementation Using SYCL'
+external_url: https://ieeexplore.ieee.org/document/9681747
+authors:
+  - name: Daniil Kulikov
+    affiliation: SPbU
+  - name: Daria Nikolskaia
+    affiliation: ITMO
+  - name: Petr Kurapov
+    affiliation: MIPT
+tags:
+  - parallel programming
+  - hash-join
+  - gpu
+  - dbms
+---
+
+Heterogeneous hardware requires tedious optimization of DBMS algorithms for each platform it supports when implemented
+with vendor-specific toolchains. Such an approach inevitably leads to specialization and maintainability issues. In this
+paper we evaluate several hash-joins implemented with a high-level language SYCL and compare the data across different
+execution devices. We provide a roof-line performance estimations for algorithms and show these implementations are on
+par with existing hardware specific implementations.
\ No newline at end of file
diff --git a/content/research_papers/2022/2022-07-10-a-benchmark-suite-for-improving-performance-portability-of-the-sycl-programming-model.md b/content/research_papers/2022/2022-07-10-a-benchmark-suite-for-improving-performance-portability-of-the-sycl-programming-model.md
new file mode 100644
index 0000000..5ec1129
--- /dev/null
+++ b/content/research_papers/2022/2022-07-10-a-benchmark-suite-for-improving-performance-portability-of-the-sycl-programming-model.md
@@ -0,0 +1,21 @@
+---
+contributor: scott
+date: '2022-07-10T08:08:10.490000+00:00'
+title: 'A Benchmark Suite for Improving Performance Portability of the SYCL Programming Model'
+external_url: https://ieeexplore.ieee.org/document/10158214
+authors:
+  - name: Zheming Jin
+    affiliation: Oak Ridge National Laboratory
+  - name: Jeffrey S. Vetter
+    affiliation: Oak Ridge National Laboratory
+tags:
+  - benchmarking
+  - performance
+  - portability
+---
+
+SYCL is a portable programming model for multivendor computing devices. Portability is critical for its success. The
+heterogeneous computing benchmark suite (HeCBench) is a collection of samples, benchmarks, and mini-applications from
+many open-source projects for heterogeneous computing. We hope that HeCBench is useful for understanding and improving
+performance portability in the development of the SYCL ecosystem. This abstract is a summary of the background, use
+cases, improvement, and future work of the benchmark suite.
diff --git a/content/research_papers/2022/2022-10-28-portability and-performance-assessment-of-the-non-negative-matrix-factorization-algorithm-with-openmp-and-sycl.md b/content/research_papers/2022/2022-10-28-portability and-performance-assessment-of-the-non-negative-matrix-factorization-algorithm-with-openmp-and-sycl.md
new file mode 100644
index 0000000..e28bf24
--- /dev/null
+++ b/content/research_papers/2022/2022-10-28-portability and-performance-assessment-of-the-non-negative-matrix-factorization-algorithm-with-openmp-and-sycl.md	
@@ -0,0 +1,29 @@
+---
+contributor: scott
+date: '2022-10-28T08:08:10.490000+00:00'
+title: 'Portability and Performance Assessment of the Non-Negative Matrix Factorization Algorithm with OpenMP and SYCL'
+external_url: https://ieeexplore.ieee.org/document/9959906
+authors:
+  - name: Youssef Faqir-Rhazoui
+    affiliation: Universidad Complutense de Madrid
+  - name: Carlos García
+    affiliation: Instituto de Tecnología del Conocimiento
+  - name: Francisco Tirado
+    affiliation: Informática
+tags:
+  - openmp
+  - sycl
+  - dpc++
+  - oneapi
+  - matrix
+  - hpc
+---
+
+The SYCL standard was released to improve code portability across heterogeneous environments. Intel released the oneAPI
+toolkit, which includes the Data-Parallel C++ (DPC++) compiler which is the Intel’s SYCL implementation. SYCL is
+designed to use a single source code to target multiple accelerators such as: multi-core CPUs, GPUs and even FPGAs.
+Additionally, the C/C++ compiler provided in the oneAPI toolkit supports OpenMP which also allows targeting codes on
+both CPU and GPU devices. In this paper, the performance of SYCL and OpenMP is evaluated using the well-known
+non-negative matrix factorization (NMF) algorithm. Three different NMF implementations are developed: baseline, SYCL and
+OpenMP versions to analyze the acceleration on CPU and GPU. Experimental results show that while the two programming
+models perform almost identically on CPU, on GPU, SYCL outperforms its OpenMP counterpart slightly.
diff --git a/content/research_papers/2022/2022-11-13-a-first-step-towards-support-for-mpi-partitioned-communication-on-sycl-programmed-fpg-as.md b/content/research_papers/2022/2022-11-13-a-first-step-towards-support-for-mpi-partitioned-communication-on-sycl-programmed-fpg-as.md
new file mode 100644
index 0000000..d639c3c
--- /dev/null
+++ b/content/research_papers/2022/2022-11-13-a-first-step-towards-support-for-mpi-partitioned-communication-on-sycl-programmed-fpg-as.md
@@ -0,0 +1,27 @@
+---
+contributor: scott
+date: '2022-11-13T08:08:10.490000+00:00'
+title: 'A First Step towards Support for MPI Partitioned Communication on SYCL-programmed FPGAs'
+external_url: https://ieeexplore.ieee.org/document/10027494
+authors:
+  - name: Steffen Christgau
+    affiliation: Zuse Institute Berlin, Berlin
+  - name: Marius Knaust
+    affiliation: Zuse Institute Berlin, Berlin
+  - name: Thomas Steinke
+    affiliation: Zuse Institute Berlin, Berlin
+tags:
+  - fpgas
+  - mpi
+  - oneapi
+  - dpc++
+---
+
+Version 4.0 of the Message Passing Interface standard introduced the concept of Partitioned Communication, which adds
+support for multiple contributions to a communication buffer. Although initially targeted at multithreaded MPI
+applications, Partitioned Communication currently receives attraction in the context of accelerators, especially GPUs.
+In this publication it is demonstrated that this communication concept can be implemented for SYCL-programmed FPGAs.
+This includes a discussion of the design space and the presentation of a prototype implementation. Experimental results
+show that a lightweight implementation on top of an existing MPI library is possible. The presented approach also
+reveals issues in both the SYCL and the MPI standard, which needs to be addressed for improved support for the intended
+communication style.
diff --git a/content/research_papers/2022/2022-11-13-towards-cross-platform-portability-of-coupled-cluster-methods-with-perturbative-triples-using-sycl.md b/content/research_papers/2022/2022-11-13-towards-cross-platform-portability-of-coupled-cluster-methods-with-perturbative-triples-using-sycl.md
new file mode 100644
index 0000000..106a70f
--- /dev/null
+++ b/content/research_papers/2022/2022-11-13-towards-cross-platform-portability-of-coupled-cluster-methods-with-perturbative-triples-using-sycl.md
@@ -0,0 +1,34 @@
+---
+contributor: scott
+date: '2022-11-13T08:08:10.490000+00:00'
+title: 'Towards Cross-Platform Portability of Coupled-Cluster Methods with Perturbative Triples using SYCL'
+external_url: https://ieeexplore.ieee.org/document/10024604
+authors:
+  - name: Abhishek Bagusetty
+    affiliation: Argonne National Laboratory
+  - name: Ajay Panyala
+    affiliation: Pacific Northwest National Laboratory
+  - name: Gordon Brown
+    affiliation: Codeplay Software Ltd
+  - name: Jack Kirk
+    affiliation: Codeplay Software Ltd
+tags:
+  - performance
+  - nvidia
+  - perturbation
+  - triples
+  - coupled-cluster
+  - portability
+---
+
+Tensor contractions form the fundamental computational operation of computational chemistry, and these contractions
+dictate the performance of widely used coupled-cluster (CC) methods in computational chemistry. In this work, we study a
+single-source, cross-platform C++ abstraction layer programming model, SYCL, for applications related to the
+computational chemistry methods such as CCSD(T) coupled-cluster formalism. An existing optimized CUDA implementation was
+migrated to SYCL to make use of the novel algorithm that provides tractable GPU memory needs for solving
+high-dimensional tensor contractions for accelerating CCSD(T). We present the cross-platform performance achieved using
+SYCL implementations for the non-iterative triples contribution of the CCSD(T) formalism which is considered as the
+performance bottle neck on NVIDIA A100 and AMD Instinct MI250X. Additionally, we also draw comparisons of similar
+performance metrics from vendor-based native programming models such as CUDA and ROCm HIP. Our results indicate that the
+performance of SYCL measured at-scale was on-par with the code written in HIP for AMD MI250X GPUs while the performance
+is slightly lacking on NVIDIA A100 GPUs in comparison to CUDA.
diff --git a/content/research_papers/2022/2022-11-18-evaluating-nonuniform-reduction-in-hip-and-sycl-on-gp-us.md b/content/research_papers/2022/2022-11-18-evaluating-nonuniform-reduction-in-hip-and-sycl-on-gp-us.md
new file mode 100644
index 0000000..cba3ec9
--- /dev/null
+++ b/content/research_papers/2022/2022-11-18-evaluating-nonuniform-reduction-in-hip-and-sycl-on-gp-us.md
@@ -0,0 +1,20 @@
+---
+contributor: scott
+date: '2022-11-18T08:08:10.490000+00:00'
+title: 'Evaluating Nonuniform Reduction in HIP and SYCL on GPUs'
+external_url: https://ieeexplore.ieee.org/document/10025472
+authors:
+  - name: Zheming Jin
+    affiliation: Oak Ridge National Laboratory
+  - name: Jeffrey S. Vetter
+    affiliation: Oak Ridge National Laboratory
+tags:
+  - reduction
+  - nonuniform
+  - evaluating
+---
+
+Motivated by maturing programming models and portability for heterogeneous computing, we describe the challenges posed
+by hardware architectures and programming models when migrating an optimized implementation of nonuniform reduction from
+CUDA to HIP and SYCL. We explain the migration experience, evaluate the performance of the reduction on GPU -based
+computing platforms, and provide feedback on improving portability for the development of the SYCL programming model.
diff --git a/content/research_papers/2022/2022-12-01-performance-study-of-gpu-applications-using-sycl-and-cuda-on-tesla-v100-gpu.md b/content/research_papers/2022/2022-12-01-performance-study-of-gpu-applications-using-sycl-and-cuda-on-tesla-v100-gpu.md
new file mode 100644
index 0000000..588f17e
--- /dev/null
+++ b/content/research_papers/2022/2022-12-01-performance-study-of-gpu-applications-using-sycl-and-cuda-on-tesla-v100-gpu.md
@@ -0,0 +1,31 @@
+---
+contributor: scott
+date: '2022-12-01T08:08:10.490000+00:00'
+title: 'Performance Study of GPU applications using SYCL and CUDA on Tesla V100 GPU'
+external_url: https://ieeexplore.ieee.org/document/9622813
+authors:
+  - name: Goutham Kalikrishna Reddy Kuncham
+    affiliation: NextGen R&D
+  - name: Rahul Vaidya
+    affiliation: NextGen R&D
+  - name: Mahesh Barve
+    affiliation: HPC Center Of Excellence
+tags:
+  - performance
+  - runtime
+  - conferences
+  - gpu
+  - ram
+  - throughput
+  - hardware
+---
+
+SYCL standard enables single-source programs to run on heterogeneous platforms consisting of CPUs, GPUs, FPGAs across
+different hardware vendors. SYCL combines modern C++ features along with OpenCL’s portability. SYCL runtime is also
+capable of targeting the CUDA backend directly on NVIDIA GPUs. This approach can potentially improve the performance of
+SYCL on NVIDIA devices. Although NVIDIA GPUs can be targeted via OpenCL backend, their features and capabilities are
+limited, and the performance is inadequate.In this study, we compare the performance of the Nvidia V100 GPU using SYCL
+and CUDA. For performance evaluation, we selected three GPU applications: BabelStream, Mixbench, and Tiled
+Matrix-Multiplication. We conducted extensive tests to understand the performance in terms of DRAM bandwidth, kernel
+execution time, compilation time, and throughput. As per our study, the performance of SYCL and CUDA were found to be
+similar. However, in some cases, CUDA outperformed SYCL.
\ No newline at end of file
diff --git a/content/research_papers/2022/2022-12-06-understanding-performance-portability-of-bioinformatics-applications-in-sycl-on-an-nvidia-gpu.md b/content/research_papers/2022/2022-12-06-understanding-performance-portability-of-bioinformatics-applications-in-sycl-on-an-nvidia-gpu.md
new file mode 100644
index 0000000..a4ce246
--- /dev/null
+++ b/content/research_papers/2022/2022-12-06-understanding-performance-portability-of-bioinformatics-applications-in-sycl-on-an-nvidia-gpu.md
@@ -0,0 +1,21 @@
+---
+contributor: scott
+date: '2022-12-06T08:08:10.490000+00:00'
+title: 'Understanding Performance Portability of Bioinformatics Applications in SYCL on an NVIDIA GPU'
+external_url: https://ieeexplore.ieee.org/document/9995222
+authors:
+  - name: Zheming Jin
+    affiliation: Oak Ridge National Laboratory
+  - name: Jeffrey S. Vetter
+    affiliation: Oak Ridge National Laboratory
+tags:
+  - performance
+  - nvidia
+  - bioinformatics
+  - portability
+---
+
+Our goal is to have a better understanding of performance portability of SYCL kernels on a GPU. Toward this goal, we
+migrate representative kernels in bioinformatics applications from CUDA to SYCL, evaluate their performance on an NVIDIA
+GPU, and explain the performance gaps through performance profiling and analyses. We hope that the findings provide
+valuable feedback to the development of the SYCL ecosystem.
diff --git a/content/research_papers/2022/2022-12-23-a-memory-bank-conflict-prevention-mechanism-for-sycl-on-sx-aurora-tsubasa.md b/content/research_papers/2022/2022-12-23-a-memory-bank-conflict-prevention-mechanism-for-sycl-on-sx-aurora-tsubasa.md
new file mode 100644
index 0000000..0f7d8b5
--- /dev/null
+++ b/content/research_papers/2022/2022-12-23-a-memory-bank-conflict-prevention-mechanism-for-sycl-on-sx-aurora-tsubasa.md
@@ -0,0 +1,34 @@
+---
+contributor: scott
+date: '2022-12-23T08:08:10.490000+00:00'
+title: 'A memory bank conflict prevention mechanism for SYCL on SX-Aurora TSUBASA'
+external_url: https://ieeexplore.ieee.org/document/9644088
+authors:
+  - name: Wenbin Wang
+    affiliation: Tohoku University
+  - name: Jiahao Li
+    affiliation: Tohoku University
+  - name: Yohichi Shimomura
+    affiliation: Tohoku University
+  - name: Hiroyuki Takizawa
+    affiliation: Tohoku University
+tags:
+  - performance
+  - runtime
+  - bandwidth
+  - programming
+  - supercomputers
+  - libraries
+  - kernel
+---
+
+A modern vector supercomputer, NEC SX-Aurora TSUBASA, consists of Vector Hosts (VHs) and Vector Engines (VEs). A VH is a
+standard CPU to perform general tasks and hosting the VEs, while a VE is a special device designed to operate on long
+vectors, and provides world’s top-class theoretical memory bandwidth of 1.53 TB/s. However, in some cases, the sustained
+memory bandwidth achieved in practical use is far from the theoretical one. This is because frequent memory bank
+conflicts cause performance degradation. The purpose of this work is to achieve high sustained memory bandwidth by
+introducing a bank conflict prevention mechanism to a SYCL implementation, named neoSYCL. The evaluation results using
+several kernels clearly show that this mechanism can be used without changing the standard interface defined in the SYCL
+specification. It is also demonstrated that the proposed approach can successfully prevent memory bank conflicts, and
+thus achieve higher sustained memory bandwidths than the original one, meaning to expect higher sustained performance on
+memory-intensive scientific applications.
diff --git a/content/research_papers/2023/2023-05-15-remote-execution-of-open-cl-and-sycl-applications-via-r-open-cl.md b/content/research_papers/2023/2023-05-15-remote-execution-of-open-cl-and-sycl-applications-via-r-open-cl.md
new file mode 100644
index 0000000..dd26860
--- /dev/null
+++ b/content/research_papers/2023/2023-05-15-remote-execution-of-open-cl-and-sycl-applications-via-r-open-cl.md
@@ -0,0 +1,24 @@
+---
+contributor: scott
+date: '2023-05-15T08:08:10.490000+00:00'
+title: 'Remote Execution of OpenCL and SYCL Applications via rOpenCL'
+external_url: https://ieeexplore.ieee.org/document/10196646
+authors:
+  - name: Rui Alves
+    affiliation: Instituto Politécnico de Bragança Campus de Santa Apolónia
+  - name: José Rufino
+    affiliation: Instituto Politécnico de Bragança
+tags:
+  - hpc
+  - api
+  - opencl 
+  - remote
+---
+
+Here, we present the migration of a CUDA based seismic application, named SeisAcoMod2D, to SYCL codebase using Intel®
+oneAPI. SYCL programming enables developers to have single source codebase across different computing architectures and
+vendors of CPUs, GPUs, and FPGAs. SeisAcoMod2D performs acoustic wave propagation using finite difference time domain
+modelling, which is useful in oil exploration applications. The migrated SYCL code has been optimized for GPUs and the
+output data is validated. The migrated unified SYCL code is executed on GPUs from Intel and Nvidia and on CPUs from
+Intel. The performance of the SYCL code is found similar to that of the CUDA code on Nvidia® A100 GPU. A speed up of
+1.75x is obtained on Intel® Data Center GPU Max 1550 GPU (Ponte Vecchio) over Nvidia® A100 (80GB) GPU.
diff --git a/content/research_papers/2023/2023-05-15-understanding-performance-portability-of-sycl-kernels-a-case-study-with-the-all-pairs-distance-calculation-in-bioinformatics-on-gp-us.md b/content/research_papers/2023/2023-05-15-understanding-performance-portability-of-sycl-kernels-a-case-study-with-the-all-pairs-distance-calculation-in-bioinformatics-on-gp-us.md
new file mode 100644
index 0000000..147a274
--- /dev/null
+++ b/content/research_papers/2023/2023-05-15-understanding-performance-portability-of-sycl-kernels-a-case-study-with-the-all-pairs-distance-calculation-in-bioinformatics-on-gp-us.md
@@ -0,0 +1,21 @@
+---
+contributor: scott
+date: '2023-05-15T08:08:10.490000+00:00'
+title: 'Understanding Performance Portability of SYCL Kernels: A Case Study with the All-Pairs Distance Calculation in Bioinformatics on GPUs'
+external_url: https://ieeexplore.ieee.org/document/10196541
+authors:
+  - name: Zheming Jin
+    affiliation: Oak Ridge National Laboratory
+  - name: Jeffrey S. Vetter
+    affiliation: Oak Ridge National Laboratory
+tags:
+  - portability
+  - performance
+  - bioinformatics
+---
+
+SYCL is a portable programming model. Toward the goal of a better understanding of performance portability of SYCL
+kernels on GPUs, we select a bioinformatics kernel for computing the all-pairs distance as a case study. After migrating
+the kernel from CUDA to HIP and SYCL, we evaluate the performance of the CUDA, HIP, and SYCL kernels on NVIDIA V100 and
+AMD MI210 GPUs. We analyze the GPU instructions from the kernels to explain performance gaps between SYCL and CUDA/HIP.
+We hope that the findings are valuable for improving performance portability of SYCL.
diff --git a/content/research_papers/2023/2023-05-15-understanding-sycl-portability-for-pseudorandom-number-generation-a-case-study-with-gene-expression-connectivity-mapping.md b/content/research_papers/2023/2023-05-15-understanding-sycl-portability-for-pseudorandom-number-generation-a-case-study-with-gene-expression-connectivity-mapping.md
new file mode 100644
index 0000000..ae05e61
--- /dev/null
+++ b/content/research_papers/2023/2023-05-15-understanding-sycl-portability-for-pseudorandom-number-generation-a-case-study-with-gene-expression-connectivity-mapping.md
@@ -0,0 +1,24 @@
+---
+contributor: scott
+date: '2023-05-15T08:08:10.490000+00:00'
+title: 'Understanding SYCL Portability for Pseudorandom Number Generation: a Case Study with Gene-Expression Connectivity Mapping'
+external_url: https://ieeexplore.ieee.org/document/10196601
+authors:
+  - name: Zheming Jin
+    affiliation: Oak Ridge National Laboratory
+  - name: Jeffrey S. Vetter
+    affiliation: Oak Ridge National Laboratory
+tags:
+  - portability
+  - pseudorandom
+  - random-number
+  - bioinformatics
+---
+
+Towards the goal of improving functional and performance portability of SYCL, we study a bioinformatics application that
+has been accelerated with CUDA and fast pseudorandom number generation on a GPU. We describe the experience of migrating
+pseudorandom number generation from CUDA to SYCL, evaluate the performance of pseudorandom number generators using the
+CUDA random number generation library, suggest the support of the XORWOW pseudorandom number generator in the oneAPI
+math kernel library (oneMKL) interface for performance portability, and identify the performance gap using the MKL
+interface in SYCL that supports pseudorandom number generation with third-party libraries. We hope that the results are
+valuable for the development of the SYCL ecosystem.
diff --git a/content/research_papers/2023/2023-09-05-experience-migrating-open-cl-to-sycl-a-case-study-on-searches-for-potential-off-target-sites-of-cas-9-rna-guided-endonucleases-on-amd-gp-us.md b/content/research_papers/2023/2023-09-05-experience-migrating-open-cl-to-sycl-a-case-study-on-searches-for-potential-off-target-sites-of-cas-9-rna-guided-endonucleases-on-amd-gp-us.md
new file mode 100644
index 0000000..1f420e6
--- /dev/null
+++ b/content/research_papers/2023/2023-09-05-experience-migrating-open-cl-to-sycl-a-case-study-on-searches-for-potential-off-target-sites-of-cas-9-rna-guided-endonucleases-on-amd-gp-us.md
@@ -0,0 +1,24 @@
+---
+contributor: scott
+date: '2023-09-05T08:08:10.490000+00:00'
+title: 'Experience Migrating OpenCL to SYCL: A Case Study on Searches for Potential Off-Target Sites of Cas9 RNA-Guided Endonucleases on AMD GPUs'
+external_url: https://ieeexplore.ieee.org/document/10256881
+authors:
+  - name: Zheming Jin
+    affiliation: Oak Ridge National Laboratory
+  - name: Jeffrey S. Vetter
+    affiliation: Oak Ridge National Laboratory
+tags:
+  - sequence
+  - analysis
+  - opencl
+  - migrating
+  - cas9
+  - rna
+---
+
+Cas-OFFinder is a popular application for genome editing. Its OpenCL implementation searches potential off-target sites
+in parallel on a GPU. In this work, we describe our experience migrating the application from OpenCL to SYCL. Evaluating
+the performance of the OpenCL and SYCL applications using human genome sequences shows that the SYCL program could
+achieve performance portability on the target GPUs. Exploring the optimizations of the hotspot kernel in SYCL may
+further improve the performance of the application by 9% to 23%.
diff --git a/content/research_papers/2023/2023-10-17-comparing-performance-and-portability-between-cuda-and-sycl-for-protein-database-search-on-nvidia-amd-and-intel-gp-us.md b/content/research_papers/2023/2023-10-17-comparing-performance-and-portability-between-cuda-and-sycl-for-protein-database-search-on-nvidia-amd-and-intel-gp-us.md
new file mode 100644
index 0000000..8178a20
--- /dev/null
+++ b/content/research_papers/2023/2023-10-17-comparing-performance-and-portability-between-cuda-and-sycl-for-protein-database-search-on-nvidia-amd-and-intel-gp-us.md
@@ -0,0 +1,36 @@
+---
+contributor: scott
+date: '2023-10-17T08:08:10.490000+00:00'
+title: 'Comparing Performance and Portability Between CUDA and SYCL for Protein Database Search on NVIDIA, AMD, and Intel GPUs'
+external_url: https://ieeexplore.ieee.org/document/10306194
+authors:
+  - name: Manuel Costanzo
+    affiliation: UNLP - CIC, La Plata
+  - name: Enzo Rucci
+    affiliation: UNLP - CIC, La Plata
+  - name: Carlos García-Sánchez
+    affiliation: Universidad Complutense de Madrid
+  - name: Marcelo Naiouf
+    affiliation: UNLP - CIC, La Plata
+  - name: Manuel Prieto-Matías
+    affiliation: Universidad Complutense de Madrid
+tags:
+  - cuda
+  - gpu
+  - portability
+  - performance
+  - comparison
+  - nvidia
+  - amd
+  - intel
+---
+
+The heterogeneous computing paradigm has led to the need for portable and efficient programming solutions that can
+leverage the capabilities of various hardware devices, such as NVIDIA, Intel, and AMD GPUs. This study evaluates the
+portability and performance of the SYCL and CUDA languages for one fundamental bioinformatics application (
+Smith-Waterman protein database search) across different GPU architectures, considering single and multi-GPU
+configurations from different vendors. The experimental work showed that, while both CUDA and SYCL versions achieve
+similar performance on NVIDIA devices, the latter demonstrated remarkable code portability to other GPU architectures,
+such as AMD and Intel. Furthermore, the architectural efficiency rates achieved on these devices were superior in 3 of
+the 4 cases tested. This brief study highlights the potential of SYCL as a viable solution for achieving both
+performance and portability in the heterogeneous computing ecosystem.
diff --git a/content/research_papers/2023/2023-10-31-accelerating-hyperdimensional-classifier-with-sycl.md b/content/research_papers/2023/2023-10-31-accelerating-hyperdimensional-classifier-with-sycl.md
new file mode 100644
index 0000000..e6845ea
--- /dev/null
+++ b/content/research_papers/2023/2023-10-31-accelerating-hyperdimensional-classifier-with-sycl.md
@@ -0,0 +1,33 @@
+---
+contributor: scott
+date: '2023-10-31T08:08:10.490000+00:00'
+title: 'Accelerating Hyperdimensional Classifier with SYCL'
+external_url: https://ieeexplore.ieee.org/document/10321902
+authors:
+  - name: Zheming Jin
+    affiliation: Oak Ridge National Laboratory
+  - name: Jeffrey S. Vetter
+    affiliation: Oak Ridge National Laboratory
+tags:
+  - parallel
+  - search
+  - dimension
+  - accelerating
+  - performance
+  - mathematial
+---
+
+Hyperdimensional (HD) computing is based on mathematical properties of high-dimensional spaces which show remarkable
+agreement with brain-controlled behaviors. Rahimi et al. describe an HD-based classifier for the task of
+recognizing the languages of text samples. It consists of an encoding module that generates a hypervector for each
+text sample and a search module that compares the generated vector with a set of trained hypervectors. One of the
+challenges of the HD computing research is that hardware simulation of the classifier is extremely time-consuming with
+many text samples. To address the challenge, the classifier may be modelled as a compute routine in Open Computing
+Language (OpenCL) and executed on graphics processing units (GPUs) for acceleration. While OpenCL allows for
+writing parallel and portable programs targeting vendors’ computing platforms, writing an OpenCL program tends to be
+error-prone and time-consuming. Built on the underlying concepts, portability, and efficiency of OpenCL, SYCL defines a
+single-source abstract layer in C++. In this work, we adopt the SYCL abstraction for productivity and performance.
+Compared to the OpenCL application, the SYCL application approximately reduces the lines of code by 24% and increases
+the performance by 2.13X on four GPUs. In addition, the speedups of executing the application in parallel over the
+fastest serial execution on the four heterogeneous computing systems are approximately 2.11X, 1.23X, 1.56X, and 1.03X,
+respectively.
diff --git a/content/research_papers/2023/2023-12-18-migration-of-cuda-based-seismic-application-to-cross-platform-sycl-implementation.md b/content/research_papers/2023/2023-12-18-migration-of-cuda-based-seismic-application-to-cross-platform-sycl-implementation.md
new file mode 100644
index 0000000..9f81793
--- /dev/null
+++ b/content/research_papers/2023/2023-12-18-migration-of-cuda-based-seismic-application-to-cross-platform-sycl-implementation.md
@@ -0,0 +1,35 @@
+---
+contributor: scott
+date: '2023-12-18T08:08:10.490000+00:00'
+title: 'Migration of CUDA Based Seismic Application to Cross-Platform SYCL Implementation'
+external_url: https://ieeexplore.ieee.org/document/10502402
+authors:
+  - name: Om Jadhav
+    affiliation: HPC-Technologies Group
+  - name: Sandeep Agrawal
+    affiliation: HPC-Technologies Group
+  - name: Abhishek Srivastava
+    affiliation: HPC-SE&A Group
+  - name: Richa Rastogi
+    affiliation: HPC-SE&A Group
+  - name: Sanjay Wandhekar
+    affiliation: HPC-Technologies Group
+  - name: Vinutha SV
+    affiliation: Intel Technology India Pvt. Ltd
+  - name: Jyotsna Khemka
+    affiliation: Intel Technology India Pvt. Ltd
+tags:
+  - acoustic-waves
+  - seismic-data
+  - manually-optimized
+  - cuda
+  - migration
+---
+
+Here, we present the migration of a CUDA based seismic application, named SeisAcoMod2D, to SYCL codebase using Intel®
+oneAPI. SYCL programming enables developers to have single source codebase across different computing architectures and
+vendors of CPUs, GPUs, and FPGAs. SeisAcoMod2D performs acoustic wave propagation using finite difference time domain
+modelling, which is useful in oil exploration applications. The migrated SYCL code has been optimized for GPUs and the
+output data is validated. The migrated unified SYCL code is executed on GPUs from Intel and Nvidia and on CPUs from
+Intel. The performance of the SYCL code is found similar to that of the CUDA code on Nvidia® A100 GPU. A speed up of
+1.75x is obtained on Intel® Data Center GPU Max 1550 GPU (Ponte Vecchio) over Nvidia® A100 (80GB) GPU.
diff --git a/content/research_papers/2023/2023-12-21-evaluating-performance-portability-of-sycl-and-kokkos-a-case-study-on-lbm-simulations.md b/content/research_papers/2023/2023-12-21-evaluating-performance-portability-of-sycl-and-kokkos-a-case-study-on-lbm-simulations.md
new file mode 100644
index 0000000..3f97b03
--- /dev/null
+++ b/content/research_papers/2023/2023-12-21-evaluating-performance-portability-of-sycl-and-kokkos-a-case-study-on-lbm-simulations.md
@@ -0,0 +1,41 @@
+---
+contributor: scott
+date: '2023-12-21T08:08:10.490000+00:00'
+title: 'Evaluating Performance Portability of SYCL and Kokkos: A Case Study on LBM Simulations'
+external_url: https://ieeexplore.ieee.org/document/10491773
+authors:
+  - name: Yue Ding
+    affiliation: National University of Defense Technology
+  - name: Chuanfu Xu
+    affiliation: National University of Defense Technology
+  - name: Haozhong Qiu
+    affiliation: National University of Defense Technology
+  - name: Qingsong Wang
+    affiliation: National University of Defense Technology
+  - name: Weixi Dai
+    affiliation: National University of Defense Technology
+  - name: Yongzhen Lin
+    affiliation: National University of Defense Technology
+  - name: Yonggang Che
+    affiliation: National University of Defense Technology
+tags:
+  - kokkos
+  - performance
+  - portability
+  - cross-platform
+  - lbm
+  - simulations
+---
+
+Since modern high performance computing systems are evolving towards diverse and heterogeneous architectures, the
+emergence of high-level portable programming models leads to a particular focus on performance portability. In this
+paper, we evaluate the performance portability and explore performance optimization methods for two portable programming
+models SYCL and Kokkos. We take an open-source multi-phase Lattice Boltzmann Method (LBM) flow simulation code as a case
+study and implement portable versions with different optimizations. Then we compare our portable implementations with
+engineer-tuned OpenMP and CUDA versions on Intel CPUs and NVIDIA GPUs. Experimental results show that both SYCL and
+Kokkos can deliver superior performance than traditional programming models, but the best performance of the portable
+versions depends heavily on platform-specific optimizations. There is no single implementation that can achieve the best
+performance on both CPUs and GPUs. Consequently, we conclude that the performance portability still needs to be further
+improved for both SYCL and Kokkos. In addition, we present a comparative analysis of different optimization methods that
+qualify the performance enhancement when using SYCL and Kokkos on CPUs and GPUs. Our work offers valuable references for
+the development of both portable programming models and applications.
diff --git a/content/research_papers/2024/2024-03-02-experiences-building-an-mlir-based-sycl-compiler.md b/content/research_papers/2024/2024-03-02-experiences-building-an-mlir-based-sycl-compiler.md
new file mode 100644
index 0000000..895e7b3
--- /dev/null
+++ b/content/research_papers/2024/2024-03-02-experiences-building-an-mlir-based-sycl-compiler.md
@@ -0,0 +1,41 @@
+---
+contributor: scott
+date: '2024-03-02T08:08:10.490000+00:00'
+title: 'Experiences Building an MLIR-Based SYCL Compiler'
+external_url: https://ieeexplore.ieee.org/document/10444866
+authors:
+  - name: Ettore Tiotto
+    affiliation: Intel Corporation
+  - name: Víctor Pérez
+    affiliation: Codeplay Software
+  - name: Whitney Tsang
+    affiliation: Intel Corporation
+  - name: Lukas Sommer
+    affiliation: Codeplay Software
+  - name: Julian Oppermann
+    affiliation: Codeplay Software
+  - name: Victor Lomüller
+    affiliation: Codeplay Software
+  - name: Mehdi Goli
+    affiliation: Codeplay Software
+  - name: James Brodman
+    affiliation: Intel Corporation
+tags:
+  - SYCL
+  - MLIR
+  - compiler
+  - optimization
+  - heterogeneous-programming
+---
+
+Similar to other programming models, compilers for SYCL, the open programming model for heterogeneous computing based on
+C++, would benefit from access to higher-level intermediate representations. The loss of high-level structure and
+semantics caused by premature lowering to low-level intermediate representations and the inability to reason about host
+and device code simultaneously present major challenges for SYCL compilers. The MLIR compiler framework, through its
+dialect mechanism, allows to model domain-specific, high-level intermediate representations and provides the necessary
+facilities to address these challenges. This work therefore describes practical experience with the design and
+implementation of an MLIR-based SYCL compiler. By modeling key elements of the SYCL programming model in host and device
+code in the MLIR dialect framework, the presented approach enables the implementation of powerful device code
+optimizations as well as analyses across host and device code. Compared to two LLVM-based SYCL implementations, this
+yields speedups of up to 4.3x on a collection of SYCL benchmark applications. Finally, this work also discusses
+challenges encountered in the design and implementation and how these could be addressed in the future.
diff --git a/content/research_papers/2024/2024-05-20-unveiling-performance-insights-and-portability-achievements-between-cuda-and-sycl-for-particle-in-cell-codes-on-different-gpu-architectures.md b/content/research_papers/2024/2024-05-20-unveiling-performance-insights-and-portability-achievements-between-cuda-and-sycl-for-particle-in-cell-codes-on-different-gpu-architectures.md
new file mode 100644
index 0000000..bbb3fec
--- /dev/null
+++ b/content/research_papers/2024/2024-05-20-unveiling-performance-insights-and-portability-achievements-between-cuda-and-sycl-for-particle-in-cell-codes-on-different-gpu-architectures.md
@@ -0,0 +1,30 @@
+---
+contributor: scott
+date: '2024-05-20T08:08:10.490000+00:00'
+title: 'Unveiling Performance Insights and Portability Achievements Between CUDA and SYCL for Particle-in-Cell Codes on Different GPU Architectures'
+external_url: https://ieeexplore.ieee.org/document/10569866
+authors:
+  - name: Ivona Vasileska
+    affiliation: University of Ljubljana
+  - name: Pavel Tomšič
+    affiliation: University of Ljubljana
+  - name: Leon Kos
+    affiliation: University of Ljubljana
+  - name: Leon Bogdanović
+    affiliation: University of Ljubljana
+tags:
+  - gpu
+  - cuda
+  - hpc
+  - pic
+  - plasma
+---
+
+The HPC systems worldwide are getting more powerful with the combination of CPU, GPU, and other accelerators (e.g.,
+FPGAs and Quantum Processors). Many programming frameworks mainly offer excellent support portability to the existing
+scientific codes to use the exascale HPC systems. This study evaluates the performance and portability of CUDA and SYCL
+for one of the most used plasma kinetic simulation codes Particle-In-Cell (PIC). The PIC codes are numerical modelling
+tools used for handling the extreme nonlinear methods in fusion devices. The experimental work showed that accelerating
+the PIC code with CUDA and SYCL achieve similar performance on NVIDIA devices, the latter demonstrated remarkable code
+portability to other GPU architectures. This brief study highlights the potential of SYCL as a viable solution for
+achieving both performance and portability in the heterogeneous computing ecosystem.