Skip to content

Commit 0c6deba

Browse files
authored
Enable SYCL NVIDIA and AMD backends (#2192)
* Enable SYCL NVIDIA backend Tested with ``` CXX=icpx CC=icx AR=llvm-ar ./build.sh -Dsycl=nvidia ``` on Ubuntu 24.04 with CUDA 12.9 and oneAPI 2025.1. The CUDA Compute Capability can be optionally specified with `-Dcc_cuda`. If not specified, the default CUDA target of the DPC++ compiler is used, which means SYCL device code is precompiled for the lowest supported CC. When executed on a GPU with different CC, it is recompiled at runtime for the specific architecture. In addition to meson.build changes, remove a redundand free(nullptr) causing crashes in the SYCL NVIDIA backend. * Enable SYCL AMD backend and fix its fp16 support Tested with ``` CXX=icpx CC=icx AR=llvm-ar ./build.sh -Dsycl=amd -Damd_gfx=90a ``` on Ubuntu 22.04 with ROCm 6.3.3 and oneAPI 2025.1. The new amd_gfx option is required as DPC++ does not support Just-In-Time compilation for AMD GPU code. It has to be precompiled for the right architecture when building the application. Fix the SYCL AMD fp16 backend which missed calling the fp16 hipBLAS functions where needed. Also fix the hardcoded sub-group / warp / wavefront size of 32. Some AMD GPUs have wavefront size of 64 and this has to be used instead. * Move SYCL_SUB_GROUP_SIZE definition to a common header
1 parent 5cf0c8e commit 0c6deba

File tree

6 files changed

+99
-29
lines changed

6 files changed

+99
-29
lines changed

meson.build

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -682,9 +682,26 @@ if get_option('build_backends')
682682
deps += cc.find_library('mkl_core', required: true)
683683
deps += cc.find_library('OpenCL', required: true)
684684
elif get_option('sycl') == 'amd'
685-
error('Building SYCL for AMD backend not yet supported')
685+
deps += cc.find_library('hipblas', required: true)
686+
deps += cc.find_library('amdhip64', required: true)
687+
add_project_arguments('-DUSE_HIPBLAS=ON', language : 'cpp')
688+
add_project_arguments('-D__HIP_PLATFORM_AMD__', language : 'cpp')
689+
if get_option('amd_gfx') == ''
690+
error('-Dsycl=amd requires specifying -Damd_gfx architecture identifier (e.g. 90a, 1100 or similar)')
691+
endif
692+
add_project_arguments('-fsycl-targets=amd_gpu_gfx'+get_option('amd_gfx'), language : 'cpp')
693+
add_project_link_arguments('-fsycl-targets=amd_gpu_gfx'+get_option('amd_gfx'), language : 'cpp')
686694
else
687-
error('Building SYCL for the NVIDIA backend not yet supported')
695+
deps += cc.find_library('cublas', required: true)
696+
deps += cc.find_library('cudart', required: true)
697+
add_project_arguments('-DUSE_CUBLAS=ON', language : 'cpp')
698+
if get_option('cc_cuda') != ''
699+
sycl_nvidia_target = 'nvidia_gpu_sm_' + get_option('cc_cuda')
700+
else
701+
sycl_nvidia_target = 'nvptx64-nvidia-cuda'
702+
endif
703+
add_project_arguments('-fsycl-targets='+sycl_nvidia_target, language : 'cpp')
704+
add_project_link_arguments('-fsycl-targets='+sycl_nvidia_target, language : 'cpp')
688705
endif
689706
if host_machine.system() == 'windows'
690707
# For sycl under windows we need to link using icx to generate the device code.

meson_options.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,11 @@ option('cc_cuda',
178178
value: '',
179179
description: 'Build for a specific cuda CC, e.g. -Dcc_cuda=35 for CC 3.5')
180180

181+
option('amd_gfx',
182+
type: 'string',
183+
value: '',
184+
description: 'Build for a specific AMD GPU architecture, e.g. -Damd_gfx=90a for gfx90a')
185+
181186
option('onnx_libdir',
182187
type: 'string',
183188
value: '',

src/neural/backends/sycl/common_kernels.dp.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -936,7 +936,7 @@ void globalAvgPool(int N, int C, T* output, const T* input,
936936
sycl::nd_range<3>(
937937
sycl::range<3>(1, 1, blocks) * sycl::range<3>(1, 1, kBlockSize),
938938
sycl::range<3>(1, 1, kBlockSize)),
939-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
939+
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] {
940940
globalAvgPool_kernel(output, input, prevLayerBias, N * C * kPlaneSize,
941941
N * C, C, item_ct1);
942942
});
@@ -1070,7 +1070,7 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input,
10701070
cgh.parallel_for(
10711071
sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
10721072
sycl::range<3>(1, 1, C)),
1073-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
1073+
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] {
10741074
OutputTransform_SE_relu_InputTransform_kernel<float, activation,
10751075
use_bias, use_skip>(
10761076
N, C, se_K, output, input, (float*)skip, bias, w1, b1, w2, b2,
@@ -1218,7 +1218,7 @@ void Softmax(int N, int C, T* output, const T* input, const T* input2, sycl::que
12181218
sycl::nd_range<3>(
12191219
sycl::range<3>(1, 1, blocks) * sycl::range<3>(1, 1, kBlockSize),
12201220
sycl::range<3>(1, 1, kBlockSize)),
1221-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
1221+
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] {
12221222
softmax_opt_64_kernel<T>(output, input, input2, size, item_ct1);
12231223
});
12241224
}
@@ -1235,7 +1235,7 @@ void Softmax(int N, int C, T* output, const T* input, const T* input2, sycl::que
12351235
cgh.parallel_for(
12361236
sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
12371237
sycl::range<3>(1, 1, C)),
1238-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
1238+
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] {
12391239
softmax_kernel<T>(output, input, input2, item_ct1, sum_acc_ct1,
12401240
maxval_acc_ct1);
12411241
});
@@ -1461,7 +1461,7 @@ void LayerNorm(int N, int C, T* output, const T* input, const T* bias,
14611461

14621462
cgh.parallel_for(
14631463
sycl::nd_range<3>(gridDim * blockDim, blockDim),
1464-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
1464+
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] {
14651465
layer_norm_kernel<T>(N, C, output, input, bias, skip, gammas, betas,
14661466
ep, alpha, act, item_ct1, sum_acc_ct1);
14671467
});

src/neural/backends/sycl/fp16_kernels.dp.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -749,7 +749,7 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input,
749749
sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
750750
sycl::range<3>(1, 1, C)),
751751
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(
752-
32)]] {
752+
SYCL_SUB_GROUP_SIZE)]] {
753753
OutputInputTransformKernel_fp16_shmem_board<activation,
754754
use_bias, use_skip>(
755755
N, C, se_K, (sycl::half*)output, (const sycl::half*)input,
@@ -798,7 +798,7 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input,
798798
cgh.parallel_for(
799799
sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
800800
sycl::range<3>(1, 1, C)),
801-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
801+
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] {
802802
OutputTransform_SE_relu_InputTransform_kernel<
803803
sycl::half, activation, use_bias, use_skip>(
804804
N, C, se_K, output, input, (sycl::half*)skip, bias, w1, b1, w2,

src/neural/backends/sycl/layers.cc.dp.cpp

Lines changed: 62 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,10 @@ void SELayer<sycl::half>::Eval(int N, sycl::half* output, const sycl::half* inpu
358358
half alpha = one_h;
359359
half beta = zero_h;
360360

361+
#elif defined(USE_HIPBLAS)
362+
hipblasHalf alpha{1};
363+
hipblasHalf beta{0};
364+
361365
#else
362366
sycl::half alpha = 1;
363367
sycl::half beta = 0;
@@ -393,10 +397,10 @@ void SELayer<sycl::half>::Eval(int N, sycl::half* output, const sycl::half* inpu
393397
sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
394398
hipblasSetStream(handle, hipStreamHandle);
395399

396-
hipblasSgemm(handle, transpose_type_transpose,
400+
hipblasHgemm(handle, transpose_type_transpose,
397401
transpose_type_notranspose,numFc1Out_, N, C, &alpha,
398-
((const sycl::half *)w1_), C, ((const sycl::half *)op2), C,
399-
&beta, ((sycl::half *)op1), numFc1Out_);
402+
((const hipblasHalf *)w1_), C, ((const hipblasHalf *)op2), C,
403+
&beta, ((hipblasHalf *)op1), numFc1Out_);
400404

401405
hipStreamSynchronize(hipStreamHandle);
402406
});
@@ -436,10 +440,10 @@ void SELayer<sycl::half>::Eval(int N, sycl::half* output, const sycl::half* inpu
436440
sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
437441
hipblasSetStream(handle, hipStreamHandle);
438442

439-
hipblasSgemm(
443+
hipblasHgemm(
440444
handle, transpose_type_transpose, transpose_type_notranspose, 2 * C,
441-
N, numFc1Out_, &alpha,((const sycl::half *)w2_), numFc1Out_,
442-
((const sycl::half *)op1), numFc1Out_, &beta, ((sycl::half *)op2),
445+
N, numFc1Out_, &alpha,((const hipblasHalf *)w2_), numFc1Out_,
446+
((const hipblasHalf *)op1), numFc1Out_, &beta, ((hipblasHalf *)op2),
443447
2 * C);
444448

445449
hipStreamSynchronize(hipStreamHandle);
@@ -544,6 +548,10 @@ template <>
544548
half alpha = one_h;
545549
half beta = zero_h;
546550

551+
#elif defined(USE_HIPBLAS)
552+
hipblasHalf alpha{1};
553+
hipblasHalf beta{0};
554+
547555
#else
548556
sycl::half alpha = 1;
549557
sycl::half beta = 0;
@@ -576,11 +584,11 @@ template <>
576584
sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
577585
hipblasSetStream(handle, hipStreamHandle);
578586

579-
hipblasSgemm(
587+
hipblasHgemm(
580588
handle, transpose_type_transpose, transpose_type_notranspose,
581-
num_outputs, N, num_inputs, &alpha, ((const sycl::half *)weights_),
582-
num_inputs, ((const sycl::half *)input_tensor), num_inputs, &beta,
583-
((sycl::half *)output_tensor), num_outputs);
589+
num_outputs, N, num_inputs, &alpha, ((const hipblasHalf *)weights_),
590+
num_inputs, ((const hipblasHalf *)input_tensor), num_inputs, &beta,
591+
((hipblasHalf *)output_tensor), num_outputs);
584592

585593
hipStreamSynchronize(hipStreamHandle);
586594
});
@@ -964,7 +972,7 @@ template <>
964972

965973
hipStreamSynchronize(hipStreamHandle);
966974
});
967-
);
975+
});
968976
#else
969977
int64_t M_ = M;
970978
int64_t N_ = N;
@@ -1807,7 +1815,20 @@ static void cublasXgemm(transpose_type transa,
18071815
});
18081816
}
18091817
#elif defined(USE_HIPBLAS)
1810-
hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t();
1818+
hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t();
1819+
if (fp16) {
1820+
unsigned short alpha_h = FP32toFP16(alpha);
1821+
unsigned short beta_h = FP32toFP16(beta);
1822+
sycl_queue.submit([&](sycl::handler &cgh) {
1823+
cgh.host_task([=](sycl::interop_handle ih) {
1824+
auto hipStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
1825+
hipblasSetStream(handle, hipStreamHandle);
1826+
hipblasHgemm(handle, transa, transb, m, n, k, &alpha_h, (const hipblasHalf*)A,
1827+
lda, (const hipblasHalf*)B, ldb, &beta_h, (hipblasHalf*)C, ldc);
1828+
hipStreamSynchronize(hipStreamHandle);
1829+
});
1830+
});
1831+
} else {
18111832
sycl_queue.submit([&](sycl::handler &cgh) {
18121833
cgh.host_task([=](sycl::interop_handle ih) {
18131834
auto hipStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
@@ -1816,6 +1837,7 @@ static void cublasXgemm(transpose_type transa,
18161837
hipStreamSynchronize(hipStreamHandle);
18171838
});
18181839
});
1840+
}
18191841
#else
18201842
oneapi::mkl::blas::column_major::gemm(sycl_queue, transa, transb, m, n, k, alpha, (const DataType *)A, lda,
18211843
(const DataType *)B, ldb, beta, (DataType *)C, ldc);
@@ -1873,9 +1895,29 @@ static void cublasXGemmStridedBatched(transpose_type transa, transpose_type tran
18731895
});
18741896
}
18751897
#elif defined(USE_HIPBLAS)
1876-
hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t();
1898+
hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t();
1899+
if (fp16) {
1900+
unsigned short alpha_h = FP32toFP16(alpha);
1901+
unsigned short beta_h = FP32toFP16(beta);
1902+
1903+
sycl_queue.submit([&](sycl::handler &cgh) {
1904+
1905+
cgh.host_task([=](sycl::interop_handle ih) {
1906+
1907+
auto hipStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
1908+
hipblasSetStream(handle, hipStreamHandle);
1909+
1910+
hipblasGemmStridedBatchedEx(
1911+
handle, transa, transb, m, n, k, &alpha_h, A, HIPBLAS_R_16F, lda, strideA, B,
1912+
HIPBLAS_R_16F, ldb, strideB, &beta_h, C, HIPBLAS_R_16F, ldc, strideC,
1913+
batchCount, HIPBLAS_R_16F, HIPBLAS_GEMM_DEFAULT);
1914+
1915+
hipStreamSynchronize(hipStreamHandle);
18771916

1878-
sycl_queue.submit([&](sycl::handler &cgh) {
1917+
});
1918+
});
1919+
} else {
1920+
sycl_queue.submit([&](sycl::handler &cgh) {
18791921

18801922
cgh.host_task([=](sycl::interop_handle ih) {
18811923

@@ -1891,9 +1933,10 @@ static void cublasXGemmStridedBatched(transpose_type transa, transpose_type tran
18911933

18921934
});
18931935
});
1894-
#else
1895-
oneapi::mkl::blas::column_major::gemm_batch(sycl_queue, transa, transb, m, n, k, alpha, (const DataType *)A, lda, strideA, (const DataType *)B, ldb, strideB, beta, (DataType *)C, ldc, strideC, batchCount);
1896-
#endif
1936+
}
1937+
#else
1938+
oneapi::mkl::blas::column_major::gemm_batch(sycl_queue, transa, transb, m, n, k, alpha, (const DataType *)A, lda, strideA, (const DataType *)B, ldb, strideB, beta, (DataType *)C, ldc, strideC, batchCount);
1939+
#endif
18971940
}
18981941

18991942
template <typename DataType>
@@ -1962,8 +2005,8 @@ static void cublasXGemmBatched(transpose_type transa,
19622005
hipblasSetStream(handle, hipStreamHandle);
19632006

19642007
hipblasHgemmBatched(
1965-
handle, transa, transb, m, n, k, (const half*)&alpha_h, (half**)A, lda,
1966-
(half**)B, ldb, (const half*)&beta_h, (half**)C, ldc, batchCount);
2008+
handle, transa, transb, m, n, k, (const hipblasHalf*)&alpha_h, (hipblasHalf**)A, lda,
2009+
(hipblasHalf**)B, ldb, (const hipblasHalf*)&beta_h, (hipblasHalf**)C, ldc, batchCount);
19672010

19682011
hipStreamSynchronize(hipStreamHandle);
19692012

@@ -2507,7 +2550,6 @@ template <typename DataType>
25072550
AttentionBody<DataType>::~AttentionBody() {
25082551
sycl::free(ip_emb_w_, sycl_queue_);
25092552
sycl::free(ip_emb_b_, sycl_queue_);
2510-
sycl::free(pos_encoding_, sycl_queue_);
25112553
if (is_pe_dense_embedding_) {
25122554
sycl::free(ip_emb_pre_w_, sycl_queue_);
25132555
sycl::free(ip_emb_pre_b_, sycl_queue_);

src/neural/backends/sycl/sycl_common.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,12 @@
2727

2828
#include "utils/exception.h"
2929

30+
#if defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
31+
#define SYCL_SUB_GROUP_SIZE 64
32+
#else
33+
#define SYCL_SUB_GROUP_SIZE 32
34+
#endif
35+
3036
namespace lczero {
3137
namespace sycldnn_backend {
3238

0 commit comments

Comments
 (0)