@@ -254,8 +254,8 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
254
254
GGML_ASSERT (ncols % WARP_SIZE == 0 );
255
255
if (ncols < 1024 ) {
256
256
const sycl::range<3 > block_dims (1 , 1 , WARP_SIZE);
257
- stream-> submit ([&](sycl::handler& cgh) {
258
- cgh. parallel_for (
257
+ syclex:: submit (*stream, [&](sycl::handler& cgh) {
258
+ syclex::nd_launch (cgh,
259
259
sycl::nd_range<3 >(global_dims * block_dims, block_dims),
260
260
[=](sycl::nd_item<3 > item_ct1)
261
261
[[sycl::reqd_sub_group_size (WARP_SIZE)]] {
@@ -272,10 +272,10 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
272
272
the limit. To get the device limit, query
273
273
info::device::max_work_group_size. Adjust the work-group size if needed.
274
274
*/
275
- stream-> submit ([&](sycl::handler& cgh) {
275
+ syclex:: submit (*stream, [&](sycl::handler& cgh) {
276
276
sycl::local_accessor<sycl::float2, 1 > s_sum_acc_ct1 (
277
277
sycl::range<1 >(work_group_size / WARP_SIZE), cgh);
278
- cgh. parallel_for (
278
+ syclex::nd_launch (cgh,
279
279
sycl::nd_range<3 >(global_dims * block_dims, block_dims),
280
280
[=](sycl::nd_item<3 > item_ct1)
281
281
[[sycl::reqd_sub_group_size (WARP_SIZE)]] {
@@ -290,9 +290,9 @@ static void group_norm_f32_sycl(const float* x, float* dst,
290
290
const int ne_elements, queue_ptr stream, int device) {
291
291
if (group_size < 1024 ) {
292
292
const sycl::range<3 > block_dims (1 , 1 , WARP_SIZE);
293
- stream-> submit ([&](sycl::handler& cgh) {
293
+ syclex:: submit (*stream, [&](sycl::handler& cgh) {
294
294
const float eps_ct4 = eps;
295
- cgh. parallel_for (
295
+ syclex::nd_launch (cgh,
296
296
sycl::nd_range<3 >(sycl::range<3 >(1 , 1 , num_groups) * block_dims,
297
297
block_dims),
298
298
[=](sycl::nd_item<3 > item_ct1)
@@ -313,13 +313,13 @@ static void group_norm_f32_sycl(const float* x, float* dst,
313
313
info::device::max_work_group_size. Adjust the work-group size if needed.
314
314
*/
315
315
316
- stream-> submit ([&](sycl::handler& cgh) {
316
+ syclex:: submit (*stream, [&](sycl::handler& cgh) {
317
317
sycl::local_accessor<float , 1 > s_sum_acc_ct1 (sycl::range<1 >(work_group_size / WARP_SIZE),
318
318
cgh);
319
319
320
320
const float eps_ct4 = eps;
321
321
322
- cgh. parallel_for (
322
+ syclex::nd_launch (cgh,
323
323
sycl::nd_range<3 >(sycl::range<3 >(1 , 1 , num_groups) * block_dims,
324
324
block_dims),
325
325
[=](sycl::nd_item<3 > item_ct1)
@@ -340,8 +340,8 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const
340
340
const sycl::range<3 > global_dims (nsamples, nchannels, nrows);
341
341
if (ncols < 1024 ) {
342
342
const sycl::range<3 > block_dims (1 , 1 , WARP_SIZE);
343
- stream-> submit ([&](sycl::handler& cgh) {
344
- cgh. parallel_for (
343
+ syclex:: submit (*stream, [&](sycl::handler& cgh) {
344
+ syclex::nd_launch (cgh,
345
345
sycl::nd_range<3 >(global_dims * block_dims, block_dims),
346
346
[=](sycl::nd_item<3 > item_ct1)
347
347
[[sycl::reqd_sub_group_size (WARP_SIZE)]] {
@@ -358,10 +358,10 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const
358
358
the limit. To get the device limit, query
359
359
info::device::max_work_group_size. Adjust the work-group size if needed.
360
360
*/
361
- stream-> submit ([&](sycl::handler& cgh) {
361
+ syclex:: submit (*stream, [&](sycl::handler& cgh) {
362
362
sycl::local_accessor<float , 1 > s_sum_acc_ct1 (sycl::range<1 >(work_group_size / WARP_SIZE),
363
363
cgh);
364
- cgh. parallel_for (
364
+ syclex::nd_launch (cgh,
365
365
sycl::nd_range<3 >(global_dims * block_dims, block_dims),
366
366
[=](sycl::nd_item<3 > item_ct1)
367
367
[[sycl::reqd_sub_group_size (WARP_SIZE)]] {
@@ -378,8 +378,8 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
378
378
// printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
379
379
if (ncols < 1024 ) {
380
380
const sycl::range<3 > block_dims (1 , 1 , WARP_SIZE);
381
- stream-> submit ([&](sycl::handler& cgh) {
382
- cgh. parallel_for (
381
+ syclex:: submit (*stream, [&](sycl::handler& cgh) {
382
+ syclex::nd_launch (cgh,
383
383
sycl::nd_range<3 >(sycl::range<3 >(1 , 1 , nrows) * block_dims,
384
384
block_dims),
385
385
[=](sycl::nd_item<3 > item_ct1)
@@ -398,10 +398,10 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
398
398
the limit. To get the device limit, query
399
399
info::device::max_work_group_size. Adjust the work-group size if needed.
400
400
*/
401
- stream-> submit ([&](sycl::handler& cgh) {
401
+ syclex:: submit (*stream, [&](sycl::handler& cgh) {
402
402
sycl::local_accessor<float , 1 > s_sum_acc_ct1 (sycl::range<1 >(work_group_size / WARP_SIZE),
403
403
cgh);
404
- cgh. parallel_for (
404
+ syclex::nd_launch (cgh,
405
405
sycl::nd_range<3 >(sycl::range<3 >(1 , 1 , nrows) * block_dims,
406
406
block_dims),
407
407
[=](sycl::nd_item<3 > item_ct1)
0 commit comments