Skip to content

Commit fd682bb

Browse files
committed
Convert submit and parallel_for in remaining files
1 parent da7059f commit fd682bb

File tree

3 files changed

+26
-26
lines changed

3 files changed

+26
-26
lines changed

ggml/src/ggml-sycl/norm.cpp

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -254,8 +254,8 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
254254
GGML_ASSERT(ncols % WARP_SIZE == 0);
255255
if (ncols < 1024) {
256256
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
257-
stream->submit([&](sycl::handler& cgh) {
258-
cgh.parallel_for(
257+
syclex::submit(*stream,[&](sycl::handler& cgh) {
258+
syclex::nd_launch(cgh,
259259
sycl::nd_range<3>(global_dims * block_dims, block_dims),
260260
[=](sycl::nd_item<3> item_ct1)
261261
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
@@ -272,10 +272,10 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
272272
the limit. To get the device limit, query
273273
info::device::max_work_group_size. Adjust the work-group size if needed.
274274
*/
275-
stream->submit([&](sycl::handler& cgh) {
275+
syclex::submit(*stream,[&](sycl::handler& cgh) {
276276
sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
277277
sycl::range<1>(work_group_size / WARP_SIZE), cgh);
278-
cgh.parallel_for(
278+
syclex::nd_launch(cgh,
279279
sycl::nd_range<3>(global_dims * block_dims, block_dims),
280280
[=](sycl::nd_item<3> item_ct1)
281281
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
@@ -290,9 +290,9 @@ static void group_norm_f32_sycl(const float* x, float* dst,
290290
const int ne_elements, queue_ptr stream, int device) {
291291
if (group_size < 1024) {
292292
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
293-
stream->submit([&](sycl::handler& cgh) {
293+
syclex::submit(*stream,[&](sycl::handler& cgh) {
294294
const float eps_ct4 = eps;
295-
cgh.parallel_for(
295+
syclex::nd_launch(cgh,
296296
sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
297297
block_dims),
298298
[=](sycl::nd_item<3> item_ct1)
@@ -313,13 +313,13 @@ static void group_norm_f32_sycl(const float* x, float* dst,
313313
info::device::max_work_group_size. Adjust the work-group size if needed.
314314
*/
315315

316-
stream->submit([&](sycl::handler& cgh) {
316+
syclex::submit(*stream,[&](sycl::handler& cgh) {
317317
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
318318
cgh);
319319

320320
const float eps_ct4 = eps;
321321

322-
cgh.parallel_for(
322+
syclex::nd_launch(cgh,
323323
sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
324324
block_dims),
325325
[=](sycl::nd_item<3> item_ct1)
@@ -340,8 +340,8 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const
340340
const sycl::range<3> global_dims(nsamples, nchannels, nrows);
341341
if (ncols < 1024) {
342342
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
343-
stream->submit([&](sycl::handler& cgh) {
344-
cgh.parallel_for(
343+
syclex::submit(*stream,[&](sycl::handler& cgh) {
344+
syclex::nd_launch(cgh,
345345
sycl::nd_range<3>(global_dims * block_dims, block_dims),
346346
[=](sycl::nd_item<3> item_ct1)
347347
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
@@ -358,10 +358,10 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const
358358
the limit. To get the device limit, query
359359
info::device::max_work_group_size. Adjust the work-group size if needed.
360360
*/
361-
stream->submit([&](sycl::handler& cgh) {
361+
syclex::submit(*stream,[&](sycl::handler& cgh) {
362362
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
363363
cgh);
364-
cgh.parallel_for(
364+
syclex::nd_launch(cgh,
365365
sycl::nd_range<3>(global_dims * block_dims, block_dims),
366366
[=](sycl::nd_item<3> item_ct1)
367367
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
@@ -378,8 +378,8 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
378378
// printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
379379
if (ncols < 1024) {
380380
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
381-
stream->submit([&](sycl::handler& cgh) {
382-
cgh.parallel_for(
381+
syclex::submit(*stream,[&](sycl::handler& cgh) {
382+
syclex::nd_launch(cgh,
383383
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
384384
block_dims),
385385
[=](sycl::nd_item<3> item_ct1)
@@ -398,10 +398,10 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
398398
the limit. To get the device limit, query
399399
info::device::max_work_group_size. Adjust the work-group size if needed.
400400
*/
401-
stream->submit([&](sycl::handler& cgh) {
401+
syclex::submit(*stream,[&](sycl::handler& cgh) {
402402
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
403403
cgh);
404-
cgh.parallel_for(
404+
syclex::nd_launch(cgh,
405405
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
406406
block_dims),
407407
[=](sycl::nd_item<3> item_ct1)

ggml/src/ggml-sycl/softmax.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,10 +127,10 @@ static void soft_max_f32_submitter(const float * x, const T * mask, float * dst,
127127
const int nrows_y, const float scale, const float max_bias, const float m0,
128128
const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
129129
const size_t n_local_scratch, queue_ptr stream) {
130-
stream->submit([&](sycl::handler &cgh) {
130+
syclex::submit(*stream,[&](sycl::handler &cgh) {
131131
sycl::local_accessor<float, 1> local_buf_acc(n_local_scratch, cgh);
132132

133-
cgh.parallel_for(
133+
syclex::nd_launch(cgh,
134134
sycl::nd_range<3>(block_nums * block_dims, block_dims),
135135
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
136136
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,

ggml/src/ggml-sycl/wkv.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -207,10 +207,10 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
207207

208208
// Submit kernel
209209
if (C / H == WKV_BLOCK_SIZE) {
210-
stream->submit([&](sycl::handler& cgh) {
210+
syclex::submit(*stream,[&](sycl::handler& cgh) {
211211
sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);
212212

213-
cgh.parallel_for(
213+
syclex::nd_launch(cgh,
214214
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
215215
[=](sycl::nd_item<3> item_ct1) {
216216
rwkv_wkv6_f32_kernel<WKV_BLOCK_SIZE>(
@@ -220,10 +220,10 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
220220
});
221221
});
222222
} else {
223-
stream->submit([&](sycl::handler& cgh) {
223+
syclex::submit(*stream,[&](sycl::handler& cgh) {
224224
sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);
225225

226-
cgh.parallel_for(
226+
syclex::nd_launch(cgh,
227227
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
228228
[=](sycl::nd_item<3> item_ct1) {
229229
rwkv_wkv6_f32_kernel<WKV_BLOCK_SIZE * 2>(
@@ -264,10 +264,10 @@ void ggml_sycl_op_rwkv_wkv7(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
264264

265265
// Submit kernel
266266
if (C / H == WKV_BLOCK_SIZE) {
267-
stream->submit([&](sycl::handler& cgh) {
267+
syclex::submit(*stream,[&](sycl::handler& cgh) {
268268
sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);
269269

270-
cgh.parallel_for(
270+
syclex::nd_launch(cgh,
271271
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
272272
[=](sycl::nd_item<3> item_ct1) {
273273
rwkv_wkv7_f32_kernel<WKV_BLOCK_SIZE>(
@@ -277,10 +277,10 @@ void ggml_sycl_op_rwkv_wkv7(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
277277
});
278278
});
279279
} else {
280-
stream->submit([&](sycl::handler& cgh) {
280+
syclex::submit(*stream,[&](sycl::handler& cgh) {
281281
sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);
282282

283-
cgh.parallel_for(
283+
syclex::nd_launch(cgh,
284284
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
285285
[=](sycl::nd_item<3> item_ct1) {
286286
rwkv_wkv7_f32_kernel<WKV_BLOCK_SIZE * 2>(

0 commit comments

Comments
 (0)