@@ -434,6 +434,7 @@ static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
434
434
int ne0, int ne1, int ne2, int ne3,
435
435
int ne10, int ne11, int ne12, int ne13,
436
436
/* int s0, */ int s1, int s2, int s3,
437
+ /* int s00,*/ int s01, int s02, int s03,
437
438
/* int s10,*/ int s11, int s12, int s13,
438
439
const sycl::nd_item<3 > &item_ct1) {
439
440
const int i0s = item_ct1.get_local_range (2 ) * item_ct1.get_group (2 ) +
@@ -455,9 +456,9 @@ static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
455
456
const int i12 = i2 % ne12;
456
457
const int i13 = i3 % ne13;
457
458
458
- const size_t i_src0 = i3*s3 + i2*s2 + i1*s1 ;
459
+ const size_t i_src0 = i3*s03 + i2*s02 + i1*s01 ;
459
460
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
460
- const size_t i_dst = i_src0 ;
461
+ const size_t i_dst = i3*s3 + i2*s2 + i1*s1 ;
461
462
462
463
const src0_t * src0_row = src0 + i_src0;
463
464
const src1_t * src1_row = src1 + i_src1;
@@ -475,6 +476,7 @@ static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t
475
476
int ne0, int ne1, int ne2, int ne3,
476
477
int ne10, int ne11, int ne12, int ne13,
477
478
/* int s0, */ int s1, int s2, int s3,
479
+ /* int s00,*/ int s01, int s02, int s03,
478
480
/* int s10,*/ int s11, int s12, int s13,
479
481
const sycl::nd_item<3 > &item_ct1) {
480
482
@@ -494,9 +496,9 @@ static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t
494
496
const int i12 = i2 % ne12;
495
497
const int i13 = i3 % ne13;
496
498
497
- const size_t i_src0 = i3*s3 + i2*s2 + i1*s1 ;
499
+ const size_t i_src0 = i3*s03 + i2*s02 + i1*s01 ;
498
500
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
499
- const size_t i_dst = i_src0 ;
501
+ const size_t i_dst = i3*s3 + i2*s2 + i1*s1 ;
500
502
501
503
const src0_t * src0_row = src0 + i_src0;
502
504
const src1_t * src1_row = src1 + i_src1;
@@ -526,9 +528,11 @@ struct bin_bcast_sycl {
526
528
int nr[4 ] = { nr0, nr1, nr2, nr3 };
527
529
528
530
// collapse dimensions until first broadcast dimension
529
- int64_t cne0[] = {ne0, ne1, ne2, ne3};
531
+ int64_t cne[] = {ne0, ne1, ne2, ne3};
532
+ int64_t cne0[] = {ne00, ne01, ne02, ne03};
530
533
int64_t cne1[] = {ne10, ne11, ne12, ne13};
531
- size_t cnb0[] = {nb0, nb1, nb2, nb3};
534
+ size_t cnb[] = {nb0, nb1, nb2, nb3};
535
+ size_t cnb0[] = {nb00, nb01, nb02, nb03};
532
536
size_t cnb1[] = {nb10, nb11, nb12, nb13};
533
537
auto collapse = [](int64_t cne[]) {
534
538
cne[0 ] *= cne[1 ];
@@ -543,32 +547,41 @@ struct bin_bcast_sycl {
543
547
cnb[3 ] *= cne[3 ];
544
548
};
545
549
546
- for (int i = 0 ; i < 4 ; i++) {
547
- if (nr[i] != 1 ) {
548
- break ;
549
- }
550
- if (i > 0 ) {
551
- collapse_nb (cnb0, cne0);
552
- collapse_nb (cnb1, cne1);
553
- collapse (cne0);
554
- collapse (cne1);
550
+ if (ggml_is_contiguous (src0) && ggml_is_contiguous (src1) && ggml_is_contiguous (dst)) {
551
+ for (int i = 0 ; i < 4 ; i++) {
552
+ if (nr[i] != 1 ) {
553
+ break ;
554
+ }
555
+ if (i > 0 ) {
556
+ collapse_nb (cnb, cne);
557
+ collapse_nb (cnb0, cne0);
558
+ collapse_nb (cnb1, cne1);
559
+ collapse (cne);
560
+ collapse (cne0);
561
+ collapse (cne1);
562
+ }
555
563
}
556
564
}
557
565
{
558
- int64_t ne0 = cne0 [0 ];
559
- int64_t ne1 = cne0 [1 ];
560
- int64_t ne2 = cne0 [2 ];
561
- int64_t ne3 = cne0 [3 ];
566
+ int64_t ne0 = cne [0 ];
567
+ int64_t ne1 = cne [1 ];
568
+ int64_t ne2 = cne [2 ];
569
+ int64_t ne3 = cne [3 ];
562
570
563
571
int64_t ne10 = cne1[0 ];
564
572
int64_t ne11 = cne1[1 ];
565
573
int64_t ne12 = cne1[2 ];
566
574
int64_t ne13 = cne1[3 ];
567
575
568
- size_t nb0 = cnb0[0 ];
569
- size_t nb1 = cnb0[1 ];
570
- size_t nb2 = cnb0[2 ];
571
- size_t nb3 = cnb0[3 ];
576
+ size_t nb0 = cnb[0 ];
577
+ size_t nb1 = cnb[1 ];
578
+ size_t nb2 = cnb[2 ];
579
+ size_t nb3 = cnb[3 ];
580
+
581
+ size_t nb00 = cnb0[0 ];
582
+ size_t nb01 = cnb0[1 ];
583
+ size_t nb02 = cnb0[2 ];
584
+ size_t nb03 = cnb0[3 ];
572
585
573
586
size_t nb10 = cnb1[0 ];
574
587
size_t nb11 = cnb1[1 ];
@@ -585,6 +598,28 @@ struct bin_bcast_sycl {
585
598
size_t s12 = nb12 / sizeof (src1_t );
586
599
size_t s13 = nb13 / sizeof (src1_t );
587
600
601
+ size_t s00 = nb00 / sizeof (src0_t );
602
+ size_t s01 = nb01 / sizeof (src0_t );
603
+ size_t s02 = nb02 / sizeof (src0_t );
604
+ size_t s03 = nb03 / sizeof (src0_t );
605
+
606
+ GGML_UNUSED (s00);
607
+
608
+ GGML_ASSERT (nb0 % sizeof (dst_t ) == 0 );
609
+ GGML_ASSERT (nb1 % sizeof (dst_t ) == 0 );
610
+ GGML_ASSERT (nb2 % sizeof (dst_t ) == 0 );
611
+ GGML_ASSERT (nb3 % sizeof (dst_t ) == 0 );
612
+
613
+ GGML_ASSERT (nb00 % sizeof (src0_t ) == 0 );
614
+ GGML_ASSERT (nb01 % sizeof (src0_t ) == 0 );
615
+ GGML_ASSERT (nb02 % sizeof (src0_t ) == 0 );
616
+ GGML_ASSERT (nb03 % sizeof (src0_t ) == 0 );
617
+
618
+ GGML_ASSERT (nb10 % sizeof (src1_t ) == 0 );
619
+ GGML_ASSERT (nb11 % sizeof (src1_t ) == 0 );
620
+ GGML_ASSERT (nb12 % sizeof (src1_t ) == 0 );
621
+ GGML_ASSERT (nb13 % sizeof (src1_t ) == 0 );
622
+
588
623
GGML_ASSERT (s0 == 1 );
589
624
GGML_ASSERT (s10 == 1 );
590
625
@@ -621,8 +656,8 @@ struct bin_bcast_sycl {
621
656
[=](sycl::nd_item<3 > item_ct1) {
622
657
k_bin_bcast_unravel<bin_op>(
623
658
src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
624
- ne10, ne11, ne12, ne13, s1, s2, s3, s11, s12 ,
625
- s13, item_ct1);
659
+ ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02 ,
660
+ s03, s11, s12, s13, item_ct1);
626
661
});
627
662
}
628
663
} else {
@@ -640,7 +675,7 @@ struct bin_bcast_sycl {
640
675
[=](sycl::nd_item<3 > item_ct1) {
641
676
k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
642
677
ne2, ne3, ne10, ne11, ne12, ne13,
643
- s1, s2, s3, s11, s12, s13,
678
+ s1, s2, s3, s01, s02, s03, s11, s12, s13,
644
679
item_ct1);
645
680
});
646
681
}
0 commit comments