@@ -237,6 +237,7 @@ CNAME(BLASLONG M,
237
237
#endif
238
238
{
239
239
const uint64_t v_size = svcntw ();
240
+ const uint64_t v_size2 = v_size * 2 ;
240
241
const svbool_t pg_true = svptrue_b32 ();
241
242
const svbool_t pg_quad = svwhilelt_b32 (0 , 4 );
242
243
const svbool_t pg_first = svwhilelt_b32 (0 , 1 );
@@ -245,10 +246,11 @@ CNAME(BLASLONG M,
245
246
const svfloat32_t beta_vec = svdup_f32 (beta );
246
247
#endif
247
248
const BLASLONG n4 = N & -4 ;
249
+ const BLASLONG v_m2 = M & - v_size2 ;
248
250
const BLASLONG v_m1 = M & - v_size ;
249
251
const BLASLONG k4 = K & -4 ;
250
252
251
- const int pack_b = M >= v_size && N >= 8 && K >= 8 ? 1 : 0 ;
253
+ const int pack_b = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0 ;
252
254
FLOAT * packed_b =
253
255
(pack_b ) ? packed_b = (FLOAT * )malloc (K * 4 * sizeof (FLOAT )) : NULL ;
254
256
@@ -269,16 +271,21 @@ CNAME(BLASLONG M,
269
271
CREATE_B_POINTER (3 , 3 );
270
272
271
273
BLASLONG i = 0 ;
272
- for (; i < v_m1 ; i += v_size ) {
274
+ for (; i < v_m2 ; i += v_size2 ) {
273
275
274
276
CREATE_A_POINTER (0 , 0 );
275
- UPDATE_A_POINTER (v_size );
277
+ CREATE_A_POINTER (1 , v_size );
278
+ UPDATE_A_POINTER (v_size2 );
276
279
277
280
BLASLONG k = 0 ;
278
281
DECLARE_RESULT_VECTOR (0 , 0 );
279
282
DECLARE_RESULT_VECTOR (0 , 1 );
280
283
DECLARE_RESULT_VECTOR (0 , 2 );
281
284
DECLARE_RESULT_VECTOR (0 , 3 );
285
+ DECLARE_RESULT_VECTOR (1 , 0 );
286
+ DECLARE_RESULT_VECTOR (1 , 1 );
287
+ DECLARE_RESULT_VECTOR (1 , 2 );
288
+ DECLARE_RESULT_VECTOR (1 , 3 );
282
289
283
290
if (LIKELY (packed_b != NULL )) {
284
291
if (i == 0 ) {
@@ -314,6 +321,26 @@ CNAME(BLASLONG M,
314
321
UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 3 );
315
322
UPDATE_RESULT_VECTOR_QUADWORD (0 , 2 , 0 , 2 , 3 );
316
323
UPDATE_RESULT_VECTOR_QUADWORD (0 , 3 , 0 , 3 , 3 );
324
+ VECTOR_LOAD_A (pg_true , 1 , 0 );
325
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 0 , 0 , 0 , 0 );
326
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 1 , 0 , 1 , 0 );
327
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 2 , 0 , 2 , 0 );
328
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 3 , 0 , 3 , 0 );
329
+ VECTOR_LOAD_A (pg_true , 1 , 1 );
330
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 0 , 0 , 0 , 1 );
331
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 1 , 0 , 1 , 1 );
332
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 2 , 0 , 2 , 1 );
333
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 3 , 0 , 3 , 1 );
334
+ VECTOR_LOAD_A (pg_true , 1 , 2 );
335
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 0 , 0 , 0 , 2 );
336
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 1 , 0 , 1 , 2 );
337
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 2 , 0 , 2 , 2 );
338
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 3 , 0 , 3 , 2 );
339
+ VECTOR_LOAD_A (pg_true , 1 , 3 );
340
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 0 , 0 , 0 , 3 );
341
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 1 , 0 , 1 , 3 );
342
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 2 , 0 , 2 , 3 );
343
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 3 , 0 , 3 , 3 );
317
344
}
318
345
for (; k < K ; k ++ ) {
319
346
@@ -324,12 +351,17 @@ CNAME(BLASLONG M,
324
351
BROADCAST_LOAD_B (1 , 0 );
325
352
PACK_B (1 , 0 );
326
353
UPDATE_RESULT_VECTOR (pg_true , 0 , 1 , 0 );
354
+ VECTOR_LOAD_A (pg_true , 1 , 0 );
355
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 0 , 0 );
356
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 1 , 0 );
327
357
BROADCAST_LOAD_B (2 , 0 );
328
358
PACK_B (2 , 0 );
329
359
UPDATE_RESULT_VECTOR (pg_true , 0 , 2 , 0 );
360
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 2 , 0 );
330
361
BROADCAST_LOAD_B (3 , 0 );
331
362
PACK_B (3 , 0 );
332
363
UPDATE_RESULT_VECTOR (pg_true , 0 , 3 , 0 );
364
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 3 , 0 );
333
365
}
334
366
} else {
335
367
for (; k < K ; k ++ ) {
@@ -340,11 +372,118 @@ CNAME(BLASLONG M,
340
372
UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 0 );
341
373
UPDATE_RESULT_VECTOR_QUADWORD (0 , 2 , 0 , 2 , 0 );
342
374
UPDATE_RESULT_VECTOR_QUADWORD (0 , 3 , 0 , 3 , 0 );
375
+ VECTOR_LOAD_A (pg_true , 1 , 0 );
376
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 0 , 0 , 0 , 0 );
377
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 1 , 0 , 1 , 0 );
378
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 2 , 0 , 2 , 0 );
379
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 3 , 0 , 3 , 0 );
343
380
}
344
381
}
345
382
} else {
346
383
for (; k < k4 ; k += 4 ) {
347
384
385
+ VECTOR_LOAD_B_K4 (0 , 0 );
386
+ VECTOR_LOAD_B_K4 (1 , 0 );
387
+ VECTOR_LOAD_B_K4 (2 , 0 );
388
+ VECTOR_LOAD_B_K4 (3 , 0 );
389
+ TRANSPOSE_B4_K4 (0 , 1 , 2 , 3 , 0 , 1 , 2 , 3 );
390
+ SCALE_B4_K4 (0 , 0 , 1 , 2 , 3 );
391
+ VECTOR_LOAD_A (pg_true , 0 , 0 );
392
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 0 , 0 , 0 , 0 );
393
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 0 );
394
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 2 , 0 , 2 , 0 );
395
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 3 , 0 , 3 , 0 );
396
+ VECTOR_LOAD_A (pg_true , 0 , 1 );
397
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 0 , 0 , 0 , 1 );
398
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 1 );
399
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 2 , 0 , 2 , 1 );
400
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 3 , 0 , 3 , 1 );
401
+ VECTOR_LOAD_A (pg_true , 0 , 2 );
402
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 0 , 0 , 0 , 2 );
403
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 2 );
404
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 2 , 0 , 2 , 2 );
405
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 3 , 0 , 3 , 2 );
406
+ VECTOR_LOAD_A (pg_true , 0 , 3 );
407
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 0 , 0 , 0 , 3 );
408
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 3 );
409
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 2 , 0 , 2 , 3 );
410
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 3 , 0 , 3 , 3 );
411
+ VECTOR_LOAD_A (pg_true , 1 , 0 );
412
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 0 , 0 , 0 , 0 );
413
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 1 , 0 , 1 , 0 );
414
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 2 , 0 , 2 , 0 );
415
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 3 , 0 , 3 , 0 );
416
+ VECTOR_LOAD_A (pg_true , 1 , 1 );
417
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 0 , 0 , 0 , 1 );
418
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 1 , 0 , 1 , 1 );
419
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 2 , 0 , 2 , 1 );
420
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 3 , 0 , 3 , 1 );
421
+ VECTOR_LOAD_A (pg_true , 1 , 2 );
422
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 0 , 0 , 0 , 2 );
423
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 1 , 0 , 1 , 2 );
424
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 2 , 0 , 2 , 2 );
425
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 3 , 0 , 3 , 2 );
426
+ VECTOR_LOAD_A (pg_true , 1 , 3 );
427
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 0 , 0 , 0 , 3 );
428
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 1 , 0 , 1 , 3 );
429
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 2 , 0 , 2 , 3 );
430
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 3 , 0 , 3 , 3 );
431
+ }
432
+ for (; k < K ; k ++ ) {
433
+
434
+ BROADCAST_LOAD_B (0 , 0 );
435
+ VECTOR_LOAD_A (pg_true , 0 , 0 );
436
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 0 , 0 );
437
+ BROADCAST_LOAD_B (1 , 0 );
438
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 1 , 0 );
439
+ VECTOR_LOAD_A (pg_true , 1 , 0 );
440
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 0 , 0 );
441
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 1 , 0 );
442
+ BROADCAST_LOAD_B (2 , 0 );
443
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 2 , 0 );
444
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 2 , 0 );
445
+ BROADCAST_LOAD_B (3 , 0 );
446
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 3 , 0 );
447
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 3 , 0 );
448
+ }
449
+ }
450
+ VECTOR_STORE (pg_true , 0 , 0 );
451
+ VECTOR_STORE (pg_true , 0 , 1 );
452
+ VECTOR_STORE (pg_true , 0 , 2 );
453
+ VECTOR_STORE (pg_true , 0 , 3 );
454
+ VECTOR_STORE (pg_true , 1 , 0 );
455
+ VECTOR_STORE (pg_true , 1 , 1 );
456
+ VECTOR_STORE (pg_true , 1 , 2 );
457
+ VECTOR_STORE (pg_true , 1 , 3 );
458
+ INCR_C_POINTER (0 , v_size2 );
459
+ INCR_C_POINTER (1 , v_size2 );
460
+ INCR_C_POINTER (2 , v_size2 );
461
+ INCR_C_POINTER (3 , v_size2 );
462
+ }
463
+ for (; i < v_m1 ; i += v_size ) {
464
+
465
+ CREATE_A_POINTER (0 , 0 );
466
+ UPDATE_A_POINTER (v_size );
467
+
468
+ BLASLONG k = 0 ;
469
+ DECLARE_RESULT_VECTOR (0 , 0 );
470
+ DECLARE_RESULT_VECTOR (0 , 1 );
471
+ DECLARE_RESULT_VECTOR (0 , 2 );
472
+ DECLARE_RESULT_VECTOR (0 , 3 );
473
+
474
+ if (LIKELY (packed_b != NULL )) {
475
+ for (; k < K ; k ++ ) {
476
+
477
+ UNPACK_QUADWORD_B (0 , 0 );
478
+ VECTOR_LOAD_A (pg_true , 0 , 0 );
479
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 0 , 0 , 0 , 0 );
480
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 0 );
481
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 2 , 0 , 2 , 0 );
482
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 3 , 0 , 3 , 0 );
483
+ }
484
+ } else {
485
+ for (; k < k4 ; k += 4 ) {
486
+
348
487
VECTOR_LOAD_B_K4 (0 , 0 );
349
488
VECTOR_LOAD_B_K4 (1 , 0 );
350
489
VECTOR_LOAD_B_K4 (2 , 0 );
@@ -478,6 +617,28 @@ CNAME(BLASLONG M,
478
617
CREATE_B_POINTER (0 , 0 );
479
618
480
619
BLASLONG i = 0 ;
620
+ for (; i < v_m2 ; i += v_size2 ) {
621
+
622
+ CREATE_A_POINTER (0 , 0 );
623
+ CREATE_A_POINTER (1 , v_size );
624
+ UPDATE_A_POINTER (v_size2 );
625
+
626
+ BLASLONG k = 0 ;
627
+ DECLARE_RESULT_VECTOR (0 , 0 );
628
+ DECLARE_RESULT_VECTOR (1 , 0 );
629
+
630
+ for (; k < K ; k ++ ) {
631
+
632
+ BROADCAST_LOAD_B (0 , 0 );
633
+ VECTOR_LOAD_A (pg_true , 0 , 0 );
634
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 0 , 0 );
635
+ VECTOR_LOAD_A (pg_true , 1 , 0 );
636
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 0 , 0 );
637
+ }
638
+ VECTOR_STORE (pg_true , 0 , 0 );
639
+ VECTOR_STORE (pg_true , 1 , 0 );
640
+ INCR_C_POINTER (0 , v_size2 );
641
+ }
481
642
for (; i < v_m1 ; i += v_size ) {
482
643
483
644
CREATE_A_POINTER (0 , 0 );
0 commit comments