@@ -219,6 +219,7 @@ CNAME(BLASLONG M,
219
219
220
220
const BLASLONG v_m2 = M & - v_size2 ;
221
221
const BLASLONG v_m1 = M & - v_size ;
222
+ const BLASLONG n8 = N & -8 ;
222
223
const BLASLONG n4 = N & -4 ;
223
224
224
225
const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0 ;
@@ -238,23 +239,35 @@ CNAME(BLASLONG M,
238
239
CREATE_A_POINTER (1 , v_size );
239
240
240
241
BLASLONG j = 0 ;
241
- for (; j < n4 ; j += 4 ) {
242
+ for (; j < n8 ; j += 8 ) {
242
243
243
244
CREATE_B_POINTER (0 , 0 );
244
245
CREATE_B_POINTER (1 , 1 );
245
246
CREATE_B_POINTER (2 , 2 );
246
247
CREATE_B_POINTER (3 , 3 );
247
- UPDATE_B_POINTER (4 );
248
+ CREATE_B_POINTER (4 , 4 );
249
+ CREATE_B_POINTER (5 , 5 );
250
+ CREATE_B_POINTER (6 , 6 );
251
+ CREATE_B_POINTER (7 , 7 );
252
+ UPDATE_B_POINTER (8 );
248
253
249
254
BLASLONG k = 0 ;
250
255
DECLARE_RESULT_VECTOR (0 , 0 );
251
256
DECLARE_RESULT_VECTOR (0 , 1 );
252
257
DECLARE_RESULT_VECTOR (0 , 2 );
253
258
DECLARE_RESULT_VECTOR (0 , 3 );
259
+ DECLARE_RESULT_VECTOR (0 , 4 );
260
+ DECLARE_RESULT_VECTOR (0 , 5 );
261
+ DECLARE_RESULT_VECTOR (0 , 6 );
262
+ DECLARE_RESULT_VECTOR (0 , 7 );
254
263
DECLARE_RESULT_VECTOR (1 , 0 );
255
264
DECLARE_RESULT_VECTOR (1 , 1 );
256
265
DECLARE_RESULT_VECTOR (1 , 2 );
257
266
DECLARE_RESULT_VECTOR (1 , 3 );
267
+ DECLARE_RESULT_VECTOR (1 , 4 );
268
+ DECLARE_RESULT_VECTOR (1 , 5 );
269
+ DECLARE_RESULT_VECTOR (1 , 6 );
270
+ DECLARE_RESULT_VECTOR (1 , 7 );
258
271
259
272
if (LIKELY (packed_a != NULL )) {
260
273
if (j == 0 ) {
@@ -267,12 +280,21 @@ CNAME(BLASLONG M,
267
280
UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 0 );
268
281
UPDATE_RESULT_VECTOR_QUADWORD (0 , 2 , 0 , 2 , 0 );
269
282
UPDATE_RESULT_VECTOR_QUADWORD (0 , 3 , 0 , 3 , 0 );
283
+ QUADWORD_LOAD_B (4 , 0 );
284
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 4 , 4 , 0 , 0 );
285
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 5 , 4 , 1 , 0 );
286
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 6 , 4 , 2 , 0 );
287
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 7 , 4 , 3 , 0 );
270
288
GATHER_LOAD_A (pg_true , 1 , 0 );
271
289
VECTOR_PACK_A (1 , 0 );
272
290
UPDATE_RESULT_VECTOR_QUADWORD (1 , 0 , 0 , 0 , 0 );
273
291
UPDATE_RESULT_VECTOR_QUADWORD (1 , 1 , 0 , 1 , 0 );
274
292
UPDATE_RESULT_VECTOR_QUADWORD (1 , 2 , 0 , 2 , 0 );
275
293
UPDATE_RESULT_VECTOR_QUADWORD (1 , 3 , 0 , 3 , 0 );
294
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 4 , 4 , 0 , 0 );
295
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 5 , 4 , 1 , 0 );
296
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 6 , 4 , 2 , 0 );
297
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 7 , 4 , 3 , 0 );
276
298
}
277
299
} else {
278
300
for (; k < K ; k ++ ) {
@@ -283,16 +305,102 @@ CNAME(BLASLONG M,
283
305
UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 0 );
284
306
UPDATE_RESULT_VECTOR_QUADWORD (0 , 2 , 0 , 2 , 0 );
285
307
UPDATE_RESULT_VECTOR_QUADWORD (0 , 3 , 0 , 3 , 0 );
308
+ QUADWORD_LOAD_B (4 , 0 );
309
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 4 , 4 , 0 , 0 );
310
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 5 , 4 , 1 , 0 );
311
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 6 , 4 , 2 , 0 );
312
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 7 , 4 , 3 , 0 );
286
313
UNPACK_VECTOR_A (1 , 0 );
287
314
UPDATE_RESULT_VECTOR_QUADWORD (1 , 0 , 0 , 0 , 0 );
288
315
UPDATE_RESULT_VECTOR_QUADWORD (1 , 1 , 0 , 1 , 0 );
289
316
UPDATE_RESULT_VECTOR_QUADWORD (1 , 2 , 0 , 2 , 0 );
290
317
UPDATE_RESULT_VECTOR_QUADWORD (1 , 3 , 0 , 3 , 0 );
318
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 4 , 4 , 0 , 0 );
319
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 5 , 4 , 1 , 0 );
320
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 6 , 4 , 2 , 0 );
321
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 7 , 4 , 3 , 0 );
291
322
}
292
323
}
293
324
} else {
294
325
for (; k < K ; k ++ ) {
295
326
327
+ QUADWORD_LOAD_B (0 , 0 );
328
+ GATHER_LOAD_A (pg_true , 0 , 0 );
329
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 0 , 0 , 0 , 0 );
330
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 0 );
331
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 2 , 0 , 2 , 0 );
332
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 3 , 0 , 3 , 0 );
333
+ QUADWORD_LOAD_B (4 , 0 );
334
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 4 , 4 , 0 , 0 );
335
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 5 , 4 , 1 , 0 );
336
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 6 , 4 , 2 , 0 );
337
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 7 , 4 , 3 , 0 );
338
+ GATHER_LOAD_A (pg_true , 1 , 0 );
339
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 0 , 0 , 0 , 0 );
340
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 1 , 0 , 1 , 0 );
341
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 2 , 0 , 2 , 0 );
342
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 3 , 0 , 3 , 0 );
343
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 4 , 4 , 0 , 0 );
344
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 5 , 4 , 1 , 0 );
345
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 6 , 4 , 2 , 0 );
346
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 7 , 4 , 3 , 0 );
347
+ }
348
+ }
349
+ VECTOR_STORE (pg_true , 0 , 0 );
350
+ VECTOR_STORE (pg_true , 0 , 1 );
351
+ VECTOR_STORE (pg_true , 0 , 2 );
352
+ VECTOR_STORE (pg_true , 0 , 3 );
353
+ VECTOR_STORE (pg_true , 0 , 4 );
354
+ VECTOR_STORE (pg_true , 0 , 5 );
355
+ VECTOR_STORE (pg_true , 0 , 6 );
356
+ VECTOR_STORE (pg_true , 0 , 7 );
357
+ VECTOR_STORE (pg_true , 1 , 0 );
358
+ VECTOR_STORE (pg_true , 1 , 1 );
359
+ VECTOR_STORE (pg_true , 1 , 2 );
360
+ VECTOR_STORE (pg_true , 1 , 3 );
361
+ VECTOR_STORE (pg_true , 1 , 4 );
362
+ VECTOR_STORE (pg_true , 1 , 5 );
363
+ VECTOR_STORE (pg_true , 1 , 6 );
364
+ VECTOR_STORE (pg_true , 1 , 7 );
365
+ INCR_C_POINTER (0 , 8 );
366
+ INCR_C_POINTER (1 , 8 );
367
+ }
368
+ for (; j < n4 ; j += 4 ) {
369
+
370
+ CREATE_B_POINTER (0 , 0 );
371
+ CREATE_B_POINTER (1 , 1 );
372
+ CREATE_B_POINTER (2 , 2 );
373
+ CREATE_B_POINTER (3 , 3 );
374
+ UPDATE_B_POINTER (4 );
375
+
376
+ BLASLONG k = 0 ;
377
+ DECLARE_RESULT_VECTOR (0 , 0 );
378
+ DECLARE_RESULT_VECTOR (0 , 1 );
379
+ DECLARE_RESULT_VECTOR (0 , 2 );
380
+ DECLARE_RESULT_VECTOR (0 , 3 );
381
+ DECLARE_RESULT_VECTOR (1 , 0 );
382
+ DECLARE_RESULT_VECTOR (1 , 1 );
383
+ DECLARE_RESULT_VECTOR (1 , 2 );
384
+ DECLARE_RESULT_VECTOR (1 , 3 );
385
+
386
+ if (LIKELY (packed_a != NULL )) {
387
+ for (; k < K ; k ++ ) {
388
+
389
+ QUADWORD_LOAD_B (0 , 0 );
390
+ UNPACK_VECTOR_A (0 , 0 );
391
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 0 , 0 , 0 , 0 );
392
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 0 );
393
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 2 , 0 , 2 , 0 );
394
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 3 , 0 , 3 , 0 );
395
+ UNPACK_VECTOR_A (1 , 0 );
396
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 0 , 0 , 0 , 0 );
397
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 1 , 0 , 1 , 0 );
398
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 2 , 0 , 2 , 0 );
399
+ UPDATE_RESULT_VECTOR_QUADWORD (1 , 3 , 0 , 3 , 0 );
400
+ }
401
+ } else {
402
+ for (; k < K ; k ++ ) {
403
+
296
404
QUADWORD_LOAD_B (0 , 0 );
297
405
GATHER_LOAD_A (pg_true , 0 , 0 );
298
406
UPDATE_RESULT_VECTOR_QUADWORD (0 , 0 , 0 , 0 , 0 );
@@ -361,6 +469,52 @@ CNAME(BLASLONG M,
361
469
CREATE_A_POINTER (0 , 0 );
362
470
363
471
BLASLONG j = 0 ;
472
+ for (; j < n8 ; j += 8 ) {
473
+
474
+ CREATE_B_POINTER (0 , 0 );
475
+ CREATE_B_POINTER (1 , 1 );
476
+ CREATE_B_POINTER (2 , 2 );
477
+ CREATE_B_POINTER (3 , 3 );
478
+ CREATE_B_POINTER (4 , 4 );
479
+ CREATE_B_POINTER (5 , 5 );
480
+ CREATE_B_POINTER (6 , 6 );
481
+ CREATE_B_POINTER (7 , 7 );
482
+ UPDATE_B_POINTER (8 );
483
+
484
+ BLASLONG k = 0 ;
485
+ DECLARE_RESULT_VECTOR (0 , 0 );
486
+ DECLARE_RESULT_VECTOR (0 , 1 );
487
+ DECLARE_RESULT_VECTOR (0 , 2 );
488
+ DECLARE_RESULT_VECTOR (0 , 3 );
489
+ DECLARE_RESULT_VECTOR (0 , 4 );
490
+ DECLARE_RESULT_VECTOR (0 , 5 );
491
+ DECLARE_RESULT_VECTOR (0 , 6 );
492
+ DECLARE_RESULT_VECTOR (0 , 7 );
493
+
494
+ for (; k < K ; k ++ ) {
495
+
496
+ QUADWORD_LOAD_B (0 , 0 );
497
+ GATHER_LOAD_A (pg_true , 0 , 0 );
498
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 0 , 0 , 0 , 0 );
499
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 0 );
500
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 2 , 0 , 2 , 0 );
501
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 3 , 0 , 3 , 0 );
502
+ QUADWORD_LOAD_B (4 , 0 );
503
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 4 , 4 , 0 , 0 );
504
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 5 , 4 , 1 , 0 );
505
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 6 , 4 , 2 , 0 );
506
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 7 , 4 , 3 , 0 );
507
+ }
508
+ VECTOR_STORE (pg_true , 0 , 0 );
509
+ VECTOR_STORE (pg_true , 0 , 1 );
510
+ VECTOR_STORE (pg_true , 0 , 2 );
511
+ VECTOR_STORE (pg_true , 0 , 3 );
512
+ VECTOR_STORE (pg_true , 0 , 4 );
513
+ VECTOR_STORE (pg_true , 0 , 5 );
514
+ VECTOR_STORE (pg_true , 0 , 6 );
515
+ VECTOR_STORE (pg_true , 0 , 7 );
516
+ INCR_C_POINTER (0 , 8 );
517
+ }
364
518
for (; j < n4 ; j += 4 ) {
365
519
366
520
CREATE_B_POINTER (0 , 0 );
@@ -418,6 +572,52 @@ CNAME(BLASLONG M,
418
572
CREATE_A_POINTER (0 , 0 );
419
573
420
574
BLASLONG j = 0 ;
575
+ for (; j < n8 ; j += 8 ) {
576
+
577
+ CREATE_B_POINTER (0 , 0 );
578
+ CREATE_B_POINTER (1 , 1 );
579
+ CREATE_B_POINTER (2 , 2 );
580
+ CREATE_B_POINTER (3 , 3 );
581
+ CREATE_B_POINTER (4 , 4 );
582
+ CREATE_B_POINTER (5 , 5 );
583
+ CREATE_B_POINTER (6 , 6 );
584
+ CREATE_B_POINTER (7 , 7 );
585
+ UPDATE_B_POINTER (8 );
586
+
587
+ BLASLONG k = 0 ;
588
+ DECLARE_RESULT_VECTOR (0 , 0 );
589
+ DECLARE_RESULT_VECTOR (0 , 1 );
590
+ DECLARE_RESULT_VECTOR (0 , 2 );
591
+ DECLARE_RESULT_VECTOR (0 , 3 );
592
+ DECLARE_RESULT_VECTOR (0 , 4 );
593
+ DECLARE_RESULT_VECTOR (0 , 5 );
594
+ DECLARE_RESULT_VECTOR (0 , 6 );
595
+ DECLARE_RESULT_VECTOR (0 , 7 );
596
+
597
+ for (; k < K ; k ++ ) {
598
+
599
+ QUADWORD_LOAD_B (0 , 0 );
600
+ GATHER_LOAD_A (pg_tail , 0 , 0 );
601
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 0 , 0 , 0 , 0 );
602
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 0 );
603
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 2 , 0 , 2 , 0 );
604
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 3 , 0 , 3 , 0 );
605
+ QUADWORD_LOAD_B (4 , 0 );
606
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 4 , 4 , 0 , 0 );
607
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 5 , 4 , 1 , 0 );
608
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 6 , 4 , 2 , 0 );
609
+ UPDATE_RESULT_VECTOR_QUADWORD (0 , 7 , 4 , 3 , 0 );
610
+ }
611
+ VECTOR_STORE (pg_tail , 0 , 0 );
612
+ VECTOR_STORE (pg_tail , 0 , 1 );
613
+ VECTOR_STORE (pg_tail , 0 , 2 );
614
+ VECTOR_STORE (pg_tail , 0 , 3 );
615
+ VECTOR_STORE (pg_tail , 0 , 4 );
616
+ VECTOR_STORE (pg_tail , 0 , 5 );
617
+ VECTOR_STORE (pg_tail , 0 , 6 );
618
+ VECTOR_STORE (pg_tail , 0 , 7 );
619
+ INCR_C_POINTER (0 , 8 );
620
+ }
421
621
for (; j < n4 ; j += 4 ) {
422
622
423
623
CREATE_B_POINTER (0 , 0 );
0 commit comments