|
267 | 267 |
|
268 | 268 | #define aria_ark_8way(x0, x1, x2, x3, \
|
269 | 269 | x4, x5, x6, x7, \
|
270 |
| - t0, rk, idx, round) \ |
| 270 | + t0, t1, t2, rk, \ |
| 271 | + idx, round) \ |
271 | 272 | /* AddRoundKey */ \
|
272 |
| - vpbroadcastb ((round * 16) + idx + 3)(rk), t0; \ |
273 |
| - vpxor t0, x0, x0; \ |
274 |
| - vpbroadcastb ((round * 16) + idx + 2)(rk), t0; \ |
275 |
| - vpxor t0, x1, x1; \ |
276 |
| - vpbroadcastb ((round * 16) + idx + 1)(rk), t0; \ |
277 |
| - vpxor t0, x2, x2; \ |
278 |
| - vpbroadcastb ((round * 16) + idx + 0)(rk), t0; \ |
279 |
| - vpxor t0, x3, x3; \ |
280 |
| - vpbroadcastb ((round * 16) + idx + 7)(rk), t0; \ |
281 |
| - vpxor t0, x4, x4; \ |
282 |
| - vpbroadcastb ((round * 16) + idx + 6)(rk), t0; \ |
283 |
| - vpxor t0, x5, x5; \ |
284 |
| - vpbroadcastb ((round * 16) + idx + 5)(rk), t0; \ |
285 |
| - vpxor t0, x6, x6; \ |
286 |
| - vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \ |
287 |
| - vpxor t0, x7, x7; |
| 273 | + vbroadcastss ((round * 16) + idx + 0)(rk), t0; \ |
| 274 | + vpsrld $24, t0, t2; \ |
| 275 | + vpshufb t1, t2, t2; \ |
| 276 | + vpxor t2, x0, x0; \ |
| 277 | + vpsrld $16, t0, t2; \ |
| 278 | + vpshufb t1, t2, t2; \ |
| 279 | + vpxor t2, x1, x1; \ |
| 280 | + vpsrld $8, t0, t2; \ |
| 281 | + vpshufb t1, t2, t2; \ |
| 282 | + vpxor t2, x2, x2; \ |
| 283 | + vpshufb t1, t0, t2; \ |
| 284 | + vpxor t2, x3, x3; \ |
| 285 | + vbroadcastss ((round * 16) + idx + 4)(rk), t0; \ |
| 286 | + vpsrld $24, t0, t2; \ |
| 287 | + vpshufb t1, t2, t2; \ |
| 288 | + vpxor t2, x4, x4; \ |
| 289 | + vpsrld $16, t0, t2; \ |
| 290 | + vpshufb t1, t2, t2; \ |
| 291 | + vpxor t2, x5, x5; \ |
| 292 | + vpsrld $8, t0, t2; \ |
| 293 | + vpshufb t1, t2, t2; \ |
| 294 | + vpxor t2, x6, x6; \ |
| 295 | + vpshufb t1, t0, t2; \ |
| 296 | + vpxor t2, x7, x7; |
288 | 297 |
|
289 | 298 | #ifdef CONFIG_AS_GFNI
|
290 | 299 | #define aria_sbox_8way_gfni(x0, x1, x2, x3, \
|
291 | 300 | x4, x5, x6, x7, \
|
292 | 301 | t0, t1, t2, t3, \
|
293 | 302 | t4, t5, t6, t7) \
|
294 |
| - vpbroadcastq .Ltf_s2_bitmatrix, t0; \ |
295 |
| - vpbroadcastq .Ltf_inv_bitmatrix, t1; \ |
296 |
| - vpbroadcastq .Ltf_id_bitmatrix, t2; \ |
297 |
| - vpbroadcastq .Ltf_aff_bitmatrix, t3; \ |
298 |
| - vpbroadcastq .Ltf_x2_bitmatrix, t4; \ |
| 303 | + vmovdqa .Ltf_s2_bitmatrix, t0; \ |
| 304 | + vmovdqa .Ltf_inv_bitmatrix, t1; \ |
| 305 | + vmovdqa .Ltf_id_bitmatrix, t2; \ |
| 306 | + vmovdqa .Ltf_aff_bitmatrix, t3; \ |
| 307 | + vmovdqa .Ltf_x2_bitmatrix, t4; \ |
299 | 308 | vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
|
300 | 309 | vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
|
301 | 310 | vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
|
|
315 | 324 | x4, x5, x6, x7, \
|
316 | 325 | t0, t1, t2, t3, \
|
317 | 326 | t4, t5, t6, t7) \
|
318 |
| - vpxor t7, t7, t7; \ |
319 | 327 | vmovdqa .Linv_shift_row, t0; \
|
320 | 328 | vmovdqa .Lshift_row, t1; \
|
321 |
| - vpbroadcastd .L0f0f0f0f, t6; \ |
| 329 | + vbroadcastss .L0f0f0f0f, t6; \ |
322 | 330 | vmovdqa .Ltf_lo__inv_aff__and__s2, t2; \
|
323 | 331 | vmovdqa .Ltf_hi__inv_aff__and__s2, t3; \
|
324 | 332 | vmovdqa .Ltf_lo__x2__and__fwd_aff, t4; \
|
|
413 | 421 | y0, y1, y2, y3, \
|
414 | 422 | y4, y5, y6, y7, \
|
415 | 423 | mem_tmp, rk, round) \
|
| 424 | + vpxor y7, y7, y7; \ |
416 | 425 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
|
417 |
| - y0, rk, 8, round); \ |
| 426 | + y0, y7, y2, rk, 8, round); \ |
418 | 427 | \
|
419 | 428 | aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
|
420 | 429 | y0, y1, y2, y3, y4, y5, y6, y7); \
|
|
429 | 438 | x4, x5, x6, x7, \
|
430 | 439 | mem_tmp, 0); \
|
431 | 440 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
|
432 |
| - y0, rk, 0, round); \ |
| 441 | + y0, y7, y2, rk, 0, round); \ |
433 | 442 | \
|
434 | 443 | aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
|
435 | 444 | y0, y1, y2, y3, y4, y5, y6, y7); \
|
|
467 | 476 | y0, y1, y2, y3, \
|
468 | 477 | y4, y5, y6, y7, \
|
469 | 478 | mem_tmp, rk, round) \
|
| 479 | + vpxor y7, y7, y7; \ |
470 | 480 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
|
471 |
| - y0, rk, 8, round); \ |
| 481 | + y0, y7, y2, rk, 8, round); \ |
472 | 482 | \
|
473 | 483 | aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
|
474 | 484 | y0, y1, y2, y3, y4, y5, y6, y7); \
|
|
483 | 493 | x4, x5, x6, x7, \
|
484 | 494 | mem_tmp, 0); \
|
485 | 495 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
|
486 |
| - y0, rk, 0, round); \ |
| 496 | + y0, y7, y2, rk, 0, round); \ |
487 | 497 | \
|
488 | 498 | aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
|
489 | 499 | y0, y1, y2, y3, y4, y5, y6, y7); \
|
|
521 | 531 | y0, y1, y2, y3, \
|
522 | 532 | y4, y5, y6, y7, \
|
523 | 533 | mem_tmp, rk, round, last_round) \
|
| 534 | + vpxor y7, y7, y7; \ |
524 | 535 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
|
525 |
| - y0, rk, 8, round); \ |
| 536 | + y0, y7, y2, rk, 8, round); \ |
526 | 537 | \
|
527 | 538 | aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
|
528 | 539 | y0, y1, y2, y3, y4, y5, y6, y7); \
|
529 | 540 | \
|
530 | 541 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
|
531 |
| - y0, rk, 8, last_round); \ |
| 542 | + y0, y7, y2, rk, 8, last_round); \ |
532 | 543 | \
|
533 | 544 | aria_store_state_8way(x0, x1, x2, x3, \
|
534 | 545 | x4, x5, x6, x7, \
|
|
538 | 549 | x4, x5, x6, x7, \
|
539 | 550 | mem_tmp, 0); \
|
540 | 551 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
|
541 |
| - y0, rk, 0, round); \ |
| 552 | + y0, y7, y2, rk, 0, round); \ |
542 | 553 | \
|
543 | 554 | aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
|
544 | 555 | y0, y1, y2, y3, y4, y5, y6, y7); \
|
545 | 556 | \
|
546 | 557 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
|
547 |
| - y0, rk, 0, last_round); \ |
| 558 | + y0, y7, y2, rk, 0, last_round); \ |
548 | 559 | \
|
549 | 560 | aria_load_state_8way(y0, y1, y2, y3, \
|
550 | 561 | y4, y5, y6, y7, \
|
|
556 | 567 | y0, y1, y2, y3, \
|
557 | 568 | y4, y5, y6, y7, \
|
558 | 569 | mem_tmp, rk, round) \
|
| 570 | + vpxor y7, y7, y7; \ |
559 | 571 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
|
560 |
| - y0, rk, 8, round); \ |
| 572 | + y0, y7, y2, rk, 8, round); \ |
561 | 573 | \
|
562 | 574 | aria_sbox_8way_gfni(x2, x3, x0, x1, \
|
563 | 575 | x6, x7, x4, x5, \
|
|
574 | 586 | x4, x5, x6, x7, \
|
575 | 587 | mem_tmp, 0); \
|
576 | 588 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
|
577 |
| - y0, rk, 0, round); \ |
| 589 | + y0, y7, y2, rk, 0, round); \ |
578 | 590 | \
|
579 | 591 | aria_sbox_8way_gfni(x2, x3, x0, x1, \
|
580 | 592 | x6, x7, x4, x5, \
|
|
614 | 626 | y0, y1, y2, y3, \
|
615 | 627 | y4, y5, y6, y7, \
|
616 | 628 | mem_tmp, rk, round) \
|
| 629 | + vpxor y7, y7, y7; \ |
617 | 630 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
|
618 |
| - y0, rk, 8, round); \ |
| 631 | + y0, y7, y2, rk, 8, round); \ |
619 | 632 | \
|
620 | 633 | aria_sbox_8way_gfni(x0, x1, x2, x3, \
|
621 | 634 | x4, x5, x6, x7, \
|
|
632 | 645 | x4, x5, x6, x7, \
|
633 | 646 | mem_tmp, 0); \
|
634 | 647 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
|
635 |
| - y0, rk, 0, round); \ |
| 648 | + y0, y7, y2, rk, 0, round); \ |
636 | 649 | \
|
637 | 650 | aria_sbox_8way_gfni(x0, x1, x2, x3, \
|
638 | 651 | x4, x5, x6, x7, \
|
|
672 | 685 | y0, y1, y2, y3, \
|
673 | 686 | y4, y5, y6, y7, \
|
674 | 687 | mem_tmp, rk, round, last_round) \
|
| 688 | + vpxor y7, y7, y7; \ |
675 | 689 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
|
676 |
| - y0, rk, 8, round); \ |
| 690 | + y0, y7, y2, rk, 8, round); \ |
677 | 691 | \
|
678 | 692 | aria_sbox_8way_gfni(x2, x3, x0, x1, \
|
679 | 693 | x6, x7, x4, x5, \
|
680 | 694 | y0, y1, y2, y3, \
|
681 | 695 | y4, y5, y6, y7); \
|
682 | 696 | \
|
683 | 697 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
|
684 |
| - y0, rk, 8, last_round); \ |
| 698 | + y0, y7, y2, rk, 8, last_round); \ |
685 | 699 | \
|
686 | 700 | aria_store_state_8way(x0, x1, x2, x3, \
|
687 | 701 | x4, x5, x6, x7, \
|
|
691 | 705 | x4, x5, x6, x7, \
|
692 | 706 | mem_tmp, 0); \
|
693 | 707 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
|
694 |
| - y0, rk, 0, round); \ |
| 708 | + y0, y7, y2, rk, 0, round); \ |
695 | 709 | \
|
696 | 710 | aria_sbox_8way_gfni(x2, x3, x0, x1, \
|
697 | 711 | x6, x7, x4, x5, \
|
698 | 712 | y0, y1, y2, y3, \
|
699 | 713 | y4, y5, y6, y7); \
|
700 | 714 | \
|
701 | 715 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
|
702 |
| - y0, rk, 0, last_round); \ |
| 716 | + y0, y7, y2, rk, 0, last_round); \ |
703 | 717 | \
|
704 | 718 | aria_load_state_8way(y0, y1, y2, y3, \
|
705 | 719 | y4, y5, y6, y7, \
|
|
772 | 786 | BV8(0, 1, 1, 1, 1, 1, 0, 0),
|
773 | 787 | BV8(0, 0, 1, 1, 1, 1, 1, 0),
|
774 | 788 | BV8(0, 0, 0, 1, 1, 1, 1, 1))
|
| 789 | + .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1), |
| 790 | + BV8(1, 1, 0, 0, 0, 1, 1, 1), |
| 791 | + BV8(1, 1, 1, 0, 0, 0, 1, 1), |
| 792 | + BV8(1, 1, 1, 1, 0, 0, 0, 1), |
| 793 | + BV8(1, 1, 1, 1, 1, 0, 0, 0), |
| 794 | + BV8(0, 1, 1, 1, 1, 1, 0, 0), |
| 795 | + BV8(0, 0, 1, 1, 1, 1, 1, 0), |
| 796 | + BV8(0, 0, 0, 1, 1, 1, 1, 1)) |
775 | 797 |
|
776 | 798 | /* AES inverse affine: */
|
777 | 799 | #define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
|
|
784 | 806 | BV8(0, 0, 1, 0, 1, 0, 0, 1),
|
785 | 807 | BV8(1, 0, 0, 1, 0, 1, 0, 0),
|
786 | 808 | BV8(0, 1, 0, 0, 1, 0, 1, 0))
|
| 809 | + .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1), |
| 810 | + BV8(1, 0, 0, 1, 0, 0, 1, 0), |
| 811 | + BV8(0, 1, 0, 0, 1, 0, 0, 1), |
| 812 | + BV8(1, 0, 1, 0, 0, 1, 0, 0), |
| 813 | + BV8(0, 1, 0, 1, 0, 0, 1, 0), |
| 814 | + BV8(0, 0, 1, 0, 1, 0, 0, 1), |
| 815 | + BV8(1, 0, 0, 1, 0, 1, 0, 0), |
| 816 | + BV8(0, 1, 0, 0, 1, 0, 1, 0)) |
787 | 817 |
|
788 | 818 | /* S2: */
|
789 | 819 | #define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
|
|
796 | 826 | BV8(1, 1, 0, 0, 1, 1, 1, 0),
|
797 | 827 | BV8(0, 1, 1, 0, 0, 0, 1, 1),
|
798 | 828 | BV8(1, 1, 1, 1, 0, 1, 1, 0))
|
| 829 | + .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1), |
| 830 | + BV8(0, 0, 1, 1, 1, 1, 1, 1), |
| 831 | + BV8(1, 1, 1, 0, 1, 1, 0, 1), |
| 832 | + BV8(1, 1, 0, 0, 0, 0, 1, 1), |
| 833 | + BV8(0, 1, 0, 0, 0, 0, 1, 1), |
| 834 | + BV8(1, 1, 0, 0, 1, 1, 1, 0), |
| 835 | + BV8(0, 1, 1, 0, 0, 0, 1, 1), |
| 836 | + BV8(1, 1, 1, 1, 0, 1, 1, 0)) |
799 | 837 |
|
800 | 838 | /* X2: */
|
801 | 839 | #define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
|
|
808 | 846 | BV8(0, 1, 1, 0, 1, 0, 1, 1),
|
809 | 847 | BV8(1, 0, 1, 1, 1, 1, 0, 1),
|
810 | 848 | BV8(1, 0, 0, 1, 0, 0, 1, 1))
|
| 849 | + .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0), |
| 850 | + BV8(0, 0, 1, 0, 0, 1, 1, 0), |
| 851 | + BV8(0, 0, 0, 0, 1, 0, 1, 0), |
| 852 | + BV8(1, 1, 1, 0, 0, 0, 1, 1), |
| 853 | + BV8(1, 1, 1, 0, 1, 1, 0, 0), |
| 854 | + BV8(0, 1, 1, 0, 1, 0, 1, 1), |
| 855 | + BV8(1, 0, 1, 1, 1, 1, 0, 1), |
| 856 | + BV8(1, 0, 0, 1, 0, 0, 1, 1)) |
811 | 857 |
|
812 | 858 | /* Identity matrix: */
|
813 | 859 | .Ltf_id_bitmatrix:
|
|
819 | 865 | BV8(0, 0, 0, 0, 0, 1, 0, 0),
|
820 | 866 | BV8(0, 0, 0, 0, 0, 0, 1, 0),
|
821 | 867 | BV8(0, 0, 0, 0, 0, 0, 0, 1))
|
| 868 | + .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0), |
| 869 | + BV8(0, 1, 0, 0, 0, 0, 0, 0), |
| 870 | + BV8(0, 0, 1, 0, 0, 0, 0, 0), |
| 871 | + BV8(0, 0, 0, 1, 0, 0, 0, 0), |
| 872 | + BV8(0, 0, 0, 0, 1, 0, 0, 0), |
| 873 | + BV8(0, 0, 0, 0, 0, 1, 0, 0), |
| 874 | + BV8(0, 0, 0, 0, 0, 0, 1, 0), |
| 875 | + BV8(0, 0, 0, 0, 0, 0, 0, 1)) |
822 | 876 | #endif /* CONFIG_AS_GFNI */
|
823 | 877 |
|
824 | 878 | /* 4-bit mask */
|
|
0 commit comments