Skip to content

Commit 8b84475

Browse files
TaeheeYooherbertx
authored andcommitted
crypto: x86/aria-avx - Do not use avx2 instructions
vpbroadcastb and vpbroadcastd are not AVX instructions. But the aria-avx assembly code contains these instructions. So, kernel panic will occur if the aria-avx works on AVX2 unsupported CPU. vbroadcastss, and vpshufb are used to avoid using vpbroadcastb in it. Unfortunately, this change reduces performance by about 5%. Also, vpbroadcastd is simply replaced by vmovdqa in it. Fixes: ba3579e ("crypto: aria-avx - add AES-NI/AVX/x86_64/GFNI assembler implementation of aria cipher") Reported-by: Herbert Xu <herbert@gondor.apana.org.au> Reported-by: Erhard F. <erhard_f@mailbox.org> Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
1 parent eb33108 commit 8b84475

File tree

1 file changed

+94
-40
lines changed

1 file changed

+94
-40
lines changed

arch/x86/crypto/aria-aesni-avx-asm_64.S

Lines changed: 94 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -267,35 +267,44 @@
267267

268268
#define aria_ark_8way(x0, x1, x2, x3, \
269269
x4, x5, x6, x7, \
270-
t0, rk, idx, round) \
270+
t0, t1, t2, rk, \
271+
idx, round) \
271272
/* AddRoundKey */ \
272-
vpbroadcastb ((round * 16) + idx + 3)(rk), t0; \
273-
vpxor t0, x0, x0; \
274-
vpbroadcastb ((round * 16) + idx + 2)(rk), t0; \
275-
vpxor t0, x1, x1; \
276-
vpbroadcastb ((round * 16) + idx + 1)(rk), t0; \
277-
vpxor t0, x2, x2; \
278-
vpbroadcastb ((round * 16) + idx + 0)(rk), t0; \
279-
vpxor t0, x3, x3; \
280-
vpbroadcastb ((round * 16) + idx + 7)(rk), t0; \
281-
vpxor t0, x4, x4; \
282-
vpbroadcastb ((round * 16) + idx + 6)(rk), t0; \
283-
vpxor t0, x5, x5; \
284-
vpbroadcastb ((round * 16) + idx + 5)(rk), t0; \
285-
vpxor t0, x6, x6; \
286-
vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \
287-
vpxor t0, x7, x7;
273+
vbroadcastss ((round * 16) + idx + 0)(rk), t0; \
274+
vpsrld $24, t0, t2; \
275+
vpshufb t1, t2, t2; \
276+
vpxor t2, x0, x0; \
277+
vpsrld $16, t0, t2; \
278+
vpshufb t1, t2, t2; \
279+
vpxor t2, x1, x1; \
280+
vpsrld $8, t0, t2; \
281+
vpshufb t1, t2, t2; \
282+
vpxor t2, x2, x2; \
283+
vpshufb t1, t0, t2; \
284+
vpxor t2, x3, x3; \
285+
vbroadcastss ((round * 16) + idx + 4)(rk), t0; \
286+
vpsrld $24, t0, t2; \
287+
vpshufb t1, t2, t2; \
288+
vpxor t2, x4, x4; \
289+
vpsrld $16, t0, t2; \
290+
vpshufb t1, t2, t2; \
291+
vpxor t2, x5, x5; \
292+
vpsrld $8, t0, t2; \
293+
vpshufb t1, t2, t2; \
294+
vpxor t2, x6, x6; \
295+
vpshufb t1, t0, t2; \
296+
vpxor t2, x7, x7;
288297

289298
#ifdef CONFIG_AS_GFNI
290299
#define aria_sbox_8way_gfni(x0, x1, x2, x3, \
291300
x4, x5, x6, x7, \
292301
t0, t1, t2, t3, \
293302
t4, t5, t6, t7) \
294-
vpbroadcastq .Ltf_s2_bitmatrix, t0; \
295-
vpbroadcastq .Ltf_inv_bitmatrix, t1; \
296-
vpbroadcastq .Ltf_id_bitmatrix, t2; \
297-
vpbroadcastq .Ltf_aff_bitmatrix, t3; \
298-
vpbroadcastq .Ltf_x2_bitmatrix, t4; \
303+
vmovdqa .Ltf_s2_bitmatrix, t0; \
304+
vmovdqa .Ltf_inv_bitmatrix, t1; \
305+
vmovdqa .Ltf_id_bitmatrix, t2; \
306+
vmovdqa .Ltf_aff_bitmatrix, t3; \
307+
vmovdqa .Ltf_x2_bitmatrix, t4; \
299308
vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
300309
vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
301310
vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
@@ -315,10 +324,9 @@
315324
x4, x5, x6, x7, \
316325
t0, t1, t2, t3, \
317326
t4, t5, t6, t7) \
318-
vpxor t7, t7, t7; \
319327
vmovdqa .Linv_shift_row, t0; \
320328
vmovdqa .Lshift_row, t1; \
321-
vpbroadcastd .L0f0f0f0f, t6; \
329+
vbroadcastss .L0f0f0f0f, t6; \
322330
vmovdqa .Ltf_lo__inv_aff__and__s2, t2; \
323331
vmovdqa .Ltf_hi__inv_aff__and__s2, t3; \
324332
vmovdqa .Ltf_lo__x2__and__fwd_aff, t4; \
@@ -413,8 +421,9 @@
413421
y0, y1, y2, y3, \
414422
y4, y5, y6, y7, \
415423
mem_tmp, rk, round) \
424+
vpxor y7, y7, y7; \
416425
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
417-
y0, rk, 8, round); \
426+
y0, y7, y2, rk, 8, round); \
418427
\
419428
aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
420429
y0, y1, y2, y3, y4, y5, y6, y7); \
@@ -429,7 +438,7 @@
429438
x4, x5, x6, x7, \
430439
mem_tmp, 0); \
431440
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
432-
y0, rk, 0, round); \
441+
y0, y7, y2, rk, 0, round); \
433442
\
434443
aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
435444
y0, y1, y2, y3, y4, y5, y6, y7); \
@@ -467,8 +476,9 @@
467476
y0, y1, y2, y3, \
468477
y4, y5, y6, y7, \
469478
mem_tmp, rk, round) \
479+
vpxor y7, y7, y7; \
470480
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
471-
y0, rk, 8, round); \
481+
y0, y7, y2, rk, 8, round); \
472482
\
473483
aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
474484
y0, y1, y2, y3, y4, y5, y6, y7); \
@@ -483,7 +493,7 @@
483493
x4, x5, x6, x7, \
484494
mem_tmp, 0); \
485495
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
486-
y0, rk, 0, round); \
496+
y0, y7, y2, rk, 0, round); \
487497
\
488498
aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
489499
y0, y1, y2, y3, y4, y5, y6, y7); \
@@ -521,14 +531,15 @@
521531
y0, y1, y2, y3, \
522532
y4, y5, y6, y7, \
523533
mem_tmp, rk, round, last_round) \
534+
vpxor y7, y7, y7; \
524535
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
525-
y0, rk, 8, round); \
536+
y0, y7, y2, rk, 8, round); \
526537
\
527538
aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
528539
y0, y1, y2, y3, y4, y5, y6, y7); \
529540
\
530541
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
531-
y0, rk, 8, last_round); \
542+
y0, y7, y2, rk, 8, last_round); \
532543
\
533544
aria_store_state_8way(x0, x1, x2, x3, \
534545
x4, x5, x6, x7, \
@@ -538,13 +549,13 @@
538549
x4, x5, x6, x7, \
539550
mem_tmp, 0); \
540551
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
541-
y0, rk, 0, round); \
552+
y0, y7, y2, rk, 0, round); \
542553
\
543554
aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
544555
y0, y1, y2, y3, y4, y5, y6, y7); \
545556
\
546557
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
547-
y0, rk, 0, last_round); \
558+
y0, y7, y2, rk, 0, last_round); \
548559
\
549560
aria_load_state_8way(y0, y1, y2, y3, \
550561
y4, y5, y6, y7, \
@@ -556,8 +567,9 @@
556567
y0, y1, y2, y3, \
557568
y4, y5, y6, y7, \
558569
mem_tmp, rk, round) \
570+
vpxor y7, y7, y7; \
559571
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
560-
y0, rk, 8, round); \
572+
y0, y7, y2, rk, 8, round); \
561573
\
562574
aria_sbox_8way_gfni(x2, x3, x0, x1, \
563575
x6, x7, x4, x5, \
@@ -574,7 +586,7 @@
574586
x4, x5, x6, x7, \
575587
mem_tmp, 0); \
576588
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
577-
y0, rk, 0, round); \
589+
y0, y7, y2, rk, 0, round); \
578590
\
579591
aria_sbox_8way_gfni(x2, x3, x0, x1, \
580592
x6, x7, x4, x5, \
@@ -614,8 +626,9 @@
614626
y0, y1, y2, y3, \
615627
y4, y5, y6, y7, \
616628
mem_tmp, rk, round) \
629+
vpxor y7, y7, y7; \
617630
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
618-
y0, rk, 8, round); \
631+
y0, y7, y2, rk, 8, round); \
619632
\
620633
aria_sbox_8way_gfni(x0, x1, x2, x3, \
621634
x4, x5, x6, x7, \
@@ -632,7 +645,7 @@
632645
x4, x5, x6, x7, \
633646
mem_tmp, 0); \
634647
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
635-
y0, rk, 0, round); \
648+
y0, y7, y2, rk, 0, round); \
636649
\
637650
aria_sbox_8way_gfni(x0, x1, x2, x3, \
638651
x4, x5, x6, x7, \
@@ -672,16 +685,17 @@
672685
y0, y1, y2, y3, \
673686
y4, y5, y6, y7, \
674687
mem_tmp, rk, round, last_round) \
688+
vpxor y7, y7, y7; \
675689
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
676-
y0, rk, 8, round); \
690+
y0, y7, y2, rk, 8, round); \
677691
\
678692
aria_sbox_8way_gfni(x2, x3, x0, x1, \
679693
x6, x7, x4, x5, \
680694
y0, y1, y2, y3, \
681695
y4, y5, y6, y7); \
682696
\
683697
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
684-
y0, rk, 8, last_round); \
698+
y0, y7, y2, rk, 8, last_round); \
685699
\
686700
aria_store_state_8way(x0, x1, x2, x3, \
687701
x4, x5, x6, x7, \
@@ -691,15 +705,15 @@
691705
x4, x5, x6, x7, \
692706
mem_tmp, 0); \
693707
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
694-
y0, rk, 0, round); \
708+
y0, y7, y2, rk, 0, round); \
695709
\
696710
aria_sbox_8way_gfni(x2, x3, x0, x1, \
697711
x6, x7, x4, x5, \
698712
y0, y1, y2, y3, \
699713
y4, y5, y6, y7); \
700714
\
701715
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
702-
y0, rk, 0, last_round); \
716+
y0, y7, y2, rk, 0, last_round); \
703717
\
704718
aria_load_state_8way(y0, y1, y2, y3, \
705719
y4, y5, y6, y7, \
@@ -772,6 +786,14 @@
772786
BV8(0, 1, 1, 1, 1, 1, 0, 0),
773787
BV8(0, 0, 1, 1, 1, 1, 1, 0),
774788
BV8(0, 0, 0, 1, 1, 1, 1, 1))
789+
.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
790+
BV8(1, 1, 0, 0, 0, 1, 1, 1),
791+
BV8(1, 1, 1, 0, 0, 0, 1, 1),
792+
BV8(1, 1, 1, 1, 0, 0, 0, 1),
793+
BV8(1, 1, 1, 1, 1, 0, 0, 0),
794+
BV8(0, 1, 1, 1, 1, 1, 0, 0),
795+
BV8(0, 0, 1, 1, 1, 1, 1, 0),
796+
BV8(0, 0, 0, 1, 1, 1, 1, 1))
775797

776798
/* AES inverse affine: */
777799
#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
@@ -784,6 +806,14 @@
784806
BV8(0, 0, 1, 0, 1, 0, 0, 1),
785807
BV8(1, 0, 0, 1, 0, 1, 0, 0),
786808
BV8(0, 1, 0, 0, 1, 0, 1, 0))
809+
.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
810+
BV8(1, 0, 0, 1, 0, 0, 1, 0),
811+
BV8(0, 1, 0, 0, 1, 0, 0, 1),
812+
BV8(1, 0, 1, 0, 0, 1, 0, 0),
813+
BV8(0, 1, 0, 1, 0, 0, 1, 0),
814+
BV8(0, 0, 1, 0, 1, 0, 0, 1),
815+
BV8(1, 0, 0, 1, 0, 1, 0, 0),
816+
BV8(0, 1, 0, 0, 1, 0, 1, 0))
787817

788818
/* S2: */
789819
#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
@@ -796,6 +826,14 @@
796826
BV8(1, 1, 0, 0, 1, 1, 1, 0),
797827
BV8(0, 1, 1, 0, 0, 0, 1, 1),
798828
BV8(1, 1, 1, 1, 0, 1, 1, 0))
829+
.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
830+
BV8(0, 0, 1, 1, 1, 1, 1, 1),
831+
BV8(1, 1, 1, 0, 1, 1, 0, 1),
832+
BV8(1, 1, 0, 0, 0, 0, 1, 1),
833+
BV8(0, 1, 0, 0, 0, 0, 1, 1),
834+
BV8(1, 1, 0, 0, 1, 1, 1, 0),
835+
BV8(0, 1, 1, 0, 0, 0, 1, 1),
836+
BV8(1, 1, 1, 1, 0, 1, 1, 0))
799837

800838
/* X2: */
801839
#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
@@ -808,6 +846,14 @@
808846
BV8(0, 1, 1, 0, 1, 0, 1, 1),
809847
BV8(1, 0, 1, 1, 1, 1, 0, 1),
810848
BV8(1, 0, 0, 1, 0, 0, 1, 1))
849+
.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
850+
BV8(0, 0, 1, 0, 0, 1, 1, 0),
851+
BV8(0, 0, 0, 0, 1, 0, 1, 0),
852+
BV8(1, 1, 1, 0, 0, 0, 1, 1),
853+
BV8(1, 1, 1, 0, 1, 1, 0, 0),
854+
BV8(0, 1, 1, 0, 1, 0, 1, 1),
855+
BV8(1, 0, 1, 1, 1, 1, 0, 1),
856+
BV8(1, 0, 0, 1, 0, 0, 1, 1))
811857

812858
/* Identity matrix: */
813859
.Ltf_id_bitmatrix:
@@ -819,6 +865,14 @@
819865
BV8(0, 0, 0, 0, 0, 1, 0, 0),
820866
BV8(0, 0, 0, 0, 0, 0, 1, 0),
821867
BV8(0, 0, 0, 0, 0, 0, 0, 1))
868+
.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
869+
BV8(0, 1, 0, 0, 0, 0, 0, 0),
870+
BV8(0, 0, 1, 0, 0, 0, 0, 0),
871+
BV8(0, 0, 0, 1, 0, 0, 0, 0),
872+
BV8(0, 0, 0, 0, 1, 0, 0, 0),
873+
BV8(0, 0, 0, 0, 0, 1, 0, 0),
874+
BV8(0, 0, 0, 0, 0, 0, 1, 0),
875+
BV8(0, 0, 0, 0, 0, 0, 0, 1))
822876
#endif /* CONFIG_AS_GFNI */
823877

824878
/* 4-bit mask */

0 commit comments

Comments
 (0)