@@ -163,9 +163,9 @@ NOTE: String length must be evenly divisible by 16byte (str_len % 16 == 0)
163
163
// jcallan@github points out that declaring Multiply as a function
164
164
// reduces code size considerably with the Keil ARM compiler.
165
165
// See this link for more information: https://github.com/kokke/tiny-AES-C/pull/3
166
- #ifndef MULTIPLY_AS_A_FUNCTION
167
- #define MULTIPLY_AS_A_FUNCTION 0
168
- #endif
166
+ // #ifndef MULTIPLY_AS_A_FUNCTION /*Multiply is removed*/
167
+ // #define MULTIPLY_AS_A_FUNCTION 0
168
+ // #endif
169
169
170
170
171
171
@@ -394,36 +394,18 @@ static void AddRoundKey(uint32_t round, state_t* state, const uint8_t* RoundKey)
394
394
// state matrix with values in an S-box.
395
395
static void SubBytes (state_t * state )
396
396
{
397
- #if 1
398
397
unsigned int i ;
399
398
for (i = 0 ; i < 4 ; i ++ )
400
399
{
401
400
uint32_t * pLine = ((uint32_t * )state + i );
402
401
uint32_t line = * pLine ;
403
402
404
- uint32_t byte0 = (line & MASK32_BYTE0 ) >> OFS32_BYTE0 ;
405
- uint32_t byte1 = (line & MASK32_BYTE1 ) >> OFS32_BYTE1 ;
406
- uint32_t byte2 = (line & MASK32_BYTE2 ) >> OFS32_BYTE2 ;
407
- uint32_t byte3 = (line & MASK32_BYTE3 ) >> OFS32_BYTE3 ;
408
-
409
403
* pLine = (getSBoxValue ((line & MASK32_BYTE0 ) >> OFS32_BYTE0 ) << OFS32_BYTE0 ) |
410
404
(getSBoxValue ((line & MASK32_BYTE1 ) >> OFS32_BYTE1 ) << OFS32_BYTE1 ) |
411
405
(getSBoxValue ((line & MASK32_BYTE2 ) >> OFS32_BYTE2 ) << OFS32_BYTE2 ) |
412
406
(getSBoxValue ((line & MASK32_BYTE3 ) >> OFS32_BYTE3 ) << OFS32_BYTE3 );
413
407
}
414
408
415
- #else
416
- unsigned int i , j ;
417
- for (i = 0 ; i < 4 ; ++ i )
418
- {
419
- for (j = 0 ; j < 4 ; ++ j )
420
- {
421
- (* state )[j ][i ] = getSBoxValue ((* state )[j ][i ]);
422
- //(*state)[i][j] = getSBoxValue((*state)[i][j]);
423
- }
424
- }
425
-
426
- #endif
427
409
428
410
}
429
411
@@ -505,6 +487,7 @@ static void ShiftRows(state_t* state)
505
487
#endif
506
488
}
507
489
490
+ #if 0 /*removing because xtime has been redefined*/
508
491
#ifndef _DEBUG
509
492
static inline uint32_t xtime (uint32_t x )
510
493
{
@@ -517,56 +500,40 @@ static inline uint64_t xtime64(uint64_t x)
517
500
#else
518
501
#define xtime (x ) ((((x)<<1) ^ ((((x)>>7)) * 0x1b)) & 0xFF)
519
502
#endif
503
+ #endif
520
504
505
+ /* xtime
506
+ * original xtime function operated on bytes, this optimized version performs the same
507
+ * calculation on all bytes in a dword simultaneously, reducing operations and memory accesses
508
+ */
509
+ static inline uint32_t xtime (uint32_t x )
510
+ {
511
+ return ((x & 0x7f7f7f7f )<<1 )^(((x & 0x80808080 )>>7 )* 0x1b );
512
+ }
513
+ static inline uint64_t xtime64 (uint64_t x )
514
+ {
515
+ return ((x << 1 ) ^ (((x >> 7 )/* & 1*/ ) * 0x1b )) & 0xFF ;
516
+ }
521
517
522
-
523
- // MixColumns function mixes the columns of the state matrix
518
+ /* MixColumns
519
+ * replaced byte-wise operations with word-based operations
520
+ * eliminated repeated calculations
521
+ */
524
522
static void MixColumns (state_t * state )
525
523
{
526
- unsigned int i ;
527
- for (i = 0 ; i < 4 ; ++ i )
528
- {
529
-
530
- #if 1
531
- uint32_t * pLine = ((uint32_t * )state + i );
532
- uint32_t line = * pLine ;
533
-
534
- uint32_t byte0 = (line & MASK32_BYTE0 ) >> OFS32_BYTE0 ;
535
- uint32_t byte1 = (line & MASK32_BYTE1 ) >> OFS32_BYTE1 ;
536
- uint32_t byte2 = (line & MASK32_BYTE2 ) >> OFS32_BYTE2 ;
537
- uint32_t byte3 = (line & MASK32_BYTE3 ) >> OFS32_BYTE3 ;
538
-
539
- uint32_t t = byte0 ;
540
- uint32_t Tmp = byte0 ^ byte1 ^ byte2 ^ byte3 ;
541
- byte0 ^= xtime (byte0 ^ byte1 ) ^ Tmp ;
542
- byte1 ^= xtime (byte1 ^ byte2 ) ^ Tmp ;
543
- byte2 ^= xtime (byte2 ^ byte3 ) ^ Tmp ;
544
- byte3 ^= xtime (byte3 ^ t ) ^ Tmp ;
545
-
546
- * pLine = (byte0 << OFS32_BYTE0 ) | (byte1 << OFS32_BYTE1 ) | (byte2 << OFS32_BYTE2 ) | (byte3 << OFS32_BYTE3 );
547
- #else
548
- uint32_t Tm ;
549
- /* GK
550
-
551
- t = (*state)[i][0];
552
- Tmp = (*state)[i][0] ^ (*state)[i][1] ^ (*state)[i][2] ^ (*state)[i][3] ;
553
- Tm = (*state)[i][0] ^ (*state)[i][1] ; Tm = xtime(Tm); (*state)[i][0] ^= Tm ^ Tmp ;
554
- Tm = (*state)[i][1] ^ (*state)[i][2] ; Tm = xtime(Tm); (*state)[i][1] ^= Tm ^ Tmp ;
555
- Tm = (*state)[i][2] ^ (*state)[i][3] ; Tm = xtime(Tm); (*state)[i][2] ^= Tm ^ Tmp ;
556
- Tm = (*state)[i][3] ^ t ; Tm = xtime(Tm); (*state)[i][3] ^= Tm ^ Tmp ;
557
- */
558
- // GK - slightly more optimal and simple
559
- t = (* state )[i ][0 ];
560
- Tmp = (* state )[i ][0 ] ^ (* state )[i ][1 ] ^ (* state )[i ][2 ] ^ (* state )[i ][3 ];
561
- Tm = xtime ((* state )[i ][0 ] ^ (* state )[i ][1 ]); (* state )[i ][0 ] ^= Tm ^ Tmp ;
562
- Tm = xtime ((* state )[i ][1 ] ^ (* state )[i ][2 ]); (* state )[i ][1 ] ^= Tm ^ Tmp ;
563
- Tm = xtime ((* state )[i ][2 ] ^ (* state )[i ][3 ]); (* state )[i ][2 ] ^= Tm ^ Tmp ;
564
- Tm = xtime ((* state )[i ][3 ] ^ t ); (* state )[i ][3 ] ^= Tm ^ Tmp ;
565
-
566
- #endif
567
- }
524
+ unsigned int * sp = (unsigned int * ) state ;
525
+
526
+ for (int i = 4 ;i ;-- i ,sp ++ )
527
+ * sp = xtime ((* sp ) ^ (((* sp )>>8 )|((* sp )<<24 ))) ^
528
+ (((* sp )<<8 )|((* sp )>>24 )) ^
529
+ (((* sp )<<16 )|((* sp )>>16 )) ^ (((* sp )<<24 )|((* sp )>>8 ));
568
530
}
569
531
532
+ /*removed old Multiply function - now integrated into InvMixColumns */
533
+ // Multiply is used to multiply numbers in the field GF(2^8)
534
+ // Note: The last call to xtime() is unneeded, but often ends up generating a smaller binary
535
+ // The compiler seems to be able to vectorize the operation better this way.
536
+ // See https://github.com/kokke/tiny-AES-c/pull/34
570
537
// Multiply is used to multiply numbers in the field GF(2^8)
571
538
// Note: The last call to xtime() is unneeded, but often ends up generating a smaller binary
572
539
// The compiler seems to be able to vectorize the operation better this way.
@@ -575,118 +542,99 @@ static void MixColumns(state_t* state)
575
542
#ifndef _DEBUG
576
543
static inline uint32_t Multiply (uint32_t x , uint32_t y )
577
544
{
578
- uint32_t xtimeX = xtime (x );
579
- uint32_t xtimeXX = xtime (xtimeX );
580
- uint32_t xtimeXXX = xtime (xtimeXX );
581
-
582
- return ((~((y & 1 )- 1 ) & x ) ^
583
- (~((y >>1 & 1 )- 1 ) & xtimeX ) ^
584
- (~((y >>2 & 1 )- 1 ) & xtimeXX ) ^
585
- (~((y >>3 & 1 )- 1 ) & xtimeXXX )
545
+ uint32_t xtimeX = xtime (x );
546
+ uint32_t xtimeXX = xtime (xtimeX );
547
+ uint32_t xtimeXXX = xtime (xtimeXX );
548
+
549
+ return ((~((y & 1 )- 1 ) & x ) ^
550
+ (~((y >>1 & 1 )- 1 ) & xtimeX ) ^
551
+ (~((y >>2 & 1 )- 1 ) & xtimeXX ) ^
552
+ (~((y >>3 & 1 )- 1 ) & xtimeXXX )
586
553
#if defined(_MSC_VER ) && defined (_M_AMD64 )
587
- ^
554
+ ^
588
555
(~((y >>4 & 1 )- 1 ) & xtime (xtimeXXX ))
589
556
#endif
590
- ); /* this last call to xtime() can be omitted */
591
- }
592
-
593
- static inline uint64_t Multiply64 (uint64_t x , uint64_t y )
594
- {
595
- uint64_t xtimeX = xtime64 (x );
596
- uint64_t xtimeXX = xtime64 (xtimeX );
597
- uint64_t xtimeXXX = xtime64 (xtimeXX );
598
-
599
- return ((~((y & 1 ) - 1 ) & x ) ^
600
- (~((y >> 1 & 1 ) - 1 ) & xtimeX ) ^
601
- (~((y >> 2 & 1 ) - 1 ) & xtimeXX ) ^
602
- (~((y >> 3 & 1 ) - 1 ) & xtimeXXX )
603
- #if defined(_MSC_VER ) && defined (_M_AMD64 )
604
- ^
605
- (~((y >> 4 & 1 ) - 1 ) & xtime64 (xtimeXXX ))
606
- #endif
607
- ); /* this last call to xtime() can be omitted */
557
+ ); /* this last call to xtime() can be omitted */
608
558
}
559
+
609
560
#else
610
561
#define Multiply (x , y ) \
611
562
((~((y & 1)-1) & x) ^ \
612
563
(~((y>>1 & 1)-1) & xtime(x)) ^ \
613
564
(~((y>>2 & 1)-1) & xtime(xtime(x))) ^ \
614
565
(~((y>>3 & 1)-1) & xtime(xtime(xtime(x)))) ) \
615
566
616
- #define Multiply64 (x , y ) \
617
- ((~((y & 1)-1) & x) ^ \
618
- (~((y>>1 & 1)-1) & xtime(x)) ^ \
619
- (~((y>>2 & 1)-1) & xtime(xtime(x))) ^ \
620
- (~((y>>3 & 1)-1) & xtime(xtime(xtime(x)))) ) \
621
567
622
568
#endif
623
569
624
570
#if (defined(CBC ) && CBC == 1 ) || (defined(ECB ) && ECB == 1 )
625
571
// MixColumns function mixes the columns of the state matrix.
626
572
// The method used to multiply may be difficult to understand for the inexperienced.
627
573
// Please use the references to gain more information.
574
+ /* InvMixColumns
575
+ * this a more optimal version which performs parallel computation on all bytes in
576
+ * a dword and applies the Multiply & xor and unrolls the multiple calls to Multiply
577
+ * for each 0x9,0xb,0xd and 0xe perturbation
578
+ */
628
579
static void InvMixColumns (state_t * state )
629
580
{
630
- int i ;
631
- #if (USE32_ARITHMETIC == 1 )
632
- for (i = 0 ; i < 4 ; ++ i )
633
- {
634
- uint32_t * pLine = ((uint32_t * )state + i );
635
- uint32_t line = * pLine ;
636
-
637
- uint32_t byte0 = (line & MASK32_BYTE0 ) >> OFS32_BYTE0 ;
638
- uint32_t byte1 = (line & MASK32_BYTE1 ) >> OFS32_BYTE1 ;
639
- uint32_t byte2 = (line & MASK32_BYTE2 ) >> OFS32_BYTE2 ;
640
- uint32_t byte3 = (line & MASK32_BYTE3 ) >> OFS32_BYTE3 ;
641
-
642
- line = (Multiply (byte0 , 0x0e ) ^ Multiply (byte1 , 0x0b ) ^ Multiply (byte2 , 0x0d ) ^ Multiply (byte3 , 0x09 )) << OFS32_BYTE0 ;
643
- line |= (Multiply (byte0 , 0x09 ) ^ Multiply (byte1 , 0x0e ) ^ Multiply (byte2 , 0x0b ) ^ Multiply (byte3 , 0x0d )) << OFS32_BYTE1 ;
644
- line |= (Multiply (byte0 , 0x0d ) ^ Multiply (byte1 , 0x09 ) ^ Multiply (byte2 , 0x0e ) ^ Multiply (byte3 , 0x0b )) << OFS32_BYTE2 ;
645
- line |= (Multiply (byte0 , 0x0b ) ^ Multiply (byte1 , 0x0d ) ^ Multiply (byte2 , 0x09 ) ^ Multiply (byte3 , 0x0e )) << OFS32_BYTE3 ;
581
+ #if !defined(_M_X64 ) // This approach runs a bit faster on arm64-v8a and possibly others
582
+ for (int i = 0 ; i < 4 ; ++ i )
583
+ {
584
+ uint32_t * pLine = ((uint32_t * ) state + i );
585
+ uint32_t line = * pLine ;
646
586
647
- * pLine = line ;
587
+ uint32_t byte0 = (line & MASK32_BYTE0 ) >> OFS32_BYTE0 ;
588
+ uint32_t byte1 = (line & MASK32_BYTE1 ) >> OFS32_BYTE1 ;
589
+ uint32_t byte2 = (line & MASK32_BYTE2 ) >> OFS32_BYTE2 ;
590
+ uint32_t byte3 = (line & MASK32_BYTE3 ) >> OFS32_BYTE3 ;
648
591
592
+ line = (Multiply (byte0 , 0x0e ) ^ Multiply (byte1 , 0x0b ) ^ Multiply (byte2 , 0x0d ) ^
593
+ Multiply (byte3 , 0x09 )) << OFS32_BYTE0 ;
594
+ line |= (Multiply (byte0 , 0x09 ) ^ Multiply (byte1 , 0x0e ) ^ Multiply (byte2 , 0x0b ) ^
595
+ Multiply (byte3 , 0x0d )) << OFS32_BYTE1 ;
596
+ line |= (Multiply (byte0 , 0x0d ) ^ Multiply (byte1 , 0x09 ) ^ Multiply (byte2 , 0x0e ) ^
597
+ Multiply (byte3 , 0x0b )) << OFS32_BYTE2 ;
598
+ line |= (Multiply (byte0 , 0x0b ) ^ Multiply (byte1 , 0x0d ) ^ Multiply (byte2 , 0x09 ) ^
599
+ Multiply (byte3 , 0x0e )) << OFS32_BYTE3 ;
649
600
650
- /* GK
651
- a = (*state)[i][0];
652
- b = (*state)[i][1];
653
- c = (*state)[i][2];
654
- d = (*state)[i][3];
655
-
656
- (*state)[i][0] = Multiply(a, 0x0e) ^ Multiply(b, 0x0b) ^ Multiply(c, 0x0d) ^ Multiply(d, 0x09);
657
- (*state)[i][1] = Multiply(a, 0x09) ^ Multiply(b, 0x0e) ^ Multiply(c, 0x0b) ^ Multiply(d, 0x0d);
658
- (*state)[i][2] = Multiply(a, 0x0d) ^ Multiply(b, 0x09) ^ Multiply(c, 0x0e) ^ Multiply(d, 0x0b);
659
- (*state)[i][3] = Multiply(a, 0x0b) ^ Multiply(b, 0x0d) ^ Multiply(c, 0x09) ^ Multiply(d, 0x0e);
660
- */
661
- }
662
- #else
663
- for (i = 0 ; i < 2 ; ++ i )
664
- {
665
- uint64_t * pDoubleLine = ((uint64_t * )state + i );
666
- uint64_t doubleLine = * pDoubleLine ;
667
-
668
- uint64_t byte0 = (doubleLine & MASK64_BYTE0 ) >> OFS64_BYTE0 ;
669
- uint64_t byte1 = (doubleLine & MASK64_BYTE1 ) >> OFS64_BYTE1 ;
670
- uint64_t byte2 = (doubleLine & MASK64_BYTE2 ) >> OFS64_BYTE2 ;
671
- uint64_t byte3 = (doubleLine & MASK64_BYTE3 ) >> OFS64_BYTE3 ;
672
-
673
- uint64_t result = (Multiply64 (byte0 , 0x0e ) ^ Multiply64 (byte1 , 0x0b ) ^ Multiply64 (byte2 , 0x0d ) ^ Multiply64 (byte3 , 0x09 )) << OFS64_BYTE0 ;
674
- result |= (Multiply64 (byte0 , 0x09 ) ^ Multiply64 (byte1 , 0x0e ) ^ Multiply64 (byte2 , 0x0b ) ^ Multiply64 (byte3 , 0x0d )) << OFS64_BYTE1 ;
675
- result |= (Multiply64 (byte0 , 0x0d ) ^ Multiply64 (byte1 , 0x09 ) ^ Multiply64 (byte2 , 0x0e ) ^ Multiply64 (byte3 , 0x0b )) << OFS64_BYTE2 ;
676
- result |= (Multiply64 (byte0 , 0x0b ) ^ Multiply64 (byte1 , 0x0d ) ^ Multiply64 (byte2 , 0x09 ) ^ Multiply64 (byte3 , 0x0e )) << OFS64_BYTE3 ;
677
-
678
- byte0 = (doubleLine & MASK64_BYTE4 ) >> OFS64_BYTE4 ;
679
- byte1 = (doubleLine & MASK64_BYTE5 ) >> OFS64_BYTE5 ;
680
- byte2 = (doubleLine & MASK64_BYTE6 ) >> OFS64_BYTE6 ;
681
- byte3 = (doubleLine & MASK64_BYTE7 ) >> OFS64_BYTE7 ;
682
-
683
- result |= (Multiply64 (byte0 , 0x0e ) ^ Multiply64 (byte1 , 0x0b ) ^ Multiply64 (byte2 , 0x0d ) ^ Multiply64 (byte3 , 0x09 )) << OFS64_BYTE4 ;
684
- result |= (Multiply64 (byte0 , 0x09 ) ^ Multiply64 (byte1 , 0x0e ) ^ Multiply64 (byte2 , 0x0b ) ^ Multiply64 (byte3 , 0x0d )) << OFS64_BYTE5 ;
685
- result |= (Multiply64 (byte0 , 0x0d ) ^ Multiply64 (byte1 , 0x09 ) ^ Multiply64 (byte2 , 0x0e ) ^ Multiply64 (byte3 , 0x0b )) << OFS64_BYTE6 ;
686
- result |= (Multiply64 (byte0 , 0x0b ) ^ Multiply64 (byte1 , 0x0d ) ^ Multiply64 (byte2 , 0x09 ) ^ Multiply64 (byte3 , 0x0e )) << OFS64_BYTE7 ;
687
-
688
- * pDoubleLine = result ;
689
- }
601
+ * pLine = line ;
602
+ }
603
+ #else // This way is more efficient on the x64 Intel/AMD architecture
604
+ uint32_t * sp = (uint32_t * )state ;
605
+ uint32_t xtimeX ;
606
+ uint32_t xtimeXX ;
607
+ uint32_t xtimeXXX ;
608
+ uint32_t xtime_x9 ;
609
+ uint32_t xtime_xb ;
610
+ uint32_t xtime_xd ;
611
+ uint32_t xtime_xe ;
612
+ //*sp++ = i;
613
+
614
+ for (int i = 4 ; i ; -- i , sp ++ )
615
+ {
616
+ uint32_t spVal = * sp ;
617
+ xtimeX = xtime (spVal );
618
+ xtimeXX = xtime (xtimeX );
619
+ xtimeXXX = xtime (xtimeXX );
620
+
621
+ xtime_x9 = xtimeXXX ^ spVal ;
622
+ xtime_xb = xtimeXXX ^ xtimeX ^ spVal ;
623
+ xtime_xd = xtimeXXX ^ xtimeXX ^ spVal ;
624
+ xtime_xe = xtimeXXX ^ xtimeXX ^ xtimeX ;
625
+
626
+ uint32_t xtime_xb_r8 = xtime_xb >> 8 ;
627
+ uint32_t xtime_xd_r16 = xtime_xd >> 16 ;
628
+ uint32_t xtime_x9_l8 = xtime_x9 << 8 ;
629
+ uint32_t xtime_xd_l16 = xtime_xd << 16 ;
630
+
631
+ //this next assignment incorporates all of the Multiply calls, eliminating the repeated re-calculations
632
+ * sp =
633
+ /* byte 0:*/ (((xtime_xe ^ xtime_xb_r8 ^ xtime_xd_r16 ^ (xtime_x9 >> 24 )) & 0x000000ff ) |
634
+ /* byte 1:*/ ((xtime_x9_l8 ^ xtime_xe ^ xtime_xb_r8 ^ xtime_xd_r16 ) & 0x0000ff00 ) |
635
+ /* byte 2:*/ ((xtime_xd_l16 ^ xtime_x9_l8 ^ xtime_xe ^ xtime_xb_r8 ) & 0x00ff0000 ) |
636
+ /* byte 3:*/ (((xtime_xb << 24 ) ^ xtime_xd_l16 ^ xtime_x9_l8 ^ xtime_xe ) & 0xff000000 ));
637
+ }
690
638
#endif
691
639
}
692
640
0 commit comments