Skip to content

Commit ce85c7a

Browse files
committed
Optimized TinyAES code for better performance
1 parent 2828ecb commit ce85c7a

File tree

1 file changed

+102
-154
lines changed
  • Thirdparty/encryption/tiny-AES-c

1 file changed

+102
-154
lines changed

Thirdparty/encryption/tiny-AES-c/aes.c

Lines changed: 102 additions & 154 deletions
Original file line numberDiff line numberDiff line change
@@ -163,9 +163,9 @@ NOTE: String length must be evenly divisible by 16byte (str_len % 16 == 0)
163163
// jcallan@github points out that declaring Multiply as a function
164164
// reduces code size considerably with the Keil ARM compiler.
165165
// See this link for more information: https://github.com/kokke/tiny-AES-C/pull/3
166-
#ifndef MULTIPLY_AS_A_FUNCTION
167-
#define MULTIPLY_AS_A_FUNCTION 0
168-
#endif
166+
//#ifndef MULTIPLY_AS_A_FUNCTION /*Multiply is removed*/
167+
// #define MULTIPLY_AS_A_FUNCTION 0
168+
//#endif
169169

170170

171171

@@ -394,36 +394,18 @@ static void AddRoundKey(uint32_t round, state_t* state, const uint8_t* RoundKey)
394394
// state matrix with values in an S-box.
395395
static void SubBytes(state_t* state)
396396
{
397-
#if 1
398397
unsigned int i;
399398
for (i = 0; i < 4; i++)
400399
{
401400
uint32_t* pLine = ((uint32_t*)state + i);
402401
uint32_t line = *pLine;
403402

404-
uint32_t byte0 = (line & MASK32_BYTE0) >> OFS32_BYTE0;
405-
uint32_t byte1 = (line & MASK32_BYTE1) >> OFS32_BYTE1;
406-
uint32_t byte2 = (line & MASK32_BYTE2) >> OFS32_BYTE2;
407-
uint32_t byte3 = (line & MASK32_BYTE3) >> OFS32_BYTE3;
408-
409403
*pLine = (getSBoxValue((line & MASK32_BYTE0) >> OFS32_BYTE0) << OFS32_BYTE0) |
410404
(getSBoxValue((line & MASK32_BYTE1) >> OFS32_BYTE1) << OFS32_BYTE1) |
411405
(getSBoxValue((line & MASK32_BYTE2) >> OFS32_BYTE2) << OFS32_BYTE2) |
412406
(getSBoxValue((line & MASK32_BYTE3) >> OFS32_BYTE3) << OFS32_BYTE3);
413407
}
414408

415-
#else
416-
unsigned int i, j;
417-
for (i = 0; i < 4; ++i)
418-
{
419-
for (j = 0; j < 4; ++j)
420-
{
421-
(*state)[j][i] = getSBoxValue((*state)[j][i]);
422-
//(*state)[i][j] = getSBoxValue((*state)[i][j]);
423-
}
424-
}
425-
426-
#endif
427409

428410
}
429411

@@ -505,6 +487,7 @@ static void ShiftRows(state_t* state)
505487
#endif
506488
}
507489

490+
#if 0 /*removing because xtime has been redefined*/
508491
#ifndef _DEBUG
509492
static inline uint32_t xtime(uint32_t x)
510493
{
@@ -517,56 +500,40 @@ static inline uint64_t xtime64(uint64_t x)
517500
#else
518501
#define xtime(x) ((((x)<<1) ^ ((((x)>>7)) * 0x1b)) & 0xFF)
519502
#endif
503+
#endif
520504

505+
/* xtime
506+
* original xtime function operated on bytes, this optimized version performs the same
507+
* calculation on all bytes in a dword simultaneously, reducing operations and memory accesses
508+
*/
509+
static inline uint32_t xtime(uint32_t x)
510+
{
511+
return ((x&0x7f7f7f7f)<<1)^(((x&0x80808080)>>7)*0x1b);
512+
}
513+
static inline uint64_t xtime64(uint64_t x)
514+
{
515+
return ((x << 1) ^ (((x >> 7)/* & 1*/) * 0x1b)) & 0xFF;
516+
}
521517

522-
523-
// MixColumns function mixes the columns of the state matrix
518+
/* MixColumns
519+
* replaced byte-wise operations with word-based operations
520+
* eliminated repeated calculations
521+
*/
524522
static void MixColumns(state_t* state)
525523
{
526-
unsigned int i;
527-
for (i = 0; i < 4; ++i)
528-
{
529-
530-
#if 1
531-
uint32_t* pLine = ((uint32_t*)state + i);
532-
uint32_t line = *pLine;
533-
534-
uint32_t byte0 = (line & MASK32_BYTE0) >> OFS32_BYTE0;
535-
uint32_t byte1 = (line & MASK32_BYTE1) >> OFS32_BYTE1;
536-
uint32_t byte2 = (line & MASK32_BYTE2) >> OFS32_BYTE2;
537-
uint32_t byte3 = (line & MASK32_BYTE3) >> OFS32_BYTE3;
538-
539-
uint32_t t = byte0;
540-
uint32_t Tmp = byte0 ^ byte1 ^ byte2 ^ byte3;
541-
byte0 ^= xtime(byte0 ^ byte1) ^ Tmp;
542-
byte1 ^= xtime(byte1 ^ byte2) ^ Tmp;
543-
byte2 ^= xtime(byte2 ^ byte3) ^ Tmp;
544-
byte3 ^= xtime(byte3 ^ t) ^ Tmp;
545-
546-
*pLine = (byte0 << OFS32_BYTE0) | (byte1 << OFS32_BYTE1) | (byte2 << OFS32_BYTE2) | (byte3 << OFS32_BYTE3);
547-
#else
548-
uint32_t Tm;
549-
/* GK
550-
551-
t = (*state)[i][0];
552-
Tmp = (*state)[i][0] ^ (*state)[i][1] ^ (*state)[i][2] ^ (*state)[i][3] ;
553-
Tm = (*state)[i][0] ^ (*state)[i][1] ; Tm = xtime(Tm); (*state)[i][0] ^= Tm ^ Tmp ;
554-
Tm = (*state)[i][1] ^ (*state)[i][2] ; Tm = xtime(Tm); (*state)[i][1] ^= Tm ^ Tmp ;
555-
Tm = (*state)[i][2] ^ (*state)[i][3] ; Tm = xtime(Tm); (*state)[i][2] ^= Tm ^ Tmp ;
556-
Tm = (*state)[i][3] ^ t ; Tm = xtime(Tm); (*state)[i][3] ^= Tm ^ Tmp ;
557-
*/
558-
// GK - slightly more optimal and simple
559-
t = (*state)[i][0];
560-
Tmp = (*state)[i][0] ^ (*state)[i][1] ^ (*state)[i][2] ^ (*state)[i][3];
561-
Tm = xtime((*state)[i][0] ^ (*state)[i][1]); (*state)[i][0] ^= Tm ^ Tmp;
562-
Tm = xtime((*state)[i][1] ^ (*state)[i][2]); (*state)[i][1] ^= Tm ^ Tmp;
563-
Tm = xtime((*state)[i][2] ^ (*state)[i][3]); (*state)[i][2] ^= Tm ^ Tmp;
564-
Tm = xtime((*state)[i][3] ^ t); (*state)[i][3] ^= Tm ^ Tmp;
565-
566-
#endif
567-
}
524+
unsigned int *sp = (unsigned int *) state;
525+
526+
for (int i=4;i;--i,sp++)
527+
*sp = xtime((*sp) ^ (((*sp)>>8)|((*sp)<<24))) ^
528+
(((*sp)<<8)|((*sp)>>24)) ^
529+
(((*sp)<<16)|((*sp)>>16)) ^ (((*sp)<<24)|((*sp)>>8));
568530
}
569531

532+
/*removed old Multiply function - now integrated into InvMixColumns */
533+
// Multiply is used to multiply numbers in the field GF(2^8)
534+
// Note: The last call to xtime() is unneeded, but often ends up generating a smaller binary
535+
// The compiler seems to be able to vectorize the operation better this way.
536+
// See https://github.com/kokke/tiny-AES-c/pull/34
570537
// Multiply is used to multiply numbers in the field GF(2^8)
571538
// Note: The last call to xtime() is unneeded, but often ends up generating a smaller binary
572539
// The compiler seems to be able to vectorize the operation better this way.
@@ -575,118 +542,99 @@ static void MixColumns(state_t* state)
575542
#ifndef _DEBUG
576543
static inline uint32_t Multiply(uint32_t x, uint32_t y)
577544
{
578-
uint32_t xtimeX = xtime(x);
579-
uint32_t xtimeXX = xtime(xtimeX);
580-
uint32_t xtimeXXX = xtime(xtimeXX);
581-
582-
return ((~((y & 1)-1) & x) ^
583-
(~((y>>1 & 1)-1) & xtimeX) ^
584-
(~((y>>2 & 1)-1) & xtimeXX) ^
585-
(~((y>>3 & 1)-1) & xtimeXXX)
545+
uint32_t xtimeX = xtime(x);
546+
uint32_t xtimeXX = xtime(xtimeX);
547+
uint32_t xtimeXXX = xtime(xtimeXX);
548+
549+
return ((~((y & 1)-1) & x) ^
550+
(~((y>>1 & 1)-1) & xtimeX) ^
551+
(~((y>>2 & 1)-1) & xtimeXX) ^
552+
(~((y>>3 & 1)-1) & xtimeXXX)
586553
#if defined(_MSC_VER) && defined(_M_AMD64)
587-
^
554+
^
588555
(~((y>>4 & 1)-1) & xtime(xtimeXXX))
589556
#endif
590-
); /* this last call to xtime() can be omitted */
591-
}
592-
593-
static inline uint64_t Multiply64(uint64_t x, uint64_t y)
594-
{
595-
uint64_t xtimeX = xtime64(x);
596-
uint64_t xtimeXX = xtime64(xtimeX);
597-
uint64_t xtimeXXX = xtime64(xtimeXX);
598-
599-
return ((~((y & 1) - 1) & x) ^
600-
(~((y >> 1 & 1) - 1) & xtimeX) ^
601-
(~((y >> 2 & 1) - 1) & xtimeXX) ^
602-
(~((y >> 3 & 1) - 1) & xtimeXXX)
603-
#if defined(_MSC_VER) && defined(_M_AMD64)
604-
^
605-
(~((y >> 4 & 1) - 1) & xtime64(xtimeXXX))
606-
#endif
607-
); /* this last call to xtime() can be omitted */
557+
); /* this last call to xtime() can be omitted */
608558
}
559+
609560
#else
610561
#define Multiply(x, y) \
611562
((~((y & 1)-1) & x) ^ \
612563
(~((y>>1 & 1)-1) & xtime(x)) ^ \
613564
(~((y>>2 & 1)-1) & xtime(xtime(x))) ^ \
614565
(~((y>>3 & 1)-1) & xtime(xtime(xtime(x)))) ) \
615566

616-
#define Multiply64(x, y) \
617-
((~((y & 1)-1) & x) ^ \
618-
(~((y>>1 & 1)-1) & xtime(x)) ^ \
619-
(~((y>>2 & 1)-1) & xtime(xtime(x))) ^ \
620-
(~((y>>3 & 1)-1) & xtime(xtime(xtime(x)))) ) \
621567

622568
#endif
623569

624570
#if (defined(CBC) && CBC == 1) || (defined(ECB) && ECB == 1)
625571
// MixColumns function mixes the columns of the state matrix.
626572
// The method used to multiply may be difficult to understand for the inexperienced.
627573
// Please use the references to gain more information.
574+
/* InvMixColumns
575+
* this a more optimal version which performs parallel computation on all bytes in
576+
* a dword and applies the Multiply & xor and unrolls the multiple calls to Multiply
577+
* for each 0x9,0xb,0xd and 0xe perturbation
578+
*/
628579
static void InvMixColumns(state_t* state)
629580
{
630-
int i;
631-
#if (USE32_ARITHMETIC == 1)
632-
for (i = 0; i < 4; ++i)
633-
{
634-
uint32_t* pLine = ((uint32_t*)state + i);
635-
uint32_t line = *pLine;
636-
637-
uint32_t byte0 = (line & MASK32_BYTE0) >> OFS32_BYTE0;
638-
uint32_t byte1 = (line & MASK32_BYTE1) >> OFS32_BYTE1;
639-
uint32_t byte2 = (line & MASK32_BYTE2) >> OFS32_BYTE2;
640-
uint32_t byte3 = (line & MASK32_BYTE3) >> OFS32_BYTE3;
641-
642-
line = (Multiply(byte0, 0x0e) ^ Multiply(byte1, 0x0b) ^ Multiply(byte2, 0x0d) ^ Multiply(byte3, 0x09)) << OFS32_BYTE0;
643-
line |= (Multiply(byte0, 0x09) ^ Multiply(byte1, 0x0e) ^ Multiply(byte2, 0x0b) ^ Multiply(byte3, 0x0d)) << OFS32_BYTE1;
644-
line |= (Multiply(byte0, 0x0d) ^ Multiply(byte1, 0x09) ^ Multiply(byte2, 0x0e) ^ Multiply(byte3, 0x0b)) << OFS32_BYTE2;
645-
line |= (Multiply(byte0, 0x0b) ^ Multiply(byte1, 0x0d) ^ Multiply(byte2, 0x09) ^ Multiply(byte3, 0x0e)) << OFS32_BYTE3;
581+
#if !defined(_M_X64) // This approach runs a bit faster on arm64-v8a and possibly others
582+
for (int i = 0; i < 4; ++i)
583+
{
584+
uint32_t *pLine = ((uint32_t *) state + i);
585+
uint32_t line = *pLine;
646586

647-
*pLine = line;
587+
uint32_t byte0 = (line & MASK32_BYTE0) >> OFS32_BYTE0;
588+
uint32_t byte1 = (line & MASK32_BYTE1) >> OFS32_BYTE1;
589+
uint32_t byte2 = (line & MASK32_BYTE2) >> OFS32_BYTE2;
590+
uint32_t byte3 = (line & MASK32_BYTE3) >> OFS32_BYTE3;
648591

592+
line = (Multiply(byte0, 0x0e) ^ Multiply(byte1, 0x0b) ^ Multiply(byte2, 0x0d) ^
593+
Multiply(byte3, 0x09)) << OFS32_BYTE0;
594+
line |= (Multiply(byte0, 0x09) ^ Multiply(byte1, 0x0e) ^ Multiply(byte2, 0x0b) ^
595+
Multiply(byte3, 0x0d)) << OFS32_BYTE1;
596+
line |= (Multiply(byte0, 0x0d) ^ Multiply(byte1, 0x09) ^ Multiply(byte2, 0x0e) ^
597+
Multiply(byte3, 0x0b)) << OFS32_BYTE2;
598+
line |= (Multiply(byte0, 0x0b) ^ Multiply(byte1, 0x0d) ^ Multiply(byte2, 0x09) ^
599+
Multiply(byte3, 0x0e)) << OFS32_BYTE3;
649600

650-
/* GK
651-
a = (*state)[i][0];
652-
b = (*state)[i][1];
653-
c = (*state)[i][2];
654-
d = (*state)[i][3];
655-
656-
(*state)[i][0] = Multiply(a, 0x0e) ^ Multiply(b, 0x0b) ^ Multiply(c, 0x0d) ^ Multiply(d, 0x09);
657-
(*state)[i][1] = Multiply(a, 0x09) ^ Multiply(b, 0x0e) ^ Multiply(c, 0x0b) ^ Multiply(d, 0x0d);
658-
(*state)[i][2] = Multiply(a, 0x0d) ^ Multiply(b, 0x09) ^ Multiply(c, 0x0e) ^ Multiply(d, 0x0b);
659-
(*state)[i][3] = Multiply(a, 0x0b) ^ Multiply(b, 0x0d) ^ Multiply(c, 0x09) ^ Multiply(d, 0x0e);
660-
*/
661-
}
662-
#else
663-
for (i = 0; i < 2; ++i)
664-
{
665-
uint64_t* pDoubleLine = ((uint64_t*)state + i);
666-
uint64_t doubleLine = *pDoubleLine;
667-
668-
uint64_t byte0 = (doubleLine & MASK64_BYTE0) >> OFS64_BYTE0;
669-
uint64_t byte1 = (doubleLine & MASK64_BYTE1) >> OFS64_BYTE1;
670-
uint64_t byte2 = (doubleLine & MASK64_BYTE2) >> OFS64_BYTE2;
671-
uint64_t byte3 = (doubleLine & MASK64_BYTE3) >> OFS64_BYTE3;
672-
673-
uint64_t result = (Multiply64(byte0, 0x0e) ^ Multiply64(byte1, 0x0b) ^ Multiply64(byte2, 0x0d) ^ Multiply64(byte3, 0x09)) << OFS64_BYTE0;
674-
result |= (Multiply64(byte0, 0x09) ^ Multiply64(byte1, 0x0e) ^ Multiply64(byte2, 0x0b) ^ Multiply64(byte3, 0x0d)) << OFS64_BYTE1;
675-
result |= (Multiply64(byte0, 0x0d) ^ Multiply64(byte1, 0x09) ^ Multiply64(byte2, 0x0e) ^ Multiply64(byte3, 0x0b)) << OFS64_BYTE2;
676-
result |= (Multiply64(byte0, 0x0b) ^ Multiply64(byte1, 0x0d) ^ Multiply64(byte2, 0x09) ^ Multiply64(byte3, 0x0e)) << OFS64_BYTE3;
677-
678-
byte0 = (doubleLine & MASK64_BYTE4) >> OFS64_BYTE4;
679-
byte1 = (doubleLine & MASK64_BYTE5) >> OFS64_BYTE5;
680-
byte2 = (doubleLine & MASK64_BYTE6) >> OFS64_BYTE6;
681-
byte3 = (doubleLine & MASK64_BYTE7) >> OFS64_BYTE7;
682-
683-
result |= (Multiply64(byte0, 0x0e) ^ Multiply64(byte1, 0x0b) ^ Multiply64(byte2, 0x0d) ^ Multiply64(byte3, 0x09)) << OFS64_BYTE4;
684-
result |= (Multiply64(byte0, 0x09) ^ Multiply64(byte1, 0x0e) ^ Multiply64(byte2, 0x0b) ^ Multiply64(byte3, 0x0d)) << OFS64_BYTE5;
685-
result |= (Multiply64(byte0, 0x0d) ^ Multiply64(byte1, 0x09) ^ Multiply64(byte2, 0x0e) ^ Multiply64(byte3, 0x0b)) << OFS64_BYTE6;
686-
result |= (Multiply64(byte0, 0x0b) ^ Multiply64(byte1, 0x0d) ^ Multiply64(byte2, 0x09) ^ Multiply64(byte3, 0x0e)) << OFS64_BYTE7;
687-
688-
*pDoubleLine = result;
689-
}
601+
*pLine = line;
602+
}
603+
#else // This way is more efficient on the x64 Intel/AMD architecture
604+
uint32_t *sp=(uint32_t*)state;
605+
uint32_t xtimeX;
606+
uint32_t xtimeXX;
607+
uint32_t xtimeXXX;
608+
uint32_t xtime_x9;
609+
uint32_t xtime_xb;
610+
uint32_t xtime_xd;
611+
uint32_t xtime_xe;
612+
//*sp++ = i;
613+
614+
for (int i=4; i; --i, sp++)
615+
{
616+
uint32_t spVal = *sp;
617+
xtimeX = xtime(spVal);
618+
xtimeXX = xtime(xtimeX);
619+
xtimeXXX = xtime(xtimeXX);
620+
621+
xtime_x9 = xtimeXXX ^ spVal;
622+
xtime_xb = xtimeXXX ^ xtimeX ^ spVal;
623+
xtime_xd = xtimeXXX ^ xtimeXX ^ spVal;
624+
xtime_xe = xtimeXXX ^ xtimeXX ^ xtimeX;
625+
626+
uint32_t xtime_xb_r8 = xtime_xb >> 8;
627+
uint32_t xtime_xd_r16 = xtime_xd >> 16;
628+
uint32_t xtime_x9_l8 = xtime_x9 << 8;
629+
uint32_t xtime_xd_l16 = xtime_xd << 16;
630+
631+
//this next assignment incorporates all of the Multiply calls, eliminating the repeated re-calculations
632+
*sp =
633+
/* byte 0:*/ (((xtime_xe ^ xtime_xb_r8 ^ xtime_xd_r16 ^ (xtime_x9 >> 24)) & 0x000000ff) |
634+
/* byte 1:*/ ((xtime_x9_l8 ^ xtime_xe ^ xtime_xb_r8 ^ xtime_xd_r16 ) & 0x0000ff00) |
635+
/* byte 2:*/ ((xtime_xd_l16 ^ xtime_x9_l8 ^ xtime_xe ^ xtime_xb_r8 ) & 0x00ff0000) |
636+
/* byte 3:*/ (((xtime_xb << 24) ^ xtime_xd_l16 ^ xtime_x9_l8 ^ xtime_xe ) & 0xff000000));
637+
}
690638
#endif
691639
}
692640

0 commit comments

Comments
 (0)