diff --git a/.github/workflows/aes.yml b/.github/workflows/aes.yml index 3dcace83..50cc07ce 100644 --- a/.github/workflows/aes.yml +++ b/.github/workflows/aes.yml @@ -16,6 +16,7 @@ defaults: env: CARGO_INCREMENTAL: 0 RUSTFLAGS: "-Dwarnings" + SDE_FULL_VERSION: "9.53.0-2025-03-16" jobs: # Builds for no_std platforms @@ -68,7 +69,7 @@ jobs: env: CARGO_INCREMENTAL: 0 RUSTDOCFLAGS: "-C target-feature=+aes,+ssse3" - RUSTFLAGS: "-Dwarnings -C target-feature=+aes,+ssse3" + RUSTFLAGS: "-Dwarnings -C target-feature=+aes,+ssse3 --cfg aes_avx512_disable --cfg aes_avx256_disable" strategy: matrix: include: @@ -97,6 +98,80 @@ jobs: - run: cargo test --target ${{ matrix.target }} --features hazmat - run: cargo test --target ${{ matrix.target }} --all-features + # Tests for the VAES AVX backend + vaes256: + runs-on: ubuntu-latest + env: + CARGO_INCREMENTAL: 0 + RUSTFLAGS: "-Dwarnings --cfg aes_avx512_disable" + strategy: + matrix: + include: + - target: x86_64-unknown-linux-gnu + rust: nightly-2025-05-28 + steps: + - uses: actions/checkout@v4 + - name: Install Intel SDE + run: | + curl -JLO "https://downloadmirror.intel.com/850782/sde-external-${{ env.SDE_FULL_VERSION }}-lin.tar.xz" + tar xvf sde-external-${{ env.SDE_FULL_VERSION }}-lin.tar.xz -C /opt + echo "/opt/sde-external-${{ env.SDE_FULL_VERSION }}-lin" >> $GITHUB_PATH + - uses: RustCrypto/actions/cargo-cache@master + - uses: dtolnay/rust-toolchain@master + with: + toolchain: ${{ matrix.rust }} + targets: ${{ matrix.target }} + # NOTE: Write a `.cargo/config.toml` to configure the target for VAES + # NOTE: We use intel-sde as the runner since not all GitHub CI hosts support AVX512 + - name: write .cargo/config.toml + shell: bash + run: | + cd ../aes/.. + mkdir -p .cargo + echo '[target.${{ matrix.target }}]' > .cargo/config.toml + echo 'runner = "sde64 -future --"' >> .cargo/config.toml + - run: ${{ matrix.deps }} + - run: cargo test --target ${{ matrix.target }} + - run: cargo test --target ${{ matrix.target }} --features hazmat + - run: cargo test --target ${{ matrix.target }} --all-features + + # Tests for the VAES AVX512 backend + vaes512: + runs-on: ubuntu-latest + env: + CARGO_INCREMENTAL: 0 + strategy: + matrix: + include: + - target: x86_64-unknown-linux-gnu + rust: nightly-2025-05-28 + steps: + - uses: actions/checkout@v4 + - name: Install Intel SDE + run: | + curl -JLO "https://downloadmirror.intel.com/850782/sde-external-${{ env.SDE_FULL_VERSION }}-lin.tar.xz" + tar xvf sde-external-${{ env.SDE_FULL_VERSION }}-lin.tar.xz -C /opt + echo "/opt/sde-external-${{ env.SDE_FULL_VERSION }}-lin" >> $GITHUB_PATH + - uses: RustCrypto/actions/cargo-cache@master + - uses: dtolnay/rust-toolchain@master + with: + toolchain: ${{ matrix.rust }} + targets: ${{ matrix.target }} + # NOTE: Write a `.cargo/config.toml` to configure the target for VAES + # NOTE: We use intel-sde as the runner since not all GitHub CI hosts support AVX512 + - name: write .cargo/config.toml + shell: bash + run: | + cd ../aes/.. + mkdir -p .cargo + echo '[target.${{ matrix.target }}]' > .cargo/config.toml + echo 'runner = "sde64 -future --"' >> .cargo/config.toml + - run: ${{ matrix.deps }} + - run: cargo test --target ${{ matrix.target }} + - run: cargo test --target ${{ matrix.target }} --features hazmat + - run: cargo test --target ${{ matrix.target }} --all-features + + # Tests for CPU feature autodetection with fallback to portable software implementation autodetect: runs-on: ubuntu-latest diff --git a/Cargo.lock b/Cargo.lock index f63847af..d6d58063 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -243,7 +243,7 @@ checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" [[package]] name = "xtea" -version = "0.0.1-rc.0" +version = "0.0.1-pre.0" dependencies = [ "cipher", ] diff --git a/aes/Cargo.toml b/aes/Cargo.toml index 2818b4b9..09b3343d 100644 --- a/aes/Cargo.toml +++ b/aes/Cargo.toml @@ -31,7 +31,7 @@ hazmat = [] # Expose cryptographically hazardous APIs [lints.rust.unexpected_cfgs] level = "warn" -check-cfg = ["cfg(aes_compact)", "cfg(aes_force_soft)"] +check-cfg = ["cfg(aes_compact)", "cfg(aes_force_soft)", "cfg(aes_avx256_disable)", "cfg(aes_avx512_disable)"] [package.metadata.docs.rs] all-features = true diff --git a/aes/src/armv8.rs b/aes/src/armv8.rs index 1a79fa25..995e6bae 100644 --- a/aes/src/armv8.rs +++ b/aes/src/armv8.rs @@ -25,6 +25,13 @@ use cipher::{ }; use core::fmt; +pub(crate) mod features { + cpufeatures::new!(features_aes, "aes"); + pub(crate) mod aes { + pub use super::features_aes::*; + } +} + impl_backends!( enc_name = Aes128BackEnc, dec_name = Aes128BackDec, @@ -86,18 +93,6 @@ macro_rules! define_aes_impl { decrypt: $name_back_dec, } - impl $name { - #[inline(always)] - pub(crate) fn get_enc_backend(&self) -> &$name_back_enc { - &self.encrypt - } - - #[inline(always)] - pub(crate) fn get_dec_backend(&self) -> &$name_back_dec { - &self.decrypt - } - } - impl KeySizeUser for $name { type KeySize = $key_size; } @@ -182,13 +177,6 @@ macro_rules! define_aes_impl { backend: $name_back_enc, } - impl $name_enc { - #[inline(always)] - pub(crate) fn get_enc_backend(&self) -> &$name_back_enc { - &self.backend - } - } - impl KeySizeUser for $name_enc { type KeySize = $key_size; } @@ -248,13 +236,6 @@ macro_rules! define_aes_impl { backend: $name_back_dec, } - impl $name_dec { - #[inline(always)] - pub(crate) fn get_dec_backend(&self) -> &$name_back_dec { - &self.backend - } - } - impl KeySizeUser for $name_dec { type KeySize = $key_size; } diff --git a/aes/src/autodetect.rs b/aes/src/autodetect.rs index 802680ab..3f8a2a42 100644 --- a/aes/src/autodetect.rs +++ b/aes/src/autodetect.rs @@ -12,12 +12,10 @@ use core::fmt; use core::mem::ManuallyDrop; #[cfg(target_arch = "aarch64")] -use crate::armv8 as intrinsics; +use crate::armv8 as arch; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -use crate::ni as intrinsics; - -cpufeatures::new!(aes_intrinsics, "aes"); +use crate::x86 as arch; macro_rules! define_aes_impl { ( @@ -29,21 +27,21 @@ macro_rules! define_aes_impl { doc = $doc:expr, ) => { mod $module { - use super::{intrinsics, soft}; + use super::{arch, soft}; use core::mem::ManuallyDrop; pub(super) union Inner { - pub(super) intrinsics: ManuallyDrop, + pub(super) arch: ManuallyDrop, pub(super) soft: ManuallyDrop, } pub(super) union InnerEnc { - pub(super) intrinsics: ManuallyDrop, + pub(super) arch: ManuallyDrop, pub(super) soft: ManuallyDrop, } pub(super) union InnerDec { - pub(super) intrinsics: ManuallyDrop, + pub(super) arch: ManuallyDrop, pub(super) soft: ManuallyDrop, } } @@ -52,7 +50,7 @@ macro_rules! define_aes_impl { #[doc = "block cipher"] pub struct $name { inner: $module::Inner, - token: aes_intrinsics::InitToken, + token: arch::features::aes::InitToken, } impl KeySizeUser for $name { @@ -70,9 +68,7 @@ macro_rules! define_aes_impl { use core::ops::Deref; let inner = if enc.token.get() { $module::Inner { - intrinsics: ManuallyDrop::new(unsafe { - enc.inner.intrinsics.deref().into() - }), + arch: ManuallyDrop::new(unsafe { enc.inner.arch.deref().into() }), } } else { $module::Inner { @@ -90,11 +86,11 @@ macro_rules! define_aes_impl { impl KeyInit for $name { #[inline] fn new(key: &Key) -> Self { - let (token, aesni_present) = aes_intrinsics::init_get(); + let (token, aes_features) = arch::features::aes::init_get(); - let inner = if aesni_present { + let inner = if aes_features { $module::Inner { - intrinsics: ManuallyDrop::new(intrinsics::$name::new(key)), + arch: ManuallyDrop::new(arch::$name::new(key)), } } else { $module::Inner { @@ -115,7 +111,7 @@ macro_rules! define_aes_impl { fn clone(&self) -> Self { let inner = if self.token.get() { $module::Inner { - intrinsics: unsafe { self.inner.intrinsics.clone() }, + arch: unsafe { self.inner.arch.clone() }, } } else { $module::Inner { @@ -136,38 +132,20 @@ macro_rules! define_aes_impl { impl BlockCipherEncrypt for $name { fn encrypt_with_backend(&self, f: impl BlockCipherEncClosure) { - unsafe { - if self.token.get() { - #[target_feature(enable = "aes")] - unsafe fn inner( - state: &intrinsics::$name, - f: impl BlockCipherEncClosure, - ) { - f.call(state.get_enc_backend()); - } - inner(&self.inner.intrinsics, f); - } else { - f.call(&self.inner.soft.get_enc_backend()); - } + if self.token.get() { + unsafe { &self.inner.arch }.encrypt_with_backend(f) + } else { + unsafe { &self.inner.soft }.encrypt_with_backend(f) } } } impl BlockCipherDecrypt for $name { fn decrypt_with_backend(&self, f: impl BlockCipherDecClosure) { - unsafe { - if self.token.get() { - #[target_feature(enable = "aes")] - unsafe fn inner( - state: &intrinsics::$name, - f: impl BlockCipherDecClosure, - ) { - f.call(state.get_dec_backend()); - } - inner(&self.inner.intrinsics, f); - } else { - f.call(&self.inner.soft.get_dec_backend()); - } + if self.token.get() { + unsafe { &self.inner.arch }.decrypt_with_backend(f) + } else { + unsafe { &self.inner.soft }.decrypt_with_backend(f) } } } @@ -188,7 +166,7 @@ macro_rules! define_aes_impl { #[inline] fn drop(&mut self) { if self.token.get() { - unsafe { ManuallyDrop::drop(&mut self.inner.intrinsics) }; + unsafe { ManuallyDrop::drop(&mut self.inner.arch) }; } else { unsafe { ManuallyDrop::drop(&mut self.inner.soft) }; }; @@ -202,7 +180,7 @@ macro_rules! define_aes_impl { #[doc = "block cipher (encrypt-only)"] pub struct $name_enc { inner: $module::InnerEnc, - token: aes_intrinsics::InitToken, + token: arch::features::aes::InitToken, } impl KeySizeUser for $name_enc { @@ -212,11 +190,11 @@ macro_rules! define_aes_impl { impl KeyInit for $name_enc { #[inline] fn new(key: &Key) -> Self { - let (token, aesni_present) = aes_intrinsics::init_get(); + let (token, aes_features) = arch::features::aes::init_get(); - let inner = if aesni_present { + let inner = if aes_features { $module::InnerEnc { - intrinsics: ManuallyDrop::new(intrinsics::$name_enc::new(key)), + arch: ManuallyDrop::new(arch::$name_enc::new(key)), } } else { $module::InnerEnc { @@ -237,7 +215,7 @@ macro_rules! define_aes_impl { fn clone(&self) -> Self { let inner = if self.token.get() { $module::InnerEnc { - intrinsics: unsafe { self.inner.intrinsics.clone() }, + arch: unsafe { self.inner.arch.clone() }, } } else { $module::InnerEnc { @@ -258,19 +236,10 @@ macro_rules! define_aes_impl { impl BlockCipherEncrypt for $name_enc { fn encrypt_with_backend(&self, f: impl BlockCipherEncClosure) { - unsafe { - if self.token.get() { - #[target_feature(enable = "aes")] - unsafe fn inner( - state: &intrinsics::$name_enc, - f: impl BlockCipherEncClosure, - ) { - f.call(state.get_enc_backend()); - } - inner(&self.inner.intrinsics, f); - } else { - f.call(&self.inner.soft.get_enc_backend()); - } + if self.token.get() { + unsafe { &self.inner.arch }.encrypt_with_backend(f) + } else { + unsafe { &self.inner.soft }.encrypt_with_backend(f) } } } @@ -291,7 +260,7 @@ macro_rules! define_aes_impl { #[inline] fn drop(&mut self) { if self.token.get() { - unsafe { ManuallyDrop::drop(&mut self.inner.intrinsics) }; + unsafe { ManuallyDrop::drop(&mut self.inner.arch) }; } else { unsafe { ManuallyDrop::drop(&mut self.inner.soft) }; }; @@ -305,7 +274,7 @@ macro_rules! define_aes_impl { #[doc = "block cipher (decrypt-only)"] pub struct $name_dec { inner: $module::InnerDec, - token: aes_intrinsics::InitToken, + token: arch::features::aes::InitToken, } impl KeySizeUser for $name_dec { @@ -324,9 +293,7 @@ macro_rules! define_aes_impl { use core::ops::Deref; let inner = if enc.token.get() { $module::InnerDec { - intrinsics: ManuallyDrop::new(unsafe { - enc.inner.intrinsics.deref().into() - }), + arch: ManuallyDrop::new(unsafe { enc.inner.arch.deref().into() }), } } else { $module::InnerDec { @@ -344,11 +311,11 @@ macro_rules! define_aes_impl { impl KeyInit for $name_dec { #[inline] fn new(key: &Key) -> Self { - let (token, aesni_present) = aes_intrinsics::init_get(); + let (token, aes_features) = arch::features::aes::init_get(); - let inner = if aesni_present { + let inner = if aes_features { $module::InnerDec { - intrinsics: ManuallyDrop::new(intrinsics::$name_dec::new(key)), + arch: ManuallyDrop::new(arch::$name_dec::new(key)), } } else { $module::InnerDec { @@ -369,7 +336,7 @@ macro_rules! define_aes_impl { fn clone(&self) -> Self { let inner = if self.token.get() { $module::InnerDec { - intrinsics: unsafe { self.inner.intrinsics.clone() }, + arch: unsafe { self.inner.arch.clone() }, } } else { $module::InnerDec { @@ -390,19 +357,10 @@ macro_rules! define_aes_impl { impl BlockCipherDecrypt for $name_dec { fn decrypt_with_backend(&self, f: impl BlockCipherDecClosure) { - unsafe { - if self.token.get() { - #[target_feature(enable = "aes")] - unsafe fn inner( - state: &intrinsics::$name_dec, - f: impl BlockCipherDecClosure, - ) { - f.call(state.get_dec_backend()); - } - inner(&self.inner.intrinsics, f); - } else { - f.call(&self.inner.soft.get_dec_backend()); - } + if self.token.get() { + unsafe { &self.inner.arch }.decrypt_with_backend(f) + } else { + unsafe { &self.inner.soft }.decrypt_with_backend(f) } } } @@ -423,7 +381,7 @@ macro_rules! define_aes_impl { #[inline] fn drop(&mut self) { if self.token.get() { - unsafe { ManuallyDrop::drop(&mut self.inner.intrinsics) }; + unsafe { ManuallyDrop::drop(&mut self.inner.arch) }; } else { unsafe { ManuallyDrop::drop(&mut self.inner.soft) }; }; diff --git a/aes/src/hazmat.rs b/aes/src/hazmat.rs index 068cbd40..39631f83 100644 --- a/aes/src/hazmat.rs +++ b/aes/src/hazmat.rs @@ -21,7 +21,7 @@ pub type Block8 = cipher::array::Array; use crate::armv8::hazmat as intrinsics; #[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), not(aes_force_soft)))] -use crate::ni::hazmat as intrinsics; +use crate::x86::ni::hazmat as intrinsics; #[cfg(all( any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"), diff --git a/aes/src/lib.rs b/aes/src/lib.rs index 6b27ee39..babd6884 100644 --- a/aes/src/lib.rs +++ b/aes/src/lib.rs @@ -35,18 +35,26 @@ //! runtime. On other platforms the `aes` target feature must be enabled via //! RUSTFLAGS. //! -//! ## `x86`/`x86_64` intrinsics (AES-NI) +//! ## `x86`/`x86_64` intrinsics (AES-NI and VAES) //! By default this crate uses runtime detection on `i686`/`x86_64` targets -//! in order to determine if AES-NI is available, and if it is not, it will -//! fallback to using a constant-time software implementation. +//! in order to determine if AES-NI and VAES are available, and if they are +//! not, it will fallback to using a constant-time software implementation. +//! +//! Passing `RUSTFLAGS=-Ctarget-feature=+aes,+ssse3` explicitly at +//! compile-time will override runtime detection and ensure that AES-NI is +//! used or passing `RUSTFLAGS=-Ctarget-feature=+aes,+avx512f,+ssse3,+vaes` +//! will ensure that AESNI and VAES are always used. //! -//! Passing `RUSTFLAGS=-C target-feature=+aes,+ssse3` explicitly at compile-time -//! will override runtime detection and ensure that AES-NI is always used. //! Programs built in this manner will crash with an illegal instruction on -//! CPUs which do not have AES-NI enabled. +//! CPUs which do not have AES-NI and VAES enabled. +//! +//! Note: It is possible to disable the use of AVX512 for the VAES backend +//! and limiting it to AVX (256-bit) by specifying `--cfg aes_avx512_disable`. +//! For CPUs which support VAES but not AVX512, the 256-bit VAES backend will +//! be selected automatically without needing to specify this flag. //! //! Note: runtime detection is not possible on SGX targets. Please use the -//! afforementioned `RUSTFLAGS` to leverage AES-NI on these targets. +//! afforementioned `RUSTFLAGS` to leverage AES-NI and VAES on these targets. //! //! # Examples //! ``` @@ -134,8 +142,8 @@ cfg_if! { any(target_arch = "x86", target_arch = "x86_64"), not(aes_force_soft) ))] { + mod x86; mod autodetect; - mod ni; pub use autodetect::*; } else { pub use soft::*; @@ -216,19 +224,19 @@ mod tests { #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(aes_force_soft)))] { - use super::ni; + use super::x86; cpufeatures::new!(aes_intrinsics, "aes"); if aes_intrinsics::get() { - test_for(ni::Aes128::new(&key_128)); - test_for(ni::Aes128Enc::new(&key_128)); - test_for(ni::Aes128Dec::new(&key_128)); - test_for(ni::Aes192::new(&key_192)); - test_for(ni::Aes192Enc::new(&key_192)); - test_for(ni::Aes192Dec::new(&key_192)); - test_for(ni::Aes256::new(&key_256)); - test_for(ni::Aes256Enc::new(&key_256)); - test_for(ni::Aes256Dec::new(&key_256)); + test_for(x86::Aes128::new(&key_128)); + test_for(x86::Aes128Enc::new(&key_128)); + test_for(x86::Aes128Dec::new(&key_128)); + test_for(x86::Aes192::new(&key_192)); + test_for(x86::Aes192Enc::new(&key_192)); + test_for(x86::Aes192Dec::new(&key_192)); + test_for(x86::Aes256::new(&key_256)); + test_for(x86::Aes256Enc::new(&key_256)); + test_for(x86::Aes256Dec::new(&key_256)); } } diff --git a/aes/src/ni.rs b/aes/src/ni.rs deleted file mode 100644 index 9a798102..00000000 --- a/aes/src/ni.rs +++ /dev/null @@ -1,355 +0,0 @@ -//! AES block ciphers implementation using AES-NI instruction set. -//! -//! Ciphers functionality is accessed using `BlockCipher` trait from the -//! [`cipher`](https://docs.rs/cipher) crate. -//! -//! # Vulnerability -//! Lazy FP state restory vulnerability can allow local process to leak content -//! of the FPU register, in which round keys are stored. This vulnerability -//! can be mitigated at the operating system level by installing relevant -//! patches. (i.e. keep your OS updated!) More info: -//! - [Intel advisory](https://www.intel.com/content/www/us/en/security-center/advisory/intel-sa-00145.html) -//! - [Wikipedia](https://en.wikipedia.org/wiki/Lazy_FP_state_restore) -//! -//! # Related documents -//! - [Intel AES-NI whitepaper](https://software.intel.com/sites/default/files/article/165683/aes-wp-2012-09-22-v01.pdf) -//! - [Use of the AES Instruction Set](https://www.cosic.esat.kuleuven.be/ecrypt/AESday/slides/Use_of_the_AES_Instruction_Set.pdf) - -mod encdec; -mod expand; -#[cfg(test)] -mod test_expand; - -#[cfg(feature = "hazmat")] -pub(crate) mod hazmat; - -#[cfg(target_arch = "x86")] -use core::arch::x86 as arch; -#[cfg(target_arch = "x86_64")] -use core::arch::x86_64 as arch; - -use cipher::{ - AlgorithmName, BlockCipherDecClosure, BlockCipherDecrypt, BlockCipherEncClosure, - BlockCipherEncrypt, BlockSizeUser, Key, KeyInit, KeySizeUser, - consts::{self, U16, U24, U32}, - crypto_common::WeakKeyError, -}; -use core::fmt; - -impl_backends!( - enc_name = Aes128BackEnc, - dec_name = Aes128BackDec, - key_size = consts::U16, - keys_ty = expand::Aes128RoundKeys, - par_size = consts::U9, - expand_keys = expand::aes128_expand_key, - inv_keys = expand::inv_keys, - encrypt = encdec::encrypt, - encrypt_par = encdec::encrypt_par, - decrypt = encdec::decrypt, - decrypt_par = encdec::decrypt_par, -); - -impl_backends!( - enc_name = Aes192BackEnc, - dec_name = Aes192BackDec, - key_size = consts::U24, - keys_ty = expand::Aes192RoundKeys, - par_size = consts::U9, - expand_keys = expand::aes192_expand_key, - inv_keys = expand::inv_keys, - encrypt = encdec::encrypt, - encrypt_par = encdec::encrypt_par, - decrypt = encdec::decrypt, - decrypt_par = encdec::decrypt_par, -); - -impl_backends!( - enc_name = Aes256BackEnc, - dec_name = Aes256BackDec, - key_size = consts::U32, - keys_ty = expand::Aes256RoundKeys, - par_size = consts::U9, - expand_keys = expand::aes256_expand_key, - inv_keys = expand::inv_keys, - encrypt = encdec::encrypt, - encrypt_par = encdec::encrypt_par, - decrypt = encdec::decrypt, - decrypt_par = encdec::decrypt_par, -); - -macro_rules! define_aes_impl { - ( - $name:tt, - $name_enc:ident, - $name_dec:ident, - $name_back_enc:ident, - $name_back_dec:ident, - $key_size:ty, - $doc:expr $(,)? - ) => { - #[doc=$doc] - #[doc = "block cipher"] - #[derive(Clone)] - pub struct $name { - encrypt: $name_enc, - decrypt: $name_dec, - } - - impl $name { - #[inline(always)] - pub(crate) fn get_enc_backend(&self) -> &$name_back_enc { - self.encrypt.get_enc_backend() - } - - #[inline(always)] - pub(crate) fn get_dec_backend(&self) -> &$name_back_dec { - self.decrypt.get_dec_backend() - } - } - - impl KeySizeUser for $name { - type KeySize = $key_size; - } - - impl KeyInit for $name { - #[inline] - fn new(key: &Key) -> Self { - let encrypt = $name_enc::new(key); - let decrypt = $name_dec::from(&encrypt); - Self { encrypt, decrypt } - } - - #[inline] - fn weak_key_test(key: &Key) -> Result<(), WeakKeyError> { - crate::weak_key_test(&key.0) - } - } - - impl From<$name_enc> for $name { - #[inline] - fn from(encrypt: $name_enc) -> $name { - let decrypt = (&encrypt).into(); - Self { encrypt, decrypt } - } - } - - impl From<&$name_enc> for $name { - #[inline] - fn from(encrypt: &$name_enc) -> $name { - let decrypt = encrypt.into(); - let encrypt = encrypt.clone(); - Self { encrypt, decrypt } - } - } - - impl BlockSizeUser for $name { - type BlockSize = U16; - } - - impl BlockCipherEncrypt for $name { - fn encrypt_with_backend(&self, f: impl BlockCipherEncClosure) { - self.encrypt.encrypt_with_backend(f) - } - } - - impl BlockCipherDecrypt for $name { - fn decrypt_with_backend(&self, f: impl BlockCipherDecClosure) { - self.decrypt.decrypt_with_backend(f) - } - } - - impl fmt::Debug for $name { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { - f.write_str(concat!(stringify!($name), " { .. }")) - } - } - - impl AlgorithmName for $name { - fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str(stringify!($name)) - } - } - - #[cfg(feature = "zeroize")] - impl zeroize::ZeroizeOnDrop for $name {} - - #[doc=$doc] - #[doc = "block cipher (encrypt-only)"] - #[derive(Clone)] - pub struct $name_enc { - backend: $name_back_enc, - } - - impl $name_enc { - #[inline(always)] - pub(crate) fn get_enc_backend(&self) -> &$name_back_enc { - &self.backend - } - } - - impl KeySizeUser for $name_enc { - type KeySize = $key_size; - } - - impl KeyInit for $name_enc { - #[inline] - fn new(key: &Key) -> Self { - Self { - backend: $name_back_enc::new(key), - } - } - - #[inline] - fn weak_key_test(key: &Key) -> Result<(), WeakKeyError> { - crate::weak_key_test(&key.0) - } - } - - impl BlockSizeUser for $name_enc { - type BlockSize = U16; - } - - impl BlockCipherEncrypt for $name_enc { - fn encrypt_with_backend(&self, f: impl BlockCipherEncClosure) { - f.call(&self.backend) - } - } - - impl fmt::Debug for $name_enc { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { - f.write_str(concat!(stringify!($name_enc), " { .. }")) - } - } - - impl AlgorithmName for $name_enc { - fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str(stringify!($name_enc)) - } - } - - impl Drop for $name_enc { - #[inline] - fn drop(&mut self) { - #[cfg(feature = "zeroize")] - unsafe { - zeroize::zeroize_flat_type(&mut self.backend) - } - } - } - - #[cfg(feature = "zeroize")] - impl zeroize::ZeroizeOnDrop for $name_enc {} - - #[doc=$doc] - #[doc = "block cipher (decrypt-only)"] - #[derive(Clone)] - pub struct $name_dec { - backend: $name_back_dec, - } - - impl $name_dec { - #[inline(always)] - pub(crate) fn get_dec_backend(&self) -> &$name_back_dec { - &self.backend - } - } - - impl KeySizeUser for $name_dec { - type KeySize = $key_size; - } - - impl KeyInit for $name_dec { - #[inline] - fn new(key: &Key) -> Self { - $name_enc::new(key).into() - } - - #[inline] - fn weak_key_test(key: &Key) -> Result<(), WeakKeyError> { - crate::weak_key_test(&key.0) - } - } - - impl From<$name_enc> for $name_dec { - #[inline] - fn from(enc: $name_enc) -> $name_dec { - Self::from(&enc) - } - } - - impl From<&$name_enc> for $name_dec { - #[inline] - fn from(enc: &$name_enc) -> $name_dec { - Self { - backend: enc.backend.clone().into(), - } - } - } - - impl BlockSizeUser for $name_dec { - type BlockSize = U16; - } - - impl BlockCipherDecrypt for $name_dec { - fn decrypt_with_backend(&self, f: impl BlockCipherDecClosure) { - f.call(self.get_dec_backend()); - } - } - - impl fmt::Debug for $name_dec { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { - f.write_str(concat!(stringify!($name_dec), " { .. }")) - } - } - - impl AlgorithmName for $name_dec { - fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str(stringify!($name_dec)) - } - } - - impl Drop for $name_dec { - #[inline] - fn drop(&mut self) { - #[cfg(feature = "zeroize")] - unsafe { - zeroize::zeroize_flat_type(&mut self.backend) - } - } - } - - #[cfg(feature = "zeroize")] - impl zeroize::ZeroizeOnDrop for $name_dec {} - }; -} - -define_aes_impl!( - Aes128, - Aes128Enc, - Aes128Dec, - Aes128BackEnc, - Aes128BackDec, - U16, - "AES-128", -); - -define_aes_impl!( - Aes192, - Aes192Enc, - Aes192Dec, - Aes192BackEnc, - Aes192BackDec, - U24, - "AES-192", -); - -define_aes_impl!( - Aes256, - Aes256Enc, - Aes256Dec, - Aes256BackEnc, - Aes256BackDec, - U32, - "AES-256", -); diff --git a/aes/src/ni/expand.rs b/aes/src/ni/expand.rs deleted file mode 100644 index 325c4c34..00000000 --- a/aes/src/ni/expand.rs +++ /dev/null @@ -1,204 +0,0 @@ -#![allow(unsafe_op_in_unsafe_fn)] - -use super::arch::*; -use core::mem::{transmute, zeroed}; - -pub(super) type Aes128RoundKeys = [__m128i; 11]; -pub(super) type Aes192RoundKeys = [__m128i; 13]; -pub(super) type Aes256RoundKeys = [__m128i; 15]; - -#[target_feature(enable = "aes")] -pub(super) unsafe fn aes128_expand_key(key: &[u8; 16]) -> Aes128RoundKeys { - unsafe fn expand_round(keys: &mut Aes128RoundKeys, pos: usize) { - let mut t1 = keys[pos - 1]; - let mut t2; - let mut t3; - - t2 = _mm_aeskeygenassist_si128(t1, RK); - t2 = _mm_shuffle_epi32(t2, 0xff); - t3 = _mm_slli_si128(t1, 0x4); - t1 = _mm_xor_si128(t1, t3); - t3 = _mm_slli_si128(t3, 0x4); - t1 = _mm_xor_si128(t1, t3); - t3 = _mm_slli_si128(t3, 0x4); - t1 = _mm_xor_si128(t1, t3); - t1 = _mm_xor_si128(t1, t2); - - keys[pos] = t1; - } - - let mut keys: Aes128RoundKeys = zeroed(); - let k = _mm_loadu_si128(key.as_ptr().cast()); - keys[0] = k; - - let kr = &mut keys; - expand_round::<0x01>(kr, 1); - expand_round::<0x02>(kr, 2); - expand_round::<0x04>(kr, 3); - expand_round::<0x08>(kr, 4); - expand_round::<0x10>(kr, 5); - expand_round::<0x20>(kr, 6); - expand_round::<0x40>(kr, 7); - expand_round::<0x80>(kr, 8); - expand_round::<0x1B>(kr, 9); - expand_round::<0x36>(kr, 10); - - keys -} - -#[target_feature(enable = "aes")] -pub(super) unsafe fn aes192_expand_key(key: &[u8; 24]) -> Aes192RoundKeys { - unsafe fn shuffle(a: __m128i, b: __m128i, i: usize) -> __m128i { - let a: [u64; 2] = transmute(a); - let b: [u64; 2] = transmute(b); - transmute([a[i], b[0]]) - } - - #[target_feature(enable = "aes")] - unsafe fn expand_round(mut t1: __m128i, mut t3: __m128i) -> (__m128i, __m128i) { - let (mut t2, mut t4); - - t2 = _mm_aeskeygenassist_si128(t3, RK); - t2 = _mm_shuffle_epi32(t2, 0x55); - t4 = _mm_slli_si128(t1, 0x4); - t1 = _mm_xor_si128(t1, t4); - t4 = _mm_slli_si128(t4, 0x4); - t1 = _mm_xor_si128(t1, t4); - t4 = _mm_slli_si128(t4, 0x4); - t1 = _mm_xor_si128(t1, t4); - t1 = _mm_xor_si128(t1, t2); - t2 = _mm_shuffle_epi32(t1, 0xff); - t4 = _mm_slli_si128(t3, 0x4); - t3 = _mm_xor_si128(t3, t4); - t3 = _mm_xor_si128(t3, t2); - - (t1, t3) - } - - let mut keys: Aes192RoundKeys = zeroed(); - // We are being extra pedantic here to remove out-of-bound access. - // This should be optimized into movups, movsd sequence. - let (k0, k1l) = { - let mut t = [0u8; 32]; - t[..key.len()].copy_from_slice(key); - ( - _mm_loadu_si128(t.as_ptr().cast()), - _mm_loadu_si128(t.as_ptr().offset(16).cast()), - ) - }; - - keys[0] = k0; - - let (k1_2, k2r) = expand_round::<0x01>(k0, k1l); - keys[1] = shuffle(k1l, k1_2, 0); - keys[2] = shuffle(k1_2, k2r, 1); - - let (k3, k4l) = expand_round::<0x02>(k1_2, k2r); - keys[3] = k3; - - let (k4_5, k5r) = expand_round::<0x04>(k3, k4l); - let k4 = shuffle(k4l, k4_5, 0); - let k5 = shuffle(k4_5, k5r, 1); - keys[4] = k4; - keys[5] = k5; - - let (k6, k7l) = expand_round::<0x08>(k4_5, k5r); - keys[6] = k6; - - let (k7_8, k8r) = expand_round::<0x10>(k6, k7l); - keys[7] = shuffle(k7l, k7_8, 0); - keys[8] = shuffle(k7_8, k8r, 1); - - let (k9, k10l) = expand_round::<0x20>(k7_8, k8r); - keys[9] = k9; - - let (k10_11, k11r) = expand_round::<0x40>(k9, k10l); - keys[10] = shuffle(k10l, k10_11, 0); - keys[11] = shuffle(k10_11, k11r, 1); - - let (k12, _) = expand_round::<0x80>(k10_11, k11r); - keys[12] = k12; - - keys -} - -#[target_feature(enable = "aes")] -pub(super) unsafe fn aes256_expand_key(key: &[u8; 32]) -> Aes256RoundKeys { - unsafe fn expand_round(keys: &mut Aes256RoundKeys, pos: usize) { - let mut t1 = keys[pos - 2]; - let mut t2; - let mut t3 = keys[pos - 1]; - let mut t4; - - t2 = _mm_aeskeygenassist_si128(t3, RK); - t2 = _mm_shuffle_epi32(t2, 0xff); - t4 = _mm_slli_si128(t1, 0x4); - t1 = _mm_xor_si128(t1, t4); - t4 = _mm_slli_si128(t4, 0x4); - t1 = _mm_xor_si128(t1, t4); - t4 = _mm_slli_si128(t4, 0x4); - t1 = _mm_xor_si128(t1, t4); - t1 = _mm_xor_si128(t1, t2); - - keys[pos] = t1; - - t4 = _mm_aeskeygenassist_si128(t1, 0x00); - t2 = _mm_shuffle_epi32(t4, 0xaa); - t4 = _mm_slli_si128(t3, 0x4); - t3 = _mm_xor_si128(t3, t4); - t4 = _mm_slli_si128(t4, 0x4); - t3 = _mm_xor_si128(t3, t4); - t4 = _mm_slli_si128(t4, 0x4); - t3 = _mm_xor_si128(t3, t4); - t3 = _mm_xor_si128(t3, t2); - - keys[pos + 1] = t3; - } - - unsafe fn expand_round_last(keys: &mut Aes256RoundKeys, pos: usize) { - let mut t1 = keys[pos - 2]; - let mut t2; - let t3 = keys[pos - 1]; - let mut t4; - - t2 = _mm_aeskeygenassist_si128(t3, RK); - t2 = _mm_shuffle_epi32(t2, 0xff); - t4 = _mm_slli_si128(t1, 0x4); - t1 = _mm_xor_si128(t1, t4); - t4 = _mm_slli_si128(t4, 0x4); - t1 = _mm_xor_si128(t1, t4); - t4 = _mm_slli_si128(t4, 0x4); - t1 = _mm_xor_si128(t1, t4); - t1 = _mm_xor_si128(t1, t2); - - keys[pos] = t1; - } - - let mut keys: Aes256RoundKeys = zeroed(); - - let kp = key.as_ptr().cast::<__m128i>(); - keys[0] = _mm_loadu_si128(kp); - keys[1] = _mm_loadu_si128(kp.add(1)); - - let k = &mut keys; - expand_round::<0x01>(k, 2); - expand_round::<0x02>(k, 4); - expand_round::<0x04>(k, 6); - expand_round::<0x08>(k, 8); - expand_round::<0x10>(k, 10); - expand_round::<0x20>(k, 12); - expand_round_last::<0x40>(k, 14); - - keys -} - -#[target_feature(enable = "aes")] -pub(super) unsafe fn inv_keys(keys: &[__m128i; N]) -> [__m128i; N] { - let mut inv_keys: [__m128i; N] = zeroed(); - inv_keys[0] = keys[N - 1]; - for i in 1..N - 1 { - inv_keys[i] = _mm_aesimc_si128(keys[N - 1 - i]); - } - inv_keys[N - 1] = keys[0]; - inv_keys -} diff --git a/aes/src/soft.rs b/aes/src/soft.rs index f8d60617..9e3fe2be 100644 --- a/aes/src/soft.rs +++ b/aes/src/soft.rs @@ -45,18 +45,6 @@ macro_rules! define_aes_impl { keys: $fixslice_keys, } - impl $name { - #[inline(always)] - pub(crate) fn get_enc_backend(&self) -> $name_back_enc<'_> { - $name_back_enc(self) - } - - #[inline(always)] - pub(crate) fn get_dec_backend(&self) -> $name_back_dec<'_> { - $name_back_dec(self) - } - } - impl KeySizeUser for $name { type KeySize = $key_size; } @@ -81,13 +69,13 @@ macro_rules! define_aes_impl { impl BlockCipherEncrypt for $name { fn encrypt_with_backend(&self, f: impl BlockCipherEncClosure) { - f.call(&self.get_enc_backend()) + f.call(&$name_back_enc(self)) } } impl BlockCipherDecrypt for $name { fn decrypt_with_backend(&self, f: impl BlockCipherDecClosure) { - f.call(&self.get_dec_backend()) + f.call(&$name_back_dec(self)) } } @@ -135,13 +123,6 @@ macro_rules! define_aes_impl { inner: $name, } - impl $name_enc { - #[inline(always)] - pub(crate) fn get_enc_backend(&self) -> $name_back_enc<'_> { - self.inner.get_enc_backend() - } - } - impl KeySizeUser for $name_enc { type KeySize = $key_size; } @@ -165,7 +146,7 @@ macro_rules! define_aes_impl { impl BlockCipherEncrypt for $name_enc { fn encrypt_with_backend(&self, f: impl BlockCipherEncClosure) { - f.call(&self.get_enc_backend()) + f.call(&mut $name_back_enc(&self.inner)) } } @@ -191,13 +172,6 @@ macro_rules! define_aes_impl { inner: $name, } - impl $name_dec { - #[inline(always)] - pub(crate) fn get_dec_backend(&self) -> $name_back_dec<'_> { - self.inner.get_dec_backend() - } - } - impl KeySizeUser for $name_dec { type KeySize = $key_size; } @@ -237,7 +211,7 @@ macro_rules! define_aes_impl { impl BlockCipherDecrypt for $name_dec { fn decrypt_with_backend(&self, f: impl BlockCipherDecClosure) { - f.call(&self.get_dec_backend()); + f.call(&$name_back_dec(&self.inner)); } } diff --git a/aes/src/x86.rs b/aes/src/x86.rs new file mode 100644 index 00000000..5ab0d1da --- /dev/null +++ b/aes/src/x86.rs @@ -0,0 +1,714 @@ +pub(crate) mod ni; +#[cfg(target_arch = "x86_64")] +pub(crate) mod vaes256; +#[cfg(target_arch = "x86_64")] +pub(crate) mod vaes512; + +#[cfg(target_arch = "x86")] +use core::arch::x86 as arch; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64 as arch; + +use self::arch::*; +use crate::Block; +use cipher::{ + AlgorithmName, BlockCipherDecBackend, BlockCipherDecClosure, BlockCipherDecrypt, + BlockCipherEncBackend, BlockCipherEncClosure, BlockCipherEncrypt, BlockSizeUser, InOut, Key, + KeyInit, KeySizeUser, ParBlocksSizeUser, + consts::{U9, U16, U24, U32}, + crypto_common::WeakKeyError, +}; +#[cfg(target_arch = "x86_64")] +use cipher::{ + Array, InOutBuf, + consts::{U30, U64}, + typenum::Unsigned, +}; +#[cfg(target_arch = "x86_64")] +use core::cell::OnceCell; +use core::fmt; + +#[cfg(target_arch = "x86_64")] +pub(crate) type Block30 = Array; +#[cfg(target_arch = "x86_64")] +pub(crate) type Block64 = Array; + +pub(crate) mod features { + cpufeatures::new!(features_aes, "aes"); + cpufeatures::new!(features_avx, "avx"); + cpufeatures::new!(features_avx512f, "avx512f"); + cpufeatures::new!(features_vaes, "vaes"); + pub(crate) mod aes { + pub use super::features_aes::*; + } + #[cfg(target_arch = "x86_64")] + pub(crate) mod avx { + pub use super::features_avx::*; + } + #[cfg(target_arch = "x86_64")] + pub(crate) mod avx512f { + pub use super::features_avx512f::*; + } + #[cfg(target_arch = "x86_64")] + pub(crate) mod vaes { + pub use super::features_vaes::*; + } +} + +type Simd128RoundKeys = [__m128i; ROUNDS]; +#[cfg(target_arch = "x86_64")] +type Simd256RoundKeys = [__m256i; ROUNDS]; +#[cfg(target_arch = "x86_64")] +type Simd512RoundKeys = [__m512i; ROUNDS]; + +#[derive(Clone)] +enum Backend { + Ni, + #[cfg(target_arch = "x86_64")] + Vaes256, + #[cfg(target_arch = "x86_64")] + Vaes512, +} + +#[derive(Clone, Copy)] +struct Features { + #[cfg(target_arch = "x86_64")] + avx: self::features::avx::InitToken, + #[cfg(target_arch = "x86_64")] + avx512f: self::features::avx512f::InitToken, + #[cfg(target_arch = "x86_64")] + vaes: self::features::vaes::InitToken, +} + +impl Features { + fn new() -> Self { + Self { + #[cfg(target_arch = "x86_64")] + avx: self::features::avx::init(), + #[cfg(target_arch = "x86_64")] + avx512f: self::features::avx512f::init(), + #[cfg(target_arch = "x86_64")] + vaes: self::features::vaes::init(), + } + } + + #[cfg(target_arch = "x86_64")] + fn has_vaes256(&self) -> bool { + #[cfg(target_arch = "x86_64")] + if self.vaes.get() && self.avx.get() && !cfg!(aes_avx256_disable) { + return true; + } + false + } + + #[cfg(target_arch = "x86_64")] + fn has_vaes512(&self) -> bool { + #[cfg(target_arch = "x86_64")] + if self.vaes.get() && self.avx512f.get() && !cfg!(aes_avx512_disable) { + return true; + } + false + } + + fn dispatch(&self) -> Backend { + #[cfg(target_arch = "x86_64")] + if self.has_vaes512() { + return self::Backend::Vaes512; + } + #[cfg(target_arch = "x86_64")] + if self.has_vaes256() { + return self::Backend::Vaes256; + } + Backend::Ni + } +} + +macro_rules! define_aes_impl { + ( + $name:tt, + $name_enc:ident, + $name_dec:ident, + $name_backend:ident, + $module:tt, + $key_size:ty, + $rounds:tt, + $doc:expr $(,)? + ) => { + mod $name_backend { + use super::*; + + #[derive(Clone)] + pub(crate) struct Ni<'a> { + pub(crate) keys: &'a Simd128RoundKeys<$rounds>, + } + #[cfg(target_arch = "x86_64")] + impl<'a> Ni<'a> { + pub const fn par_blocks(&self) -> usize { + ::ParBlocksSize::USIZE + } + } + #[cfg(target_arch = "x86_64")] + impl<'a> From<&Vaes256<'a>> for Ni<'a> { + fn from(backend: &Vaes256<'a>) -> Self { + Self { keys: backend.keys } + } + } + + #[derive(Clone)] + #[cfg(target_arch = "x86_64")] + pub(crate) struct Vaes256<'a> { + pub(crate) features: Features, + pub(crate) keys: &'a Simd128RoundKeys<$rounds>, + pub(crate) simd_256_keys: OnceCell>, + } + #[cfg(target_arch = "x86_64")] + impl<'a> Vaes256<'a> { + pub const fn par_blocks(&self) -> usize { + ::ParBlocksSize::USIZE + } + } + #[cfg(target_arch = "x86_64")] + impl<'a> From<&Vaes512<'a>> for Vaes256<'a> { + fn from(backend: &Vaes512<'a>) -> Self { + Self { + features: backend.features, + keys: backend.keys, + simd_256_keys: OnceCell::new(), + } + } + } + + #[cfg(target_arch = "x86_64")] + pub(crate) struct Vaes512<'a> { + pub(crate) features: Features, + pub(crate) keys: &'a Simd128RoundKeys<$rounds>, + pub(crate) simd_512_keys: OnceCell>, + } + } + + #[doc=$doc] + #[doc = "block cipher"] + #[derive(Clone)] + pub struct $name { + encrypt: $name_enc, + decrypt: $name_dec, + } + + #[cfg(feature = "zeroize")] + impl zeroize::ZeroizeOnDrop for $name {} + + impl KeySizeUser for $name { + type KeySize = $key_size; + } + + impl KeyInit for $name { + #[inline] + fn new(key: &Key) -> Self { + let encrypt = $name_enc::new(key); + let decrypt = $name_dec::from(&encrypt); + Self { encrypt, decrypt } + } + + #[inline] + fn weak_key_test(key: &Key) -> Result<(), WeakKeyError> { + crate::weak_key_test(&key.0) + } + } + + impl From<$name_enc> for $name { + #[inline] + fn from(encrypt: $name_enc) -> $name { + let decrypt = (&encrypt).into(); + Self { encrypt, decrypt } + } + } + + impl From<&$name_enc> for $name { + #[inline] + fn from(encrypt: &$name_enc) -> $name { + let decrypt = encrypt.into(); + let encrypt = encrypt.clone(); + Self { encrypt, decrypt } + } + } + + impl BlockSizeUser for $name { + type BlockSize = U16; + } + + impl BlockCipherEncrypt for $name { + #[inline] + fn encrypt_with_backend(&self, f: impl BlockCipherEncClosure) { + self.encrypt.encrypt_with_backend(f) + } + } + + impl BlockCipherDecrypt for $name { + #[inline] + fn decrypt_with_backend(&self, f: impl BlockCipherDecClosure) { + self.decrypt.decrypt_with_backend(f) + } + } + + impl fmt::Debug for $name { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { + f.write_str(concat!(stringify!($name), " { .. }")) + } + } + + impl AlgorithmName for $name { + fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(stringify!($name)) + } + } + + #[doc=$doc] + #[doc = "block cipher (encrypt-only)"] + #[derive(Clone)] + pub struct $name_enc { + keys: Simd128RoundKeys<$rounds>, + features: Features, + } + + impl Drop for $name_enc { + fn drop(&mut self) { + #[cfg(feature = "zeroize")] + unsafe { + zeroize::zeroize_flat_type(&mut self.keys) + } + } + } + + #[cfg(feature = "zeroize")] + impl zeroize::ZeroizeOnDrop for $name_enc {} + + impl KeySizeUser for $name_enc { + type KeySize = $key_size; + } + + impl KeyInit for $name_enc { + #[inline] + fn new(key: &Key) -> Self { + // SAFETY: we enforce that this code is called only when + // target features required by `expand` were properly checked. + Self { + keys: unsafe { self::ni::expand::$module::expand_key(key.as_ref()) }, + features: Features::new(), + } + } + + #[inline] + fn weak_key_test(key: &Key) -> Result<(), WeakKeyError> { + crate::weak_key_test(&key.0) + } + } + + impl BlockSizeUser for $name_enc { + type BlockSize = U16; + } + + impl BlockCipherEncrypt for $name_enc { + #[inline] + fn encrypt_with_backend(&self, f: impl BlockCipherEncClosure) { + let features = self.features; + let keys = &self.keys; + match features.dispatch() { + self::Backend::Ni => f.call(&mut $name_backend::Ni { keys }), + #[cfg(target_arch = "x86_64")] + self::Backend::Vaes256 => f.call(&mut $name_backend::Vaes256 { + features, + keys, + simd_256_keys: OnceCell::new(), + }), + #[cfg(target_arch = "x86_64")] + self::Backend::Vaes512 => f.call(&mut $name_backend::Vaes512 { + features, + keys, + simd_512_keys: OnceCell::new(), + }), + } + } + } + + impl fmt::Debug for $name_enc { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { + f.write_str(concat!(stringify!($name_enc), " { .. }")) + } + } + + impl AlgorithmName for $name_enc { + fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(stringify!($name_enc)) + } + } + + #[doc=$doc] + #[doc = "block cipher (decrypt-only)"] + #[derive(Clone)] + pub struct $name_dec { + keys: Simd128RoundKeys<$rounds>, + features: Features, + } + + impl Drop for $name_dec { + fn drop(&mut self) { + #[cfg(feature = "zeroize")] + unsafe { + zeroize::zeroize_flat_type(&mut self.keys) + } + } + } + + #[cfg(feature = "zeroize")] + impl zeroize::ZeroizeOnDrop for $name_dec {} + + impl KeySizeUser for $name_dec { + type KeySize = $key_size; + } + + impl KeyInit for $name_dec { + #[inline] + fn new(key: &Key) -> Self { + $name_enc::new(key).into() + } + + #[inline] + fn weak_key_test(key: &Key) -> Result<(), WeakKeyError> { + crate::weak_key_test(&key.0) + } + } + + impl From<$name_enc> for $name_dec { + #[inline] + fn from(enc: $name_enc) -> $name_dec { + Self::from(&enc) + } + } + + impl From<&$name_enc> for $name_dec { + #[inline] + fn from(enc: &$name_enc) -> $name_dec { + Self { + keys: unsafe { self::ni::expand::inv_keys(&enc.keys) }, + features: enc.features.clone(), + } + } + } + + impl BlockSizeUser for $name_dec { + type BlockSize = U16; + } + + impl BlockCipherDecrypt for $name_dec { + #[inline] + fn decrypt_with_backend(&self, f: impl BlockCipherDecClosure) { + let features = self.features; + let keys = &self.keys; + match features.dispatch() { + self::Backend::Ni => f.call(&mut $name_backend::Ni { keys }), + #[cfg(target_arch = "x86_64")] + self::Backend::Vaes256 => f.call(&mut $name_backend::Vaes256 { + features, + keys, + simd_256_keys: OnceCell::new(), + }), + #[cfg(target_arch = "x86_64")] + self::Backend::Vaes512 => f.call(&mut $name_backend::Vaes512 { + features, + keys, + simd_512_keys: OnceCell::new(), + }), + } + } + } + + impl fmt::Debug for $name_dec { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { + f.write_str(concat!(stringify!($name_dec), " { .. }")) + } + } + + impl AlgorithmName for $name_dec { + fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(stringify!($name_dec)) + } + } + + impl<'a> BlockSizeUser for $name_backend::Ni<'a> { + type BlockSize = U16; + } + #[cfg(target_arch = "x86_64")] + impl<'a> BlockSizeUser for $name_backend::Vaes256<'a> { + type BlockSize = U16; + } + #[cfg(target_arch = "x86_64")] + impl<'a> BlockSizeUser for $name_backend::Vaes512<'a> { + type BlockSize = U16; + } + + impl<'a> ParBlocksSizeUser for $name_backend::Ni<'a> { + type ParBlocksSize = U9; + } + #[cfg(target_arch = "x86_64")] + impl<'a> ParBlocksSizeUser for $name_backend::Vaes256<'a> { + type ParBlocksSize = U30; + } + #[cfg(target_arch = "x86_64")] + impl<'a> ParBlocksSizeUser for $name_backend::Vaes512<'a> { + type ParBlocksSize = U64; + } + + impl<'a> BlockCipherEncBackend for $name_backend::Ni<'a> { + #[inline] + fn encrypt_block(&self, block: InOut<'_, '_, Block>) { + unsafe { + self::ni::encdec::encrypt(self.keys, block); + } + } + #[inline] + fn encrypt_par_blocks(&self, blocks: InOut<'_, '_, cipher::ParBlocks>) { + unsafe { + self::ni::encdec::encrypt_par(self.keys, blocks); + } + } + } + #[cfg(target_arch = "x86_64")] + impl<'a> BlockCipherEncBackend for $name_backend::Vaes256<'a> { + #[inline] + fn encrypt_block(&self, block: InOut<'_, '_, Block>) { + unsafe { + self::ni::encdec::encrypt(self.keys, block); + } + } + #[inline] + fn encrypt_par_blocks(&self, blocks: InOut<'_, '_, cipher::ParBlocks>) { + unsafe { + let simd_256_keys = self + .simd_256_keys + .get_or_init(|| vaes256::$module::broadcast_keys(&self.keys)); + vaes256::$module::encrypt30(simd_256_keys, blocks); + } + } + #[inline] + fn encrypt_tail_blocks(&self, blocks: InOutBuf<'_, '_, Block>) { + let backend = self; + + let mut rem = blocks.len(); + let (mut iptr, mut optr) = blocks.into_raw(); + + let backend = $name_backend::Ni::from(backend); + while rem >= backend.par_blocks() { + let blocks = unsafe { InOut::from_raw(iptr.cast(), optr.cast()) }; + backend.encrypt_par_blocks(blocks); + rem -= backend.par_blocks(); + iptr = unsafe { iptr.add(backend.par_blocks()) }; + optr = unsafe { optr.add(backend.par_blocks()) }; + } + + while rem > 0 { + let block = unsafe { InOut::from_raw(iptr, optr) }; + backend.encrypt_block(block); + rem -= 1; + iptr = unsafe { iptr.add(1) }; + optr = unsafe { optr.add(1) }; + } + } + } + #[cfg(target_arch = "x86_64")] + impl<'a> BlockCipherEncBackend for $name_backend::Vaes512<'a> { + #[inline] + fn encrypt_block(&self, block: InOut<'_, '_, Block>) { + unsafe { + self::ni::encdec::encrypt(self.keys, block); + } + } + #[inline] + fn encrypt_par_blocks(&self, blocks: InOut<'_, '_, cipher::ParBlocks>) { + unsafe { + let simd_512_keys = self + .simd_512_keys + .get_or_init(|| vaes512::$module::broadcast_keys(&self.keys)); + vaes512::$module::encrypt64(simd_512_keys, blocks); + } + } + #[inline] + fn encrypt_tail_blocks(&self, blocks: InOutBuf<'_, '_, Block>) { + let backend = self; + + let mut rem = blocks.len(); + let (mut iptr, mut optr) = blocks.into_raw(); + + let backend = &$name_backend::Vaes256::from(backend); + if backend.features.has_vaes256() { + while rem >= backend.par_blocks() { + let blocks = unsafe { InOut::from_raw(iptr.cast(), optr.cast()) }; + backend.encrypt_par_blocks(blocks); + rem -= backend.par_blocks(); + iptr = unsafe { iptr.add(backend.par_blocks()) }; + optr = unsafe { optr.add(backend.par_blocks()) }; + } + } + + let backend = &$name_backend::Ni::from(backend); + while rem >= backend.par_blocks() { + let blocks = unsafe { InOut::from_raw(iptr.cast(), optr.cast()) }; + backend.encrypt_par_blocks(blocks); + rem -= backend.par_blocks(); + iptr = unsafe { iptr.add(backend.par_blocks()) }; + optr = unsafe { optr.add(backend.par_blocks()) }; + } + + while rem > 0 { + let block = unsafe { InOut::from_raw(iptr, optr) }; + backend.encrypt_block(block); + rem -= 1; + iptr = unsafe { iptr.add(1) }; + optr = unsafe { optr.add(1) }; + } + } + } + + impl<'a> BlockCipherDecBackend for $name_backend::Ni<'a> { + #[inline] + fn decrypt_block(&self, block: InOut<'_, '_, Block>) { + unsafe { + self::ni::encdec::decrypt(self.keys, block); + } + } + #[inline] + fn decrypt_par_blocks(&self, blocks: InOut<'_, '_, cipher::ParBlocks>) { + unsafe { + self::ni::encdec::decrypt_par(self.keys, blocks); + } + } + } + #[cfg(target_arch = "x86_64")] + impl<'a> BlockCipherDecBackend for $name_backend::Vaes256<'a> { + #[inline] + fn decrypt_block(&self, block: InOut<'_, '_, Block>) { + unsafe { + self::ni::encdec::decrypt(self.keys, block); + } + } + #[inline] + fn decrypt_par_blocks(&self, blocks: InOut<'_, '_, cipher::ParBlocks>) { + unsafe { + let simd_256_keys = self + .simd_256_keys + .get_or_init(|| vaes256::$module::broadcast_keys(&self.keys)); + vaes256::$module::decrypt30(simd_256_keys, blocks); + } + } + #[inline] + fn decrypt_tail_blocks(&self, blocks: InOutBuf<'_, '_, Block>) { + let backend = self; + + let mut rem = blocks.len(); + let (mut iptr, mut optr) = blocks.into_raw(); + + let backend = $name_backend::Ni::from(backend); + while rem >= backend.par_blocks() { + let blocks = unsafe { InOut::from_raw(iptr.cast(), optr.cast()) }; + backend.decrypt_par_blocks(blocks); + rem -= backend.par_blocks(); + iptr = unsafe { iptr.add(backend.par_blocks()) }; + optr = unsafe { optr.add(backend.par_blocks()) }; + } + + while rem > 0 { + let block = unsafe { InOut::from_raw(iptr, optr) }; + backend.decrypt_block(block); + rem -= 1; + iptr = unsafe { iptr.add(1) }; + optr = unsafe { optr.add(1) }; + } + } + } + #[cfg(target_arch = "x86_64")] + impl<'a> BlockCipherDecBackend for $name_backend::Vaes512<'a> { + #[inline] + fn decrypt_block(&self, block: InOut<'_, '_, Block>) { + unsafe { + self::ni::encdec::decrypt(self.keys, block); + } + } + #[inline] + fn decrypt_par_blocks(&self, blocks: InOut<'_, '_, cipher::ParBlocks>) { + unsafe { + let simd_512_keys = self + .simd_512_keys + .get_or_init(|| vaes512::$module::broadcast_keys(&self.keys)); + vaes512::$module::decrypt64(simd_512_keys, blocks); + } + } + #[inline] + fn decrypt_tail_blocks(&self, blocks: InOutBuf<'_, '_, Block>) { + let backend = self; + + let mut rem = blocks.len(); + let (mut iptr, mut optr) = blocks.into_raw(); + + let backend = &$name_backend::Vaes256::from(backend); + if backend.features.has_vaes256() { + while rem >= backend.par_blocks() { + let blocks = unsafe { InOut::from_raw(iptr.cast(), optr.cast()) }; + backend.decrypt_par_blocks(blocks); + rem -= backend.par_blocks(); + iptr = unsafe { iptr.add(backend.par_blocks()) }; + optr = unsafe { optr.add(backend.par_blocks()) }; + } + } + + let backend = &$name_backend::Ni::from(backend); + while rem >= backend.par_blocks() { + let blocks = unsafe { InOut::from_raw(iptr.cast(), optr.cast()) }; + backend.decrypt_par_blocks(blocks); + rem -= backend.par_blocks(); + iptr = unsafe { iptr.add(backend.par_blocks()) }; + optr = unsafe { optr.add(backend.par_blocks()) }; + } + + while rem > 0 { + let block = unsafe { InOut::from_raw(iptr, optr) }; + backend.decrypt_block(block); + rem -= 1; + iptr = unsafe { iptr.add(1) }; + optr = unsafe { optr.add(1) }; + } + } + } + }; +} + +define_aes_impl!( + Aes128, + Aes128Enc, + Aes128Dec, + aes128_backend, + aes128, + U16, + 11, + "AES-128", +); + +define_aes_impl!( + Aes192, + Aes192Enc, + Aes192Dec, + aes192_backend, + aes192, + U24, + 13, + "AES-192", +); + +define_aes_impl!( + Aes256, + Aes256Enc, + Aes256Dec, + aes256_backend, + aes256, + U32, + 15, + "AES-256", +); diff --git a/aes/src/x86/ni.rs b/aes/src/x86/ni.rs new file mode 100644 index 00000000..c35b1b86 --- /dev/null +++ b/aes/src/x86/ni.rs @@ -0,0 +1,24 @@ +//! AES block ciphers implementation using AES-NI instruction set. +//! +//! Ciphers functionality is accessed using `BlockCipher` trait from the +//! [`cipher`](https://docs.rs/cipher) crate. +//! +//! # Vulnerability +//! Lazy FP state restory vulnerability can allow local process to leak content +//! of the FPU register, in which round keys are stored. This vulnerability +//! can be mitigated at the operating system level by installing relevant +//! patches. (i.e. keep your OS updated!) More info: +//! - [Intel advisory](https://www.intel.com/content/www/us/en/security-center/advisory/intel-sa-00145.html) +//! - [Wikipedia](https://en.wikipedia.org/wiki/Lazy_FP_state_restore) +//! +//! # Related documents +//! - [Intel AES-NI whitepaper](https://software.intel.com/sites/default/files/article/165683/aes-wp-2012-09-22-v01.pdf) +//! - [Use of the AES Instruction Set](https://www.cosic.esat.kuleuven.be/ecrypt/AESday/slides/Use_of_the_AES_Instruction_Set.pdf) + +pub(super) mod encdec; +pub(super) mod expand; +#[cfg(test)] +mod test_expand; + +#[cfg(feature = "hazmat")] +pub(crate) mod hazmat; diff --git a/aes/src/ni/encdec.rs b/aes/src/x86/ni/encdec.rs similarity index 94% rename from aes/src/ni/encdec.rs rename to aes/src/x86/ni/encdec.rs index 12db0da2..b68ed5f9 100644 --- a/aes/src/ni/encdec.rs +++ b/aes/src/x86/ni/encdec.rs @@ -1,14 +1,14 @@ #![allow(unsafe_op_in_unsafe_fn)] -use super::arch::*; use crate::Block; +use crate::x86::arch::*; use cipher::{ array::{Array, ArraySize}, inout::InOut, }; #[target_feature(enable = "aes")] -pub(super) unsafe fn encrypt( +pub(crate) unsafe fn encrypt( keys: &[__m128i; KEYS], block: InOut<'_, '_, Block>, ) { @@ -25,7 +25,7 @@ pub(super) unsafe fn encrypt( } #[target_feature(enable = "aes")] -pub(super) unsafe fn decrypt( +pub(crate) unsafe fn decrypt( keys: &[__m128i; KEYS], block: InOut<'_, '_, Block>, ) { @@ -42,7 +42,7 @@ pub(super) unsafe fn decrypt( } #[target_feature(enable = "aes")] -pub(super) unsafe fn encrypt_par( +pub(crate) unsafe fn encrypt_par( keys: &[__m128i; KEYS], blocks: InOut<'_, '_, Array>, ) { @@ -75,7 +75,7 @@ pub(super) unsafe fn encrypt_par( } #[target_feature(enable = "aes")] -pub(super) unsafe fn decrypt_par( +pub(crate) unsafe fn decrypt_par( keys: &[__m128i; KEYS], blocks: InOut<'_, '_, Array>, ) { diff --git a/aes/src/x86/ni/expand.rs b/aes/src/x86/ni/expand.rs new file mode 100644 index 00000000..68bdb357 --- /dev/null +++ b/aes/src/x86/ni/expand.rs @@ -0,0 +1,219 @@ +#![allow(unsafe_op_in_unsafe_fn)] + +use crate::x86::arch::*; +use core::mem::{transmute, zeroed}; + +pub(super) type Aes128RoundKeys = [__m128i; 11]; +pub(super) type Aes192RoundKeys = [__m128i; 13]; +pub(super) type Aes256RoundKeys = [__m128i; 15]; + +pub(crate) mod aes128 { + use super::*; + + #[target_feature(enable = "aes")] + pub(crate) unsafe fn expand_key(key: &[u8; 16]) -> Aes128RoundKeys { + unsafe fn expand_round(keys: &mut Aes128RoundKeys, pos: usize) { + let mut t1 = keys[pos - 1]; + let mut t2; + let mut t3; + + t2 = _mm_aeskeygenassist_si128(t1, RK); + t2 = _mm_shuffle_epi32(t2, 0xff); + t3 = _mm_slli_si128(t1, 0x4); + t1 = _mm_xor_si128(t1, t3); + t3 = _mm_slli_si128(t3, 0x4); + t1 = _mm_xor_si128(t1, t3); + t3 = _mm_slli_si128(t3, 0x4); + t1 = _mm_xor_si128(t1, t3); + t1 = _mm_xor_si128(t1, t2); + + keys[pos] = t1; + } + + let mut keys: Aes128RoundKeys = zeroed(); + let k = _mm_loadu_si128(key.as_ptr().cast()); + keys[0] = k; + + let kr = &mut keys; + expand_round::<0x01>(kr, 1); + expand_round::<0x02>(kr, 2); + expand_round::<0x04>(kr, 3); + expand_round::<0x08>(kr, 4); + expand_round::<0x10>(kr, 5); + expand_round::<0x20>(kr, 6); + expand_round::<0x40>(kr, 7); + expand_round::<0x80>(kr, 8); + expand_round::<0x1B>(kr, 9); + expand_round::<0x36>(kr, 10); + + keys + } +} + +pub(crate) mod aes192 { + use super::*; + + #[target_feature(enable = "aes")] + pub(crate) unsafe fn expand_key(key: &[u8; 24]) -> Aes192RoundKeys { + unsafe fn shuffle(a: __m128i, b: __m128i, i: usize) -> __m128i { + let a: [u64; 2] = transmute(a); + let b: [u64; 2] = transmute(b); + transmute([a[i], b[0]]) + } + + #[target_feature(enable = "aes")] + unsafe fn expand_round( + mut t1: __m128i, + mut t3: __m128i, + ) -> (__m128i, __m128i) { + let (mut t2, mut t4); + + t2 = _mm_aeskeygenassist_si128(t3, RK); + t2 = _mm_shuffle_epi32(t2, 0x55); + t4 = _mm_slli_si128(t1, 0x4); + t1 = _mm_xor_si128(t1, t4); + t4 = _mm_slli_si128(t4, 0x4); + t1 = _mm_xor_si128(t1, t4); + t4 = _mm_slli_si128(t4, 0x4); + t1 = _mm_xor_si128(t1, t4); + t1 = _mm_xor_si128(t1, t2); + t2 = _mm_shuffle_epi32(t1, 0xff); + t4 = _mm_slli_si128(t3, 0x4); + t3 = _mm_xor_si128(t3, t4); + t3 = _mm_xor_si128(t3, t2); + + (t1, t3) + } + + let mut keys: Aes192RoundKeys = zeroed(); + // We are being extra pedantic here to remove out-of-bound access. + // This should be optimized into movups, movsd sequence. + let (k0, k1l) = { + let mut t = [0u8; 32]; + t[..key.len()].copy_from_slice(key); + ( + _mm_loadu_si128(t.as_ptr().cast()), + _mm_loadu_si128(t.as_ptr().offset(16).cast()), + ) + }; + + keys[0] = k0; + + let (k1_2, k2r) = expand_round::<0x01>(k0, k1l); + keys[1] = shuffle(k1l, k1_2, 0); + keys[2] = shuffle(k1_2, k2r, 1); + + let (k3, k4l) = expand_round::<0x02>(k1_2, k2r); + keys[3] = k3; + + let (k4_5, k5r) = expand_round::<0x04>(k3, k4l); + let k4 = shuffle(k4l, k4_5, 0); + let k5 = shuffle(k4_5, k5r, 1); + keys[4] = k4; + keys[5] = k5; + + let (k6, k7l) = expand_round::<0x08>(k4_5, k5r); + keys[6] = k6; + + let (k7_8, k8r) = expand_round::<0x10>(k6, k7l); + keys[7] = shuffle(k7l, k7_8, 0); + keys[8] = shuffle(k7_8, k8r, 1); + + let (k9, k10l) = expand_round::<0x20>(k7_8, k8r); + keys[9] = k9; + + let (k10_11, k11r) = expand_round::<0x40>(k9, k10l); + keys[10] = shuffle(k10l, k10_11, 0); + keys[11] = shuffle(k10_11, k11r, 1); + + let (k12, _) = expand_round::<0x80>(k10_11, k11r); + keys[12] = k12; + + keys + } +} + +pub(crate) mod aes256 { + use super::*; + + #[target_feature(enable = "aes")] + pub(crate) unsafe fn expand_key(key: &[u8; 32]) -> Aes256RoundKeys { + unsafe fn expand_round(keys: &mut Aes256RoundKeys, pos: usize) { + let mut t1 = keys[pos - 2]; + let mut t2; + let mut t3 = keys[pos - 1]; + let mut t4; + + t2 = _mm_aeskeygenassist_si128(t3, RK); + t2 = _mm_shuffle_epi32(t2, 0xff); + t4 = _mm_slli_si128(t1, 0x4); + t1 = _mm_xor_si128(t1, t4); + t4 = _mm_slli_si128(t4, 0x4); + t1 = _mm_xor_si128(t1, t4); + t4 = _mm_slli_si128(t4, 0x4); + t1 = _mm_xor_si128(t1, t4); + t1 = _mm_xor_si128(t1, t2); + + keys[pos] = t1; + + t4 = _mm_aeskeygenassist_si128(t1, 0x00); + t2 = _mm_shuffle_epi32(t4, 0xaa); + t4 = _mm_slli_si128(t3, 0x4); + t3 = _mm_xor_si128(t3, t4); + t4 = _mm_slli_si128(t4, 0x4); + t3 = _mm_xor_si128(t3, t4); + t4 = _mm_slli_si128(t4, 0x4); + t3 = _mm_xor_si128(t3, t4); + t3 = _mm_xor_si128(t3, t2); + + keys[pos + 1] = t3; + } + + unsafe fn expand_round_last(keys: &mut Aes256RoundKeys, pos: usize) { + let mut t1 = keys[pos - 2]; + let mut t2; + let t3 = keys[pos - 1]; + let mut t4; + + t2 = _mm_aeskeygenassist_si128(t3, RK); + t2 = _mm_shuffle_epi32(t2, 0xff); + t4 = _mm_slli_si128(t1, 0x4); + t1 = _mm_xor_si128(t1, t4); + t4 = _mm_slli_si128(t4, 0x4); + t1 = _mm_xor_si128(t1, t4); + t4 = _mm_slli_si128(t4, 0x4); + t1 = _mm_xor_si128(t1, t4); + t1 = _mm_xor_si128(t1, t2); + + keys[pos] = t1; + } + + let mut keys: Aes256RoundKeys = zeroed(); + + let kp = key.as_ptr().cast::<__m128i>(); + keys[0] = _mm_loadu_si128(kp); + keys[1] = _mm_loadu_si128(kp.add(1)); + + let k = &mut keys; + expand_round::<0x01>(k, 2); + expand_round::<0x02>(k, 4); + expand_round::<0x04>(k, 6); + expand_round::<0x08>(k, 8); + expand_round::<0x10>(k, 10); + expand_round::<0x20>(k, 12); + expand_round_last::<0x40>(k, 14); + + keys + } +} + +#[target_feature(enable = "aes")] +pub(crate) unsafe fn inv_keys(keys: &[__m128i; N]) -> [__m128i; N] { + let mut inv_keys: [__m128i; N] = zeroed(); + inv_keys[0] = keys[N - 1]; + for i in 1..N - 1 { + inv_keys[i] = _mm_aesimc_si128(keys[N - 1 - i]); + } + inv_keys[N - 1] = keys[0]; + inv_keys +} diff --git a/aes/src/ni/hazmat.rs b/aes/src/x86/ni/hazmat.rs similarity index 99% rename from aes/src/ni/hazmat.rs rename to aes/src/x86/ni/hazmat.rs index 0e9a9cd2..24a365a5 100644 --- a/aes/src/ni/hazmat.rs +++ b/aes/src/x86/ni/hazmat.rs @@ -5,8 +5,8 @@ //! access to the AES round function gated under the `hazmat` crate feature. #![allow(unsafe_op_in_unsafe_fn)] -use super::arch::*; use crate::hazmat::{Block, Block8}; +use crate::x86::arch::*; use cipher::array::{Array, ArraySize}; #[target_feature(enable = "sse2")] diff --git a/aes/src/ni/test_expand.rs b/aes/src/x86/ni/test_expand.rs similarity index 94% rename from aes/src/ni/test_expand.rs rename to aes/src/x86/ni/test_expand.rs index 973f827d..6524ef74 100644 --- a/aes/src/ni/test_expand.rs +++ b/aes/src/x86/ni/test_expand.rs @@ -1,4 +1,5 @@ -use super::{arch::*, expand::*}; +use crate::x86::arch::*; +use crate::x86::ni::expand::*; use hex_literal::hex; pub(crate) fn check(a: &[__m128i], b: &[[u64; 2]]) { @@ -18,7 +19,7 @@ pub(crate) fn check(a: &[__m128i], b: &[[u64; 2]]) { fn aes128_expand_key_test() { let keys = [0x00; 16]; check( - &unsafe { aes128_expand_key(&keys) }, + &unsafe { aes128::expand_key(&keys) }, &[ [0x0000000000000000, 0x0000000000000000], [0x6263636362636363, 0x6263636362636363], @@ -36,7 +37,7 @@ fn aes128_expand_key_test() { let keys = [0xff; 16]; check( - &unsafe { aes128_expand_key(&keys) }, + &unsafe { aes128::expand_key(&keys) }, &[ [0xffffffffffffffff, 0xffffffffffffffff], [0xe8e9e9e917161616, 0xe8e9e9e917161616], @@ -54,7 +55,7 @@ fn aes128_expand_key_test() { let keys = hex!("000102030405060708090a0b0c0d0e0f"); check( - &unsafe { aes128_expand_key(&keys) }, + &unsafe { aes128::expand_key(&keys) }, &[ [0x0001020304050607, 0x08090a0b0c0d0e0f], [0xd6aa74fdd2af72fa, 0xdaa678f1d6ab76fe], @@ -72,7 +73,7 @@ fn aes128_expand_key_test() { let keys = hex!("6920e299a5202a6d656e636869746f2a"); check( - &unsafe { aes128_expand_key(&keys) }, + &unsafe { aes128::expand_key(&keys) }, &[ [0x6920e299a5202a6d, 0x656e636869746f2a], [0xfa8807605fa82d0d, 0x3ac64e6553b2214f], @@ -90,7 +91,7 @@ fn aes128_expand_key_test() { let keys = hex!("2b7e151628aed2a6abf7158809cf4f3c"); check( - &unsafe { aes128_expand_key(&keys) }, + &unsafe { aes128::expand_key(&keys) }, &[ [0x2b7e151628aed2a6, 0xabf7158809cf4f3c], [0xa0fafe1788542cb1, 0x23a339392a6c7605], @@ -115,7 +116,7 @@ fn aes128_expand_key_test() { fn aes192_expand_key_test() { let keys = [0x00; 24]; check( - &unsafe { aes192_expand_key(&keys) }, + &unsafe { aes192::expand_key(&keys) }, &[ [0x0000000000000000, 0x0000000000000000], [0x0000000000000000, 0x6263636362636363], @@ -135,7 +136,7 @@ fn aes192_expand_key_test() { let keys = [0xff; 24]; check( - &unsafe { aes192_expand_key(&keys) }, + &unsafe { aes192::expand_key(&keys) }, &[ [0xffffffffffffffff, 0xffffffffffffffff], [0xffffffffffffffff, 0xe8e9e9e917161616], @@ -155,7 +156,7 @@ fn aes192_expand_key_test() { let keys = hex!("000102030405060708090a0b0c0d0e0f1011121314151617"); check( - &unsafe { aes192_expand_key(&keys) }, + &unsafe { aes192::expand_key(&keys) }, &[ [0x0001020304050607, 0x08090a0b0c0d0e0f], [0x1011121314151617, 0x5846f2f95c43f4fe], @@ -175,7 +176,7 @@ fn aes192_expand_key_test() { let keys = hex!("8e73b0f7da0e6452c810f32b809079e562f8ead2522c6b7b"); check( - &unsafe { aes192_expand_key(&keys) }, + &unsafe { aes192::expand_key(&keys) }, &[ [0x8e73b0f7da0e6452, 0xc810f32b809079e5], [0x62f8ead2522c6b7b, 0xfe0c91f72402f5a5], @@ -202,7 +203,7 @@ fn aes192_expand_key_test() { fn aes256_expand_key_test() { let keys = [0x00; 32]; check( - &unsafe { aes256_expand_key(&keys) }, + &unsafe { aes256::expand_key(&keys) }, &[ [0x0000000000000000, 0x0000000000000000], [0x0000000000000000, 0x0000000000000000], @@ -224,7 +225,7 @@ fn aes256_expand_key_test() { let keys = [0xff; 32]; check( - &unsafe { aes256_expand_key(&keys) }, + &unsafe { aes256::expand_key(&keys) }, &[ [0xffffffffffffffff, 0xffffffffffffffff], [0xffffffffffffffff, 0xffffffffffffffff], @@ -246,7 +247,7 @@ fn aes256_expand_key_test() { let keys = hex!("000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f"); check( - &unsafe { aes256_expand_key(&keys) }, + &unsafe { aes256::expand_key(&keys) }, &[ [0x0001020304050607, 0x08090a0b0c0d0e0f], [0x1011121314151617, 0x18191a1b1c1d1e1f], @@ -268,7 +269,7 @@ fn aes256_expand_key_test() { let keys = hex!("603deb1015ca71be2b73aef0857d77811f352c073b6108d72d9810a30914dff4"); check( - &unsafe { aes256_expand_key(&keys) }, + &unsafe { aes256::expand_key(&keys) }, &[ [0x603deb1015ca71be, 0x2b73aef0857d7781], [0x1f352c073b6108d7, 0x2d9810a30914dff4], diff --git a/aes/src/x86/vaes256.rs b/aes/src/x86/vaes256.rs new file mode 100644 index 00000000..abb566f0 --- /dev/null +++ b/aes/src/x86/vaes256.rs @@ -0,0 +1,3 @@ +pub(super) mod aes128; +pub(super) mod aes192; +pub(super) mod aes256; diff --git a/aes/src/x86/vaes256/aes128.rs b/aes/src/x86/vaes256/aes128.rs new file mode 100644 index 00000000..7ce5b8dc --- /dev/null +++ b/aes/src/x86/vaes256/aes128.rs @@ -0,0 +1,534 @@ +#![allow(unsafe_op_in_unsafe_fn)] + +use crate::x86::{Block30, Simd128RoundKeys, Simd256RoundKeys, arch::*}; +use cipher::inout::InOut; +use core::arch::asm; + +#[inline] +pub(crate) unsafe fn broadcast_keys(keys: &Simd128RoundKeys<11>) -> Simd256RoundKeys<11> { + [ + _mm256_broadcastsi128_si256(keys[0]), + _mm256_broadcastsi128_si256(keys[1]), + _mm256_broadcastsi128_si256(keys[2]), + _mm256_broadcastsi128_si256(keys[3]), + _mm256_broadcastsi128_si256(keys[4]), + _mm256_broadcastsi128_si256(keys[5]), + _mm256_broadcastsi128_si256(keys[6]), + _mm256_broadcastsi128_si256(keys[7]), + _mm256_broadcastsi128_si256(keys[8]), + _mm256_broadcastsi128_si256(keys[9]), + _mm256_broadcastsi128_si256(keys[10]), + ] +} + +#[target_feature(enable = "avx")] +#[inline] +pub(crate) unsafe fn encrypt30( + simd_256_keys: &Simd256RoundKeys<11>, + blocks: InOut<'_, '_, Block30>, +) { + let (iptr, optr) = blocks.into_raw(); + let iptr = iptr.cast::<__m256i>(); + let optr = optr.cast::<__m256i>(); + + // load plain-data + let mut data0 = iptr.add(0).read_unaligned(); + let mut data1 = iptr.add(1).read_unaligned(); + let mut data2 = iptr.add(2).read_unaligned(); + let mut data3 = iptr.add(3).read_unaligned(); + let mut data4 = iptr.add(4).read_unaligned(); + let mut data5 = iptr.add(5).read_unaligned(); + let mut data6 = iptr.add(6).read_unaligned(); + let mut data7 = iptr.add(7).read_unaligned(); + let mut data8 = iptr.add(8).read_unaligned(); + let mut data9 = iptr.add(9).read_unaligned(); + let mut data10 = iptr.add(10).read_unaligned(); + let mut data11 = iptr.add(11).read_unaligned(); + let mut data12 = iptr.add(12).read_unaligned(); + let mut data13 = iptr.add(13).read_unaligned(); + let mut data14 = iptr.add(14).read_unaligned(); + + asm! { + // aes-128 round 0 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 0 * 32]", + "vpxord ymm1 , ymm1 , ymm0", + "vpxord ymm2 , ymm2 , ymm0", + "vpxord ymm3 , ymm3 , ymm0", + "vpxord ymm4 , ymm4 , ymm0", + "vpxord ymm5 , ymm5 , ymm0", + "vpxord ymm6 , ymm6 , ymm0", + "vpxord ymm7 , ymm7 , ymm0", + "vpxord ymm8 , ymm8 , ymm0", + "vpxord ymm9 , ymm9 , ymm0", + "vpxord ymm10, ymm10, ymm0", + "vpxord ymm11, ymm11, ymm0", + "vpxord ymm12, ymm12, ymm0", + "vpxord ymm13, ymm13, ymm0", + "vpxord ymm14, ymm14, ymm0", + "vpxord ymm15, ymm15, ymm0", + // aes-128 round 1 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 1 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 2 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 2 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 3 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 3 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 4 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 4 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 5 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 5 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 6 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 6 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 7 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 7 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 8 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 8 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 9 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 9 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 10 encrypt + "vmovdqu ymm0, [{simd_256_keys} + 10 * 32]", + "vaesenclast ymm1 , ymm1 , ymm0", + "vaesenclast ymm2 , ymm2 , ymm0", + "vaesenclast ymm3 , ymm3 , ymm0", + "vaesenclast ymm4 , ymm4 , ymm0", + "vaesenclast ymm5 , ymm5 , ymm0", + "vaesenclast ymm6 , ymm6 , ymm0", + "vaesenclast ymm7 , ymm7 , ymm0", + "vaesenclast ymm8 , ymm8 , ymm0", + "vaesenclast ymm9 , ymm9 , ymm0", + "vaesenclast ymm10, ymm10, ymm0", + "vaesenclast ymm11, ymm11, ymm0", + "vaesenclast ymm12, ymm12, ymm0", + "vaesenclast ymm13, ymm13, ymm0", + "vaesenclast ymm14, ymm14, ymm0", + "vaesenclast ymm15, ymm15, ymm0", + + simd_256_keys = in(reg) simd_256_keys.as_ptr(), + + out("ymm0") _, + inout("ymm1") data0, + inout("ymm2") data1, + inout("ymm3") data2, + inout("ymm4") data3, + inout("ymm5") data4, + inout("ymm6") data5, + inout("ymm7") data6, + inout("ymm8") data7, + inout("ymm9") data8, + inout("ymm10") data9, + inout("ymm11") data10, + inout("ymm12") data11, + inout("ymm13") data12, + inout("ymm14") data13, + inout("ymm15") data14, + + options(pure, readonly, nostack, preserves_flags), + }; + + // save cipher-data + optr.add(0).write_unaligned(data0); + optr.add(1).write_unaligned(data1); + optr.add(2).write_unaligned(data2); + optr.add(3).write_unaligned(data3); + optr.add(4).write_unaligned(data4); + optr.add(5).write_unaligned(data5); + optr.add(6).write_unaligned(data6); + optr.add(7).write_unaligned(data7); + optr.add(8).write_unaligned(data8); + optr.add(9).write_unaligned(data9); + optr.add(10).write_unaligned(data10); + optr.add(11).write_unaligned(data11); + optr.add(12).write_unaligned(data12); + optr.add(13).write_unaligned(data13); + optr.add(14).write_unaligned(data14); +} + +#[target_feature(enable = "avx")] +#[inline] +pub(crate) unsafe fn decrypt30( + simd_256_keys: &Simd256RoundKeys<11>, + blocks: InOut<'_, '_, Block30>, +) { + let (iptr, optr) = blocks.into_raw(); + let iptr = iptr.cast::<__m256i>(); + let optr = optr.cast::<__m256i>(); + + // load cipher-data + let mut data0 = iptr.add(0).read_unaligned(); + let mut data1 = iptr.add(1).read_unaligned(); + let mut data2 = iptr.add(2).read_unaligned(); + let mut data3 = iptr.add(3).read_unaligned(); + let mut data4 = iptr.add(4).read_unaligned(); + let mut data5 = iptr.add(5).read_unaligned(); + let mut data6 = iptr.add(6).read_unaligned(); + let mut data7 = iptr.add(7).read_unaligned(); + let mut data8 = iptr.add(8).read_unaligned(); + let mut data9 = iptr.add(9).read_unaligned(); + let mut data10 = iptr.add(10).read_unaligned(); + let mut data11 = iptr.add(11).read_unaligned(); + let mut data12 = iptr.add(12).read_unaligned(); + let mut data13 = iptr.add(13).read_unaligned(); + let mut data14 = iptr.add(14).read_unaligned(); + + asm! { + // aes-128 round 10 decrypt + "vmovdqu ymm0, [{simd_256_keys} + 0 * 32]", + "vpxord ymm1 , ymm1 , ymm0", + "vpxord ymm2 , ymm2 , ymm0", + "vpxord ymm3 , ymm3 , ymm0", + "vpxord ymm4 , ymm4 , ymm0", + "vpxord ymm5 , ymm5 , ymm0", + "vpxord ymm6 , ymm6 , ymm0", + "vpxord ymm7 , ymm7 , ymm0", + "vpxord ymm8 , ymm8 , ymm0", + "vpxord ymm9 , ymm9 , ymm0", + "vpxord ymm10, ymm10, ymm0", + "vpxord ymm11, ymm11, ymm0", + "vpxord ymm12, ymm12, ymm0", + "vpxord ymm13, ymm13, ymm0", + "vpxord ymm14, ymm14, ymm0", + "vpxord ymm15, ymm15, ymm0", + // aes-128 round 9 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 1 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-128 round 8 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 2 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-128 round 7 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 3 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-128 round 6 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 4 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-128 round 5 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 5 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-128 round 4 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 6 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-128 round 3 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 7 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-128 round 2 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 8 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-128 round 1 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 9 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-128 round 0 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 10 * 32]", + "vaesdeclast ymm1 , ymm1 , ymm0", + "vaesdeclast ymm2 , ymm2 , ymm0", + "vaesdeclast ymm3 , ymm3 , ymm0", + "vaesdeclast ymm4 , ymm4 , ymm0", + "vaesdeclast ymm5 , ymm5 , ymm0", + "vaesdeclast ymm6 , ymm6 , ymm0", + "vaesdeclast ymm7 , ymm7 , ymm0", + "vaesdeclast ymm8 , ymm8 , ymm0", + "vaesdeclast ymm9 , ymm9 , ymm0", + "vaesdeclast ymm10, ymm10, ymm0", + "vaesdeclast ymm11, ymm11, ymm0", + "vaesdeclast ymm12, ymm12, ymm0", + "vaesdeclast ymm13, ymm13, ymm0", + "vaesdeclast ymm14, ymm14, ymm0", + "vaesdeclast ymm15, ymm15, ymm0", + + simd_256_keys = in(reg) simd_256_keys.as_ptr(), + + out("ymm0") _, + inout("ymm1") data0, + inout("ymm2") data1, + inout("ymm3") data2, + inout("ymm4") data3, + inout("ymm5") data4, + inout("ymm6") data5, + inout("ymm7") data6, + inout("ymm8") data7, + inout("ymm9") data8, + inout("ymm10") data9, + inout("ymm11") data10, + inout("ymm12") data11, + inout("ymm13") data12, + inout("ymm14") data13, + inout("ymm15") data14, + + options(pure, readonly, nostack, preserves_flags), + }; + + // save plain-data + optr.add(0).write_unaligned(data0); + optr.add(1).write_unaligned(data1); + optr.add(2).write_unaligned(data2); + optr.add(3).write_unaligned(data3); + optr.add(4).write_unaligned(data4); + optr.add(5).write_unaligned(data5); + optr.add(6).write_unaligned(data6); + optr.add(7).write_unaligned(data7); + optr.add(8).write_unaligned(data8); + optr.add(9).write_unaligned(data9); + optr.add(10).write_unaligned(data10); + optr.add(11).write_unaligned(data11); + optr.add(12).write_unaligned(data12); + optr.add(13).write_unaligned(data13); + optr.add(14).write_unaligned(data14); +} diff --git a/aes/src/x86/vaes256/aes192.rs b/aes/src/x86/vaes256/aes192.rs new file mode 100644 index 00000000..38c37dea --- /dev/null +++ b/aes/src/x86/vaes256/aes192.rs @@ -0,0 +1,604 @@ +#![allow(unsafe_op_in_unsafe_fn)] + +use crate::x86::{Block30, Simd128RoundKeys, Simd256RoundKeys, arch::*}; +use cipher::inout::InOut; +use core::arch::asm; + +#[inline] +pub(crate) unsafe fn broadcast_keys(keys: &Simd128RoundKeys<13>) -> Simd256RoundKeys<13> { + [ + _mm256_broadcastsi128_si256(keys[0]), + _mm256_broadcastsi128_si256(keys[1]), + _mm256_broadcastsi128_si256(keys[2]), + _mm256_broadcastsi128_si256(keys[3]), + _mm256_broadcastsi128_si256(keys[4]), + _mm256_broadcastsi128_si256(keys[5]), + _mm256_broadcastsi128_si256(keys[6]), + _mm256_broadcastsi128_si256(keys[7]), + _mm256_broadcastsi128_si256(keys[8]), + _mm256_broadcastsi128_si256(keys[9]), + _mm256_broadcastsi128_si256(keys[10]), + _mm256_broadcastsi128_si256(keys[11]), + _mm256_broadcastsi128_si256(keys[12]), + ] +} + +#[target_feature(enable = "avx")] +#[inline] +pub(crate) unsafe fn encrypt30( + simd_256_keys: &Simd256RoundKeys<13>, + blocks: InOut<'_, '_, Block30>, +) { + let (iptr, optr) = blocks.into_raw(); + let iptr = iptr.cast::<__m256i>(); + let optr = optr.cast::<__m256i>(); + + // load plain-data + let mut data0 = iptr.add(0).read_unaligned(); + let mut data1 = iptr.add(1).read_unaligned(); + let mut data2 = iptr.add(2).read_unaligned(); + let mut data3 = iptr.add(3).read_unaligned(); + let mut data4 = iptr.add(4).read_unaligned(); + let mut data5 = iptr.add(5).read_unaligned(); + let mut data6 = iptr.add(6).read_unaligned(); + let mut data7 = iptr.add(7).read_unaligned(); + let mut data8 = iptr.add(8).read_unaligned(); + let mut data9 = iptr.add(9).read_unaligned(); + let mut data10 = iptr.add(10).read_unaligned(); + let mut data11 = iptr.add(11).read_unaligned(); + let mut data12 = iptr.add(12).read_unaligned(); + let mut data13 = iptr.add(13).read_unaligned(); + let mut data14 = iptr.add(14).read_unaligned(); + + asm! { + // aes-128 round 0 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 0 * 32]", + "vpxord ymm1 , ymm1 , ymm0", + "vpxord ymm2 , ymm2 , ymm0", + "vpxord ymm3 , ymm3 , ymm0", + "vpxord ymm4 , ymm4 , ymm0", + "vpxord ymm5 , ymm5 , ymm0", + "vpxord ymm6 , ymm6 , ymm0", + "vpxord ymm7 , ymm7 , ymm0", + "vpxord ymm8 , ymm8 , ymm0", + "vpxord ymm9 , ymm9 , ymm0", + "vpxord ymm10, ymm10, ymm0", + "vpxord ymm11, ymm11, ymm0", + "vpxord ymm12, ymm12, ymm0", + "vpxord ymm13, ymm13, ymm0", + "vpxord ymm14, ymm14, ymm0", + "vpxord ymm15, ymm15, ymm0", + // aes-128 round 1 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 1 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 2 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 2 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 3 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 3 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 4 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 4 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 5 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 5 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 6 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 6 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 7 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 7 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 8 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 8 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 9 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 9 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-192 round 10 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 10 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-192 round 11 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 11 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-192 round 12 encrypt + "vmovdqu ymm0, [{simd_256_keys} + 12 * 32]", + "vaesenclast ymm1 , ymm1 , ymm0", + "vaesenclast ymm2 , ymm2 , ymm0", + "vaesenclast ymm3 , ymm3 , ymm0", + "vaesenclast ymm4 , ymm4 , ymm0", + "vaesenclast ymm5 , ymm5 , ymm0", + "vaesenclast ymm6 , ymm6 , ymm0", + "vaesenclast ymm7 , ymm7 , ymm0", + "vaesenclast ymm8 , ymm8 , ymm0", + "vaesenclast ymm9 , ymm9 , ymm0", + "vaesenclast ymm10, ymm10, ymm0", + "vaesenclast ymm11, ymm11, ymm0", + "vaesenclast ymm12, ymm12, ymm0", + "vaesenclast ymm13, ymm13, ymm0", + "vaesenclast ymm14, ymm14, ymm0", + "vaesenclast ymm15, ymm15, ymm0", + + simd_256_keys = in(reg) simd_256_keys.as_ptr(), + + out("ymm0") _, + inout("ymm1") data0, + inout("ymm2") data1, + inout("ymm3") data2, + inout("ymm4") data3, + inout("ymm5") data4, + inout("ymm6") data5, + inout("ymm7") data6, + inout("ymm8") data7, + inout("ymm9") data8, + inout("ymm10") data9, + inout("ymm11") data10, + inout("ymm12") data11, + inout("ymm13") data12, + inout("ymm14") data13, + inout("ymm15") data14, + + options(pure, readonly, nostack, preserves_flags), + }; + + // save cipher-data + optr.add(0).write_unaligned(data0); + optr.add(1).write_unaligned(data1); + optr.add(2).write_unaligned(data2); + optr.add(3).write_unaligned(data3); + optr.add(4).write_unaligned(data4); + optr.add(5).write_unaligned(data5); + optr.add(6).write_unaligned(data6); + optr.add(7).write_unaligned(data7); + optr.add(8).write_unaligned(data8); + optr.add(9).write_unaligned(data9); + optr.add(10).write_unaligned(data10); + optr.add(11).write_unaligned(data11); + optr.add(12).write_unaligned(data12); + optr.add(13).write_unaligned(data13); + optr.add(14).write_unaligned(data14); +} + +#[target_feature(enable = "avx")] +#[inline] +pub(crate) unsafe fn decrypt30( + simd_256_keys: &Simd256RoundKeys<13>, + blocks: InOut<'_, '_, Block30>, +) { + let (iptr, optr) = blocks.into_raw(); + let iptr = iptr.cast::<__m256i>(); + let optr = optr.cast::<__m256i>(); + + // load cipher-data + let mut data0 = iptr.add(0).read_unaligned(); + let mut data1 = iptr.add(1).read_unaligned(); + let mut data2 = iptr.add(2).read_unaligned(); + let mut data3 = iptr.add(3).read_unaligned(); + let mut data4 = iptr.add(4).read_unaligned(); + let mut data5 = iptr.add(5).read_unaligned(); + let mut data6 = iptr.add(6).read_unaligned(); + let mut data7 = iptr.add(7).read_unaligned(); + let mut data8 = iptr.add(8).read_unaligned(); + let mut data9 = iptr.add(9).read_unaligned(); + let mut data10 = iptr.add(10).read_unaligned(); + let mut data11 = iptr.add(11).read_unaligned(); + let mut data12 = iptr.add(12).read_unaligned(); + let mut data13 = iptr.add(13).read_unaligned(); + let mut data14 = iptr.add(14).read_unaligned(); + + asm! { + // aes-192 round 12 decrypt + "vmovdqu ymm0, [{simd_256_keys} + 0 * 32]", + "vpxord ymm1 , ymm1 , ymm0", + "vpxord ymm2 , ymm2 , ymm0", + "vpxord ymm3 , ymm3 , ymm0", + "vpxord ymm4 , ymm4 , ymm0", + "vpxord ymm5 , ymm5 , ymm0", + "vpxord ymm6 , ymm6 , ymm0", + "vpxord ymm7 , ymm7 , ymm0", + "vpxord ymm8 , ymm8 , ymm0", + "vpxord ymm9 , ymm9 , ymm0", + "vpxord ymm10, ymm10, ymm0", + "vpxord ymm11, ymm11, ymm0", + "vpxord ymm12, ymm12, ymm0", + "vpxord ymm13, ymm13, ymm0", + "vpxord ymm14, ymm14, ymm0", + "vpxord ymm15, ymm15, ymm0", + // aes-192 round 11 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 1 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 10 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 2 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 9 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 3 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 8 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 4 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 7 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 5 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 6 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 6 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 5 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 7 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 4 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 8 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 3 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 9 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 2 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 10 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 1 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 11 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 0 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 12 * 32]", + "vaesdeclast ymm1 , ymm1 , ymm0", + "vaesdeclast ymm2 , ymm2 , ymm0", + "vaesdeclast ymm3 , ymm3 , ymm0", + "vaesdeclast ymm4 , ymm4 , ymm0", + "vaesdeclast ymm5 , ymm5 , ymm0", + "vaesdeclast ymm6 , ymm6 , ymm0", + "vaesdeclast ymm7 , ymm7 , ymm0", + "vaesdeclast ymm8 , ymm8 , ymm0", + "vaesdeclast ymm9 , ymm9 , ymm0", + "vaesdeclast ymm10, ymm10, ymm0", + "vaesdeclast ymm11, ymm11, ymm0", + "vaesdeclast ymm12, ymm12, ymm0", + "vaesdeclast ymm13, ymm13, ymm0", + "vaesdeclast ymm14, ymm14, ymm0", + "vaesdeclast ymm15, ymm15, ymm0", + + simd_256_keys = in(reg) simd_256_keys.as_ptr(), + + out("ymm0") _, + inout("ymm1") data0, + inout("ymm2") data1, + inout("ymm3") data2, + inout("ymm4") data3, + inout("ymm5") data4, + inout("ymm6") data5, + inout("ymm7") data6, + inout("ymm8") data7, + inout("ymm9") data8, + inout("ymm10") data9, + inout("ymm11") data10, + inout("ymm12") data11, + inout("ymm13") data12, + inout("ymm14") data13, + inout("ymm15") data14, + + options(pure, readonly, nostack, preserves_flags), + }; + + // save plain-data + optr.add(0).write_unaligned(data0); + optr.add(1).write_unaligned(data1); + optr.add(2).write_unaligned(data2); + optr.add(3).write_unaligned(data3); + optr.add(4).write_unaligned(data4); + optr.add(5).write_unaligned(data5); + optr.add(6).write_unaligned(data6); + optr.add(7).write_unaligned(data7); + optr.add(8).write_unaligned(data8); + optr.add(9).write_unaligned(data9); + optr.add(10).write_unaligned(data10); + optr.add(11).write_unaligned(data11); + optr.add(12).write_unaligned(data12); + optr.add(13).write_unaligned(data13); + optr.add(14).write_unaligned(data14); +} diff --git a/aes/src/x86/vaes256/aes256.rs b/aes/src/x86/vaes256/aes256.rs new file mode 100644 index 00000000..c3772c94 --- /dev/null +++ b/aes/src/x86/vaes256/aes256.rs @@ -0,0 +1,674 @@ +#![allow(unsafe_op_in_unsafe_fn)] + +use crate::x86::{Block30, Simd128RoundKeys, Simd256RoundKeys, arch::*}; +use cipher::inout::InOut; +use core::arch::asm; + +#[inline] +pub(crate) unsafe fn broadcast_keys(keys: &Simd128RoundKeys<15>) -> Simd256RoundKeys<15> { + [ + _mm256_broadcastsi128_si256(keys[0]), + _mm256_broadcastsi128_si256(keys[1]), + _mm256_broadcastsi128_si256(keys[2]), + _mm256_broadcastsi128_si256(keys[3]), + _mm256_broadcastsi128_si256(keys[4]), + _mm256_broadcastsi128_si256(keys[5]), + _mm256_broadcastsi128_si256(keys[6]), + _mm256_broadcastsi128_si256(keys[7]), + _mm256_broadcastsi128_si256(keys[8]), + _mm256_broadcastsi128_si256(keys[9]), + _mm256_broadcastsi128_si256(keys[10]), + _mm256_broadcastsi128_si256(keys[11]), + _mm256_broadcastsi128_si256(keys[12]), + _mm256_broadcastsi128_si256(keys[13]), + _mm256_broadcastsi128_si256(keys[14]), + ] +} + +#[target_feature(enable = "avx")] +#[inline] +pub(crate) unsafe fn encrypt30( + simd_256_keys: &Simd256RoundKeys<15>, + blocks: InOut<'_, '_, Block30>, +) { + let (iptr, optr) = blocks.into_raw(); + let iptr = iptr.cast::<__m256i>(); + let optr = optr.cast::<__m256i>(); + + // load plain-data + let mut data0 = iptr.add(0).read_unaligned(); + let mut data1 = iptr.add(1).read_unaligned(); + let mut data2 = iptr.add(2).read_unaligned(); + let mut data3 = iptr.add(3).read_unaligned(); + let mut data4 = iptr.add(4).read_unaligned(); + let mut data5 = iptr.add(5).read_unaligned(); + let mut data6 = iptr.add(6).read_unaligned(); + let mut data7 = iptr.add(7).read_unaligned(); + let mut data8 = iptr.add(8).read_unaligned(); + let mut data9 = iptr.add(9).read_unaligned(); + let mut data10 = iptr.add(10).read_unaligned(); + let mut data11 = iptr.add(11).read_unaligned(); + let mut data12 = iptr.add(12).read_unaligned(); + let mut data13 = iptr.add(13).read_unaligned(); + let mut data14 = iptr.add(14).read_unaligned(); + + asm! { + // aes-128 round 0 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 0 * 32]", + "vpxord ymm1 , ymm1 , ymm0", + "vpxord ymm2 , ymm2 , ymm0", + "vpxord ymm3 , ymm3 , ymm0", + "vpxord ymm4 , ymm4 , ymm0", + "vpxord ymm5 , ymm5 , ymm0", + "vpxord ymm6 , ymm6 , ymm0", + "vpxord ymm7 , ymm7 , ymm0", + "vpxord ymm8 , ymm8 , ymm0", + "vpxord ymm9 , ymm9 , ymm0", + "vpxord ymm10, ymm10, ymm0", + "vpxord ymm11, ymm11, ymm0", + "vpxord ymm12, ymm12, ymm0", + "vpxord ymm13, ymm13, ymm0", + "vpxord ymm14, ymm14, ymm0", + "vpxord ymm15, ymm15, ymm0", + // aes-128 round 1 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 1 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 2 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 2 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 3 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 3 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 4 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 4 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 5 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 5 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 6 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 6 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 7 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 7 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 8 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 8 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 9 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 9 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-192 round 10 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 10 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-192 round 11 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 11 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-192 round 12 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 12 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-192 round 13 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 13 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-192 round 14 encrypt + "vmovdqu ymm0, [{simd_256_keys} + 14 * 32]", + "vaesenclast ymm1 , ymm1 , ymm0", + "vaesenclast ymm2 , ymm2 , ymm0", + "vaesenclast ymm3 , ymm3 , ymm0", + "vaesenclast ymm4 , ymm4 , ymm0", + "vaesenclast ymm5 , ymm5 , ymm0", + "vaesenclast ymm6 , ymm6 , ymm0", + "vaesenclast ymm7 , ymm7 , ymm0", + "vaesenclast ymm8 , ymm8 , ymm0", + "vaesenclast ymm9 , ymm9 , ymm0", + "vaesenclast ymm10, ymm10, ymm0", + "vaesenclast ymm11, ymm11, ymm0", + "vaesenclast ymm12, ymm12, ymm0", + "vaesenclast ymm13, ymm13, ymm0", + "vaesenclast ymm14, ymm14, ymm0", + "vaesenclast ymm15, ymm15, ymm0", + + simd_256_keys = in(reg) simd_256_keys.as_ptr(), + + out("ymm0") _, + inout("ymm1") data0, + inout("ymm2") data1, + inout("ymm3") data2, + inout("ymm4") data3, + inout("ymm5") data4, + inout("ymm6") data5, + inout("ymm7") data6, + inout("ymm8") data7, + inout("ymm9") data8, + inout("ymm10") data9, + inout("ymm11") data10, + inout("ymm12") data11, + inout("ymm13") data12, + inout("ymm14") data13, + inout("ymm15") data14, + + options(pure, readonly, nostack, preserves_flags), + }; + + // save cipher-data + optr.add(0).write_unaligned(data0); + optr.add(1).write_unaligned(data1); + optr.add(2).write_unaligned(data2); + optr.add(3).write_unaligned(data3); + optr.add(4).write_unaligned(data4); + optr.add(5).write_unaligned(data5); + optr.add(6).write_unaligned(data6); + optr.add(7).write_unaligned(data7); + optr.add(8).write_unaligned(data8); + optr.add(9).write_unaligned(data9); + optr.add(10).write_unaligned(data10); + optr.add(11).write_unaligned(data11); + optr.add(12).write_unaligned(data12); + optr.add(13).write_unaligned(data13); + optr.add(14).write_unaligned(data14); +} + +#[target_feature(enable = "avx")] +#[inline] +pub(crate) unsafe fn decrypt30( + simd_256_keys: &Simd256RoundKeys<15>, + blocks: InOut<'_, '_, Block30>, +) { + let (iptr, optr) = blocks.into_raw(); + let iptr = iptr.cast::<__m256i>(); + let optr = optr.cast::<__m256i>(); + + // load cipher-data + let mut data0 = iptr.add(0).read_unaligned(); + let mut data1 = iptr.add(1).read_unaligned(); + let mut data2 = iptr.add(2).read_unaligned(); + let mut data3 = iptr.add(3).read_unaligned(); + let mut data4 = iptr.add(4).read_unaligned(); + let mut data5 = iptr.add(5).read_unaligned(); + let mut data6 = iptr.add(6).read_unaligned(); + let mut data7 = iptr.add(7).read_unaligned(); + let mut data8 = iptr.add(8).read_unaligned(); + let mut data9 = iptr.add(9).read_unaligned(); + let mut data10 = iptr.add(10).read_unaligned(); + let mut data11 = iptr.add(11).read_unaligned(); + let mut data12 = iptr.add(12).read_unaligned(); + let mut data13 = iptr.add(13).read_unaligned(); + let mut data14 = iptr.add(14).read_unaligned(); + + asm! { + // aes-192 round 14 decrypt + "vmovdqu ymm0, [{simd_256_keys} + 0 * 32]", + "vpxord ymm1 , ymm1 , ymm0", + "vpxord ymm2 , ymm2 , ymm0", + "vpxord ymm3 , ymm3 , ymm0", + "vpxord ymm4 , ymm4 , ymm0", + "vpxord ymm5 , ymm5 , ymm0", + "vpxord ymm6 , ymm6 , ymm0", + "vpxord ymm7 , ymm7 , ymm0", + "vpxord ymm8 , ymm8 , ymm0", + "vpxord ymm9 , ymm9 , ymm0", + "vpxord ymm10, ymm10, ymm0", + "vpxord ymm11, ymm11, ymm0", + "vpxord ymm12, ymm12, ymm0", + "vpxord ymm13, ymm13, ymm0", + "vpxord ymm14, ymm14, ymm0", + "vpxord ymm15, ymm15, ymm0", + // aes-192 round 13 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 1 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 12 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 2 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 11 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 3 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 10 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 4 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 9 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 5 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 8 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 6 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 7 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 7 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 6 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 8 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 5 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 9 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 4 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 10 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 3 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 11 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 2 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 12 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 1 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 13 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 0 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 14 * 32]", + "vaesdeclast ymm1 , ymm1 , ymm0", + "vaesdeclast ymm2 , ymm2 , ymm0", + "vaesdeclast ymm3 , ymm3 , ymm0", + "vaesdeclast ymm4 , ymm4 , ymm0", + "vaesdeclast ymm5 , ymm5 , ymm0", + "vaesdeclast ymm6 , ymm6 , ymm0", + "vaesdeclast ymm7 , ymm7 , ymm0", + "vaesdeclast ymm8 , ymm8 , ymm0", + "vaesdeclast ymm9 , ymm9 , ymm0", + "vaesdeclast ymm10, ymm10, ymm0", + "vaesdeclast ymm11, ymm11, ymm0", + "vaesdeclast ymm12, ymm12, ymm0", + "vaesdeclast ymm13, ymm13, ymm0", + "vaesdeclast ymm14, ymm14, ymm0", + "vaesdeclast ymm15, ymm15, ymm0", + + simd_256_keys = in(reg) simd_256_keys.as_ptr(), + + out("ymm0") _, + inout("ymm1") data0, + inout("ymm2") data1, + inout("ymm3") data2, + inout("ymm4") data3, + inout("ymm5") data4, + inout("ymm6") data5, + inout("ymm7") data6, + inout("ymm8") data7, + inout("ymm9") data8, + inout("ymm10") data9, + inout("ymm11") data10, + inout("ymm12") data11, + inout("ymm13") data12, + inout("ymm14") data13, + inout("ymm15") data14, + + options(pure, readonly, nostack, preserves_flags), + }; + + // save plain-data + optr.add(0).write_unaligned(data0); + optr.add(1).write_unaligned(data1); + optr.add(2).write_unaligned(data2); + optr.add(3).write_unaligned(data3); + optr.add(4).write_unaligned(data4); + optr.add(5).write_unaligned(data5); + optr.add(6).write_unaligned(data6); + optr.add(7).write_unaligned(data7); + optr.add(8).write_unaligned(data8); + optr.add(9).write_unaligned(data9); + optr.add(10).write_unaligned(data10); + optr.add(11).write_unaligned(data11); + optr.add(12).write_unaligned(data12); + optr.add(13).write_unaligned(data13); + optr.add(14).write_unaligned(data14); +} diff --git a/aes/src/x86/vaes512.rs b/aes/src/x86/vaes512.rs new file mode 100644 index 00000000..abb566f0 --- /dev/null +++ b/aes/src/x86/vaes512.rs @@ -0,0 +1,3 @@ +pub(super) mod aes128; +pub(super) mod aes192; +pub(super) mod aes256; diff --git a/aes/src/x86/vaes512/aes128.rs b/aes/src/x86/vaes512/aes128.rs new file mode 100644 index 00000000..91320071 --- /dev/null +++ b/aes/src/x86/vaes512/aes128.rs @@ -0,0 +1,603 @@ +#![allow(unsafe_op_in_unsafe_fn)] + +use crate::x86::{Block64, Simd128RoundKeys, Simd512RoundKeys, arch::*}; +use cipher::inout::InOut; +use core::{arch::asm, mem::MaybeUninit}; + +#[inline] +pub(crate) unsafe fn broadcast_keys(keys: &Simd128RoundKeys<11>) -> Simd512RoundKeys<11> { + let mut v512: [MaybeUninit<__m512i>; 11] = MaybeUninit::uninit().assume_init(); + asm! { + "vbroadcasti32x4 zmm0 , [{keys} + 0 * 16]", + "vbroadcasti32x4 zmm1 , [{keys} + 1 * 16]", + "vbroadcasti32x4 zmm2 , [{keys} + 2 * 16]", + "vbroadcasti32x4 zmm3 , [{keys} + 3 * 16]", + "vbroadcasti32x4 zmm4 , [{keys} + 4 * 16]", + "vbroadcasti32x4 zmm5 , [{keys} + 5 * 16]", + "vbroadcasti32x4 zmm6 , [{keys} + 6 * 16]", + "vbroadcasti32x4 zmm7 , [{keys} + 7 * 16]", + "vbroadcasti32x4 zmm8 , [{keys} + 8 * 16]", + "vbroadcasti32x4 zmm9 , [{keys} + 9 * 16]", + "vbroadcasti32x4 zmm10, [{keys} + 10 * 16]", + + "vmovdqu32 [{optr} + 0 * 64], zmm0", + "vmovdqu32 [{optr} + 1 * 64], zmm1", + "vmovdqu32 [{optr} + 2 * 64], zmm2", + "vmovdqu32 [{optr} + 3 * 64], zmm3", + "vmovdqu32 [{optr} + 4 * 64], zmm4", + "vmovdqu32 [{optr} + 5 * 64], zmm5", + "vmovdqu32 [{optr} + 6 * 64], zmm6", + "vmovdqu32 [{optr} + 7 * 64], zmm7", + "vmovdqu32 [{optr} + 8 * 64], zmm8", + "vmovdqu32 [{optr} + 9 * 64], zmm9", + "vmovdqu32 [{optr} + 10 * 64], zmm10", + + keys = in(reg) keys.as_ptr(), + optr = in(reg) v512.as_mut_ptr().cast::<__m512i>(), + + out("zmm0") _, + out("zmm1") _, + out("zmm2") _, + out("zmm3") _, + out("zmm4") _, + out("zmm5") _, + out("zmm6") _, + out("zmm7") _, + out("zmm8") _, + out("zmm9") _, + out("zmm10") _, + + options(nostack, preserves_flags), + }; + core::mem::transmute(v512) +} + +#[inline] +pub(crate) unsafe fn encrypt64(keys: &Simd512RoundKeys<11>, blocks: InOut<'_, '_, Block64>) { + let (iptr, optr) = blocks.into_raw(); + asm! { + // load keys + "vmovdqu32 zmm0 , [{keys} + 0 * 64]", + "vmovdqu32 zmm1 , [{keys} + 1 * 64]", + "vmovdqu32 zmm2 , [{keys} + 2 * 64]", + "vmovdqu32 zmm3 , [{keys} + 3 * 64]", + "vmovdqu32 zmm4 , [{keys} + 4 * 64]", + "vmovdqu32 zmm5 , [{keys} + 5 * 64]", + "vmovdqu32 zmm6 , [{keys} + 6 * 64]", + "vmovdqu32 zmm7 , [{keys} + 7 * 64]", + "vmovdqu32 zmm8 , [{keys} + 8 * 64]", + "vmovdqu32 zmm9 , [{keys} + 9 * 64]", + "vmovdqu32 zmm10, [{keys} + 10 * 64]", + // load plain-data + "vmovdqu32 zmm16, [{iptr} + 0 * 64]", + "vmovdqu32 zmm17, [{iptr} + 1 * 64]", + "vmovdqu32 zmm18, [{iptr} + 2 * 64]", + "vmovdqu32 zmm19, [{iptr} + 3 * 64]", + "vmovdqu32 zmm20, [{iptr} + 4 * 64]", + "vmovdqu32 zmm21, [{iptr} + 5 * 64]", + "vmovdqu32 zmm22, [{iptr} + 6 * 64]", + "vmovdqu32 zmm23, [{iptr} + 7 * 64]", + "vmovdqu32 zmm24, [{iptr} + 8 * 64]", + "vmovdqu32 zmm25, [{iptr} + 9 * 64]", + "vmovdqu32 zmm26, [{iptr} + 10 * 64]", + "vmovdqu32 zmm27, [{iptr} + 11 * 64]", + "vmovdqu32 zmm28, [{iptr} + 12 * 64]", + "vmovdqu32 zmm29, [{iptr} + 13 * 64]", + "vmovdqu32 zmm30, [{iptr} + 14 * 64]", + "vmovdqu32 zmm31, [{iptr} + 15 * 64]", + // aes-128 round 0 encrypt + "vpxord zmm16, zmm16, zmm0", + "vpxord zmm17, zmm17, zmm0", + "vpxord zmm18, zmm18, zmm0", + "vpxord zmm19, zmm19, zmm0", + "vpxord zmm20, zmm20, zmm0", + "vpxord zmm21, zmm21, zmm0", + "vpxord zmm22, zmm22, zmm0", + "vpxord zmm23, zmm23, zmm0", + "vpxord zmm24, zmm24, zmm0", + "vpxord zmm25, zmm25, zmm0", + "vpxord zmm26, zmm26, zmm0", + "vpxord zmm27, zmm27, zmm0", + "vpxord zmm28, zmm28, zmm0", + "vpxord zmm29, zmm29, zmm0", + "vpxord zmm30, zmm30, zmm0", + "vpxord zmm31, zmm31, zmm0", + // aes-128 round 1 encrypt + "vaesenc zmm16, zmm16, zmm1", + "vaesenc zmm17, zmm17, zmm1", + "vaesenc zmm18, zmm18, zmm1", + "vaesenc zmm19, zmm19, zmm1", + "vaesenc zmm20, zmm20, zmm1", + "vaesenc zmm21, zmm21, zmm1", + "vaesenc zmm22, zmm22, zmm1", + "vaesenc zmm23, zmm23, zmm1", + "vaesenc zmm24, zmm24, zmm1", + "vaesenc zmm25, zmm25, zmm1", + "vaesenc zmm26, zmm26, zmm1", + "vaesenc zmm27, zmm27, zmm1", + "vaesenc zmm28, zmm28, zmm1", + "vaesenc zmm29, zmm29, zmm1", + "vaesenc zmm30, zmm30, zmm1", + "vaesenc zmm31, zmm31, zmm1", + // aes-128 round 2 encrypt + "vaesenc zmm16, zmm16, zmm2", + "vaesenc zmm17, zmm17, zmm2", + "vaesenc zmm18, zmm18, zmm2", + "vaesenc zmm19, zmm19, zmm2", + "vaesenc zmm20, zmm20, zmm2", + "vaesenc zmm21, zmm21, zmm2", + "vaesenc zmm22, zmm22, zmm2", + "vaesenc zmm23, zmm23, zmm2", + "vaesenc zmm24, zmm24, zmm2", + "vaesenc zmm25, zmm25, zmm2", + "vaesenc zmm26, zmm26, zmm2", + "vaesenc zmm27, zmm27, zmm2", + "vaesenc zmm28, zmm28, zmm2", + "vaesenc zmm29, zmm29, zmm2", + "vaesenc zmm30, zmm30, zmm2", + "vaesenc zmm31, zmm31, zmm2", + // aes-128 round 3 encrypt + "vaesenc zmm16, zmm16, zmm3", + "vaesenc zmm17, zmm17, zmm3", + "vaesenc zmm18, zmm18, zmm3", + "vaesenc zmm19, zmm19, zmm3", + "vaesenc zmm20, zmm20, zmm3", + "vaesenc zmm21, zmm21, zmm3", + "vaesenc zmm22, zmm22, zmm3", + "vaesenc zmm23, zmm23, zmm3", + "vaesenc zmm24, zmm24, zmm3", + "vaesenc zmm25, zmm25, zmm3", + "vaesenc zmm26, zmm26, zmm3", + "vaesenc zmm27, zmm27, zmm3", + "vaesenc zmm28, zmm28, zmm3", + "vaesenc zmm29, zmm29, zmm3", + "vaesenc zmm30, zmm30, zmm3", + "vaesenc zmm31, zmm31, zmm3", + // aes-128 round 4 encrypt + "vaesenc zmm16, zmm16, zmm4", + "vaesenc zmm17, zmm17, zmm4", + "vaesenc zmm18, zmm18, zmm4", + "vaesenc zmm19, zmm19, zmm4", + "vaesenc zmm20, zmm20, zmm4", + "vaesenc zmm21, zmm21, zmm4", + "vaesenc zmm22, zmm22, zmm4", + "vaesenc zmm23, zmm23, zmm4", + "vaesenc zmm24, zmm24, zmm4", + "vaesenc zmm25, zmm25, zmm4", + "vaesenc zmm26, zmm26, zmm4", + "vaesenc zmm27, zmm27, zmm4", + "vaesenc zmm28, zmm28, zmm4", + "vaesenc zmm29, zmm29, zmm4", + "vaesenc zmm30, zmm30, zmm4", + "vaesenc zmm31, zmm31, zmm4", + // aes-128 round 5 encrypt + "vaesenc zmm16, zmm16, zmm5", + "vaesenc zmm17, zmm17, zmm5", + "vaesenc zmm18, zmm18, zmm5", + "vaesenc zmm19, zmm19, zmm5", + "vaesenc zmm20, zmm20, zmm5", + "vaesenc zmm21, zmm21, zmm5", + "vaesenc zmm22, zmm22, zmm5", + "vaesenc zmm23, zmm23, zmm5", + "vaesenc zmm24, zmm24, zmm5", + "vaesenc zmm25, zmm25, zmm5", + "vaesenc zmm26, zmm26, zmm5", + "vaesenc zmm27, zmm27, zmm5", + "vaesenc zmm28, zmm28, zmm5", + "vaesenc zmm29, zmm29, zmm5", + "vaesenc zmm30, zmm30, zmm5", + "vaesenc zmm31, zmm31, zmm5", + // aes-128 round 6 encrypt + "vaesenc zmm16, zmm16, zmm6", + "vaesenc zmm17, zmm17, zmm6", + "vaesenc zmm18, zmm18, zmm6", + "vaesenc zmm19, zmm19, zmm6", + "vaesenc zmm20, zmm20, zmm6", + "vaesenc zmm21, zmm21, zmm6", + "vaesenc zmm22, zmm22, zmm6", + "vaesenc zmm23, zmm23, zmm6", + "vaesenc zmm24, zmm24, zmm6", + "vaesenc zmm25, zmm25, zmm6", + "vaesenc zmm26, zmm26, zmm6", + "vaesenc zmm27, zmm27, zmm6", + "vaesenc zmm28, zmm28, zmm6", + "vaesenc zmm29, zmm29, zmm6", + "vaesenc zmm30, zmm30, zmm6", + "vaesenc zmm31, zmm31, zmm6", + // aes-128 round 7 encrypt + "vaesenc zmm16, zmm16, zmm7", + "vaesenc zmm17, zmm17, zmm7", + "vaesenc zmm18, zmm18, zmm7", + "vaesenc zmm19, zmm19, zmm7", + "vaesenc zmm20, zmm20, zmm7", + "vaesenc zmm21, zmm21, zmm7", + "vaesenc zmm22, zmm22, zmm7", + "vaesenc zmm23, zmm23, zmm7", + "vaesenc zmm24, zmm24, zmm7", + "vaesenc zmm25, zmm25, zmm7", + "vaesenc zmm26, zmm26, zmm7", + "vaesenc zmm27, zmm27, zmm7", + "vaesenc zmm28, zmm28, zmm7", + "vaesenc zmm29, zmm29, zmm7", + "vaesenc zmm30, zmm30, zmm7", + "vaesenc zmm31, zmm31, zmm7", + // aes-128 round 8 encrypt + "vaesenc zmm16, zmm16, zmm8", + "vaesenc zmm17, zmm17, zmm8", + "vaesenc zmm18, zmm18, zmm8", + "vaesenc zmm19, zmm19, zmm8", + "vaesenc zmm20, zmm20, zmm8", + "vaesenc zmm21, zmm21, zmm8", + "vaesenc zmm22, zmm22, zmm8", + "vaesenc zmm23, zmm23, zmm8", + "vaesenc zmm24, zmm24, zmm8", + "vaesenc zmm25, zmm25, zmm8", + "vaesenc zmm26, zmm26, zmm8", + "vaesenc zmm27, zmm27, zmm8", + "vaesenc zmm28, zmm28, zmm8", + "vaesenc zmm29, zmm29, zmm8", + "vaesenc zmm30, zmm30, zmm8", + "vaesenc zmm31, zmm31, zmm8", + // aes-128 round 9 encrypt + "vaesenc zmm16, zmm16, zmm9", + "vaesenc zmm17, zmm17, zmm9", + "vaesenc zmm18, zmm18, zmm9", + "vaesenc zmm19, zmm19, zmm9", + "vaesenc zmm20, zmm20, zmm9", + "vaesenc zmm21, zmm21, zmm9", + "vaesenc zmm22, zmm22, zmm9", + "vaesenc zmm23, zmm23, zmm9", + "vaesenc zmm24, zmm24, zmm9", + "vaesenc zmm25, zmm25, zmm9", + "vaesenc zmm26, zmm26, zmm9", + "vaesenc zmm27, zmm27, zmm9", + "vaesenc zmm28, zmm28, zmm9", + "vaesenc zmm29, zmm29, zmm9", + "vaesenc zmm30, zmm30, zmm9", + "vaesenc zmm31, zmm31, zmm9", + // aes-128 round 10 encrypt + "vaesenclast zmm16, zmm16, zmm10", + "vaesenclast zmm17, zmm17, zmm10", + "vaesenclast zmm18, zmm18, zmm10", + "vaesenclast zmm19, zmm19, zmm10", + "vaesenclast zmm20, zmm20, zmm10", + "vaesenclast zmm21, zmm21, zmm10", + "vaesenclast zmm22, zmm22, zmm10", + "vaesenclast zmm23, zmm23, zmm10", + "vaesenclast zmm24, zmm24, zmm10", + "vaesenclast zmm25, zmm25, zmm10", + "vaesenclast zmm26, zmm26, zmm10", + "vaesenclast zmm27, zmm27, zmm10", + "vaesenclast zmm28, zmm28, zmm10", + "vaesenclast zmm29, zmm29, zmm10", + "vaesenclast zmm30, zmm30, zmm10", + "vaesenclast zmm31, zmm31, zmm10", + // save cipher-data + "vmovdqu32 [{optr} + 0 * 64], zmm16", + "vmovdqu32 [{optr} + 1 * 64], zmm17", + "vmovdqu32 [{optr} + 2 * 64], zmm18", + "vmovdqu32 [{optr} + 3 * 64], zmm19", + "vmovdqu32 [{optr} + 4 * 64], zmm20", + "vmovdqu32 [{optr} + 5 * 64], zmm21", + "vmovdqu32 [{optr} + 6 * 64], zmm22", + "vmovdqu32 [{optr} + 7 * 64], zmm23", + "vmovdqu32 [{optr} + 8 * 64], zmm24", + "vmovdqu32 [{optr} + 9 * 64], zmm25", + "vmovdqu32 [{optr} + 10 * 64], zmm26", + "vmovdqu32 [{optr} + 11 * 64], zmm27", + "vmovdqu32 [{optr} + 12 * 64], zmm28", + "vmovdqu32 [{optr} + 13 * 64], zmm29", + "vmovdqu32 [{optr} + 14 * 64], zmm30", + "vmovdqu32 [{optr} + 15 * 64], zmm31", + + keys = in(reg) keys.as_ptr(), + iptr = in(reg) iptr, + optr = in(reg) optr, + + out("zmm0") _, + out("zmm1") _, + out("zmm2") _, + out("zmm3") _, + out("zmm4") _, + out("zmm5") _, + out("zmm6") _, + out("zmm7") _, + out("zmm8") _, + out("zmm9") _, + out("zmm10") _, + + out("zmm16") _, + out("zmm17") _, + out("zmm18") _, + out("zmm19") _, + out("zmm20") _, + out("zmm21") _, + out("zmm22") _, + out("zmm23") _, + out("zmm24") _, + out("zmm25") _, + out("zmm26") _, + out("zmm27") _, + out("zmm28") _, + out("zmm29") _, + out("zmm30") _, + out("zmm31") _, + + options(nostack, preserves_flags), + }; +} + +#[inline] +pub(crate) unsafe fn decrypt64(keys: &Simd512RoundKeys<11>, blocks: InOut<'_, '_, Block64>) { + let (iptr, optr) = blocks.into_raw(); + asm! { + // load keys + "vmovdqu32 zmm10, [{keys} + 0 * 64]", + "vmovdqu32 zmm9 , [{keys} + 1 * 64]", + "vmovdqu32 zmm8 , [{keys} + 2 * 64]", + "vmovdqu32 zmm7 , [{keys} + 3 * 64]", + "vmovdqu32 zmm6 , [{keys} + 4 * 64]", + "vmovdqu32 zmm5 , [{keys} + 5 * 64]", + "vmovdqu32 zmm4 , [{keys} + 6 * 64]", + "vmovdqu32 zmm3 , [{keys} + 7 * 64]", + "vmovdqu32 zmm2 , [{keys} + 8 * 64]", + "vmovdqu32 zmm1 , [{keys} + 9 * 64]", + "vmovdqu32 zmm0 , [{keys} + 10 * 64]", + // load cipher-data + "vmovdqu32 zmm16, [{iptr} + 0 * 64]", + "vmovdqu32 zmm17, [{iptr} + 1 * 64]", + "vmovdqu32 zmm18, [{iptr} + 2 * 64]", + "vmovdqu32 zmm19, [{iptr} + 3 * 64]", + "vmovdqu32 zmm20, [{iptr} + 4 * 64]", + "vmovdqu32 zmm21, [{iptr} + 5 * 64]", + "vmovdqu32 zmm22, [{iptr} + 6 * 64]", + "vmovdqu32 zmm23, [{iptr} + 7 * 64]", + "vmovdqu32 zmm24, [{iptr} + 8 * 64]", + "vmovdqu32 zmm25, [{iptr} + 9 * 64]", + "vmovdqu32 zmm26, [{iptr} + 10 * 64]", + "vmovdqu32 zmm27, [{iptr} + 11 * 64]", + "vmovdqu32 zmm28, [{iptr} + 12 * 64]", + "vmovdqu32 zmm29, [{iptr} + 13 * 64]", + "vmovdqu32 zmm30, [{iptr} + 14 * 64]", + "vmovdqu32 zmm31, [{iptr} + 15 * 64]", + // aes-128 round 10 decrypt + "vpxord zmm16, zmm16, zmm10", + "vpxord zmm17, zmm17, zmm10", + "vpxord zmm18, zmm18, zmm10", + "vpxord zmm19, zmm19, zmm10", + "vpxord zmm20, zmm20, zmm10", + "vpxord zmm21, zmm21, zmm10", + "vpxord zmm22, zmm22, zmm10", + "vpxord zmm23, zmm23, zmm10", + "vpxord zmm24, zmm24, zmm10", + "vpxord zmm25, zmm25, zmm10", + "vpxord zmm26, zmm26, zmm10", + "vpxord zmm27, zmm27, zmm10", + "vpxord zmm28, zmm28, zmm10", + "vpxord zmm29, zmm29, zmm10", + "vpxord zmm30, zmm30, zmm10", + "vpxord zmm31, zmm31, zmm10", + // aes-128 round 9 decrypt + "vaesdec zmm16, zmm16, zmm9", + "vaesdec zmm17, zmm17, zmm9", + "vaesdec zmm18, zmm18, zmm9", + "vaesdec zmm19, zmm19, zmm9", + "vaesdec zmm20, zmm20, zmm9", + "vaesdec zmm21, zmm21, zmm9", + "vaesdec zmm22, zmm22, zmm9", + "vaesdec zmm23, zmm23, zmm9", + "vaesdec zmm24, zmm24, zmm9", + "vaesdec zmm25, zmm25, zmm9", + "vaesdec zmm26, zmm26, zmm9", + "vaesdec zmm27, zmm27, zmm9", + "vaesdec zmm28, zmm28, zmm9", + "vaesdec zmm29, zmm29, zmm9", + "vaesdec zmm30, zmm30, zmm9", + "vaesdec zmm31, zmm31, zmm9", + // aes-128 round 8 decrypt + "vaesdec zmm16, zmm16, zmm8", + "vaesdec zmm17, zmm17, zmm8", + "vaesdec zmm18, zmm18, zmm8", + "vaesdec zmm19, zmm19, zmm8", + "vaesdec zmm20, zmm20, zmm8", + "vaesdec zmm21, zmm21, zmm8", + "vaesdec zmm22, zmm22, zmm8", + "vaesdec zmm23, zmm23, zmm8", + "vaesdec zmm24, zmm24, zmm8", + "vaesdec zmm25, zmm25, zmm8", + "vaesdec zmm26, zmm26, zmm8", + "vaesdec zmm27, zmm27, zmm8", + "vaesdec zmm28, zmm28, zmm8", + "vaesdec zmm29, zmm29, zmm8", + "vaesdec zmm30, zmm30, zmm8", + "vaesdec zmm31, zmm31, zmm8", + // aes-128 round 7 decrypt + "vaesdec zmm16, zmm16, zmm7", + "vaesdec zmm17, zmm17, zmm7", + "vaesdec zmm18, zmm18, zmm7", + "vaesdec zmm19, zmm19, zmm7", + "vaesdec zmm20, zmm20, zmm7", + "vaesdec zmm21, zmm21, zmm7", + "vaesdec zmm22, zmm22, zmm7", + "vaesdec zmm23, zmm23, zmm7", + "vaesdec zmm24, zmm24, zmm7", + "vaesdec zmm25, zmm25, zmm7", + "vaesdec zmm26, zmm26, zmm7", + "vaesdec zmm27, zmm27, zmm7", + "vaesdec zmm28, zmm28, zmm7", + "vaesdec zmm29, zmm29, zmm7", + "vaesdec zmm30, zmm30, zmm7", + "vaesdec zmm31, zmm31, zmm7", + // aes-128 round 6 decrypt + "vaesdec zmm16, zmm16, zmm6", + "vaesdec zmm17, zmm17, zmm6", + "vaesdec zmm18, zmm18, zmm6", + "vaesdec zmm19, zmm19, zmm6", + "vaesdec zmm20, zmm20, zmm6", + "vaesdec zmm21, zmm21, zmm6", + "vaesdec zmm22, zmm22, zmm6", + "vaesdec zmm23, zmm23, zmm6", + "vaesdec zmm24, zmm24, zmm6", + "vaesdec zmm25, zmm25, zmm6", + "vaesdec zmm26, zmm26, zmm6", + "vaesdec zmm27, zmm27, zmm6", + "vaesdec zmm28, zmm28, zmm6", + "vaesdec zmm29, zmm29, zmm6", + "vaesdec zmm30, zmm30, zmm6", + "vaesdec zmm31, zmm31, zmm6", + // aes-128 round 5 decrypt + "vaesdec zmm16, zmm16, zmm5", + "vaesdec zmm17, zmm17, zmm5", + "vaesdec zmm18, zmm18, zmm5", + "vaesdec zmm19, zmm19, zmm5", + "vaesdec zmm20, zmm20, zmm5", + "vaesdec zmm21, zmm21, zmm5", + "vaesdec zmm22, zmm22, zmm5", + "vaesdec zmm23, zmm23, zmm5", + "vaesdec zmm24, zmm24, zmm5", + "vaesdec zmm25, zmm25, zmm5", + "vaesdec zmm26, zmm26, zmm5", + "vaesdec zmm27, zmm27, zmm5", + "vaesdec zmm28, zmm28, zmm5", + "vaesdec zmm29, zmm29, zmm5", + "vaesdec zmm30, zmm30, zmm5", + "vaesdec zmm31, zmm31, zmm5", + // aes-128 round 4 decrypt + "vaesdec zmm16, zmm16, zmm4", + "vaesdec zmm17, zmm17, zmm4", + "vaesdec zmm18, zmm18, zmm4", + "vaesdec zmm19, zmm19, zmm4", + "vaesdec zmm20, zmm20, zmm4", + "vaesdec zmm21, zmm21, zmm4", + "vaesdec zmm22, zmm22, zmm4", + "vaesdec zmm23, zmm23, zmm4", + "vaesdec zmm24, zmm24, zmm4", + "vaesdec zmm25, zmm25, zmm4", + "vaesdec zmm26, zmm26, zmm4", + "vaesdec zmm27, zmm27, zmm4", + "vaesdec zmm28, zmm28, zmm4", + "vaesdec zmm29, zmm29, zmm4", + "vaesdec zmm30, zmm30, zmm4", + "vaesdec zmm31, zmm31, zmm4", + // aes-128 round 3 decrypt + "vaesdec zmm16, zmm16, zmm3", + "vaesdec zmm17, zmm17, zmm3", + "vaesdec zmm18, zmm18, zmm3", + "vaesdec zmm19, zmm19, zmm3", + "vaesdec zmm20, zmm20, zmm3", + "vaesdec zmm21, zmm21, zmm3", + "vaesdec zmm22, zmm22, zmm3", + "vaesdec zmm23, zmm23, zmm3", + "vaesdec zmm24, zmm24, zmm3", + "vaesdec zmm25, zmm25, zmm3", + "vaesdec zmm26, zmm26, zmm3", + "vaesdec zmm27, zmm27, zmm3", + "vaesdec zmm28, zmm28, zmm3", + "vaesdec zmm29, zmm29, zmm3", + "vaesdec zmm30, zmm30, zmm3", + "vaesdec zmm31, zmm31, zmm3", + // aes-128 round 2 decrypt + "vaesdec zmm16, zmm16, zmm2", + "vaesdec zmm17, zmm17, zmm2", + "vaesdec zmm18, zmm18, zmm2", + "vaesdec zmm19, zmm19, zmm2", + "vaesdec zmm20, zmm20, zmm2", + "vaesdec zmm21, zmm21, zmm2", + "vaesdec zmm22, zmm22, zmm2", + "vaesdec zmm23, zmm23, zmm2", + "vaesdec zmm24, zmm24, zmm2", + "vaesdec zmm25, zmm25, zmm2", + "vaesdec zmm26, zmm26, zmm2", + "vaesdec zmm27, zmm27, zmm2", + "vaesdec zmm28, zmm28, zmm2", + "vaesdec zmm29, zmm29, zmm2", + "vaesdec zmm30, zmm30, zmm2", + "vaesdec zmm31, zmm31, zmm2", + // aes-128 round 1 decrypt + "vaesdec zmm16, zmm16, zmm1", + "vaesdec zmm17, zmm17, zmm1", + "vaesdec zmm18, zmm18, zmm1", + "vaesdec zmm19, zmm19, zmm1", + "vaesdec zmm20, zmm20, zmm1", + "vaesdec zmm21, zmm21, zmm1", + "vaesdec zmm22, zmm22, zmm1", + "vaesdec zmm23, zmm23, zmm1", + "vaesdec zmm24, zmm24, zmm1", + "vaesdec zmm25, zmm25, zmm1", + "vaesdec zmm26, zmm26, zmm1", + "vaesdec zmm27, zmm27, zmm1", + "vaesdec zmm28, zmm28, zmm1", + "vaesdec zmm29, zmm29, zmm1", + "vaesdec zmm30, zmm30, zmm1", + "vaesdec zmm31, zmm31, zmm1", + // aes-128 round 0 decrypt + "vaesdeclast zmm16, zmm16, zmm0", + "vaesdeclast zmm17, zmm17, zmm0", + "vaesdeclast zmm18, zmm18, zmm0", + "vaesdeclast zmm19, zmm19, zmm0", + "vaesdeclast zmm20, zmm20, zmm0", + "vaesdeclast zmm21, zmm21, zmm0", + "vaesdeclast zmm22, zmm22, zmm0", + "vaesdeclast zmm23, zmm23, zmm0", + "vaesdeclast zmm24, zmm24, zmm0", + "vaesdeclast zmm25, zmm25, zmm0", + "vaesdeclast zmm26, zmm26, zmm0", + "vaesdeclast zmm27, zmm27, zmm0", + "vaesdeclast zmm28, zmm28, zmm0", + "vaesdeclast zmm29, zmm29, zmm0", + "vaesdeclast zmm30, zmm30, zmm0", + "vaesdeclast zmm31, zmm31, zmm0", + // save plain-data + "vmovdqu32 [{optr} + 0 * 64], zmm16", + "vmovdqu32 [{optr} + 1 * 64], zmm17", + "vmovdqu32 [{optr} + 2 * 64], zmm18", + "vmovdqu32 [{optr} + 3 * 64], zmm19", + "vmovdqu32 [{optr} + 4 * 64], zmm20", + "vmovdqu32 [{optr} + 5 * 64], zmm21", + "vmovdqu32 [{optr} + 6 * 64], zmm22", + "vmovdqu32 [{optr} + 7 * 64], zmm23", + "vmovdqu32 [{optr} + 8 * 64], zmm24", + "vmovdqu32 [{optr} + 9 * 64], zmm25", + "vmovdqu32 [{optr} + 10 * 64], zmm26", + "vmovdqu32 [{optr} + 11 * 64], zmm27", + "vmovdqu32 [{optr} + 12 * 64], zmm28", + "vmovdqu32 [{optr} + 13 * 64], zmm29", + "vmovdqu32 [{optr} + 14 * 64], zmm30", + "vmovdqu32 [{optr} + 15 * 64], zmm31", + + keys = in(reg) keys.as_ptr(), + iptr = in(reg) iptr, + optr = in(reg) optr, + + out("zmm0") _, + out("zmm1") _, + out("zmm2") _, + out("zmm3") _, + out("zmm4") _, + out("zmm5") _, + out("zmm6") _, + out("zmm7") _, + out("zmm8") _, + out("zmm9") _, + out("zmm10") _, + + out("zmm16") _, + out("zmm17") _, + out("zmm18") _, + out("zmm19") _, + out("zmm20") _, + out("zmm21") _, + out("zmm22") _, + out("zmm23") _, + out("zmm24") _, + out("zmm25") _, + out("zmm26") _, + out("zmm27") _, + out("zmm28") _, + out("zmm29") _, + out("zmm30") _, + out("zmm31") _, + + options(nostack, preserves_flags), + }; +} diff --git a/aes/src/x86/vaes512/aes192.rs b/aes/src/x86/vaes512/aes192.rs new file mode 100644 index 00000000..1daceb6e --- /dev/null +++ b/aes/src/x86/vaes512/aes192.rs @@ -0,0 +1,685 @@ +#![allow(unsafe_op_in_unsafe_fn)] + +use crate::x86::{Block64, Simd128RoundKeys, Simd512RoundKeys, arch::*}; +use cipher::inout::InOut; +use core::{arch::asm, mem::MaybeUninit}; + +#[inline] +pub(crate) unsafe fn broadcast_keys(keys: &Simd128RoundKeys<13>) -> Simd512RoundKeys<13> { + let mut v512: [MaybeUninit<__m512i>; 13] = MaybeUninit::uninit().assume_init(); + asm! { + "vbroadcasti32x4 zmm0 , [{keys} + 0 * 16]", + "vbroadcasti32x4 zmm1 , [{keys} + 1 * 16]", + "vbroadcasti32x4 zmm2 , [{keys} + 2 * 16]", + "vbroadcasti32x4 zmm3 , [{keys} + 3 * 16]", + "vbroadcasti32x4 zmm4 , [{keys} + 4 * 16]", + "vbroadcasti32x4 zmm5 , [{keys} + 5 * 16]", + "vbroadcasti32x4 zmm6 , [{keys} + 6 * 16]", + "vbroadcasti32x4 zmm7 , [{keys} + 7 * 16]", + "vbroadcasti32x4 zmm8 , [{keys} + 8 * 16]", + "vbroadcasti32x4 zmm9 , [{keys} + 9 * 16]", + "vbroadcasti32x4 zmm10, [{keys} + 10 * 16]", + "vbroadcasti32x4 zmm11, [{keys} + 11 * 16]", + "vbroadcasti32x4 zmm12, [{keys} + 12 * 16]", + + "vmovdqu32 [{optr} + 0 * 64], zmm0", + "vmovdqu32 [{optr} + 1 * 64], zmm1", + "vmovdqu32 [{optr} + 2 * 64], zmm2", + "vmovdqu32 [{optr} + 3 * 64], zmm3", + "vmovdqu32 [{optr} + 4 * 64], zmm4", + "vmovdqu32 [{optr} + 5 * 64], zmm5", + "vmovdqu32 [{optr} + 6 * 64], zmm6", + "vmovdqu32 [{optr} + 7 * 64], zmm7", + "vmovdqu32 [{optr} + 8 * 64], zmm8", + "vmovdqu32 [{optr} + 9 * 64], zmm9", + "vmovdqu32 [{optr} + 10 * 64], zmm10", + "vmovdqu32 [{optr} + 11 * 64], zmm11", + "vmovdqu32 [{optr} + 12 * 64], zmm12", + + keys = in(reg) keys.as_ptr(), + optr = in(reg) v512.as_mut_ptr().cast::<__m512i>(), + + out("zmm0") _, + out("zmm1") _, + out("zmm2") _, + out("zmm3") _, + out("zmm4") _, + out("zmm5") _, + out("zmm6") _, + out("zmm7") _, + out("zmm8") _, + out("zmm9") _, + out("zmm10") _, + out("zmm11") _, + out("zmm12") _, + + options(nostack, preserves_flags), + }; + core::mem::transmute(v512) +} + +#[inline] +pub(crate) unsafe fn encrypt64(keys: &Simd512RoundKeys<13>, blocks: InOut<'_, '_, Block64>) { + let (iptr, optr) = blocks.into_raw(); + asm! { + // load keys + "vmovdqu32 zmm0 , [{keys} + 0 * 64]", + "vmovdqu32 zmm1 , [{keys} + 1 * 64]", + "vmovdqu32 zmm2 , [{keys} + 2 * 64]", + "vmovdqu32 zmm3 , [{keys} + 3 * 64]", + "vmovdqu32 zmm4 , [{keys} + 4 * 64]", + "vmovdqu32 zmm5 , [{keys} + 5 * 64]", + "vmovdqu32 zmm6 , [{keys} + 6 * 64]", + "vmovdqu32 zmm7 , [{keys} + 7 * 64]", + "vmovdqu32 zmm8 , [{keys} + 8 * 64]", + "vmovdqu32 zmm9 , [{keys} + 9 * 64]", + "vmovdqu32 zmm10, [{keys} + 10 * 64]", + "vmovdqu32 zmm11, [{keys} + 11 * 64]", + "vmovdqu32 zmm12, [{keys} + 12 * 64]", + // load plain-data + "vmovdqu32 zmm16, [{iptr} + 0 * 64]", + "vmovdqu32 zmm17, [{iptr} + 1 * 64]", + "vmovdqu32 zmm18, [{iptr} + 2 * 64]", + "vmovdqu32 zmm19, [{iptr} + 3 * 64]", + "vmovdqu32 zmm20, [{iptr} + 4 * 64]", + "vmovdqu32 zmm21, [{iptr} + 5 * 64]", + "vmovdqu32 zmm22, [{iptr} + 6 * 64]", + "vmovdqu32 zmm23, [{iptr} + 7 * 64]", + "vmovdqu32 zmm24, [{iptr} + 8 * 64]", + "vmovdqu32 zmm25, [{iptr} + 9 * 64]", + "vmovdqu32 zmm26, [{iptr} + 10 * 64]", + "vmovdqu32 zmm27, [{iptr} + 11 * 64]", + "vmovdqu32 zmm28, [{iptr} + 12 * 64]", + "vmovdqu32 zmm29, [{iptr} + 13 * 64]", + "vmovdqu32 zmm30, [{iptr} + 14 * 64]", + "vmovdqu32 zmm31, [{iptr} + 15 * 64]", + // aes-192 round 0 encrypt + "vpxord zmm16, zmm16, zmm0", + "vpxord zmm17, zmm17, zmm0", + "vpxord zmm18, zmm18, zmm0", + "vpxord zmm19, zmm19, zmm0", + "vpxord zmm20, zmm20, zmm0", + "vpxord zmm21, zmm21, zmm0", + "vpxord zmm22, zmm22, zmm0", + "vpxord zmm23, zmm23, zmm0", + "vpxord zmm24, zmm24, zmm0", + "vpxord zmm25, zmm25, zmm0", + "vpxord zmm26, zmm26, zmm0", + "vpxord zmm27, zmm27, zmm0", + "vpxord zmm28, zmm28, zmm0", + "vpxord zmm29, zmm29, zmm0", + "vpxord zmm30, zmm30, zmm0", + "vpxord zmm31, zmm31, zmm0", + // aes-192 round 1 encrypt + "vaesenc zmm16, zmm16, zmm1", + "vaesenc zmm17, zmm17, zmm1", + "vaesenc zmm18, zmm18, zmm1", + "vaesenc zmm19, zmm19, zmm1", + "vaesenc zmm20, zmm20, zmm1", + "vaesenc zmm21, zmm21, zmm1", + "vaesenc zmm22, zmm22, zmm1", + "vaesenc zmm23, zmm23, zmm1", + "vaesenc zmm24, zmm24, zmm1", + "vaesenc zmm25, zmm25, zmm1", + "vaesenc zmm26, zmm26, zmm1", + "vaesenc zmm27, zmm27, zmm1", + "vaesenc zmm28, zmm28, zmm1", + "vaesenc zmm29, zmm29, zmm1", + "vaesenc zmm30, zmm30, zmm1", + "vaesenc zmm31, zmm31, zmm1", + // aes-192 round 2 encrypt + "vaesenc zmm16, zmm16, zmm2", + "vaesenc zmm17, zmm17, zmm2", + "vaesenc zmm18, zmm18, zmm2", + "vaesenc zmm19, zmm19, zmm2", + "vaesenc zmm20, zmm20, zmm2", + "vaesenc zmm21, zmm21, zmm2", + "vaesenc zmm22, zmm22, zmm2", + "vaesenc zmm23, zmm23, zmm2", + "vaesenc zmm24, zmm24, zmm2", + "vaesenc zmm25, zmm25, zmm2", + "vaesenc zmm26, zmm26, zmm2", + "vaesenc zmm27, zmm27, zmm2", + "vaesenc zmm28, zmm28, zmm2", + "vaesenc zmm29, zmm29, zmm2", + "vaesenc zmm30, zmm30, zmm2", + "vaesenc zmm31, zmm31, zmm2", + // aes-192 round 3 encrypt + "vaesenc zmm16, zmm16, zmm3", + "vaesenc zmm17, zmm17, zmm3", + "vaesenc zmm18, zmm18, zmm3", + "vaesenc zmm19, zmm19, zmm3", + "vaesenc zmm20, zmm20, zmm3", + "vaesenc zmm21, zmm21, zmm3", + "vaesenc zmm22, zmm22, zmm3", + "vaesenc zmm23, zmm23, zmm3", + "vaesenc zmm24, zmm24, zmm3", + "vaesenc zmm25, zmm25, zmm3", + "vaesenc zmm26, zmm26, zmm3", + "vaesenc zmm27, zmm27, zmm3", + "vaesenc zmm28, zmm28, zmm3", + "vaesenc zmm29, zmm29, zmm3", + "vaesenc zmm30, zmm30, zmm3", + "vaesenc zmm31, zmm31, zmm3", + // aes-192 round 4 encrypt + "vaesenc zmm16, zmm16, zmm4", + "vaesenc zmm17, zmm17, zmm4", + "vaesenc zmm18, zmm18, zmm4", + "vaesenc zmm19, zmm19, zmm4", + "vaesenc zmm20, zmm20, zmm4", + "vaesenc zmm21, zmm21, zmm4", + "vaesenc zmm22, zmm22, zmm4", + "vaesenc zmm23, zmm23, zmm4", + "vaesenc zmm24, zmm24, zmm4", + "vaesenc zmm25, zmm25, zmm4", + "vaesenc zmm26, zmm26, zmm4", + "vaesenc zmm27, zmm27, zmm4", + "vaesenc zmm28, zmm28, zmm4", + "vaesenc zmm29, zmm29, zmm4", + "vaesenc zmm30, zmm30, zmm4", + "vaesenc zmm31, zmm31, zmm4", + // aes-192 round 5 encrypt + "vaesenc zmm16, zmm16, zmm5", + "vaesenc zmm17, zmm17, zmm5", + "vaesenc zmm18, zmm18, zmm5", + "vaesenc zmm19, zmm19, zmm5", + "vaesenc zmm20, zmm20, zmm5", + "vaesenc zmm21, zmm21, zmm5", + "vaesenc zmm22, zmm22, zmm5", + "vaesenc zmm23, zmm23, zmm5", + "vaesenc zmm24, zmm24, zmm5", + "vaesenc zmm25, zmm25, zmm5", + "vaesenc zmm26, zmm26, zmm5", + "vaesenc zmm27, zmm27, zmm5", + "vaesenc zmm28, zmm28, zmm5", + "vaesenc zmm29, zmm29, zmm5", + "vaesenc zmm30, zmm30, zmm5", + "vaesenc zmm31, zmm31, zmm5", + // aes-192 round 6 encrypt + "vaesenc zmm16, zmm16, zmm6", + "vaesenc zmm17, zmm17, zmm6", + "vaesenc zmm18, zmm18, zmm6", + "vaesenc zmm19, zmm19, zmm6", + "vaesenc zmm20, zmm20, zmm6", + "vaesenc zmm21, zmm21, zmm6", + "vaesenc zmm22, zmm22, zmm6", + "vaesenc zmm23, zmm23, zmm6", + "vaesenc zmm24, zmm24, zmm6", + "vaesenc zmm25, zmm25, zmm6", + "vaesenc zmm26, zmm26, zmm6", + "vaesenc zmm27, zmm27, zmm6", + "vaesenc zmm28, zmm28, zmm6", + "vaesenc zmm29, zmm29, zmm6", + "vaesenc zmm30, zmm30, zmm6", + "vaesenc zmm31, zmm31, zmm6", + // aes-192 round 7 encrypt + "vaesenc zmm16, zmm16, zmm7", + "vaesenc zmm17, zmm17, zmm7", + "vaesenc zmm18, zmm18, zmm7", + "vaesenc zmm19, zmm19, zmm7", + "vaesenc zmm20, zmm20, zmm7", + "vaesenc zmm21, zmm21, zmm7", + "vaesenc zmm22, zmm22, zmm7", + "vaesenc zmm23, zmm23, zmm7", + "vaesenc zmm24, zmm24, zmm7", + "vaesenc zmm25, zmm25, zmm7", + "vaesenc zmm26, zmm26, zmm7", + "vaesenc zmm27, zmm27, zmm7", + "vaesenc zmm28, zmm28, zmm7", + "vaesenc zmm29, zmm29, zmm7", + "vaesenc zmm30, zmm30, zmm7", + "vaesenc zmm31, zmm31, zmm7", + // aes-192 round 8 encrypt + "vaesenc zmm16, zmm16, zmm8", + "vaesenc zmm17, zmm17, zmm8", + "vaesenc zmm18, zmm18, zmm8", + "vaesenc zmm19, zmm19, zmm8", + "vaesenc zmm20, zmm20, zmm8", + "vaesenc zmm21, zmm21, zmm8", + "vaesenc zmm22, zmm22, zmm8", + "vaesenc zmm23, zmm23, zmm8", + "vaesenc zmm24, zmm24, zmm8", + "vaesenc zmm25, zmm25, zmm8", + "vaesenc zmm26, zmm26, zmm8", + "vaesenc zmm27, zmm27, zmm8", + "vaesenc zmm28, zmm28, zmm8", + "vaesenc zmm29, zmm29, zmm8", + "vaesenc zmm30, zmm30, zmm8", + "vaesenc zmm31, zmm31, zmm8", + // aes-192 round 9 encrypt + "vaesenc zmm16, zmm16, zmm9", + "vaesenc zmm17, zmm17, zmm9", + "vaesenc zmm18, zmm18, zmm9", + "vaesenc zmm19, zmm19, zmm9", + "vaesenc zmm20, zmm20, zmm9", + "vaesenc zmm21, zmm21, zmm9", + "vaesenc zmm22, zmm22, zmm9", + "vaesenc zmm23, zmm23, zmm9", + "vaesenc zmm24, zmm24, zmm9", + "vaesenc zmm25, zmm25, zmm9", + "vaesenc zmm26, zmm26, zmm9", + "vaesenc zmm27, zmm27, zmm9", + "vaesenc zmm28, zmm28, zmm9", + "vaesenc zmm29, zmm29, zmm9", + "vaesenc zmm30, zmm30, zmm9", + "vaesenc zmm31, zmm31, zmm9", + // aes-192 round 10 encrypt + "vaesenc zmm16, zmm16, zmm10", + "vaesenc zmm17, zmm17, zmm10", + "vaesenc zmm18, zmm18, zmm10", + "vaesenc zmm19, zmm19, zmm10", + "vaesenc zmm20, zmm20, zmm10", + "vaesenc zmm21, zmm21, zmm10", + "vaesenc zmm22, zmm22, zmm10", + "vaesenc zmm23, zmm23, zmm10", + "vaesenc zmm24, zmm24, zmm10", + "vaesenc zmm25, zmm25, zmm10", + "vaesenc zmm26, zmm26, zmm10", + "vaesenc zmm27, zmm27, zmm10", + "vaesenc zmm28, zmm28, zmm10", + "vaesenc zmm29, zmm29, zmm10", + "vaesenc zmm30, zmm30, zmm10", + "vaesenc zmm31, zmm31, zmm10", + // aes-192 round 11 encrypt + "vaesenc zmm16, zmm16, zmm11", + "vaesenc zmm17, zmm17, zmm11", + "vaesenc zmm18, zmm18, zmm11", + "vaesenc zmm19, zmm19, zmm11", + "vaesenc zmm20, zmm20, zmm11", + "vaesenc zmm21, zmm21, zmm11", + "vaesenc zmm22, zmm22, zmm11", + "vaesenc zmm23, zmm23, zmm11", + "vaesenc zmm24, zmm24, zmm11", + "vaesenc zmm25, zmm25, zmm11", + "vaesenc zmm26, zmm26, zmm11", + "vaesenc zmm27, zmm27, zmm11", + "vaesenc zmm28, zmm28, zmm11", + "vaesenc zmm29, zmm29, zmm11", + "vaesenc zmm30, zmm30, zmm11", + "vaesenc zmm31, zmm31, zmm11", + // aes-192 round 12 encrypt + "vaesenclast zmm16, zmm16, zmm12", + "vaesenclast zmm17, zmm17, zmm12", + "vaesenclast zmm18, zmm18, zmm12", + "vaesenclast zmm19, zmm19, zmm12", + "vaesenclast zmm20, zmm20, zmm12", + "vaesenclast zmm21, zmm21, zmm12", + "vaesenclast zmm22, zmm22, zmm12", + "vaesenclast zmm23, zmm23, zmm12", + "vaesenclast zmm24, zmm24, zmm12", + "vaesenclast zmm25, zmm25, zmm12", + "vaesenclast zmm26, zmm26, zmm12", + "vaesenclast zmm27, zmm27, zmm12", + "vaesenclast zmm28, zmm28, zmm12", + "vaesenclast zmm29, zmm29, zmm12", + "vaesenclast zmm30, zmm30, zmm12", + "vaesenclast zmm31, zmm31, zmm12", + // save cipher-data + "vmovdqu32 [{optr} + 0 * 64], zmm16", + "vmovdqu32 [{optr} + 1 * 64], zmm17", + "vmovdqu32 [{optr} + 2 * 64], zmm18", + "vmovdqu32 [{optr} + 3 * 64], zmm19", + "vmovdqu32 [{optr} + 4 * 64], zmm20", + "vmovdqu32 [{optr} + 5 * 64], zmm21", + "vmovdqu32 [{optr} + 6 * 64], zmm22", + "vmovdqu32 [{optr} + 7 * 64], zmm23", + "vmovdqu32 [{optr} + 8 * 64], zmm24", + "vmovdqu32 [{optr} + 9 * 64], zmm25", + "vmovdqu32 [{optr} + 10 * 64], zmm26", + "vmovdqu32 [{optr} + 11 * 64], zmm27", + "vmovdqu32 [{optr} + 12 * 64], zmm28", + "vmovdqu32 [{optr} + 13 * 64], zmm29", + "vmovdqu32 [{optr} + 14 * 64], zmm30", + "vmovdqu32 [{optr} + 15 * 64], zmm31", + + keys = in(reg) keys.as_ptr(), + iptr = in(reg) iptr, + optr = in(reg) optr, + + out("zmm0") _, + out("zmm1") _, + out("zmm2") _, + out("zmm3") _, + out("zmm4") _, + out("zmm5") _, + out("zmm6") _, + out("zmm7") _, + out("zmm8") _, + out("zmm9") _, + out("zmm10") _, + out("zmm11") _, + out("zmm12") _, + + out("zmm16") _, + out("zmm17") _, + out("zmm18") _, + out("zmm19") _, + out("zmm20") _, + out("zmm21") _, + out("zmm22") _, + out("zmm23") _, + out("zmm24") _, + out("zmm25") _, + out("zmm26") _, + out("zmm27") _, + out("zmm28") _, + out("zmm29") _, + out("zmm30") _, + out("zmm31") _, + + options(nostack, preserves_flags), + }; +} + +#[inline] +pub(crate) unsafe fn decrypt64(keys: &Simd512RoundKeys<13>, blocks: InOut<'_, '_, Block64>) { + let (iptr, optr) = blocks.into_raw(); + asm! { + // load keys + "vmovdqu32 zmm12, [{keys} + 0 * 64]", + "vmovdqu32 zmm11, [{keys} + 1 * 64]", + "vmovdqu32 zmm10, [{keys} + 2 * 64]", + "vmovdqu32 zmm9 , [{keys} + 3 * 64]", + "vmovdqu32 zmm8 , [{keys} + 4 * 64]", + "vmovdqu32 zmm7 , [{keys} + 5 * 64]", + "vmovdqu32 zmm6 , [{keys} + 6 * 64]", + "vmovdqu32 zmm5 , [{keys} + 7 * 64]", + "vmovdqu32 zmm4 , [{keys} + 8 * 64]", + "vmovdqu32 zmm3 , [{keys} + 9 * 64]", + "vmovdqu32 zmm2 , [{keys} + 10 * 64]", + "vmovdqu32 zmm1 , [{keys} + 11 * 64]", + "vmovdqu32 zmm0 , [{keys} + 12 * 64]", + // load cipher-data + "vmovdqu32 zmm16, [{iptr} + 0 * 64]", + "vmovdqu32 zmm17, [{iptr} + 1 * 64]", + "vmovdqu32 zmm18, [{iptr} + 2 * 64]", + "vmovdqu32 zmm19, [{iptr} + 3 * 64]", + "vmovdqu32 zmm20, [{iptr} + 4 * 64]", + "vmovdqu32 zmm21, [{iptr} + 5 * 64]", + "vmovdqu32 zmm22, [{iptr} + 6 * 64]", + "vmovdqu32 zmm23, [{iptr} + 7 * 64]", + "vmovdqu32 zmm24, [{iptr} + 8 * 64]", + "vmovdqu32 zmm25, [{iptr} + 9 * 64]", + "vmovdqu32 zmm26, [{iptr} + 10 * 64]", + "vmovdqu32 zmm27, [{iptr} + 11 * 64]", + "vmovdqu32 zmm28, [{iptr} + 12 * 64]", + "vmovdqu32 zmm29, [{iptr} + 13 * 64]", + "vmovdqu32 zmm30, [{iptr} + 14 * 64]", + "vmovdqu32 zmm31, [{iptr} + 15 * 64]", + // aes-192 round 12 decrypt + "vpxord zmm16, zmm16, zmm12", + "vpxord zmm17, zmm17, zmm12", + "vpxord zmm18, zmm18, zmm12", + "vpxord zmm19, zmm19, zmm12", + "vpxord zmm20, zmm20, zmm12", + "vpxord zmm21, zmm21, zmm12", + "vpxord zmm22, zmm22, zmm12", + "vpxord zmm23, zmm23, zmm12", + "vpxord zmm24, zmm24, zmm12", + "vpxord zmm25, zmm25, zmm12", + "vpxord zmm26, zmm26, zmm12", + "vpxord zmm27, zmm27, zmm12", + "vpxord zmm28, zmm28, zmm12", + "vpxord zmm29, zmm29, zmm12", + "vpxord zmm30, zmm30, zmm12", + "vpxord zmm31, zmm31, zmm12", + // aes-192 round 11 decrypt + "vaesdec zmm16, zmm16, zmm11", + "vaesdec zmm17, zmm17, zmm11", + "vaesdec zmm18, zmm18, zmm11", + "vaesdec zmm19, zmm19, zmm11", + "vaesdec zmm20, zmm20, zmm11", + "vaesdec zmm21, zmm21, zmm11", + "vaesdec zmm22, zmm22, zmm11", + "vaesdec zmm23, zmm23, zmm11", + "vaesdec zmm24, zmm24, zmm11", + "vaesdec zmm25, zmm25, zmm11", + "vaesdec zmm26, zmm26, zmm11", + "vaesdec zmm27, zmm27, zmm11", + "vaesdec zmm28, zmm28, zmm11", + "vaesdec zmm29, zmm29, zmm11", + "vaesdec zmm30, zmm30, zmm11", + "vaesdec zmm31, zmm31, zmm11", + // aes-192 round 10 decrypt + "vaesdec zmm16, zmm16, zmm10", + "vaesdec zmm17, zmm17, zmm10", + "vaesdec zmm18, zmm18, zmm10", + "vaesdec zmm19, zmm19, zmm10", + "vaesdec zmm20, zmm20, zmm10", + "vaesdec zmm21, zmm21, zmm10", + "vaesdec zmm22, zmm22, zmm10", + "vaesdec zmm23, zmm23, zmm10", + "vaesdec zmm24, zmm24, zmm10", + "vaesdec zmm25, zmm25, zmm10", + "vaesdec zmm26, zmm26, zmm10", + "vaesdec zmm27, zmm27, zmm10", + "vaesdec zmm28, zmm28, zmm10", + "vaesdec zmm29, zmm29, zmm10", + "vaesdec zmm30, zmm30, zmm10", + "vaesdec zmm31, zmm31, zmm10", + // aes-192 round 9 decrypt + "vaesdec zmm16, zmm16, zmm9", + "vaesdec zmm17, zmm17, zmm9", + "vaesdec zmm18, zmm18, zmm9", + "vaesdec zmm19, zmm19, zmm9", + "vaesdec zmm20, zmm20, zmm9", + "vaesdec zmm21, zmm21, zmm9", + "vaesdec zmm22, zmm22, zmm9", + "vaesdec zmm23, zmm23, zmm9", + "vaesdec zmm24, zmm24, zmm9", + "vaesdec zmm25, zmm25, zmm9", + "vaesdec zmm26, zmm26, zmm9", + "vaesdec zmm27, zmm27, zmm9", + "vaesdec zmm28, zmm28, zmm9", + "vaesdec zmm29, zmm29, zmm9", + "vaesdec zmm30, zmm30, zmm9", + "vaesdec zmm31, zmm31, zmm9", + // aes-192 round 8 decrypt + "vaesdec zmm16, zmm16, zmm8", + "vaesdec zmm17, zmm17, zmm8", + "vaesdec zmm18, zmm18, zmm8", + "vaesdec zmm19, zmm19, zmm8", + "vaesdec zmm20, zmm20, zmm8", + "vaesdec zmm21, zmm21, zmm8", + "vaesdec zmm22, zmm22, zmm8", + "vaesdec zmm23, zmm23, zmm8", + "vaesdec zmm24, zmm24, zmm8", + "vaesdec zmm25, zmm25, zmm8", + "vaesdec zmm26, zmm26, zmm8", + "vaesdec zmm27, zmm27, zmm8", + "vaesdec zmm28, zmm28, zmm8", + "vaesdec zmm29, zmm29, zmm8", + "vaesdec zmm30, zmm30, zmm8", + "vaesdec zmm31, zmm31, zmm8", + // aes-192 round 7 decrypt + "vaesdec zmm16, zmm16, zmm7", + "vaesdec zmm17, zmm17, zmm7", + "vaesdec zmm18, zmm18, zmm7", + "vaesdec zmm19, zmm19, zmm7", + "vaesdec zmm20, zmm20, zmm7", + "vaesdec zmm21, zmm21, zmm7", + "vaesdec zmm22, zmm22, zmm7", + "vaesdec zmm23, zmm23, zmm7", + "vaesdec zmm24, zmm24, zmm7", + "vaesdec zmm25, zmm25, zmm7", + "vaesdec zmm26, zmm26, zmm7", + "vaesdec zmm27, zmm27, zmm7", + "vaesdec zmm28, zmm28, zmm7", + "vaesdec zmm29, zmm29, zmm7", + "vaesdec zmm30, zmm30, zmm7", + "vaesdec zmm31, zmm31, zmm7", + // aes-192 round 6 decrypt + "vaesdec zmm16, zmm16, zmm6", + "vaesdec zmm17, zmm17, zmm6", + "vaesdec zmm18, zmm18, zmm6", + "vaesdec zmm19, zmm19, zmm6", + "vaesdec zmm20, zmm20, zmm6", + "vaesdec zmm21, zmm21, zmm6", + "vaesdec zmm22, zmm22, zmm6", + "vaesdec zmm23, zmm23, zmm6", + "vaesdec zmm24, zmm24, zmm6", + "vaesdec zmm25, zmm25, zmm6", + "vaesdec zmm26, zmm26, zmm6", + "vaesdec zmm27, zmm27, zmm6", + "vaesdec zmm28, zmm28, zmm6", + "vaesdec zmm29, zmm29, zmm6", + "vaesdec zmm30, zmm30, zmm6", + "vaesdec zmm31, zmm31, zmm6", + // aes-192 round 5 decrypt + "vaesdec zmm16, zmm16, zmm5", + "vaesdec zmm17, zmm17, zmm5", + "vaesdec zmm18, zmm18, zmm5", + "vaesdec zmm19, zmm19, zmm5", + "vaesdec zmm20, zmm20, zmm5", + "vaesdec zmm21, zmm21, zmm5", + "vaesdec zmm22, zmm22, zmm5", + "vaesdec zmm23, zmm23, zmm5", + "vaesdec zmm24, zmm24, zmm5", + "vaesdec zmm25, zmm25, zmm5", + "vaesdec zmm26, zmm26, zmm5", + "vaesdec zmm27, zmm27, zmm5", + "vaesdec zmm28, zmm28, zmm5", + "vaesdec zmm29, zmm29, zmm5", + "vaesdec zmm30, zmm30, zmm5", + "vaesdec zmm31, zmm31, zmm5", + // aes-192 round 4 decrypt + "vaesdec zmm16, zmm16, zmm4", + "vaesdec zmm17, zmm17, zmm4", + "vaesdec zmm18, zmm18, zmm4", + "vaesdec zmm19, zmm19, zmm4", + "vaesdec zmm20, zmm20, zmm4", + "vaesdec zmm21, zmm21, zmm4", + "vaesdec zmm22, zmm22, zmm4", + "vaesdec zmm23, zmm23, zmm4", + "vaesdec zmm24, zmm24, zmm4", + "vaesdec zmm25, zmm25, zmm4", + "vaesdec zmm26, zmm26, zmm4", + "vaesdec zmm27, zmm27, zmm4", + "vaesdec zmm28, zmm28, zmm4", + "vaesdec zmm29, zmm29, zmm4", + "vaesdec zmm30, zmm30, zmm4", + "vaesdec zmm31, zmm31, zmm4", + // aes-192 round 3 decrypt + "vaesdec zmm16, zmm16, zmm3", + "vaesdec zmm17, zmm17, zmm3", + "vaesdec zmm18, zmm18, zmm3", + "vaesdec zmm19, zmm19, zmm3", + "vaesdec zmm20, zmm20, zmm3", + "vaesdec zmm21, zmm21, zmm3", + "vaesdec zmm22, zmm22, zmm3", + "vaesdec zmm23, zmm23, zmm3", + "vaesdec zmm24, zmm24, zmm3", + "vaesdec zmm25, zmm25, zmm3", + "vaesdec zmm26, zmm26, zmm3", + "vaesdec zmm27, zmm27, zmm3", + "vaesdec zmm28, zmm28, zmm3", + "vaesdec zmm29, zmm29, zmm3", + "vaesdec zmm30, zmm30, zmm3", + "vaesdec zmm31, zmm31, zmm3", + // aes-192 round 2 decrypt + "vaesdec zmm16, zmm16, zmm2", + "vaesdec zmm17, zmm17, zmm2", + "vaesdec zmm18, zmm18, zmm2", + "vaesdec zmm19, zmm19, zmm2", + "vaesdec zmm20, zmm20, zmm2", + "vaesdec zmm21, zmm21, zmm2", + "vaesdec zmm22, zmm22, zmm2", + "vaesdec zmm23, zmm23, zmm2", + "vaesdec zmm24, zmm24, zmm2", + "vaesdec zmm25, zmm25, zmm2", + "vaesdec zmm26, zmm26, zmm2", + "vaesdec zmm27, zmm27, zmm2", + "vaesdec zmm28, zmm28, zmm2", + "vaesdec zmm29, zmm29, zmm2", + "vaesdec zmm30, zmm30, zmm2", + "vaesdec zmm31, zmm31, zmm2", + // aes-192 round 1 decrypt + "vaesdec zmm16, zmm16, zmm1", + "vaesdec zmm17, zmm17, zmm1", + "vaesdec zmm18, zmm18, zmm1", + "vaesdec zmm19, zmm19, zmm1", + "vaesdec zmm20, zmm20, zmm1", + "vaesdec zmm21, zmm21, zmm1", + "vaesdec zmm22, zmm22, zmm1", + "vaesdec zmm23, zmm23, zmm1", + "vaesdec zmm24, zmm24, zmm1", + "vaesdec zmm25, zmm25, zmm1", + "vaesdec zmm26, zmm26, zmm1", + "vaesdec zmm27, zmm27, zmm1", + "vaesdec zmm28, zmm28, zmm1", + "vaesdec zmm29, zmm29, zmm1", + "vaesdec zmm30, zmm30, zmm1", + "vaesdec zmm31, zmm31, zmm1", + // aes-192 round 0 decrypt + "vaesdeclast zmm16, zmm16, zmm0", + "vaesdeclast zmm17, zmm17, zmm0", + "vaesdeclast zmm18, zmm18, zmm0", + "vaesdeclast zmm19, zmm19, zmm0", + "vaesdeclast zmm20, zmm20, zmm0", + "vaesdeclast zmm21, zmm21, zmm0", + "vaesdeclast zmm22, zmm22, zmm0", + "vaesdeclast zmm23, zmm23, zmm0", + "vaesdeclast zmm24, zmm24, zmm0", + "vaesdeclast zmm25, zmm25, zmm0", + "vaesdeclast zmm26, zmm26, zmm0", + "vaesdeclast zmm27, zmm27, zmm0", + "vaesdeclast zmm28, zmm28, zmm0", + "vaesdeclast zmm29, zmm29, zmm0", + "vaesdeclast zmm30, zmm30, zmm0", + "vaesdeclast zmm31, zmm31, zmm0", + // save plain-data + "vmovdqu32 [{optr} + 0 * 64], zmm16", + "vmovdqu32 [{optr} + 1 * 64], zmm17", + "vmovdqu32 [{optr} + 2 * 64], zmm18", + "vmovdqu32 [{optr} + 3 * 64], zmm19", + "vmovdqu32 [{optr} + 4 * 64], zmm20", + "vmovdqu32 [{optr} + 5 * 64], zmm21", + "vmovdqu32 [{optr} + 6 * 64], zmm22", + "vmovdqu32 [{optr} + 7 * 64], zmm23", + "vmovdqu32 [{optr} + 8 * 64], zmm24", + "vmovdqu32 [{optr} + 9 * 64], zmm25", + "vmovdqu32 [{optr} + 10 * 64], zmm26", + "vmovdqu32 [{optr} + 11 * 64], zmm27", + "vmovdqu32 [{optr} + 12 * 64], zmm28", + "vmovdqu32 [{optr} + 13 * 64], zmm29", + "vmovdqu32 [{optr} + 14 * 64], zmm30", + "vmovdqu32 [{optr} + 15 * 64], zmm31", + + keys = in(reg) keys.as_ptr(), + iptr = in(reg) iptr, + optr = in(reg) optr, + + out("zmm0") _, + out("zmm1") _, + out("zmm2") _, + out("zmm3") _, + out("zmm4") _, + out("zmm5") _, + out("zmm6") _, + out("zmm7") _, + out("zmm8") _, + out("zmm9") _, + out("zmm10") _, + out("zmm11") _, + out("zmm12") _, + + out("zmm16") _, + out("zmm17") _, + out("zmm18") _, + out("zmm19") _, + out("zmm20") _, + out("zmm21") _, + out("zmm22") _, + out("zmm23") _, + out("zmm24") _, + out("zmm25") _, + out("zmm26") _, + out("zmm27") _, + out("zmm28") _, + out("zmm29") _, + out("zmm30") _, + out("zmm31") _, + + options(nostack, preserves_flags), + }; +} diff --git a/aes/src/x86/vaes512/aes256.rs b/aes/src/x86/vaes512/aes256.rs new file mode 100644 index 00000000..e4623c9e --- /dev/null +++ b/aes/src/x86/vaes512/aes256.rs @@ -0,0 +1,767 @@ +#![allow(unsafe_op_in_unsafe_fn)] + +use crate::x86::{Block64, Simd128RoundKeys, Simd512RoundKeys, arch::*}; +use cipher::inout::InOut; +use core::{arch::asm, mem::MaybeUninit}; + +#[inline] +pub(crate) unsafe fn broadcast_keys(keys: &Simd128RoundKeys<15>) -> Simd512RoundKeys<15> { + let mut v512: [MaybeUninit<__m512i>; 15] = MaybeUninit::uninit().assume_init(); + asm! { + "vbroadcasti32x4 zmm0 , [{keys} + 0 * 16]", + "vbroadcasti32x4 zmm1 , [{keys} + 1 * 16]", + "vbroadcasti32x4 zmm2 , [{keys} + 2 * 16]", + "vbroadcasti32x4 zmm3 , [{keys} + 3 * 16]", + "vbroadcasti32x4 zmm4 , [{keys} + 4 * 16]", + "vbroadcasti32x4 zmm5 , [{keys} + 5 * 16]", + "vbroadcasti32x4 zmm6 , [{keys} + 6 * 16]", + "vbroadcasti32x4 zmm7 , [{keys} + 7 * 16]", + "vbroadcasti32x4 zmm8 , [{keys} + 8 * 16]", + "vbroadcasti32x4 zmm9 , [{keys} + 9 * 16]", + "vbroadcasti32x4 zmm10, [{keys} + 10 * 16]", + "vbroadcasti32x4 zmm11, [{keys} + 11 * 16]", + "vbroadcasti32x4 zmm12, [{keys} + 12 * 16]", + "vbroadcasti32x4 zmm13, [{keys} + 13 * 16]", + "vbroadcasti32x4 zmm14, [{keys} + 14 * 16]", + + "vmovdqu32 [{optr} + 0 * 64], zmm0", + "vmovdqu32 [{optr} + 1 * 64], zmm1", + "vmovdqu32 [{optr} + 2 * 64], zmm2", + "vmovdqu32 [{optr} + 3 * 64], zmm3", + "vmovdqu32 [{optr} + 4 * 64], zmm4", + "vmovdqu32 [{optr} + 5 * 64], zmm5", + "vmovdqu32 [{optr} + 6 * 64], zmm6", + "vmovdqu32 [{optr} + 7 * 64], zmm7", + "vmovdqu32 [{optr} + 8 * 64], zmm8", + "vmovdqu32 [{optr} + 9 * 64], zmm9", + "vmovdqu32 [{optr} + 10 * 64], zmm10", + "vmovdqu32 [{optr} + 11 * 64], zmm11", + "vmovdqu32 [{optr} + 12 * 64], zmm12", + "vmovdqu32 [{optr} + 13 * 64], zmm13", + "vmovdqu32 [{optr} + 14 * 64], zmm14", + + keys = in(reg) keys.as_ptr(), + optr = in(reg) v512.as_mut_ptr().cast::<__m512i>(), + + out("zmm0") _, + out("zmm1") _, + out("zmm2") _, + out("zmm3") _, + out("zmm4") _, + out("zmm5") _, + out("zmm6") _, + out("zmm7") _, + out("zmm8") _, + out("zmm9") _, + out("zmm10") _, + out("zmm11") _, + out("zmm12") _, + out("zmm13") _, + out("zmm14") _, + + options(nostack, preserves_flags), + }; + core::mem::transmute(v512) +} + +#[inline] +pub(crate) unsafe fn encrypt64(keys: &Simd512RoundKeys<15>, blocks: InOut<'_, '_, Block64>) { + let (iptr, optr) = blocks.into_raw(); + asm! { + // load keys + "vmovdqu32 zmm0 , [{keys} + 0 * 64]", + "vmovdqu32 zmm1 , [{keys} + 1 * 64]", + "vmovdqu32 zmm2 , [{keys} + 2 * 64]", + "vmovdqu32 zmm3 , [{keys} + 3 * 64]", + "vmovdqu32 zmm4 , [{keys} + 4 * 64]", + "vmovdqu32 zmm5 , [{keys} + 5 * 64]", + "vmovdqu32 zmm6 , [{keys} + 6 * 64]", + "vmovdqu32 zmm7 , [{keys} + 7 * 64]", + "vmovdqu32 zmm8 , [{keys} + 8 * 64]", + "vmovdqu32 zmm9 , [{keys} + 9 * 64]", + "vmovdqu32 zmm10, [{keys} + 10 * 64]", + "vmovdqu32 zmm11, [{keys} + 11 * 64]", + "vmovdqu32 zmm12, [{keys} + 12 * 64]", + "vmovdqu32 zmm13, [{keys} + 13 * 64]", + "vmovdqu32 zmm14, [{keys} + 14 * 64]", + // load plain-data + "vmovdqu32 zmm16, [{iptr} + 0 * 64]", + "vmovdqu32 zmm17, [{iptr} + 1 * 64]", + "vmovdqu32 zmm18, [{iptr} + 2 * 64]", + "vmovdqu32 zmm19, [{iptr} + 3 * 64]", + "vmovdqu32 zmm20, [{iptr} + 4 * 64]", + "vmovdqu32 zmm21, [{iptr} + 5 * 64]", + "vmovdqu32 zmm22, [{iptr} + 6 * 64]", + "vmovdqu32 zmm23, [{iptr} + 7 * 64]", + "vmovdqu32 zmm24, [{iptr} + 8 * 64]", + "vmovdqu32 zmm25, [{iptr} + 9 * 64]", + "vmovdqu32 zmm26, [{iptr} + 10 * 64]", + "vmovdqu32 zmm27, [{iptr} + 11 * 64]", + "vmovdqu32 zmm28, [{iptr} + 12 * 64]", + "vmovdqu32 zmm29, [{iptr} + 13 * 64]", + "vmovdqu32 zmm30, [{iptr} + 14 * 64]", + "vmovdqu32 zmm31, [{iptr} + 15 * 64]", + // aes-256 round 0 encrypt + "vpxord zmm16, zmm16, zmm0", + "vpxord zmm17, zmm17, zmm0", + "vpxord zmm18, zmm18, zmm0", + "vpxord zmm19, zmm19, zmm0", + "vpxord zmm20, zmm20, zmm0", + "vpxord zmm21, zmm21, zmm0", + "vpxord zmm22, zmm22, zmm0", + "vpxord zmm23, zmm23, zmm0", + "vpxord zmm24, zmm24, zmm0", + "vpxord zmm25, zmm25, zmm0", + "vpxord zmm26, zmm26, zmm0", + "vpxord zmm27, zmm27, zmm0", + "vpxord zmm28, zmm28, zmm0", + "vpxord zmm29, zmm29, zmm0", + "vpxord zmm30, zmm30, zmm0", + "vpxord zmm31, zmm31, zmm0", + // aes-256 round 1 encrypt + "vaesenc zmm16, zmm16, zmm1", + "vaesenc zmm17, zmm17, zmm1", + "vaesenc zmm18, zmm18, zmm1", + "vaesenc zmm19, zmm19, zmm1", + "vaesenc zmm20, zmm20, zmm1", + "vaesenc zmm21, zmm21, zmm1", + "vaesenc zmm22, zmm22, zmm1", + "vaesenc zmm23, zmm23, zmm1", + "vaesenc zmm24, zmm24, zmm1", + "vaesenc zmm25, zmm25, zmm1", + "vaesenc zmm26, zmm26, zmm1", + "vaesenc zmm27, zmm27, zmm1", + "vaesenc zmm28, zmm28, zmm1", + "vaesenc zmm29, zmm29, zmm1", + "vaesenc zmm30, zmm30, zmm1", + "vaesenc zmm31, zmm31, zmm1", + // aes-256 round 2 encrypt + "vaesenc zmm16, zmm16, zmm2", + "vaesenc zmm17, zmm17, zmm2", + "vaesenc zmm18, zmm18, zmm2", + "vaesenc zmm19, zmm19, zmm2", + "vaesenc zmm20, zmm20, zmm2", + "vaesenc zmm21, zmm21, zmm2", + "vaesenc zmm22, zmm22, zmm2", + "vaesenc zmm23, zmm23, zmm2", + "vaesenc zmm24, zmm24, zmm2", + "vaesenc zmm25, zmm25, zmm2", + "vaesenc zmm26, zmm26, zmm2", + "vaesenc zmm27, zmm27, zmm2", + "vaesenc zmm28, zmm28, zmm2", + "vaesenc zmm29, zmm29, zmm2", + "vaesenc zmm30, zmm30, zmm2", + "vaesenc zmm31, zmm31, zmm2", + // aes-256 round 3 encrypt + "vaesenc zmm16, zmm16, zmm3", + "vaesenc zmm17, zmm17, zmm3", + "vaesenc zmm18, zmm18, zmm3", + "vaesenc zmm19, zmm19, zmm3", + "vaesenc zmm20, zmm20, zmm3", + "vaesenc zmm21, zmm21, zmm3", + "vaesenc zmm22, zmm22, zmm3", + "vaesenc zmm23, zmm23, zmm3", + "vaesenc zmm24, zmm24, zmm3", + "vaesenc zmm25, zmm25, zmm3", + "vaesenc zmm26, zmm26, zmm3", + "vaesenc zmm27, zmm27, zmm3", + "vaesenc zmm28, zmm28, zmm3", + "vaesenc zmm29, zmm29, zmm3", + "vaesenc zmm30, zmm30, zmm3", + "vaesenc zmm31, zmm31, zmm3", + // aes-256 round 4 encrypt + "vaesenc zmm16, zmm16, zmm4", + "vaesenc zmm17, zmm17, zmm4", + "vaesenc zmm18, zmm18, zmm4", + "vaesenc zmm19, zmm19, zmm4", + "vaesenc zmm20, zmm20, zmm4", + "vaesenc zmm21, zmm21, zmm4", + "vaesenc zmm22, zmm22, zmm4", + "vaesenc zmm23, zmm23, zmm4", + "vaesenc zmm24, zmm24, zmm4", + "vaesenc zmm25, zmm25, zmm4", + "vaesenc zmm26, zmm26, zmm4", + "vaesenc zmm27, zmm27, zmm4", + "vaesenc zmm28, zmm28, zmm4", + "vaesenc zmm29, zmm29, zmm4", + "vaesenc zmm30, zmm30, zmm4", + "vaesenc zmm31, zmm31, zmm4", + // aes-256 round 5 encrypt + "vaesenc zmm16, zmm16, zmm5", + "vaesenc zmm17, zmm17, zmm5", + "vaesenc zmm18, zmm18, zmm5", + "vaesenc zmm19, zmm19, zmm5", + "vaesenc zmm20, zmm20, zmm5", + "vaesenc zmm21, zmm21, zmm5", + "vaesenc zmm22, zmm22, zmm5", + "vaesenc zmm23, zmm23, zmm5", + "vaesenc zmm24, zmm24, zmm5", + "vaesenc zmm25, zmm25, zmm5", + "vaesenc zmm26, zmm26, zmm5", + "vaesenc zmm27, zmm27, zmm5", + "vaesenc zmm28, zmm28, zmm5", + "vaesenc zmm29, zmm29, zmm5", + "vaesenc zmm30, zmm30, zmm5", + "vaesenc zmm31, zmm31, zmm5", + // aes-256 round 6 encrypt + "vaesenc zmm16, zmm16, zmm6", + "vaesenc zmm17, zmm17, zmm6", + "vaesenc zmm18, zmm18, zmm6", + "vaesenc zmm19, zmm19, zmm6", + "vaesenc zmm20, zmm20, zmm6", + "vaesenc zmm21, zmm21, zmm6", + "vaesenc zmm22, zmm22, zmm6", + "vaesenc zmm23, zmm23, zmm6", + "vaesenc zmm24, zmm24, zmm6", + "vaesenc zmm25, zmm25, zmm6", + "vaesenc zmm26, zmm26, zmm6", + "vaesenc zmm27, zmm27, zmm6", + "vaesenc zmm28, zmm28, zmm6", + "vaesenc zmm29, zmm29, zmm6", + "vaesenc zmm30, zmm30, zmm6", + "vaesenc zmm31, zmm31, zmm6", + // aes-256 round 7 encrypt + "vaesenc zmm16, zmm16, zmm7", + "vaesenc zmm17, zmm17, zmm7", + "vaesenc zmm18, zmm18, zmm7", + "vaesenc zmm19, zmm19, zmm7", + "vaesenc zmm20, zmm20, zmm7", + "vaesenc zmm21, zmm21, zmm7", + "vaesenc zmm22, zmm22, zmm7", + "vaesenc zmm23, zmm23, zmm7", + "vaesenc zmm24, zmm24, zmm7", + "vaesenc zmm25, zmm25, zmm7", + "vaesenc zmm26, zmm26, zmm7", + "vaesenc zmm27, zmm27, zmm7", + "vaesenc zmm28, zmm28, zmm7", + "vaesenc zmm29, zmm29, zmm7", + "vaesenc zmm30, zmm30, zmm7", + "vaesenc zmm31, zmm31, zmm7", + // aes-256 round 8 encrypt + "vaesenc zmm16, zmm16, zmm8", + "vaesenc zmm17, zmm17, zmm8", + "vaesenc zmm18, zmm18, zmm8", + "vaesenc zmm19, zmm19, zmm8", + "vaesenc zmm20, zmm20, zmm8", + "vaesenc zmm21, zmm21, zmm8", + "vaesenc zmm22, zmm22, zmm8", + "vaesenc zmm23, zmm23, zmm8", + "vaesenc zmm24, zmm24, zmm8", + "vaesenc zmm25, zmm25, zmm8", + "vaesenc zmm26, zmm26, zmm8", + "vaesenc zmm27, zmm27, zmm8", + "vaesenc zmm28, zmm28, zmm8", + "vaesenc zmm29, zmm29, zmm8", + "vaesenc zmm30, zmm30, zmm8", + "vaesenc zmm31, zmm31, zmm8", + // aes-256 round 9 encrypt + "vaesenc zmm16, zmm16, zmm9", + "vaesenc zmm17, zmm17, zmm9", + "vaesenc zmm18, zmm18, zmm9", + "vaesenc zmm19, zmm19, zmm9", + "vaesenc zmm20, zmm20, zmm9", + "vaesenc zmm21, zmm21, zmm9", + "vaesenc zmm22, zmm22, zmm9", + "vaesenc zmm23, zmm23, zmm9", + "vaesenc zmm24, zmm24, zmm9", + "vaesenc zmm25, zmm25, zmm9", + "vaesenc zmm26, zmm26, zmm9", + "vaesenc zmm27, zmm27, zmm9", + "vaesenc zmm28, zmm28, zmm9", + "vaesenc zmm29, zmm29, zmm9", + "vaesenc zmm30, zmm30, zmm9", + "vaesenc zmm31, zmm31, zmm9", + // aes-256 round 10 encrypt + "vaesenc zmm16, zmm16, zmm10", + "vaesenc zmm17, zmm17, zmm10", + "vaesenc zmm18, zmm18, zmm10", + "vaesenc zmm19, zmm19, zmm10", + "vaesenc zmm20, zmm20, zmm10", + "vaesenc zmm21, zmm21, zmm10", + "vaesenc zmm22, zmm22, zmm10", + "vaesenc zmm23, zmm23, zmm10", + "vaesenc zmm24, zmm24, zmm10", + "vaesenc zmm25, zmm25, zmm10", + "vaesenc zmm26, zmm26, zmm10", + "vaesenc zmm27, zmm27, zmm10", + "vaesenc zmm28, zmm28, zmm10", + "vaesenc zmm29, zmm29, zmm10", + "vaesenc zmm30, zmm30, zmm10", + "vaesenc zmm31, zmm31, zmm10", + // aes-256 round 11 encrypt + "vaesenc zmm16, zmm16, zmm11", + "vaesenc zmm17, zmm17, zmm11", + "vaesenc zmm18, zmm18, zmm11", + "vaesenc zmm19, zmm19, zmm11", + "vaesenc zmm20, zmm20, zmm11", + "vaesenc zmm21, zmm21, zmm11", + "vaesenc zmm22, zmm22, zmm11", + "vaesenc zmm23, zmm23, zmm11", + "vaesenc zmm24, zmm24, zmm11", + "vaesenc zmm25, zmm25, zmm11", + "vaesenc zmm26, zmm26, zmm11", + "vaesenc zmm27, zmm27, zmm11", + "vaesenc zmm28, zmm28, zmm11", + "vaesenc zmm29, zmm29, zmm11", + "vaesenc zmm30, zmm30, zmm11", + "vaesenc zmm31, zmm31, zmm11", + // aes-256 round 12 encrypt + "vaesenc zmm16, zmm16, zmm12", + "vaesenc zmm17, zmm17, zmm12", + "vaesenc zmm18, zmm18, zmm12", + "vaesenc zmm19, zmm19, zmm12", + "vaesenc zmm20, zmm20, zmm12", + "vaesenc zmm21, zmm21, zmm12", + "vaesenc zmm22, zmm22, zmm12", + "vaesenc zmm23, zmm23, zmm12", + "vaesenc zmm24, zmm24, zmm12", + "vaesenc zmm25, zmm25, zmm12", + "vaesenc zmm26, zmm26, zmm12", + "vaesenc zmm27, zmm27, zmm12", + "vaesenc zmm28, zmm28, zmm12", + "vaesenc zmm29, zmm29, zmm12", + "vaesenc zmm30, zmm30, zmm12", + "vaesenc zmm31, zmm31, zmm12", + // aes-256 round 13 encrypt + "vaesenc zmm16, zmm16, zmm13", + "vaesenc zmm17, zmm17, zmm13", + "vaesenc zmm18, zmm18, zmm13", + "vaesenc zmm19, zmm19, zmm13", + "vaesenc zmm20, zmm20, zmm13", + "vaesenc zmm21, zmm21, zmm13", + "vaesenc zmm22, zmm22, zmm13", + "vaesenc zmm23, zmm23, zmm13", + "vaesenc zmm24, zmm24, zmm13", + "vaesenc zmm25, zmm25, zmm13", + "vaesenc zmm26, zmm26, zmm13", + "vaesenc zmm27, zmm27, zmm13", + "vaesenc zmm28, zmm28, zmm13", + "vaesenc zmm29, zmm29, zmm13", + "vaesenc zmm30, zmm30, zmm13", + "vaesenc zmm31, zmm31, zmm13", + // aes-256 round 14 encrypt + "vaesenclast zmm16, zmm16, zmm14", + "vaesenclast zmm17, zmm17, zmm14", + "vaesenclast zmm18, zmm18, zmm14", + "vaesenclast zmm19, zmm19, zmm14", + "vaesenclast zmm20, zmm20, zmm14", + "vaesenclast zmm21, zmm21, zmm14", + "vaesenclast zmm22, zmm22, zmm14", + "vaesenclast zmm23, zmm23, zmm14", + "vaesenclast zmm24, zmm24, zmm14", + "vaesenclast zmm25, zmm25, zmm14", + "vaesenclast zmm26, zmm26, zmm14", + "vaesenclast zmm27, zmm27, zmm14", + "vaesenclast zmm28, zmm28, zmm14", + "vaesenclast zmm29, zmm29, zmm14", + "vaesenclast zmm30, zmm30, zmm14", + "vaesenclast zmm31, zmm31, zmm14", + // save cipher-data + "vmovdqu32 [{optr} + 0 * 64], zmm16", + "vmovdqu32 [{optr} + 1 * 64], zmm17", + "vmovdqu32 [{optr} + 2 * 64], zmm18", + "vmovdqu32 [{optr} + 3 * 64], zmm19", + "vmovdqu32 [{optr} + 4 * 64], zmm20", + "vmovdqu32 [{optr} + 5 * 64], zmm21", + "vmovdqu32 [{optr} + 6 * 64], zmm22", + "vmovdqu32 [{optr} + 7 * 64], zmm23", + "vmovdqu32 [{optr} + 8 * 64], zmm24", + "vmovdqu32 [{optr} + 9 * 64], zmm25", + "vmovdqu32 [{optr} + 10 * 64], zmm26", + "vmovdqu32 [{optr} + 11 * 64], zmm27", + "vmovdqu32 [{optr} + 12 * 64], zmm28", + "vmovdqu32 [{optr} + 13 * 64], zmm29", + "vmovdqu32 [{optr} + 14 * 64], zmm30", + "vmovdqu32 [{optr} + 15 * 64], zmm31", + + keys = in(reg) keys.as_ptr(), + iptr = in(reg) iptr, + optr = in(reg) optr, + + out("zmm0") _, + out("zmm1") _, + out("zmm2") _, + out("zmm3") _, + out("zmm4") _, + out("zmm5") _, + out("zmm6") _, + out("zmm7") _, + out("zmm8") _, + out("zmm9") _, + out("zmm10") _, + out("zmm11") _, + out("zmm12") _, + out("zmm13") _, + out("zmm14") _, + + out("zmm16") _, + out("zmm17") _, + out("zmm18") _, + out("zmm19") _, + out("zmm20") _, + out("zmm21") _, + out("zmm22") _, + out("zmm23") _, + out("zmm24") _, + out("zmm25") _, + out("zmm26") _, + out("zmm27") _, + out("zmm28") _, + out("zmm29") _, + out("zmm30") _, + out("zmm31") _, + + options(nostack, preserves_flags), + }; +} + +#[inline] +pub(crate) unsafe fn decrypt64(keys: &Simd512RoundKeys<15>, blocks: InOut<'_, '_, Block64>) { + let (iptr, optr) = blocks.into_raw(); + asm! { + // load keys + "vmovdqu32 zmm14, [{keys} + 0 * 64]", + "vmovdqu32 zmm13, [{keys} + 1 * 64]", + "vmovdqu32 zmm12, [{keys} + 2 * 64]", + "vmovdqu32 zmm11, [{keys} + 3 * 64]", + "vmovdqu32 zmm10, [{keys} + 4 * 64]", + "vmovdqu32 zmm9 , [{keys} + 5 * 64]", + "vmovdqu32 zmm8 , [{keys} + 6 * 64]", + "vmovdqu32 zmm7 , [{keys} + 7 * 64]", + "vmovdqu32 zmm6 , [{keys} + 8 * 64]", + "vmovdqu32 zmm5 , [{keys} + 9 * 64]", + "vmovdqu32 zmm4 , [{keys} + 10 * 64]", + "vmovdqu32 zmm3 , [{keys} + 11 * 64]", + "vmovdqu32 zmm2 , [{keys} + 12 * 64]", + "vmovdqu32 zmm1 , [{keys} + 13 * 64]", + "vmovdqu32 zmm0 , [{keys} + 14 * 64]", + // load cipher-data + "vmovdqu32 zmm16, [{iptr} + 0 * 64]", + "vmovdqu32 zmm17, [{iptr} + 1 * 64]", + "vmovdqu32 zmm18, [{iptr} + 2 * 64]", + "vmovdqu32 zmm19, [{iptr} + 3 * 64]", + "vmovdqu32 zmm20, [{iptr} + 4 * 64]", + "vmovdqu32 zmm21, [{iptr} + 5 * 64]", + "vmovdqu32 zmm22, [{iptr} + 6 * 64]", + "vmovdqu32 zmm23, [{iptr} + 7 * 64]", + "vmovdqu32 zmm24, [{iptr} + 8 * 64]", + "vmovdqu32 zmm25, [{iptr} + 9 * 64]", + "vmovdqu32 zmm26, [{iptr} + 10 * 64]", + "vmovdqu32 zmm27, [{iptr} + 11 * 64]", + "vmovdqu32 zmm28, [{iptr} + 12 * 64]", + "vmovdqu32 zmm29, [{iptr} + 13 * 64]", + "vmovdqu32 zmm30, [{iptr} + 14 * 64]", + "vmovdqu32 zmm31, [{iptr} + 15 * 64]", + // aes-256 round 14 encrypt + "vpxord zmm16, zmm16, zmm14", + "vpxord zmm17, zmm17, zmm14", + "vpxord zmm18, zmm18, zmm14", + "vpxord zmm19, zmm19, zmm14", + "vpxord zmm20, zmm20, zmm14", + "vpxord zmm21, zmm21, zmm14", + "vpxord zmm22, zmm22, zmm14", + "vpxord zmm23, zmm23, zmm14", + "vpxord zmm24, zmm24, zmm14", + "vpxord zmm25, zmm25, zmm14", + "vpxord zmm26, zmm26, zmm14", + "vpxord zmm27, zmm27, zmm14", + "vpxord zmm28, zmm28, zmm14", + "vpxord zmm29, zmm29, zmm14", + "vpxord zmm30, zmm30, zmm14", + "vpxord zmm31, zmm31, zmm14", + // aes-256 round 13 encrypt + "vaesdec zmm16, zmm16, zmm13", + "vaesdec zmm17, zmm17, zmm13", + "vaesdec zmm18, zmm18, zmm13", + "vaesdec zmm19, zmm19, zmm13", + "vaesdec zmm20, zmm20, zmm13", + "vaesdec zmm21, zmm21, zmm13", + "vaesdec zmm22, zmm22, zmm13", + "vaesdec zmm23, zmm23, zmm13", + "vaesdec zmm24, zmm24, zmm13", + "vaesdec zmm25, zmm25, zmm13", + "vaesdec zmm26, zmm26, zmm13", + "vaesdec zmm27, zmm27, zmm13", + "vaesdec zmm28, zmm28, zmm13", + "vaesdec zmm29, zmm29, zmm13", + "vaesdec zmm30, zmm30, zmm13", + "vaesdec zmm31, zmm31, zmm13", + // aes-256 round 12 encrypt + "vaesdec zmm16, zmm16, zmm12", + "vaesdec zmm17, zmm17, zmm12", + "vaesdec zmm18, zmm18, zmm12", + "vaesdec zmm19, zmm19, zmm12", + "vaesdec zmm20, zmm20, zmm12", + "vaesdec zmm21, zmm21, zmm12", + "vaesdec zmm22, zmm22, zmm12", + "vaesdec zmm23, zmm23, zmm12", + "vaesdec zmm24, zmm24, zmm12", + "vaesdec zmm25, zmm25, zmm12", + "vaesdec zmm26, zmm26, zmm12", + "vaesdec zmm27, zmm27, zmm12", + "vaesdec zmm28, zmm28, zmm12", + "vaesdec zmm29, zmm29, zmm12", + "vaesdec zmm30, zmm30, zmm12", + "vaesdec zmm31, zmm31, zmm12", + // aes-256 round 11 encrypt + "vaesdec zmm16, zmm16, zmm11", + "vaesdec zmm17, zmm17, zmm11", + "vaesdec zmm18, zmm18, zmm11", + "vaesdec zmm19, zmm19, zmm11", + "vaesdec zmm20, zmm20, zmm11", + "vaesdec zmm21, zmm21, zmm11", + "vaesdec zmm22, zmm22, zmm11", + "vaesdec zmm23, zmm23, zmm11", + "vaesdec zmm24, zmm24, zmm11", + "vaesdec zmm25, zmm25, zmm11", + "vaesdec zmm26, zmm26, zmm11", + "vaesdec zmm27, zmm27, zmm11", + "vaesdec zmm28, zmm28, zmm11", + "vaesdec zmm29, zmm29, zmm11", + "vaesdec zmm30, zmm30, zmm11", + "vaesdec zmm31, zmm31, zmm11", + // aes-256 round 10 encrypt + "vaesdec zmm16, zmm16, zmm10", + "vaesdec zmm17, zmm17, zmm10", + "vaesdec zmm18, zmm18, zmm10", + "vaesdec zmm19, zmm19, zmm10", + "vaesdec zmm20, zmm20, zmm10", + "vaesdec zmm21, zmm21, zmm10", + "vaesdec zmm22, zmm22, zmm10", + "vaesdec zmm23, zmm23, zmm10", + "vaesdec zmm24, zmm24, zmm10", + "vaesdec zmm25, zmm25, zmm10", + "vaesdec zmm26, zmm26, zmm10", + "vaesdec zmm27, zmm27, zmm10", + "vaesdec zmm28, zmm28, zmm10", + "vaesdec zmm29, zmm29, zmm10", + "vaesdec zmm30, zmm30, zmm10", + "vaesdec zmm31, zmm31, zmm10", + // aes-256 round 9 encrypt + "vaesdec zmm16, zmm16, zmm9", + "vaesdec zmm17, zmm17, zmm9", + "vaesdec zmm18, zmm18, zmm9", + "vaesdec zmm19, zmm19, zmm9", + "vaesdec zmm20, zmm20, zmm9", + "vaesdec zmm21, zmm21, zmm9", + "vaesdec zmm22, zmm22, zmm9", + "vaesdec zmm23, zmm23, zmm9", + "vaesdec zmm24, zmm24, zmm9", + "vaesdec zmm25, zmm25, zmm9", + "vaesdec zmm26, zmm26, zmm9", + "vaesdec zmm27, zmm27, zmm9", + "vaesdec zmm28, zmm28, zmm9", + "vaesdec zmm29, zmm29, zmm9", + "vaesdec zmm30, zmm30, zmm9", + "vaesdec zmm31, zmm31, zmm9", + // aes-256 round 8 encrypt + "vaesdec zmm16, zmm16, zmm8", + "vaesdec zmm17, zmm17, zmm8", + "vaesdec zmm18, zmm18, zmm8", + "vaesdec zmm19, zmm19, zmm8", + "vaesdec zmm20, zmm20, zmm8", + "vaesdec zmm21, zmm21, zmm8", + "vaesdec zmm22, zmm22, zmm8", + "vaesdec zmm23, zmm23, zmm8", + "vaesdec zmm24, zmm24, zmm8", + "vaesdec zmm25, zmm25, zmm8", + "vaesdec zmm26, zmm26, zmm8", + "vaesdec zmm27, zmm27, zmm8", + "vaesdec zmm28, zmm28, zmm8", + "vaesdec zmm29, zmm29, zmm8", + "vaesdec zmm30, zmm30, zmm8", + "vaesdec zmm31, zmm31, zmm8", + // aes-256 round 7 encrypt + "vaesdec zmm16, zmm16, zmm7", + "vaesdec zmm17, zmm17, zmm7", + "vaesdec zmm18, zmm18, zmm7", + "vaesdec zmm19, zmm19, zmm7", + "vaesdec zmm20, zmm20, zmm7", + "vaesdec zmm21, zmm21, zmm7", + "vaesdec zmm22, zmm22, zmm7", + "vaesdec zmm23, zmm23, zmm7", + "vaesdec zmm24, zmm24, zmm7", + "vaesdec zmm25, zmm25, zmm7", + "vaesdec zmm26, zmm26, zmm7", + "vaesdec zmm27, zmm27, zmm7", + "vaesdec zmm28, zmm28, zmm7", + "vaesdec zmm29, zmm29, zmm7", + "vaesdec zmm30, zmm30, zmm7", + "vaesdec zmm31, zmm31, zmm7", + // aes-256 round 6 encrypt + "vaesdec zmm16, zmm16, zmm6", + "vaesdec zmm17, zmm17, zmm6", + "vaesdec zmm18, zmm18, zmm6", + "vaesdec zmm19, zmm19, zmm6", + "vaesdec zmm20, zmm20, zmm6", + "vaesdec zmm21, zmm21, zmm6", + "vaesdec zmm22, zmm22, zmm6", + "vaesdec zmm23, zmm23, zmm6", + "vaesdec zmm24, zmm24, zmm6", + "vaesdec zmm25, zmm25, zmm6", + "vaesdec zmm26, zmm26, zmm6", + "vaesdec zmm27, zmm27, zmm6", + "vaesdec zmm28, zmm28, zmm6", + "vaesdec zmm29, zmm29, zmm6", + "vaesdec zmm30, zmm30, zmm6", + "vaesdec zmm31, zmm31, zmm6", + // aes-256 round 5 encrypt + "vaesdec zmm16, zmm16, zmm5", + "vaesdec zmm17, zmm17, zmm5", + "vaesdec zmm18, zmm18, zmm5", + "vaesdec zmm19, zmm19, zmm5", + "vaesdec zmm20, zmm20, zmm5", + "vaesdec zmm21, zmm21, zmm5", + "vaesdec zmm22, zmm22, zmm5", + "vaesdec zmm23, zmm23, zmm5", + "vaesdec zmm24, zmm24, zmm5", + "vaesdec zmm25, zmm25, zmm5", + "vaesdec zmm26, zmm26, zmm5", + "vaesdec zmm27, zmm27, zmm5", + "vaesdec zmm28, zmm28, zmm5", + "vaesdec zmm29, zmm29, zmm5", + "vaesdec zmm30, zmm30, zmm5", + "vaesdec zmm31, zmm31, zmm5", + // aes-256 round 4 encrypt + "vaesdec zmm16, zmm16, zmm4", + "vaesdec zmm17, zmm17, zmm4", + "vaesdec zmm18, zmm18, zmm4", + "vaesdec zmm19, zmm19, zmm4", + "vaesdec zmm20, zmm20, zmm4", + "vaesdec zmm21, zmm21, zmm4", + "vaesdec zmm22, zmm22, zmm4", + "vaesdec zmm23, zmm23, zmm4", + "vaesdec zmm24, zmm24, zmm4", + "vaesdec zmm25, zmm25, zmm4", + "vaesdec zmm26, zmm26, zmm4", + "vaesdec zmm27, zmm27, zmm4", + "vaesdec zmm28, zmm28, zmm4", + "vaesdec zmm29, zmm29, zmm4", + "vaesdec zmm30, zmm30, zmm4", + "vaesdec zmm31, zmm31, zmm4", + // aes-256 round 3 encrypt + "vaesdec zmm16, zmm16, zmm3", + "vaesdec zmm17, zmm17, zmm3", + "vaesdec zmm18, zmm18, zmm3", + "vaesdec zmm19, zmm19, zmm3", + "vaesdec zmm20, zmm20, zmm3", + "vaesdec zmm21, zmm21, zmm3", + "vaesdec zmm22, zmm22, zmm3", + "vaesdec zmm23, zmm23, zmm3", + "vaesdec zmm24, zmm24, zmm3", + "vaesdec zmm25, zmm25, zmm3", + "vaesdec zmm26, zmm26, zmm3", + "vaesdec zmm27, zmm27, zmm3", + "vaesdec zmm28, zmm28, zmm3", + "vaesdec zmm29, zmm29, zmm3", + "vaesdec zmm30, zmm30, zmm3", + "vaesdec zmm31, zmm31, zmm3", + // aes-256 round 2 encrypt + "vaesdec zmm16, zmm16, zmm2", + "vaesdec zmm17, zmm17, zmm2", + "vaesdec zmm18, zmm18, zmm2", + "vaesdec zmm19, zmm19, zmm2", + "vaesdec zmm20, zmm20, zmm2", + "vaesdec zmm21, zmm21, zmm2", + "vaesdec zmm22, zmm22, zmm2", + "vaesdec zmm23, zmm23, zmm2", + "vaesdec zmm24, zmm24, zmm2", + "vaesdec zmm25, zmm25, zmm2", + "vaesdec zmm26, zmm26, zmm2", + "vaesdec zmm27, zmm27, zmm2", + "vaesdec zmm28, zmm28, zmm2", + "vaesdec zmm29, zmm29, zmm2", + "vaesdec zmm30, zmm30, zmm2", + "vaesdec zmm31, zmm31, zmm2", + // aes-256 round 1 encrypt + "vaesdec zmm16, zmm16, zmm1", + "vaesdec zmm17, zmm17, zmm1", + "vaesdec zmm18, zmm18, zmm1", + "vaesdec zmm19, zmm19, zmm1", + "vaesdec zmm20, zmm20, zmm1", + "vaesdec zmm21, zmm21, zmm1", + "vaesdec zmm22, zmm22, zmm1", + "vaesdec zmm23, zmm23, zmm1", + "vaesdec zmm24, zmm24, zmm1", + "vaesdec zmm25, zmm25, zmm1", + "vaesdec zmm26, zmm26, zmm1", + "vaesdec zmm27, zmm27, zmm1", + "vaesdec zmm28, zmm28, zmm1", + "vaesdec zmm29, zmm29, zmm1", + "vaesdec zmm30, zmm30, zmm1", + "vaesdec zmm31, zmm31, zmm1", + // aes-256 round 0 encrypt + "vaesdeclast zmm16, zmm16, zmm0", + "vaesdeclast zmm17, zmm17, zmm0", + "vaesdeclast zmm18, zmm18, zmm0", + "vaesdeclast zmm19, zmm19, zmm0", + "vaesdeclast zmm20, zmm20, zmm0", + "vaesdeclast zmm21, zmm21, zmm0", + "vaesdeclast zmm22, zmm22, zmm0", + "vaesdeclast zmm23, zmm23, zmm0", + "vaesdeclast zmm24, zmm24, zmm0", + "vaesdeclast zmm25, zmm25, zmm0", + "vaesdeclast zmm26, zmm26, zmm0", + "vaesdeclast zmm27, zmm27, zmm0", + "vaesdeclast zmm28, zmm28, zmm0", + "vaesdeclast zmm29, zmm29, zmm0", + "vaesdeclast zmm30, zmm30, zmm0", + "vaesdeclast zmm31, zmm31, zmm0", + // save plain-data + "vmovdqu32 [{optr} + 0 * 64], zmm16", + "vmovdqu32 [{optr} + 1 * 64], zmm17", + "vmovdqu32 [{optr} + 2 * 64], zmm18", + "vmovdqu32 [{optr} + 3 * 64], zmm19", + "vmovdqu32 [{optr} + 4 * 64], zmm20", + "vmovdqu32 [{optr} + 5 * 64], zmm21", + "vmovdqu32 [{optr} + 6 * 64], zmm22", + "vmovdqu32 [{optr} + 7 * 64], zmm23", + "vmovdqu32 [{optr} + 8 * 64], zmm24", + "vmovdqu32 [{optr} + 9 * 64], zmm25", + "vmovdqu32 [{optr} + 10 * 64], zmm26", + "vmovdqu32 [{optr} + 11 * 64], zmm27", + "vmovdqu32 [{optr} + 12 * 64], zmm28", + "vmovdqu32 [{optr} + 13 * 64], zmm29", + "vmovdqu32 [{optr} + 14 * 64], zmm30", + "vmovdqu32 [{optr} + 15 * 64], zmm31", + + keys = in(reg) keys.as_ptr(), + iptr = in(reg) iptr, + optr = in(reg) optr, + + out("zmm0") _, + out("zmm1") _, + out("zmm2") _, + out("zmm3") _, + out("zmm4") _, + out("zmm5") _, + out("zmm6") _, + out("zmm7") _, + out("zmm8") _, + out("zmm9") _, + out("zmm10") _, + out("zmm11") _, + out("zmm12") _, + out("zmm13") _, + out("zmm14") _, + + out("zmm16") _, + out("zmm17") _, + out("zmm18") _, + out("zmm19") _, + out("zmm20") _, + out("zmm21") _, + out("zmm22") _, + out("zmm23") _, + out("zmm24") _, + out("zmm25") _, + out("zmm26") _, + out("zmm27") _, + out("zmm28") _, + out("zmm29") _, + out("zmm30") _, + out("zmm31") _, + + options(nostack, preserves_flags), + }; +}