#! /usr/bin/env perl # Copyright 2025 The OpenSSL Project Authors. All Rights Reserved. # Copyright (c) 2025, Intel Corporation. All Rights Reserved. # # Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # Implements AES-CFB128 encryption and decryption with Intel(R) VAES $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $avx512vaes=0; # will be non-zero if tooling supports Intel AVX-512 and VAES $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx512vaes = ($1>=2.30); } if (!$avx512vaes && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { $avx512vaes = ($1==2.13 && $2>=3) + ($1>=2.14); } if (!$avx512vaes && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+\.[0-9]+)\./) { $avx512vaes = ($1>=14.16); } if (!$avx512vaes && `$ENV{CC} -v 2>&1` =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) { my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001 if ($1) { # Apple conditions, they use a different version series, see # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2 # clang 7.0.0 is Apple clang 10.0.1 $avx512vaes = ($ver>=10.0001) } else { $avx512vaes = ($ver>=7.0); } } open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" or die "can't call $xlate: $!"; *STDOUT=*OUT; ################################################################## $code=".text\n"; if ($avx512vaes) { $code.=<<___; .extern OPENSSL_ia32cap_P ################################################################# # Signature: # # int ossl_aes_cfb128_vaes_eligible(void); # # Detects if the underlying hardware supports all the features # required to run the Intel AVX-512 implementations of AES-CFB128 algorithms. # # Returns: non zero if all the required features are detected, 0 otherwise ################################################################# .globl ossl_aes_cfb128_vaes_eligible .type ossl_aes_cfb128_vaes_eligible,\@abi-omnipotent .balign 64 ossl_aes_cfb128_vaes_eligible: .cfi_startproc endbranch mov OPENSSL_ia32cap_P+8(%rip),%ecx xor %eax,%eax # Check 3rd 32-bit word of OPENSSL_ia32cap_P for the feature bit(s): # AVX512BW (bit 30) + AVX512DQ (bit 17) + AVX512F (bit 16) and \$0x40030000,%ecx # mask is 1<<30|1<<17|1<<16 cmp \$0x40030000,%ecx jne .Laes_cfb128_vaes_eligible_done mov OPENSSL_ia32cap_P+12(%rip),%ecx # Check 4th 32-bit word of OPENSSL_ia32cap_P for the feature bit(s): # AVX512VAES (bit 9) and \$0x200,%ecx # mask is 1<<9 cmp \$0x200,%ecx cmove %ecx,%eax .Laes_cfb128_vaes_eligible_done: ret .cfi_endproc .size ossl_aes_cfb128_vaes_eligible, .-ossl_aes_cfb128_vaes_eligible ___ ################################################################# # # AES subroutines for: # - preloading the AES key schedule into AVX registers # - single-block AES encryption used by CFB encryption and decryption # - multiple-block AES encryption used by CFB decryption # # The CFB mode only uses block cipher encryption. # # The AES encryption step is described in Section 5.1 Cipher() of # FIPS 197 https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.197-upd1.pdf # and implemented with Intel(R) AES-NI and VAES instructions: # # - AESKEYGENASSIST for key expansion, elsewhere in aesni_set_encrypt_key() # - VPXORD for AES pre-whitening # - VAESENC for performing one AES encryption round # - VAESENCLAST for performing the last AES encryption round # # For more information please consult: # - the Intel(R) 64 and IA-32 Architectures Optimization Reference Manual, # Chapter 21: Cryptography & Finite Field Arithmetic Instructions # https://www.intel.com/content/www/us/en/developer/articles/technical/intel64-and-ia32-architectures-optimization.html # - the Intel(R) Advanced Encryption Standard (AES) New Instructions Set Whitepaper # https://www.intel.com/content/dam/doc/white-paper/advanced-encryption-standard-new-instructions-set-paper.pdf # ################################################################# # expects the key schedule address in $key_original sub load_aes_key_schedule_1x() { $code.=<<___; vmovdqu8 0($key_original),%xmm17 # schedule 0 whitening vmovdqu8 16($key_original),%xmm18 # 1 vmovdqu8 32($key_original),%xmm19 # 2 vmovdqu8 48($key_original),%xmm20 # 3 vmovdqu8 64($key_original),%xmm21 # 4 vmovdqu8 80($key_original),%xmm22 # 5 vmovdqu8 96($key_original),%xmm23 # 6 vmovdqu8 112($key_original),%xmm24 # 7 vmovdqu8 128($key_original),%xmm25 # 8 vmovdqu8 144($key_original),%xmm26 # 9 vmovdqu8 160($key_original),%xmm27 # 10 last for AES-128 vmovdqu8 176($key_original),%xmm28 # 11 vmovdqu8 192($key_original),%xmm29 # 12 last for AES-192 vmovdqu8 208($key_original),%xmm30 # 13 vmovdqu8 224($key_original),%xmm31 # 14 last for AES-256 mov 240($key_original),$rounds # load AES rounds # 240 is the byte-offset of the rounds field in AES_KEY ___ } # expects the key schedule address in $key_original sub load_aes_key_schedule_4x() { $code.=<<___; vbroadcasti32x4 0($key_original),%zmm17 # schedule 0 whitening vbroadcasti32x4 16($key_original),%zmm18 # 1 vbroadcasti32x4 32($key_original),%zmm19 # 2 vbroadcasti32x4 48($key_original),%zmm20 # 3 vbroadcasti32x4 64($key_original),%zmm21 # 4 vbroadcasti32x4 80($key_original),%zmm22 # 5 vbroadcasti32x4 96($key_original),%zmm23 # 6 vbroadcasti32x4 112($key_original),%zmm24 # 7 vbroadcasti32x4 128($key_original),%zmm25 # 8 vbroadcasti32x4 144($key_original),%zmm26 # 9 vbroadcasti32x4 160($key_original),%zmm27 # 10 last for AES-128 vbroadcasti32x4 176($key_original),%zmm28 # 11 vbroadcasti32x4 192($key_original),%zmm29 # 12 last for AES-192 vbroadcasti32x4 208($key_original),%zmm30 # 13 vbroadcasti32x4 224($key_original),%zmm31 # 14 last for AES-256 mov 240($key_original),$rounds # load AES rounds # 240 is the byte-offset of the rounds field in AES_KEY ___ } # Performs AES encryption of 1 128-bit block # Expects iv in $temp, non-final AES rounds in $rounds and key schedule in xmm17..31 sub vaes_encrypt_block_1x() { my ($label_prefix)=@_; $code.=<<___; vpxord %xmm17,$temp,$temp # AES pre-whitening vaesenc %xmm18,$temp,$temp vaesenc %xmm19,$temp,$temp vaesenc %xmm20,$temp,$temp vaesenc %xmm21,$temp,$temp vaesenc %xmm22,$temp,$temp vaesenc %xmm23,$temp,$temp vaesenc %xmm24,$temp,$temp vaesenc %xmm25,$temp,$temp vaesenc %xmm26,$temp,$temp cmp \$0x09,$rounds ja ${label_prefix}_192_256 vaesenclast %xmm27,$temp,$temp # last AES-128 encryption round jmp ${label_prefix}_end .balign 32 ${label_prefix}_192_256: vaesenc %xmm27,$temp,$temp vaesenc %xmm28,$temp,$temp cmp \$0x0B,$rounds ja ${label_prefix}_256 vaesenclast %xmm29,$temp,$temp # last AES-192 encryption round jmp ${label_prefix}_end .balign 32 ${label_prefix}_256: vaesenc %xmm29,$temp,$temp vaesenc %xmm30,$temp,$temp vaesenclast %xmm31,$temp,$temp # last AES-256 encryption round .balign 32 ${label_prefix}_end: ___ } # Performs parallel AES encryption of 4 128-bit blocks # Expects iv in $temp_4x, non-final AES rounds in $rounds and key schedule in zmm17..31 sub vaes_encrypt_block_4x() { my ($label_prefix)=@_; $code.=<<___; vpxord %zmm17,$temp_4x,$temp_4x # AES pre-whitening vaesenc %zmm18,$temp_4x,$temp_4x vaesenc %zmm19,$temp_4x,$temp_4x vaesenc %zmm20,$temp_4x,$temp_4x vaesenc %zmm21,$temp_4x,$temp_4x vaesenc %zmm22,$temp_4x,$temp_4x vaesenc %zmm23,$temp_4x,$temp_4x vaesenc %zmm24,$temp_4x,$temp_4x vaesenc %zmm25,$temp_4x,$temp_4x vaesenc %zmm26,$temp_4x,$temp_4x cmp \$0x09,$rounds ja ${label_prefix}_192_256 vaesenclast %zmm27,$temp_4x,$temp_4x # last AES-128 encryption round jmp ${label_prefix}_end .balign 32 ${label_prefix}_192_256: vaesenc %zmm27,$temp_4x,$temp_4x vaesenc %zmm28,$temp_4x,$temp_4x cmp \$0x0B,$rounds ja ${label_prefix}_256 vaesenclast %zmm29,$temp_4x,$temp_4x # last AES-192 encryption round jmp ${label_prefix}_end .balign 32 ${label_prefix}_256: vaesenc %zmm29,$temp_4x,$temp_4x vaesenc %zmm30,$temp_4x,$temp_4x vaesenclast %zmm31,$temp_4x,$temp_4x # last AES-256 encryption round .balign 32 ${label_prefix}_end: ___ } # Performs parallel AES encryption of 16 128-bit blocks # Expects input in $temp_*x, non-final AES rounds in $rounds and key schedule in zmm17..31 sub vaes_encrypt_block_16x() { my ($label_prefix)=@_; $code.=<<___; vpxord %zmm17,$temp_4x, $temp_4x # parallel AES pre-whitening vpxord %zmm17,$temp_8x, $temp_8x vpxord %zmm17,$temp_12x,$temp_12x vpxord %zmm17,$temp_16x,$temp_16x vaesenc %zmm18,$temp_4x, $temp_4x vaesenc %zmm18,$temp_8x, $temp_8x vaesenc %zmm18,$temp_12x,$temp_12x vaesenc %zmm18,$temp_16x,$temp_16x vaesenc %zmm19,$temp_4x, $temp_4x vaesenc %zmm19,$temp_8x, $temp_8x vaesenc %zmm19,$temp_12x,$temp_12x vaesenc %zmm19,$temp_16x,$temp_16x vaesenc %zmm20,$temp_4x, $temp_4x vaesenc %zmm20,$temp_8x, $temp_8x vaesenc %zmm20,$temp_12x,$temp_12x vaesenc %zmm20,$temp_16x,$temp_16x vaesenc %zmm21,$temp_4x, $temp_4x vaesenc %zmm21,$temp_8x, $temp_8x vaesenc %zmm21,$temp_12x,$temp_12x vaesenc %zmm21,$temp_16x,$temp_16x vaesenc %zmm22,$temp_4x, $temp_4x vaesenc %zmm22,$temp_8x, $temp_8x vaesenc %zmm22,$temp_12x,$temp_12x vaesenc %zmm22,$temp_16x,$temp_16x vaesenc %zmm23,$temp_4x, $temp_4x vaesenc %zmm23,$temp_8x, $temp_8x vaesenc %zmm23,$temp_12x,$temp_12x vaesenc %zmm23,$temp_16x,$temp_16x vaesenc %zmm24,$temp_4x, $temp_4x vaesenc %zmm24,$temp_8x, $temp_8x vaesenc %zmm24,$temp_12x,$temp_12x vaesenc %zmm24,$temp_16x,$temp_16x vaesenc %zmm25,$temp_4x, $temp_4x vaesenc %zmm25,$temp_8x, $temp_8x vaesenc %zmm25,$temp_12x,$temp_12x vaesenc %zmm25,$temp_16x,$temp_16x vaesenc %zmm26,$temp_4x, $temp_4x vaesenc %zmm26,$temp_8x, $temp_8x vaesenc %zmm26,$temp_12x,$temp_12x vaesenc %zmm26,$temp_16x,$temp_16x cmp \$0x09,$rounds ja ${label_prefix}_192_256 vaesenclast %zmm27,$temp_4x, $temp_4x # last AES-128 encryption round vaesenclast %zmm27,$temp_8x, $temp_8x vaesenclast %zmm27,$temp_12x,$temp_12x vaesenclast %zmm27,$temp_16x,$temp_16x jmp ${label_prefix}_end .balign 32 ${label_prefix}_192_256: vaesenc %zmm27,$temp_4x, $temp_4x vaesenc %zmm27,$temp_8x, $temp_8x vaesenc %zmm27,$temp_12x,$temp_12x vaesenc %zmm27,$temp_16x,$temp_16x vaesenc %zmm28,$temp_4x, $temp_4x vaesenc %zmm28,$temp_8x, $temp_8x vaesenc %zmm28,$temp_12x,$temp_12x vaesenc %zmm28,$temp_16x,$temp_16x cmp \$0x0B,$rounds ja ${label_prefix}_256 vaesenclast %zmm29,$temp_4x, $temp_4x # last AES-192 encryption round vaesenclast %zmm29,$temp_8x, $temp_8x vaesenclast %zmm29,$temp_12x,$temp_12x vaesenclast %zmm29,$temp_16x,$temp_16x jmp ${label_prefix}_end .balign 32 ${label_prefix}_256: vaesenc %zmm29,$temp_4x, $temp_4x vaesenc %zmm29,$temp_8x, $temp_8x vaesenc %zmm29,$temp_12x,$temp_12x vaesenc %zmm29,$temp_16x,$temp_16x vaesenc %zmm30,$temp_4x, $temp_4x vaesenc %zmm30,$temp_8x, $temp_8x vaesenc %zmm30,$temp_12x,$temp_12x vaesenc %zmm30,$temp_16x,$temp_16x vaesenclast %zmm31,$temp_4x, $temp_4x # last AES-256 encryption round vaesenclast %zmm31,$temp_8x, $temp_8x vaesenclast %zmm31,$temp_12x,$temp_12x vaesenclast %zmm31,$temp_16x,$temp_16x .balign 32 ${label_prefix}_end: ___ } ################################################################# # Signature: # # void ossl_aes_cfb128_vaes_enc( # const unsigned char *in, # unsigned char *out, # size_t len, # const AES_KEY *ks, # const unsigned char ivec[16], # /*in-out*/ ossl_ssize_t *num); # # Preconditions: # - all pointers are valid (not NULL...) # - AES key schedule and rounds in `ks` are precomputed # # Invariants: # - `*num` is between 0 and 15 (inclusive) # ################################################################# # # The implementation follows closely the encryption half of CRYPTO_cfb128_encrypt: # - "pre" step: processes the last bytes of a partial block # - "mid" step: processes complete blocks # - "post" step: processes the first bytes of a partial block # # To obtain the next ciphertext block `cipher ` from # the plaintext block `plain `, the previous ciphertext # block `cipher ` is required as input. # # The dependency on previous encryption outputs (ciphertexts) # makes CFB encryption inherently serial. # # +----+ +----------+ # | iv | +---------------> cipher 0 | # +--+-+ | +----------+ # | | | # | | | # +------v------+ | +------v------+ # | AES encrypt | | | AES encrypt | # | with key | | | with key | # +------+------+ | +------+------+ # | | | # | | | # +---------+ +--v--+ | +---------+ +--v--+ # | plain 0 +----> XOR | | | plain 1 +----> XOR | # +---------+ +--+--+ | +---------+ +--+--+ # | | | # | | | # +-----v----+ | +-----v----+ # | cipher 0 +----+ | cipher 1 | # +----------+ +----------+ # ################################################################# $code.=<<___; .globl ossl_aes_cfb128_vaes_enc .type ossl_aes_cfb128_vaes_enc,\@function,6 .balign 64 ossl_aes_cfb128_vaes_enc: .cfi_startproc endbranch ___ $inp="%rdi"; # arg0 $out="%rsi"; # arg1 $len="%rdx"; # arg2 $key_original="%rcx"; # arg3 $key_backup="%r10"; $ivp="%r8"; # arg4 $nump="%r9"; # arg5 $num="%r11"; $left="%rcx"; $mask="%rax"; $rounds="%r11d"; $temp="%xmm2"; $plain="%xmm3"; $code.=<<___; mov ($nump),$num # $num is the current byte index in the first partial block # $num belongs to 0..15; non-zero means a partial first block test $len,$len # return early if $len==0, unlikely to occur jz .Laes_cfb128_vaes_enc_done test $num,$num # check if the first block is partial jz .Laes_cfb128_enc_mid # if not, jump to processing full blocks ########################################################### # first partial block pre-processing ########################################################### mov $key_original,$key_backup # make room for variable shl with cl mov \$0x10,$left # first block is partial sub $num,$left # calculate how many bytes $left to process in the block cmp $len,$left # cmova $len,$left # $left = min(16-$num,$len) mov \$1,$mask # build a mask with the least significant $left bits set shlq %cl,$mask # $left is left shift counter dec $mask # $mask is 2^$left-1 kmovq $mask,%k1 mov $num,%rax # keep in-out $num in %al add $left,%rax # advance $num and \$0x0F,%al # wrap-around $num in a 16-byte block leaq ($num,$ivp),%r11 # process $left iv bytes vmovdqu8 (%r11),%xmm0 vmovdqu8 ($inp),%xmm1 # process $left input bytes vpxor %xmm0,%xmm1,%xmm2 # CipherFeedBack XOR vmovdqu8 %xmm2,($out){%k1} # write $left output bytes vmovdqu8 %xmm2,(%r11){%k1} # blend $left output bytes into iv add $left,$inp # advance pointers add $left,$out sub $left,$len jz .Laes_cfb128_enc_zero_pre # return early if no AES encryption required mov $key_backup,$key_original # restore "key_original" as arg3 .Laes_cfb128_enc_mid: ___ &load_aes_key_schedule_1x(); $code.=<<___; ########################################################### # inner full blocks processing ########################################################### vmovdqu ($ivp),$temp # load iv cmp \$0x10,$len # is there a full plaintext block left (128 bits) ? jb .Laes_cfb128_enc_post .balign 32 .Loop_aes_cfb128_enc_main: sub \$0x10,$len vmovdqu ($inp),$plain # load plaintext block lea 16($inp),$inp # $inp points to next plaintext ___ &vaes_encrypt_block_1x(".Laes_cfb128_enc_mid"); $code.=<<___; vpxor $plain,$temp,$temp # CipherFeedBack XOR cmp \$0x10,$len vmovdqu $temp,($out) # write ciphertext lea 16($out),$out # $out points to the next output block jae .Loop_aes_cfb128_enc_main xor %eax,%eax # reset num when processing full blocks vmovdqu $temp,($ivp) # latest ciphertext block is next encryption input .Laes_cfb128_enc_post: ########################################################### # last partial block post-processing ########################################################### test $len,$len # check if the last block is partial jz .Laes_cfb128_enc_zero_all ___ &vaes_encrypt_block_1x(".Laes_cfb128_enc_post"); $code.=<<___; mov $len,%rax # num=$len mov \$1,%r11 # build a mask with the least significant $len bits set mov %dl,%cl # $len is left shift counter less than 16 shlq %cl,%r11 dec %r11 # mask is 2^$len-1 kmovq %r11,%k1 vmovdqu8 ($inp),%xmm1{%k1}{z} # read $len input bytes, zero the rest to not impact XOR vpxor $temp,%xmm1,%xmm0 # CipherFeedBack XOR vmovdqu8 %xmm0,($out){%k1} # write $len output bytes vmovdqu8 %xmm0,($ivp) # write chained/streaming iv # clear registers .Laes_cfb128_enc_zero_all: vpxord %xmm17,%xmm17,%xmm17 # clear the AES key schedule vpxord %xmm18,%xmm18,%xmm18 vpxord %xmm19,%xmm19,%xmm19 vpxord %xmm20,%xmm20,%xmm20 vpxord %xmm21,%xmm21,%xmm21 vpxord %xmm22,%xmm22,%xmm22 vpxord %xmm23,%xmm23,%xmm23 vpxord %xmm24,%xmm24,%xmm24 vpxord %xmm25,%xmm25,%xmm25 vpxord %xmm26,%xmm26,%xmm26 vpxord %xmm27,%xmm27,%xmm27 vpxord %xmm28,%xmm28,%xmm28 vpxord %xmm29,%xmm29,%xmm29 vpxord %xmm30,%xmm30,%xmm30 vpxord %xmm31,%xmm31,%xmm31 vpxor %xmm3,%xmm3,%xmm3 # clear registers used during AES encryption .Laes_cfb128_enc_zero_pre: vpxor %xmm0,%xmm0,%xmm0 # clear the rest of the registers vpxor %xmm1,%xmm1,%xmm1 vpxor %xmm2,%xmm2,%xmm2 mov %rax,($nump) # num is in/out, update for future/chained calls vzeroupper .Laes_cfb128_vaes_enc_done: ret .cfi_endproc .size ossl_aes_cfb128_vaes_enc,.-ossl_aes_cfb128_vaes_enc ___ ################################################################# # Signature: # # void ossl_aes_cfb128_vaes_dec( # const unsigned char *in, # unsigned char *out, # size_t len, # const AES_KEY *ks, # const unsigned char ivec[16], # /*in-out*/ ossl_ssize_t *num); # # Preconditions: # - all pointers are valid (not NULL...) # - AES key schedule and rounds in `ks` are precomputed # # Invariants: # - `*num` is between 0 and 15 (inclusive) # ################################################################# # # The implementation follows closely the decryption half of CRYPTO_cfb128_encrypt: # # - "pre" step: processes the last bytes of a partial block # - "mid" step: processes complete blocks using an unrolled approach: # - processes 16 blocks in parallel until fewer than 16 blocks remain # - processes 4 blocks in parallel until fewer than 4 blocks remain # - processes 1 block in series until none are left # - "post" step: processes the first bytes of a partial block # # To obtain the next plaintext block `plain ` from # its ciphertext block `cipher `, the previous ciphertext # block `cipher ` is required as input. # # Since CFB decryption for the current block only depends on # iv and ciphertext blocks (already available as inputs) # and not on plaintext blocks, it can be efficiently parallelized. # # +----+ +----------+ +----------+ +----------+ # | iv | | cipher 0 | | cipher 1 | | cipher 2 | # +--+-+ +----+-----+ +----+-----+ +----+-----+ # | | | | # | | | | # +------v------+ +------v------+ +------v------+ +------v------+ # | AES encrypt | | AES encrypt | | AES encrypt | | AES encrypt | # | with key | | with key | | with key | | with key | # +------+------+ +------+------+ +------+------+ +------+------+ # | | | | # | | | | # +--v--+ +----------+ +--v--+ +----------+ +--v--+ +----------+ +--v--+ +----------+ # | XOR <-----+ cipher 0 | | XOR <-----+ cipher 1 | | XOR <-----+ cipher 2 | | XOR <-----+ cipher 3 | # +--+--+ +----------+ +--+--+ +----------+ +--+--+ +----------+ +--+--+ +----------+ # | | | | # | | | | # +----v----+ +----v----+ +----v----+ +----v----+ # | plain 0 | | plain 1 | | plain 2 | | plain 3 | # +---------+ +---------+ +---------+ +---------+ # # To produce N (4 in the diagram above) output/plaintext blocks we require as inputs: # - iv # - N ciphertext blocks # The N-th ciphertext block is not encrypted and becomes the next iv input. # ################################################################# $code.=<<___; .globl ossl_aes_cfb128_vaes_dec .type ossl_aes_cfb128_vaes_dec,\@function,6 .balign 64 ossl_aes_cfb128_vaes_dec: .cfi_startproc endbranch ___ $inp="%rdi"; # arg0 $out="%rsi"; # arg1 $len="%rdx"; # arg2 $key_original="%rcx"; # arg3 $key_backup="%r10"; $ivp="%r8"; # arg4 $nump="%r9"; # arg5 $num="%r11"; $left="%rcx"; $mask="%rax"; $rounds="%r11d"; $temp="%xmm2"; $temp_4x="%zmm2"; $temp_8x="%zmm4"; $temp_12x="%zmm0"; $temp_16x="%zmm6"; $cipher="%xmm3"; $cipher_4x="%zmm3"; $cipher_8x="%zmm5"; $cipher_12x="%zmm1"; $cipher_16x="%zmm16"; $code.=<<___; mov ($nump),$num # $num is the current byte index in the first partial block # $num belongs to 0..15; non-zero means a partial first block test $len,$len # return early if $len==0, unlikely to occur jz .Laes_cfb128_vaes_dec_done ___ $code.=<<___ if($win64); sub \$0x10,%rsp .cfi_adjust_cfa_offset 16 vmovdqu %xmm6,(%rsp) # xmm6 needs to be maintained for Windows ___ $code.=<<___; test $num,$num # check if the first block is partial jz .Laes_cfb128_dec_mid # if not, jump to processing full blocks ########################################################### # first partial block pre-processing ########################################################### mov $key_original,$key_backup # make room for variable shl with cl mov \$0x10,$left # first block is partial sub $num,$left # calculate how many bytes $left to process in the block cmp $len,$left # cmova $len,$left # $left = min(16-$num,$len) mov \$1,$mask # build a mask with the least significant $left bits set shlq %cl,$mask # $left is left shift counter dec $mask # $mask is 2^$left-1 kmovq $mask,%k1 lea ($num,$left),%rax # keep in-out num in %al, advance by $left and \$0x0F,%al # wrap-around in a 16-byte block leaq ($num,$ivp),%r11 # process $left iv bytes vmovdqu8 (%r11),%xmm0 vmovdqu8 ($inp),%xmm1 # process $left input bytes vpxor %xmm0,%xmm1,%xmm2 # CipherFeedBack XOR vmovdqu8 %xmm2,($out){%k1} # write $left output bytes vmovdqu8 %xmm1,(%r11){%k1} # blend $left input bytes into iv add $left,$inp # advance pointers add $left,$out sub $left,$len jz .Laes_cfb128_dec_zero_pre # return early if no AES encryption required mov $key_backup,$key_original # restore "key_original" as arg3 .Laes_cfb128_dec_mid: ___ &load_aes_key_schedule_4x(); $code.=<<___; ########################################################### # inner full blocks processing ########################################################### # $temp_4x is "iv | iv | iv | iv" vbroadcasti32x4 ($ivp),$temp_4x # load iv cmp \$0x100,$len # are there 16 ciphertext blocks left (2048 bits) ? jb .Laes_cfb128_dec_check_4x ########################################################### # decrypt groups of 16 128-bit blocks in parallel # behaves as 16x loop unroll ########################################################### .balign 32 .Loop_aes_cfb128_dec_mid_16x: sub \$0x100,$len # load 16 ciphertext blocks # $cipher_4x is "ciphertext 0 | ciphertext 1 | ciphertext 2 | ciphertext 3" vmovdqu32 ($inp),$cipher_4x # $cipher_8x is "ciphertext 4 | ciphertext 5 | ciphertext 6 | ciphertext 7" vmovdqu32 64($inp),$cipher_8x # $cipher_12x is "ciphertext 8 | ciphertext 9 | ciphertext 10 | ciphertext 11" vmovdqu32 128($inp),$cipher_12x # $cipher_16x is "ciphertext 12 | ciphertext 13 | ciphertext 14 | ciphertext 15" vmovdqu32 192($inp),$cipher_16x # $temp_4x is "iv | ciphertext 0 | ciphertext 1 | ciphertext 2" valignq \$6,$temp_4x,$cipher_4x,$temp_4x # $temp_8x is "ciphertext 3 | ciphertext 4 | ciphertext 5 | ciphertext 6" valignq \$6,$cipher_4x,$cipher_8x,$temp_8x # $temp_12x is "ciphertext 7 | ciphertext 8 | ciphertext 9 | ciphertext 10" valignq \$6,$cipher_8x,$cipher_12x,$temp_12x # $temp_16x is "ciphertext 11 | ciphertext 12 | ciphertext 13 | ciphertext 14" valignq \$6,$cipher_12x,$cipher_16x,$temp_16x lea 256($inp),$inp # $inp points to next ciphertext ___ &vaes_encrypt_block_16x(".Laes_cfb128_dec_mid_16x"); $code.=<<___; vpxord $cipher_4x,$temp_4x,$temp_4x # CipherFeedBack XOR of 16 blocks vpxord $cipher_8x,$temp_8x,$temp_8x vpxord $cipher_12x,$temp_12x,$temp_12x vpxord $cipher_16x,$temp_16x,$temp_16x cmp \$0x100,$len vmovdqu32 $temp_4x,($out) # write 16 plaintext blocks vmovdqu32 $temp_8x,64($out) vmovdqu32 $temp_12x,128($out) vmovdqu32 $temp_16x,192($out) vmovdqu8 $cipher_16x,$temp_4x lea 256($out),$out # $out points to the next output block jae .Loop_aes_cfb128_dec_mid_16x vextracti64x2 \$3,$cipher_16x,$temp # latest ciphertext block is next decryption iv vinserti32x4 \$3,$temp,$temp_4x,$temp_4x # move ciphertext3 to positions 0 and 3 in preparation for next shuffle xor %eax,%eax # reset $num when processing full blocks vmovdqu $temp,($ivp) # latest plaintext block is next decryption iv .Laes_cfb128_dec_check_4x: cmp \$0x40,$len # are there 4 ciphertext blocks left (512 bits) ? jb .Laes_cfb128_dec_check_1x ########################################################### # decrypt groups of 4 128-bit blocks in parallel # behaves as 4x loop unroll ########################################################### # expects $temp_4x to contain "iv" in the 3rd (most significant) lane .balign 32 .Loop_aes_cfb128_dec_mid_4x: sub \$0x40,$len # $cipher_4x is "ciphertext0 | ciphertext1 | ciphertext 2 | ciphertext 3" vmovdqu32 ($inp),$cipher_4x # load 4 ciphertext blocks # $temp_4x is "iv | ciphertext0 | ciphertext 1 | ciphertext 2" valignq \$6,$temp_4x,$cipher_4x,$temp_4x lea 64($inp),$inp # $inp points to next ciphertext ___ &vaes_encrypt_block_4x(".Laes_cfb128_dec_mid_4x"); $code.=<<___; vpxord $cipher_4x,$temp_4x,$temp_4x # CipherFeedBack XOR of 4 blocks cmp \$0x40,$len vmovdqu32 $temp_4x,($out) # write 4 plaintext blocks vmovdqu8 $cipher_4x,$temp_4x lea 64($out),$out # $out points to the next output block jae .Loop_aes_cfb128_dec_mid_4x vextracti64x2 \$3,$temp_4x,$temp # latest ciphertext block is next decryption iv # move ciphertext3 to position 0 in preparation for next step xor %eax,%eax # reset $num when processing full blocks vmovdqu $temp,($ivp) # latest plaintext block is next decryption iv .Laes_cfb128_dec_check_1x: cmp \$0x10,$len # are there full ciphertext blocks left (128 bits) ? jb .Laes_cfb128_dec_post ########################################################### # decrypt the rest of full 128-bit blocks in series ########################################################### # expects $temp to contain iv .balign 32 .Loop_aes_cfb128_dec_mid_1x: sub \$0x10,$len vmovdqu ($inp),$cipher # load ciphertext block lea 16($inp),$inp # $inp points to next ciphertext ___ &vaes_encrypt_block_1x(".Loop_aes_cfb128_dec_mid_1x_inner"); $code.=<<___; vpxor $cipher,$temp,$temp # CipherFeedBack XOR cmp \$0x10,$len vmovdqu $temp,($out) # write plaintext vmovdqu8 $cipher,$temp lea 16($out),$out # $out points to the next output block jae .Loop_aes_cfb128_dec_mid_1x xor %eax,%eax # reset $num when processing full blocks vmovdqu $temp,($ivp) # latest plaintext block is next decryption input .Laes_cfb128_dec_post: ########################################################### # last partial block post-processing ########################################################### test $len,$len # check if the last block is partial jz .Laes_cfb128_dec_zero_all ___ &vaes_encrypt_block_1x(".Loop_aes_cfb128_dec_post"); $code.=<<___; mov $len,%rax # num=$len mov \$1,%r11 # build a mask with the least significant $len bits set mov %dl,%cl # $len is left shift counter less than 16 shlq %cl,%r11 dec %r11 # mask is 2^$len-1 kmovq %r11,%k1 vmovdqu8 ($inp),%xmm1{%k1}{z} # read $len input bytes, zero the rest to not impact XOR vpxor $temp,%xmm1,%xmm0 # CipherFeedBack XOR vmovdqu8 %xmm0,($out){%k1} # write $len output bytes vpblendmb %xmm1,$temp,${temp}{%k1} # blend $len input bytes into iv vmovdqu8 $temp,($ivp) # write chained/streaming iv # clear registers .Laes_cfb128_dec_zero_all: vpxord %xmm17,%xmm17,%xmm17 # clear the AES key schedule vpxord %xmm18,%xmm18,%xmm18 # zero the upper lanes of zmm registers vpxord %xmm19,%xmm19,%xmm19 vpxord %xmm20,%xmm20,%xmm20 vpxord %xmm21,%xmm21,%xmm21 vpxord %xmm22,%xmm22,%xmm22 vpxord %xmm23,%xmm23,%xmm23 vpxord %xmm24,%xmm24,%xmm24 vpxord %xmm25,%xmm25,%xmm25 vpxord %xmm26,%xmm26,%xmm26 vpxord %xmm27,%xmm27,%xmm27 vpxord %xmm28,%xmm28,%xmm28 vpxord %xmm29,%xmm29,%xmm29 vpxord %xmm30,%xmm30,%xmm30 vpxord %xmm31,%xmm31,%xmm31 vpxord %xmm3,%xmm3,%xmm3 # clear registers used during AES encryption vpxord %xmm4,%xmm4,%xmm4 vpxord %xmm5,%xmm5,%xmm5 vpxord %xmm6,%xmm6,%xmm6 vpxord %xmm16,%xmm16,%xmm16 .Laes_cfb128_dec_zero_pre: vpxord %xmm0,%xmm0,%xmm0 # clear the rest of the registers vpxord %xmm1,%xmm1,%xmm1 vpxord %xmm2,%xmm2,%xmm2 vzeroupper ___ $code.=<<___ if($win64); vmovdqu (%rsp),%xmm6 # xmm6 needs to be maintained for Windows add \$16,%rsp .cfi_adjust_cfa_offset -16 ___ $code.=<<___; mov %rax,($nump) # num is in/out, update for future/chained calls .Laes_cfb128_vaes_dec_done: ret .cfi_endproc .size ossl_aes_cfb128_vaes_dec,.-ossl_aes_cfb128_vaes_dec ___ } else { $code .= <<___; .globl ossl_aes_cfb128_vaes_enc .globl ossl_aes_cfb128_vaes_dec # Mock implementations of AES-CFB128 encryption/decryption # that always fail. Should not be executed under normal circumstances. ossl_aes_cfb128_vaes_enc: ossl_aes_cfb128_vaes_dec: .byte 0x0f,0x0b # Undefined Instruction in the Intel architecture # Raises the Invalid Opcode exception ret ################################################################# # Signature: # # int ossl_aes_cfb128_vaes_eligible(void); # # Always returns 0 (not eligible), meaning that tooling does not support # the Intel AVX-512 extensions. Signals higher level code to fallback # to an alternative implementation. ################################################################# .globl ossl_aes_cfb128_vaes_eligible .type ossl_aes_cfb128_vaes_eligible,\@abi-omnipotent ossl_aes_cfb128_vaes_eligible: xor %eax,%eax ret .size ossl_aes_cfb128_vaes_eligible, .-ossl_aes_cfb128_vaes_eligible ___ } print $code; close STDOUT or die "error closing STDOUT: $!";