#! /usr/bin/env perl # Copyright 2024-2025 The OpenSSL Project Authors. All Rights Reserved. # Copyright (c) 2024, Intel Corporation. All Rights Reserved. # # Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # # This module implements support for Intel(R) SM3 instructions # from Intel(R) Multi-Buffer Crypto for IPsec Library # (https://github.com/intel/intel-ipsec-mb). # Original author is Tomasz Kantecki # $output is the last argument if it looks like a file (it has an extension) # $flavour is the first argument if it doesn't look like a file $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; # Check Intel(R) SM3 instructions support if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx2_sm3_ni = ($1>=2.22); # minimal avx2 supported version, binary translation for SM3 instructions (sub sm3op) is used $avx2_sm3_ni_native = ($1>=2.42); # support added at GNU asm 2.42 } if (!$avx2_sm3_ni && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9])\.([0-9]+)\.([0-9]+)/) { my ($major, $minor, $patch) = ($1, $2, $3); $avx2_sm3_ni = ($major > 2) || ($major == 2 && $minor > 10); # minimal avx2 supported version, binary translation for SM3 instructions (sub sm3op) is used $avx2_sm3_ni_native = ($major > 2) || ($major == 2 && $minor > 16) || ($major == 2 && $minor == 16 && $patch >= 2); # support added at NASM 2.16.02 } if (!$avx2_sm3_ni && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { $avx2_sm3_ni = ($2>=7.0); # minimal tested version, binary translation for SM3 instructions (sub sm3op) is used $avx2_sm3_ni_native = ($2>=17.0); # support added at LLVM 17.0.1 } open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" or die "can't call $xlate: $!"; *STDOUT=*OUT; if ($avx2_sm3_ni>0) { # Create 4 x 32-bit new words of message schedule W[] using SM3-NI ISA sub sm3msg { my ($W03_00, $W07_04, $W11_08, $W15_12, $W19_16, $T1,$T2) = @_; my $T3 = $W19_16; $code.=<<___; vpalignr \$12, $W07_04, $W11_08, $T3 vpsrldq \$4, $W15_12, $T1 vsm3msg1 $W03_00, $T1, $T3 vpalignr \$12, $W03_00, $W07_04, $T1 vpalignr \$8, $W11_08, $W15_12, $T2 vsm3msg2 $T2, $T1, $T3 ___ } # Performs 4 rounds of SM3 algorithm # - consumes 4 words of message schedule W[] # - updates SM3 state registers: ABEF and CDGH sub sm3rounds4 { my ($ABEF, $CDGH, $W03_00, $W07_04, $T1,$R)=@_; my $R2 = $R + 2; $code.=<<___; vpunpcklqdq $W07_04, $W03_00, $T1 vsm3rnds2 \$$R, $T1, $ABEF, $CDGH vpunpckhqdq $W07_04, $W03_00, $T1 vsm3rnds2 \$$R2, $T1, $CDGH, $ABEF ___ } $code.= ".data\n"; { # input arguments aliases my ($ctx,$p,$num) = ("%rdi","%rsi","%rdx"); $code.=<<___; .align 16 SHUFF_MASK: .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 .text # void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num) # # input: $ctx SM3 context # $p pointer to the data # $num number of blocks # .globl ossl_hwsm3_block_data_order .type ossl_hwsm3_block_data_order,\@function,3 .align 32 ossl_hwsm3_block_data_order: .cfi_startproc endbranch # Prolog push %rbp .cfi_push %rbp .cfi_def_cfa_register %rbp .Lossl_hwsm3_block_data_order_seh_setfp: ___ $code.=<<___ if($win64); # xmm6:xmm12 need to be maintained for Windows sub \$`7*16`,%rsp .cfi_adjust_cfa_offset \$`7*16` vmovdqu %xmm6, 16*0(%rsp) vmovdqu %xmm7, 16*1(%rsp) vmovdqu %xmm8, 16*2(%rsp) vmovdqu %xmm9, 16*3(%rsp) vmovdqu %xmm10,16*4(%rsp) vmovdqu %xmm11,16*5(%rsp) vmovdqu %xmm12,16*6(%rsp) ___ $code .= <<___; # Prolog ends here. .Lossl_hwsm3_block_data_order_seh_prolog_end: or $num, $num je .done_hash # xmm = D C B A # D - most significant word in an `xmm` # A - least significant word in an `xmm` vmovdqu ($ctx), %xmm6 # xmm6 = D C B A vmovdqu 16($ctx), %xmm7 # xmm7 = H G F E vpshufd \$0x1B, %xmm6, %xmm0 vpshufd \$0x1B, %xmm7, %xmm1 vpunpckhqdq %xmm0, %xmm1, %xmm6 vpunpcklqdq %xmm0, %xmm1, %xmm7 vpsrld \$9, %xmm7, %xmm2 vpslld \$23, %xmm7, %xmm3 vpxor %xmm3, %xmm2, %xmm1 vpsrld \$19, %xmm7, %xmm4 vpslld \$13, %xmm7, %xmm5 vpxor %xmm5, %xmm4, %xmm0 # xmm7 = ROL32(C, 23) ROL32(D, 23) ROL32(G, 13) ROL32(H, 13) vpblendd \$0x3, %xmm0, %xmm1, %xmm7 vmovdqa SHUFF_MASK(%rip), %xmm12 .align 32 .block_loop: vmovdqa %xmm6, %xmm10 vmovdqa %xmm7, %xmm11 # prepare W[0..15] - read and shuffle the data vmovdqu ($p), %xmm2 vmovdqu 16($p), %xmm3 vmovdqu 32($p), %xmm4 vmovdqu 48($p), %xmm5 vpshufb %xmm12, %xmm2, %xmm2 # xmm2 = W03 W02 W01 W00 vpshufb %xmm12, %xmm3, %xmm3 # xmm3 = W07 W06 W05 W04 vpshufb %xmm12, %xmm4, %xmm4 # xmm4 = W11 W10 W09 W08 vpshufb %xmm12, %xmm5, %xmm5 # xmm5 = W15 W14 W13 W12 ___ sm3msg("%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm8", "%xmm9", "%xmm1"); sm3rounds4("%xmm6", "%xmm7", "%xmm2", "%xmm3", "%xmm1", 0); $code.="vmovdqa %xmm8, %xmm2\n"; sm3msg("%xmm3", "%xmm4", "%xmm5", "%xmm2", "%xmm8", "%xmm9", "%xmm1"); sm3rounds4("%xmm6", "%xmm7", "%xmm3", "%xmm4", "%xmm1", 4); $code.="vmovdqa %xmm8, %xmm3\n"; sm3msg("%xmm4", "%xmm5", "%xmm2", "%xmm3", "%xmm8", "%xmm9", "%xmm1"); sm3rounds4("%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm1", 8); $code.="vmovdqa %xmm8, %xmm4\n"; sm3msg("%xmm5", "%xmm2", "%xmm3", "%xmm4", "%xmm8", "%xmm9", "%xmm1"); sm3rounds4("%xmm6", "%xmm7", "%xmm5", "%xmm2", "%xmm1", 12); $code.="vmovdqa %xmm8, %xmm5\n"; sm3msg("%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm8", "%xmm9", "%xmm1"); sm3rounds4("%xmm6", "%xmm7", "%xmm2", "%xmm3", "%xmm1", 16); $code.="vmovdqa %xmm8, %xmm2\n"; sm3msg("%xmm3", "%xmm4", "%xmm5", "%xmm2", "%xmm8", "%xmm9", "%xmm1"); sm3rounds4("%xmm6", "%xmm7", "%xmm3", "%xmm4", "%xmm1", 20); $code.="vmovdqa %xmm8, %xmm3\n"; sm3msg("%xmm4", "%xmm5", "%xmm2", "%xmm3", "%xmm8", "%xmm9", "%xmm1"); sm3rounds4("%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm1", 24); $code.="vmovdqa %xmm8, %xmm4\n"; sm3msg("%xmm5", "%xmm2", "%xmm3", "%xmm4", "%xmm8", "%xmm9", "%xmm1"); sm3rounds4("%xmm6", "%xmm7", "%xmm5", "%xmm2", "%xmm1", 28); $code.="vmovdqa %xmm8, %xmm5\n"; sm3msg("%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm8", "%xmm9", "%xmm1"); sm3rounds4("%xmm6", "%xmm7", "%xmm2", "%xmm3", "%xmm1", 32); $code.="vmovdqa %xmm8, %xmm2\n"; sm3msg("%xmm3", "%xmm4", "%xmm5", "%xmm2", "%xmm8", "%xmm9", "%xmm1"); sm3rounds4("%xmm6", "%xmm7", "%xmm3", "%xmm4", "%xmm1", 36); $code.="vmovdqa %xmm8, %xmm3\n"; sm3msg("%xmm4", "%xmm5", "%xmm2", "%xmm3", "%xmm8", "%xmm9", "%xmm1"); sm3rounds4("%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm1", 40); $code.="vmovdqa %xmm8, %xmm4\n"; sm3msg("%xmm5", "%xmm2", "%xmm3", "%xmm4", "%xmm8", "%xmm9", "%xmm1"); sm3rounds4("%xmm6", "%xmm7", "%xmm5", "%xmm2", "%xmm1", 44); $code.="vmovdqa %xmm8, %xmm5\n"; sm3msg("%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm8", "%xmm9", "%xmm1"); sm3rounds4("%xmm6", "%xmm7", "%xmm2", "%xmm3", "%xmm1", 48); $code.="vmovdqa %xmm8, %xmm2\n"; sm3rounds4("%xmm6", "%xmm7", "%xmm3", "%xmm4", "%xmm1", 52); sm3rounds4("%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm1", 56); sm3rounds4("%xmm6", "%xmm7", "%xmm5", "%xmm2", "%xmm1", 60); $code.=<<___; # update hash value vpxor %xmm10, %xmm6, %xmm6 vpxor %xmm11, %xmm7, %xmm7 addq \$64, $p dec $num jnz .block_loop # store the hash value back in memory vpslld \$9, %xmm7, %xmm2 vpsrld \$23, %xmm7, %xmm3 vpxor %xmm3, %xmm2, %xmm1 vpslld \$19, %xmm7, %xmm4 vpsrld \$13, %xmm7, %xmm5 vpxor %xmm5, %xmm4, %xmm0 vpblendd \$0x3, %xmm0, %xmm1, %xmm7 vpshufd \$0x1B, %xmm6, %xmm0 vpshufd \$0x1B, %xmm7, %xmm1 vpunpcklqdq %xmm1, %xmm0, %xmm6 vpunpckhqdq %xmm1, %xmm0, %xmm7 vmovdqu %xmm6, ($ctx) vmovdqu %xmm7, 16($ctx) .done_hash: # Epilog ___ $code.=<<___ if($win64); # xmm6:xmm12 need to be maintained for Windows vmovdqu 16*0(%rsp),%xmm6 vmovdqu 16*1(%rsp),%xmm7 vmovdqu 16*2(%rsp),%xmm8 vmovdqu 16*3(%rsp),%xmm9 vmovdqu 16*4(%rsp),%xmm10 vmovdqu 16*5(%rsp),%xmm11 vmovdqu 16*6(%rsp),%xmm12 add \$`7*16`,%rsp .cfi_adjust_cfa_offset \$`-7*16` ___ $code .= <<___; pop %rbp .cfi_pop %rbp ret .cfi_endproc ___ } } else { # fallback $code .= <<___; .text .globl ossl_hwsm3_block_data_order .type ossl_hwsm3_block_data_order,\@abi-omnipotent ossl_hwsm3_block_data_order: .byte 0x0f,0x0b # ud2 ret .size ossl_hwsm3_block_data_order, .-ossl_hwsm3_block_data_order ___ } # avx2_sm3_ni if ($avx2_sm3_ni_native > 0) { # SM3 instructions are supported in asm $code =~ s/\`([^\`]*)\`/eval $1/gem; print $code; } else { # binary translation for SM3 instructions sub sm3op { my $instr = shift; my $args = shift; if ($args =~ /^(.+)\s*#/) { $args = $1; # drop comment and its leading whitespace } if (($instr eq "vsm3msg1") && ($args =~ /%xmm(\d{1,2})\s*,\s*%xmm(\d{1,2})\s*,\s*%xmm(\d{1,2})/)) { my $b1 = sprintf("0x%02x", 0x42 | ((1-int($1/8))<<5) | ((1-int($3/8))<<7) ); my $b2 = sprintf("0x%02x", 0x00 | (15 - $2 & 15)<<3 ); my $b3 = sprintf("0x%02x", 0xc0 | ($1 & 7) | (($3 & 7)<<3) ); return ".byte 0xc4,".$b1.",".$b2.",0xda,".$b3; } elsif (($instr eq "vsm3msg2") && ($args =~ /%xmm(\d{1,2})\s*,\s*%xmm(\d{1,2})\s*,\s*%xmm(\d{1,2})/)) { my $b1 = sprintf("0x%02x", 0x42 | ((1-int($1/8))<<5) | ((1-int($3/8))<<7) ); my $b2 = sprintf("0x%02x", 0x01 | (15 - $2 & 15)<<3 ); my $b3 = sprintf("0x%02x", 0xc0 | ($1 & 7) | (($3 & 7)<<3) ); return ".byte 0xc4,".$b1.",".$b2.",0xda,".$b3; } elsif (($instr eq "vsm3rnds2") && ($args =~ /\$(0x[0-9a-fA-F]+|\d{1,2})\s*,\s*%xmm(\d{1,2})\s*,\s*%xmm(\d{1,2})\s*,\s*%xmm(\d{1,2})/)) { my $b1 = sprintf("0x%02x", $1 ); my $b2 = sprintf("0x%02x", 0x43 | ((1-int($2/8))<<5) | ((1-int($4/8))<<7) ); my $b3 = sprintf("0x%02x", 0x01 | (15 - $3 & 15)<<3 ); my $b4 = sprintf("0x%02x", 0xc0 | ($2 & 7) | (($4 & 7)<<3) ); return ".byte 0xc4,".$b2.",".$b3.",0xde,".$b4.",".$b1; } return $instr."\t".$args; } foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/geo; s/\b(vsm3[^\s]*)\s+(.*)/sm3op($1,$2)/geo; print $_,"\n"; } } close STDOUT or die "error closing STDOUT: $!";