diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl index 10d267b7ad7f4336cd3e11276161691039ae194a..36af3e075b74404fbbadf1ed2d40e67d3c55a27c 100755 --- a/crypto/arm64cpuid.pl +++ b/crypto/arm64cpuid.pl @@ -80,6 +80,14 @@ _armv8_pmull_probe: ret .size _armv8_pmull_probe,.-_armv8_pmull_probe +.globl _armv8_sm4_probe +.type _armv8_sm4_probe,%function +_armv8_sm4_probe: + AARCH64_VALID_CALL_TARGET + .long 0xcec08400 // sm4e v0.4s, v0.4s + ret +.size _armv8_sm4_probe,.-_armv8_sm4_probe + .globl _armv8_sha512_probe .type _armv8_sha512_probe,%function _armv8_sha512_probe: diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h index 8bd73ce8a7ede5c91bc374a0c588a85fb896f357..8522697776a1efbba0a9096ea0245bb52f54d495 100644 --- a/crypto/arm_arch.h +++ b/crypto/arm_arch.h @@ -80,6 +80,7 @@ extern unsigned int OPENSSL_armv8_rsa_neonized; # define ARMV8_CPUID (1<<7) # define ARMV8_RNG (1<<8) # define ARMV8_SM3 (1<<9) +# define ARMV8_SM4 (1<<10) /* * MIDR_EL1 system register @@ -92,9 +93,13 @@ extern unsigned int OPENSSL_armv8_rsa_neonized; */ # define ARM_CPU_IMP_ARM 0x41 +# define HISI_CPU_IMP 0x48 # define ARM_CPU_PART_CORTEX_A72 0xD08 # define ARM_CPU_PART_N1 0xD0C +# define ARM_CPU_PART_V1 0xD40 +# define ARM_CPU_PART_N2 0xD49 +# define HISI_CPU_PART_KP920 0xD01 # define MIDR_PARTNUM_SHIFT 4 # define MIDR_PARTNUM_MASK (0xfffU << MIDR_PARTNUM_SHIFT) diff --git a/crypto/armcap.c b/crypto/armcap.c index 365a48df450477f893be58d7e89f056011efd835..c5aa062767d86dd8354535b46fb961d603928c73 100644 --- a/crypto/armcap.c +++ b/crypto/armcap.c @@ -53,6 +53,7 @@ void _armv8_sha256_probe(void); void _armv8_pmull_probe(void); # ifdef __aarch64__ void _armv8_sm3_probe(void); +void _armv8_sm4_probe(void); void _armv8_sha512_probe(void); unsigned int _armv8_cpuid_probe(void); # endif @@ -139,6 +140,7 @@ static unsigned long getauxval(unsigned long key) # define HWCAP_CE_SHA256 (1 << 6) # define HWCAP_CPUID (1 << 11) # define HWCAP_CE_SM3 (1 << 18) +# define HWCAP_CE_SM4 (1 << 19) # define HWCAP_CE_SHA512 (1 << 21) # endif @@ -207,6 +209,9 @@ void OPENSSL_cpuid_setup(void) OPENSSL_armcap_P |= ARMV8_SHA256; # ifdef __aarch64__ + if (hwcap & HWCAP_CE_SM4) + OPENSSL_armcap_P |= ARMV8_SM4; + if (hwcap & HWCAP_CE_SHA512) OPENSSL_armcap_P |= ARMV8_SHA512; @@ -254,6 +259,11 @@ void OPENSSL_cpuid_setup(void) OPENSSL_armcap_P |= ARMV8_SHA256; } # if defined(__aarch64__) && !defined(__APPLE__) + if (sigsetjmp(ill_jmp, 1) == 0) { + _armv8_sm4_probe(); + OPENSSL_armcap_P |= ARMV8_SM4; + } + if (sigsetjmp(ill_jmp, 1) == 0) { _armv8_sha512_probe(); OPENSSL_armcap_P |= ARMV8_SHA512; diff --git a/crypto/evp/e_sm4.c b/crypto/evp/e_sm4.c index abd603015c714ce96251ce22e3c2f59d462d5907..c8e8cfe9c9a27de8a7ba5f82e097d87a1c979f56 100644 --- a/crypto/evp/e_sm4.c +++ b/crypto/evp/e_sm4.c @@ -17,92 +17,211 @@ # include # include "crypto/sm4.h" # include "crypto/evp.h" +# include "crypto/sm4_platform.h" # include "evp_local.h" typedef struct { - SM4_KEY ks; + union { + OSSL_UNION_ALIGN; + SM4_KEY ks; + } ks; + block128_f block; + union { + ecb128_f ecb; + cbc128_f cbc; + ctr128_f ctr; + } stream; } EVP_SM4_KEY; +# define BLOCK_CIPHER_generic(nid,blocksize,ivlen,nmode,mode,MODE,flags) \ +static const EVP_CIPHER sm4_##mode = { \ + nid##_##nmode,blocksize,128/8,ivlen, \ + flags|EVP_CIPH_##MODE##_MODE, \ + EVP_ORIG_GLOBAL, \ + sm4_init_key, \ + sm4_##mode##_cipher, \ + NULL, \ + sizeof(EVP_SM4_KEY), \ + NULL,NULL,NULL,NULL }; \ +const EVP_CIPHER *EVP_sm4_##mode(void) \ +{ return &sm4_##mode; } + +#define DEFINE_BLOCK_CIPHERS(nid,flags) \ + BLOCK_CIPHER_generic(nid,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ + BLOCK_CIPHER_generic(nid,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ + BLOCK_CIPHER_generic(nid,1,16,ofb128,ofb,OFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ + BLOCK_CIPHER_generic(nid,1,16,cfb128,cfb,CFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ + BLOCK_CIPHER_generic(nid,1,16,ctr,ctr,CTR,flags) + static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, const unsigned char *iv, int enc) { - ossl_sm4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx)); + int mode; + EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); + + mode = EVP_CIPHER_CTX_get_mode(ctx); + if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE) + && !enc) { +#ifdef HWSM4_CAPABLE + if (HWSM4_CAPABLE) { + HWSM4_set_decrypt_key(key, &dat->ks.ks); + dat->block = (block128_f) HWSM4_decrypt; + dat->stream.cbc = NULL; +# ifdef HWSM4_cbc_encrypt + if (mode == EVP_CIPH_CBC_MODE) + dat->stream.cbc = (cbc128_f) HWSM4_cbc_encrypt; +# endif +# ifdef HWSM4_ecb_encrypt + if (mode == EVP_CIPH_ECB_MODE) + dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt; +# endif + } else +#endif +#ifdef VPSM4_CAPABLE + if (VPSM4_CAPABLE) { + vpsm4_set_decrypt_key(key, &dat->ks.ks); + dat->block = (block128_f) vpsm4_decrypt; + dat->stream.cbc = NULL; + if (mode == EVP_CIPH_CBC_MODE) + dat->stream.cbc = (cbc128_f) vpsm4_cbc_encrypt; + else if (mode == EVP_CIPH_ECB_MODE) + dat->stream.ecb = (ecb128_f) vpsm4_ecb_encrypt; + } else +#endif + { + dat->block = (block128_f) ossl_sm4_decrypt; + ossl_sm4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx)); + } + } else +#ifdef HWSM4_CAPABLE + if (HWSM4_CAPABLE) { + HWSM4_set_encrypt_key(key, &dat->ks.ks); + dat->block = (block128_f) HWSM4_encrypt; + dat->stream.cbc = NULL; +# ifdef HWSM4_cbc_encrypt + if (mode == EVP_CIPH_CBC_MODE) + dat->stream.cbc = (cbc128_f) HWSM4_cbc_encrypt; + else +# endif +# ifdef HWSM4_ecb_encrypt + if (mode == EVP_CIPH_ECB_MODE) + dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt; + else +# endif +# ifdef HWSM4_ctr32_encrypt_blocks + if (mode == EVP_CIPH_CTR_MODE) + dat->stream.ctr = (ctr128_f) HWSM4_ctr32_encrypt_blocks; + else +# endif + (void)0; /* terminate potentially open 'else' */ + } else +#endif +#ifdef VPSM4_CAPABLE + if (VPSM4_CAPABLE) { + vpsm4_set_encrypt_key(key, &dat->ks.ks); + dat->block = (block128_f) vpsm4_encrypt; + dat->stream.cbc = NULL; + if (mode == EVP_CIPH_CBC_MODE) + dat->stream.cbc = (cbc128_f) vpsm4_cbc_encrypt; + else if (mode == EVP_CIPH_ECB_MODE) + dat->stream.ecb = (ecb128_f) vpsm4_ecb_encrypt; + else if (mode == EVP_CIPH_CTR_MODE) + dat->stream.ctr = (ctr128_f) vpsm4_ctr32_encrypt_blocks; + } else +#endif + { + dat->block = (block128_f) ossl_sm4_encrypt; + ossl_sm4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx)); + } return 1; } -static void sm4_cbc_encrypt(const unsigned char *in, unsigned char *out, - size_t len, const SM4_KEY *key, - unsigned char *ivec, const int enc) +static int sm4_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) { - if (enc) - CRYPTO_cbc128_encrypt(in, out, len, key, ivec, - (block128_f)ossl_sm4_encrypt); + EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); + + if (dat->stream.cbc) + (*dat->stream.cbc) (in, out, len, &dat->ks.ks, ctx->iv, + EVP_CIPHER_CTX_is_encrypting(ctx)); + else if (EVP_CIPHER_CTX_is_encrypting(ctx)) + CRYPTO_cbc128_encrypt(in, out, len, &dat->ks, ctx->iv, + dat->block); else - CRYPTO_cbc128_decrypt(in, out, len, key, ivec, - (block128_f)ossl_sm4_decrypt); + CRYPTO_cbc128_decrypt(in, out, len, &dat->ks, + ctx->iv, dat->block); + return 1; } -static void sm4_cfb128_encrypt(const unsigned char *in, unsigned char *out, - size_t length, const SM4_KEY *key, - unsigned char *ivec, int *num, const int enc) +static int sm4_cfb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) { - CRYPTO_cfb128_encrypt(in, out, length, key, ivec, num, enc, - (block128_f)ossl_sm4_encrypt); + EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); + int num = EVP_CIPHER_CTX_get_num(ctx); + + CRYPTO_cfb128_encrypt(in, out, len, &dat->ks, + ctx->iv, &num, + EVP_CIPHER_CTX_is_encrypting(ctx), dat->block); + EVP_CIPHER_CTX_set_num(ctx, num); + return 1; } -static void sm4_ecb_encrypt(const unsigned char *in, unsigned char *out, - const SM4_KEY *key, const int enc) +static int sm4_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) { - if (enc) - ossl_sm4_encrypt(in, out, key); + size_t bl = EVP_CIPHER_CTX_get_block_size(ctx); + size_t i; + EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); + + if (len < bl) + return 1; + + if (dat->stream.ecb != NULL) + (*dat->stream.ecb) (in, out, len, &dat->ks.ks, + EVP_CIPHER_CTX_is_encrypting(ctx)); else - ossl_sm4_decrypt(in, out, key); + for (i = 0, len -= bl; i <= len; i += bl) + (*dat->block) (in + i, out + i, &dat->ks); + + return 1; } -static void sm4_ofb128_encrypt(const unsigned char *in, unsigned char *out, - size_t length, const SM4_KEY *key, - unsigned char *ivec, int *num) +static int sm4_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) { - CRYPTO_ofb128_encrypt(in, out, length, key, ivec, num, - (block128_f)ossl_sm4_encrypt); -} + EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); + int num = EVP_CIPHER_CTX_get_num(ctx); -IMPLEMENT_BLOCK_CIPHER(sm4, ks, sm4, EVP_SM4_KEY, NID_sm4, - 16, 16, 16, 128, EVP_CIPH_FLAG_DEFAULT_ASN1, - sm4_init_key, 0, 0, 0, 0) + CRYPTO_ofb128_encrypt(in, out, len, &dat->ks, + ctx->iv, &num, dat->block); + EVP_CIPHER_CTX_set_num(ctx, num); + return 1; +} static int sm4_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, const unsigned char *in, size_t len) { int n = EVP_CIPHER_CTX_get_num(ctx); unsigned int num; - EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY, ctx); + EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); if (n < 0) return 0; num = (unsigned int)n; - CRYPTO_ctr128_encrypt(in, out, len, &dat->ks, ctx->iv, - EVP_CIPHER_CTX_buf_noconst(ctx), &num, - (block128_f)ossl_sm4_encrypt); + if (dat->stream.ctr) + CRYPTO_ctr128_encrypt_ctr32(in, out, len, &dat->ks, + ctx->iv, + EVP_CIPHER_CTX_buf_noconst(ctx), + &num, dat->stream.ctr); + else + CRYPTO_ctr128_encrypt(in, out, len, &dat->ks, + ctx->iv, + EVP_CIPHER_CTX_buf_noconst(ctx), &num, + dat->block); EVP_CIPHER_CTX_set_num(ctx, num); return 1; } -static const EVP_CIPHER sm4_ctr_mode = { - NID_sm4_ctr, 1, 16, 16, - EVP_CIPH_CTR_MODE, - EVP_ORIG_GLOBAL, - sm4_init_key, - sm4_ctr_cipher, - NULL, - sizeof(EVP_SM4_KEY), - NULL, NULL, NULL, NULL -}; - -const EVP_CIPHER *EVP_sm4_ctr(void) -{ - return &sm4_ctr_mode; -} - +DEFINE_BLOCK_CIPHERS(NID_sm4, 0) #endif diff --git a/crypto/modes/build.info b/crypto/modes/build.info index f3558fa1a4658ed203083afdefda424c43f31d2b..0ee297ced845f80a07a50efc0ddff97f747f001d 100644 --- a/crypto/modes/build.info +++ b/crypto/modes/build.info @@ -49,7 +49,7 @@ IF[{- !$disabled{asm} -}] ENDIF $COMMON=cbc128.c ctr128.c cfb128.c ofb128.c gcm128.c ccm128.c xts128.c \ - wrap128.c $MODESASM + wrap128.c xts128gb.c $MODESASM SOURCE[../../libcrypto]=$COMMON \ cts128.c ocb128.c siv128.c SOURCE[../../providers/libfips.a]=$COMMON diff --git a/crypto/modes/xts128gb.c b/crypto/modes/xts128gb.c new file mode 100644 index 0000000000000000000000000000000000000000..021c0597e4bb7bbf1263c6ea957e575e7c60b518 --- /dev/null +++ b/crypto/modes/xts128gb.c @@ -0,0 +1,199 @@ +/* + * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#include +#include +#include "internal/endian.h" +#include "crypto/modes.h" + +#ifndef STRICT_ALIGNMENT +# ifdef __GNUC__ +typedef u64 u64_a1 __attribute((__aligned__(1))); +# else +typedef u64 u64_a1; +# endif +#endif + +int ossl_crypto_xts128gb_encrypt(const XTS128_CONTEXT *ctx, + const unsigned char iv[16], + const unsigned char *inp, unsigned char *out, + size_t len, int enc) +{ + DECLARE_IS_ENDIAN; + union { + u64 u[2]; + u32 d[4]; + u8 c[16]; + } tweak, scratch; + unsigned int i; + + if (len < 16) + return -1; + + memcpy(tweak.c, iv, 16); + + (*ctx->block2) (tweak.c, tweak.c, ctx->key2); + + if (!enc && (len % 16)) + len -= 16; + + while (len >= 16) { +#if defined(STRICT_ALIGNMENT) + memcpy(scratch.c, inp, 16); + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; +#else + scratch.u[0] = ((u64_a1 *)inp)[0] ^ tweak.u[0]; + scratch.u[1] = ((u64_a1 *)inp)[1] ^ tweak.u[1]; +#endif + (*ctx->block1) (scratch.c, scratch.c, ctx->key1); +#if defined(STRICT_ALIGNMENT) + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + memcpy(out, scratch.c, 16); +#else + ((u64_a1 *)out)[0] = scratch.u[0] ^= tweak.u[0]; + ((u64_a1 *)out)[1] = scratch.u[1] ^= tweak.u[1]; +#endif + inp += 16; + out += 16; + len -= 16; + + if (len == 0) + return 0; + + if (IS_LITTLE_ENDIAN) { + u8 res; + u64 hi, lo; +#ifdef BSWAP8 + hi = BSWAP8(tweak.u[0]); + lo = BSWAP8(tweak.u[1]); +#else + u8 *p = tweak.c; + + hi = (u64)GETU32(p) << 32 | GETU32(p + 4); + lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12); +#endif + res = (u8)lo & 1; + tweak.u[0] = (lo >> 1) | (hi << 63); + tweak.u[1] = hi >> 1; + if (res) + tweak.c[15] ^= 0xe1; +#ifdef BSWAP8 + hi = BSWAP8(tweak.u[0]); + lo = BSWAP8(tweak.u[1]); +#else + p = tweak.c; + + hi = (u64)GETU32(p) << 32 | GETU32(p + 4); + lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12); +#endif + tweak.u[0] = lo; + tweak.u[1] = hi; + } else { + u8 carry, res; + carry = 0; + for (i = 0; i < 16; ++i) { + res = (tweak.c[i] << 7) & 0x80; + tweak.c[i] = ((tweak.c[i] >> 1) + carry) & 0xff; + carry = res; + } + if (res) + tweak.c[0] ^= 0xe1; + } + } + if (enc) { + for (i = 0; i < len; ++i) { + u8 c = inp[i]; + out[i] = scratch.c[i]; + scratch.c[i] = c; + } + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + (*ctx->block1) (scratch.c, scratch.c, ctx->key1); + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + memcpy(out - 16, scratch.c, 16); + } else { + union { + u64 u[2]; + u8 c[16]; + } tweak1; + + if (IS_LITTLE_ENDIAN) { + u8 res; + u64 hi, lo; +#ifdef BSWAP8 + hi = BSWAP8(tweak.u[0]); + lo = BSWAP8(tweak.u[1]); +#else + u8 *p = tweak.c; + + hi = (u64)GETU32(p) << 32 | GETU32(p + 4); + lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12); +#endif + res = (u8)lo & 1; + tweak1.u[0] = (lo >> 1) | (hi << 63); + tweak1.u[1] = hi >> 1; + if (res) + tweak1.c[15] ^= 0xe1; +#ifdef BSWAP8 + hi = BSWAP8(tweak1.u[0]); + lo = BSWAP8(tweak1.u[1]); +#else + p = tweak1.c; + + hi = (u64)GETU32(p) << 32 | GETU32(p + 4); + lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12); +#endif + tweak1.u[0] = lo; + tweak1.u[1] = hi; + } else { + u8 carry, res; + carry = 0; + for (i = 0; i < 16; ++i) { + res = (tweak.c[i] << 7) & 0x80; + tweak1.c[i] = ((tweak.c[i] >> 1) + carry) & 0xff; + carry = res; + } + if (res) + tweak1.c[0] ^= 0xe1; + } +#if defined(STRICT_ALIGNMENT) + memcpy(scratch.c, inp, 16); + scratch.u[0] ^= tweak1.u[0]; + scratch.u[1] ^= tweak1.u[1]; +#else + scratch.u[0] = ((u64_a1 *)inp)[0] ^ tweak1.u[0]; + scratch.u[1] = ((u64_a1 *)inp)[1] ^ tweak1.u[1]; +#endif + (*ctx->block1) (scratch.c, scratch.c, ctx->key1); + scratch.u[0] ^= tweak1.u[0]; + scratch.u[1] ^= tweak1.u[1]; + + for (i = 0; i < len; ++i) { + u8 c = inp[16 + i]; + out[16 + i] = scratch.c[i]; + scratch.c[i] = c; + } + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + (*ctx->block1) (scratch.c, scratch.c, ctx->key1); +#if defined(STRICT_ALIGNMENT) + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + memcpy(out, scratch.c, 16); +#else + ((u64_a1 *)out)[0] = scratch.u[0] ^ tweak.u[0]; + ((u64_a1 *)out)[1] = scratch.u[1] ^ tweak.u[1]; +#endif + } + + return 0; +} diff --git a/crypto/sm4/asm/sm4-armv8.pl b/crypto/sm4/asm/sm4-armv8.pl new file mode 100755 index 0000000000000000000000000000000000000000..7358a6e6a2cffc544501000a7fef75c807731d43 --- /dev/null +++ b/crypto/sm4/asm/sm4-armv8.pl @@ -0,0 +1,635 @@ +#! /usr/bin/env perl +# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# This module implements support for SM4 hw support on aarch64 +# Oct 2021 +# + +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour \"$output\"" + or die "can't call $xlate: $!"; +*STDOUT=*OUT; + +$prefix="sm4_v8"; +my @rks=map("v$_",(0..7)); + +sub rev32() { +my $dst = shift; +my $src = shift; +$code.=<<___; +#ifndef __ARMEB__ + rev32 $dst.16b,$src.16b +#endif +___ +} + +sub enc_blk () { +my $data = shift; +$code.=<<___; + sm4e $data.4s,@rks[0].4s + sm4e $data.4s,@rks[1].4s + sm4e $data.4s,@rks[2].4s + sm4e $data.4s,@rks[3].4s + sm4e $data.4s,@rks[4].4s + sm4e $data.4s,@rks[5].4s + sm4e $data.4s,@rks[6].4s + sm4e $data.4s,@rks[7].4s + rev64 $data.4S,$data.4S + ext $data.16b,$data.16b,$data.16b,#8 +___ +} + +sub enc_4blks () { +my $data0 = shift; +my $data1 = shift; +my $data2 = shift; +my $data3 = shift; +$code.=<<___; + sm4e $data0.4s,@rks[0].4s + sm4e $data1.4s,@rks[0].4s + sm4e $data2.4s,@rks[0].4s + sm4e $data3.4s,@rks[0].4s + + sm4e $data0.4s,@rks[1].4s + sm4e $data1.4s,@rks[1].4s + sm4e $data2.4s,@rks[1].4s + sm4e $data3.4s,@rks[1].4s + + sm4e $data0.4s,@rks[2].4s + sm4e $data1.4s,@rks[2].4s + sm4e $data2.4s,@rks[2].4s + sm4e $data3.4s,@rks[2].4s + + sm4e $data0.4s,@rks[3].4s + sm4e $data1.4s,@rks[3].4s + sm4e $data2.4s,@rks[3].4s + sm4e $data3.4s,@rks[3].4s + + sm4e $data0.4s,@rks[4].4s + sm4e $data1.4s,@rks[4].4s + sm4e $data2.4s,@rks[4].4s + sm4e $data3.4s,@rks[4].4s + + sm4e $data0.4s,@rks[5].4s + sm4e $data1.4s,@rks[5].4s + sm4e $data2.4s,@rks[5].4s + sm4e $data3.4s,@rks[5].4s + + sm4e $data0.4s,@rks[6].4s + sm4e $data1.4s,@rks[6].4s + sm4e $data2.4s,@rks[6].4s + sm4e $data3.4s,@rks[6].4s + + sm4e $data0.4s,@rks[7].4s + rev64 $data0.4S,$data0.4S + sm4e $data1.4s,@rks[7].4s + ext $data0.16b,$data0.16b,$data0.16b,#8 + rev64 $data1.4S,$data1.4S + sm4e $data2.4s,@rks[7].4s + ext $data1.16b,$data1.16b,$data1.16b,#8 + rev64 $data2.4S,$data2.4S + sm4e $data3.4s,@rks[7].4s + ext $data2.16b,$data2.16b,$data2.16b,#8 + rev64 $data3.4S,$data3.4S + ext $data3.16b,$data3.16b,$data3.16b,#8 +___ +} + +$code=<<___; +#include "arm_arch.h" +.arch armv8-a+crypto +.text +___ + +{{{ +$code.=<<___; +.align 6 +.Lck: + .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 + .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 + .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 + .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 + .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 + .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 + .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 + .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 +.Lfk: + .long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc +___ +}}} + +{{{ +my ($key,$keys)=("x0","x1"); +my ($tmp)=("x2"); +my ($key0,$key1,$key2,$key3,$key4,$key5,$key6,$key7)=map("v$_",(0..7)); +my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23)); +my ($fkconst) = ("v24"); +$code.=<<___; +.globl ${prefix}_set_encrypt_key +.type ${prefix}_set_encrypt_key,%function +.align 5 +${prefix}_set_encrypt_key: + AARCH64_VALID_CALL_TARGET + ld1 {$key0.4s},[$key] + adr $tmp,.Lfk + ld1 {$fkconst.4s},[$tmp] + adr $tmp,.Lck + ld1 {$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64 +___ + &rev32($key0, $key0); +$code.=<<___; + ld1 {$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp] + eor $key0.16b,$key0.16b,$fkconst.16b; + sm4ekey $key0.4S,$key0.4S,$const0.4S + sm4ekey $key1.4S,$key0.4S,$const1.4S + sm4ekey $key2.4S,$key1.4S,$const2.4S + sm4ekey $key3.4S,$key2.4S,$const3.4S + sm4ekey $key4.4S,$key3.4S,$const4.4S + st1 {$key0.4s,$key1.4s,$key2.4s,$key3.4s},[$keys],64 + sm4ekey $key5.4S,$key4.4S,$const5.4S + sm4ekey $key6.4S,$key5.4S,$const6.4S + sm4ekey $key7.4S,$key6.4S,$const7.4S + st1 {$key4.4s,$key5.4s,$key6.4s,$key7.4s},[$keys] + ret +.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key +___ +}}} + +{{{ +my ($key,$keys)=("x0","x1"); +my ($tmp)=("x2"); +my ($key7,$key6,$key5,$key4,$key3,$key2,$key1,$key0)=map("v$_",(0..7)); +my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23)); +my ($fkconst) = ("v24"); +$code.=<<___; +.globl ${prefix}_set_decrypt_key +.type ${prefix}_set_decrypt_key,%function +.align 5 +${prefix}_set_decrypt_key: + AARCH64_VALID_CALL_TARGET + ld1 {$key0.4s},[$key] + adr $tmp,.Lfk + ld1 {$fkconst.4s},[$tmp] + adr $tmp, .Lck + ld1 {$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64 +___ + &rev32($key0, $key0); +$code.=<<___; + ld1 {$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp] + eor $key0.16b, $key0.16b,$fkconst.16b; + sm4ekey $key0.4S,$key0.4S,$const0.4S + sm4ekey $key1.4S,$key0.4S,$const1.4S + sm4ekey $key2.4S,$key1.4S,$const2.4S + rev64 $key0.4s,$key0.4s + rev64 $key1.4s,$key1.4s + ext $key0.16b,$key0.16b,$key0.16b,#8 + ext $key1.16b,$key1.16b,$key1.16b,#8 + sm4ekey $key3.4S,$key2.4S,$const3.4S + sm4ekey $key4.4S,$key3.4S,$const4.4S + rev64 $key2.4s,$key2.4s + rev64 $key3.4s,$key3.4s + ext $key2.16b,$key2.16b,$key2.16b,#8 + ext $key3.16b,$key3.16b,$key3.16b,#8 + sm4ekey $key5.4S,$key4.4S,$const5.4S + sm4ekey $key6.4S,$key5.4S,$const6.4S + rev64 $key4.4s,$key4.4s + rev64 $key5.4s,$key5.4s + ext $key4.16b,$key4.16b,$key4.16b,#8 + ext $key5.16b,$key5.16b,$key5.16b,#8 + sm4ekey $key7.4S,$key6.4S,$const7.4S + rev64 $key6.4s, $key6.4s + rev64 $key7.4s, $key7.4s + ext $key6.16b,$key6.16b,$key6.16b,#8 + ext $key7.16b,$key7.16b,$key7.16b,#8 + st1 {$key7.4s,$key6.4s,$key5.4s,$key4.4s},[$keys],64 + st1 {$key3.4s,$key2.4s,$key1.4s,$key0.4s},[$keys] + ret +.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key +___ +}}} + +{{{ +sub gen_block () { +my $dir = shift; +my ($inp,$out,$rk)=map("x$_",(0..2)); +my ($data)=("v16"); +$code.=<<___; +.globl ${prefix}_${dir}crypt +.type ${prefix}_${dir}crypt,%function +.align 5 +${prefix}_${dir}crypt: + AARCH64_VALID_CALL_TARGET + ld1 {$data.4s},[$inp] + ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64 + ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk] +___ + &rev32($data,$data); + &enc_blk($data); + &rev32($data,$data); +$code.=<<___; + st1 {$data.4s},[$out] + ret +.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt +___ +} + +&gen_block("en"); +&gen_block("de"); +}}} + +{{{ +my ($inp,$out,$len,$rk)=map("x$_",(0..3)); +my ($enc) = ("w4"); +my @dat=map("v$_",(16..23)); +$code.=<<___; +.globl ${prefix}_ecb_encrypt +.type ${prefix}_ecb_encrypt,%function +.align 5 +${prefix}_ecb_encrypt: + AARCH64_VALID_CALL_TARGET + ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64 + ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk] +1: + cmp $len,#64 + b.lt 1f + ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64 + cmp $len,#128 + b.lt 2f + ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp],#64 + // 8 blocks +___ + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &rev32(@dat[4],@dat[4]); + &rev32(@dat[5],@dat[5]); + &rev32(@dat[6],@dat[6]); + &rev32(@dat[7],@dat[7]); + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &rev32(@dat[4],@dat[4]); + &rev32(@dat[5],@dat[5]); +$code.=<<___; + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 +___ + &rev32(@dat[6],@dat[6]); + &rev32(@dat[7],@dat[7]); +$code.=<<___; + st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64 + subs $len,$len,#128 + b.gt 1b + ret + // 4 blocks +2: +___ + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); +$code.=<<___; + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 + subs $len,$len,#64 + b.gt 1b +1: + subs $len,$len,#16 + b.lt 1f + ld1 {@dat[0].4s},[$inp],#16 +___ + &rev32(@dat[0],@dat[0]); + &enc_blk(@dat[0]); + &rev32(@dat[0],@dat[0]); +$code.=<<___; + st1 {@dat[0].4s},[$out],#16 + b.ne 1b +1: + ret +.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt +___ +}}} + +{{{ +my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4)); +my ($enc) = ("w5"); +my @dat=map("v$_",(16..23)); +my @in=map("v$_",(24..31)); +my ($ivec) = ("v8"); +$code.=<<___; +.globl ${prefix}_cbc_encrypt +.type ${prefix}_cbc_encrypt,%function +.align 5 +${prefix}_cbc_encrypt: + AARCH64_VALID_CALL_TARGET + stp d8,d9,[sp, #-16]! + + ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64 + ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk] + ld1 {$ivec.4s},[$ivp] + cmp $enc,#0 + b.eq .Ldec +1: + cmp $len, #64 + b.lt 1f + ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64 + eor @dat[0].16b,@dat[0].16b,$ivec.16b +___ + &rev32(@dat[1],@dat[1]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &enc_blk(@dat[0]); +$code.=<<___; + eor @dat[1].16b,@dat[1].16b,@dat[0].16b +___ + &enc_blk(@dat[1]); + &rev32(@dat[0],@dat[0]); +$code.=<<___; + eor @dat[2].16b,@dat[2].16b,@dat[1].16b +___ + &enc_blk(@dat[2]); + &rev32(@dat[1],@dat[1]); +$code.=<<___; + eor @dat[3].16b,@dat[3].16b,@dat[2].16b +___ + &enc_blk(@dat[3]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); +$code.=<<___; + mov $ivec.16b,@dat[3].16b + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 + subs $len,$len,#64 + b.ne 1b +1: + subs $len,$len,#16 + b.lt 3f + ld1 {@dat[0].4s},[$inp],#16 + eor $ivec.16b,$ivec.16b,@dat[0].16b +___ + &rev32($ivec,$ivec); + &enc_blk($ivec); + &rev32($ivec,$ivec); +$code.=<<___; + st1 {$ivec.16b},[$out],#16 + b.ne 1b + b 3f +.Ldec: +1: + cmp $len, #64 + b.lt 1f + ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp] + ld1 {@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64 + cmp $len,#128 + b.lt 2f + // 8 blocks mode + ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp] + ld1 {@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64 +___ + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],$dat[3]); + &rev32(@dat[4],@dat[4]); + &rev32(@dat[5],@dat[5]); + &rev32(@dat[6],@dat[6]); + &rev32(@dat[7],$dat[7]); + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &rev32(@dat[4],@dat[4]); + &rev32(@dat[5],@dat[5]); + &rev32(@dat[6],@dat[6]); + &rev32(@dat[7],@dat[7]); +$code.=<<___; + eor @dat[0].16b,@dat[0].16b,$ivec.16b + eor @dat[1].16b,@dat[1].16b,@in[0].16b + eor @dat[2].16b,@dat[2].16b,@in[1].16b + mov $ivec.16b,@in[7].16b + eor @dat[3].16b,$dat[3].16b,@in[2].16b + eor @dat[4].16b,$dat[4].16b,@in[3].16b + eor @dat[5].16b,$dat[5].16b,@in[4].16b + eor @dat[6].16b,$dat[6].16b,@in[5].16b + eor @dat[7].16b,$dat[7].16b,@in[6].16b + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 + st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64 + subs $len,$len,128 + b.gt 1b + b 3f + // 4 blocks mode +2: +___ + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],$dat[3]); + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); +$code.=<<___; + eor @dat[0].16b,@dat[0].16b,$ivec.16b + eor @dat[1].16b,@dat[1].16b,@in[0].16b + mov $ivec.16b,@in[3].16b + eor @dat[2].16b,@dat[2].16b,@in[1].16b + eor @dat[3].16b,$dat[3].16b,@in[2].16b + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 + subs $len,$len,#64 + b.gt 1b +1: + subs $len,$len,#16 + b.lt 3f + ld1 {@dat[0].4s},[$inp],#16 + mov @in[0].16b,@dat[0].16b +___ + &rev32(@dat[0],@dat[0]); + &enc_blk(@dat[0]); + &rev32(@dat[0],@dat[0]); +$code.=<<___; + eor @dat[0].16b,@dat[0].16b,$ivec.16b + mov $ivec.16b,@in[0].16b + st1 {@dat[0].16b},[$out],#16 + b.ne 1b +3: + // save back IV + st1 {$ivec.16b},[$ivp] + ldp d8,d9,[sp],#16 + ret +.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt +___ +}}} + +{{{ +my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4)); +my ($ctr)=("w5"); +my @dat=map("v$_",(16..23)); +my @in=map("v$_",(24..31)); +my ($ivec)=("v8"); +$code.=<<___; +.globl ${prefix}_ctr32_encrypt_blocks +.type ${prefix}_ctr32_encrypt_blocks,%function +.align 5 +${prefix}_ctr32_encrypt_blocks: + AARCH64_VALID_CALL_TARGET + stp d8,d9,[sp, #-16]! + + ld1 {$ivec.4s},[$ivp] + ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64 + ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk] +___ + &rev32($ivec,$ivec); +$code.=<<___; + mov $ctr,$ivec.s[3] +1: + cmp $len,#4 + b.lt 1f + ld1 {@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64 + mov @dat[0].16b,$ivec.16b + mov @dat[1].16b,$ivec.16b + mov @dat[2].16b,$ivec.16b + mov @dat[3].16b,$ivec.16b + add $ctr,$ctr,#1 + mov $dat[1].s[3],$ctr + add $ctr,$ctr,#1 + mov @dat[2].s[3],$ctr + add $ctr,$ctr,#1 + mov @dat[3].s[3],$ctr + cmp $len,#8 + b.lt 2f + ld1 {@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64 + mov @dat[4].16b,$ivec.16b + mov @dat[5].16b,$ivec.16b + mov @dat[6].16b,$ivec.16b + mov @dat[7].16b,$ivec.16b + add $ctr,$ctr,#1 + mov $dat[4].s[3],$ctr + add $ctr,$ctr,#1 + mov @dat[5].s[3],$ctr + add $ctr,$ctr,#1 + mov @dat[6].s[3],$ctr + add $ctr,$ctr,#1 + mov @dat[7].s[3],$ctr +___ + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &rev32(@dat[4],@dat[4]); + &rev32(@dat[5],@dat[5]); + &rev32(@dat[6],@dat[6]); + &rev32(@dat[7],@dat[7]); +$code.=<<___; + eor @dat[0].16b,@dat[0].16b,@in[0].16b + eor @dat[1].16b,@dat[1].16b,@in[1].16b + eor @dat[2].16b,@dat[2].16b,@in[2].16b + eor @dat[3].16b,@dat[3].16b,@in[3].16b + eor @dat[4].16b,@dat[4].16b,@in[4].16b + eor @dat[5].16b,@dat[5].16b,@in[5].16b + eor @dat[6].16b,@dat[6].16b,@in[6].16b + eor @dat[7].16b,@dat[7].16b,@in[7].16b + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 + st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64 + subs $len,$len,#8 + b.eq 3f + add $ctr,$ctr,#1 + mov $ivec.s[3],$ctr + b 1b +2: +___ + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); +$code.=<<___; + eor @dat[0].16b,@dat[0].16b,@in[0].16b + eor @dat[1].16b,@dat[1].16b,@in[1].16b + eor @dat[2].16b,@dat[2].16b,@in[2].16b + eor @dat[3].16b,@dat[3].16b,@in[3].16b + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 + subs $len,$len,#4 + b.eq 3f + add $ctr,$ctr,#1 + mov $ivec.s[3],$ctr + b 1b +1: + subs $len,$len,#1 + b.lt 3f + mov $dat[0].16b,$ivec.16b + ld1 {@in[0].4s},[$inp],#16 +___ + &enc_blk(@dat[0]); + &rev32(@dat[0],@dat[0]); +$code.=<<___; + eor $dat[0].16b,$dat[0].16b,@in[0].16b + st1 {$dat[0].4s},[$out],#16 + b.eq 3f + add $ctr,$ctr,#1 + mov $ivec.s[3],$ctr + b 1b +3: + ldp d8,d9,[sp],#16 + ret +.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks +___ +}}} +######################################## +{ my %opcode = ( + "sm4e" => 0xcec08400, + "sm4ekey" => 0xce60c800); + + sub unsm4 { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o + && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5)|($3<<16), + $mnemonic,$arg; + } +} + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/ge; + + s/\b(sm4\w+)\s+([qv].*)/unsm4($1,$2)/ge; + print $_,"\n"; +} + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/sm4/asm/vpsm4-armv8.pl b/crypto/sm4/asm/vpsm4-armv8.pl new file mode 100755 index 0000000000000000000000000000000000000000..2bacf9cae06a02f19f74ab359305af46094116f6 --- /dev/null +++ b/crypto/sm4/asm/vpsm4-armv8.pl @@ -0,0 +1,1578 @@ +#! /usr/bin/env perl +# Copyright 2020-2021 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# This module implements SM4 with ASIMD on aarch64 +# +# Feb 2022 +# + +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour \"$output\"" + or die "can't call $xlate: $!"; +*STDOUT=*OUT; + +$prefix="vpsm4"; +my @vtmp=map("v$_",(0..3)); +my @qtmp=map("q$_",(0..3)); +my @data=map("v$_",(4..7)); +my @datax=map("v$_",(8..11)); +my ($rk0,$rk1)=("v12","v13"); +my ($rka,$rkb)=("v14","v15"); +my @vtmpx=map("v$_",(12..15)); +my @sbox=map("v$_",(16..31)); +my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3"); +my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9"); +my ($xtmp1,$xtmp2)=("x8","x9"); +my ($ptr,$counter)=("x10","w11"); +my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15"); + +sub rev32() { + my $dst = shift; + my $src = shift; + + if ($src and ("$src" ne "$dst")) { +$code.=<<___; +#ifndef __AARCH64EB__ + rev32 $dst.16b,$src.16b +#else + mov $dst.16b,$src.16b +#endif +___ + } else { +$code.=<<___; +#ifndef __AARCH64EB__ + rev32 $dst.16b,$dst.16b +#endif +___ + } +} + +sub rev32_armeb() { + my $dst = shift; + my $src = shift; + + if ($src and ("$src" ne "$dst")) { +$code.=<<___; +#ifdef __AARCH64EB__ + rev32 $dst.16b,$src.16b +#else + mov $dst.16b,$src.16b +#endif +___ + } else { +$code.=<<___; +#ifdef __AARCH64EB__ + rev32 $dst.16b,$dst.16b +#endif +___ + } +} + +sub rbit() { + my $dst = shift; + my $src = shift; + my $std = shift; + + if ($src and ("$src" ne "$dst")) { + if ($std eq "_gb") { +$code.=<<___; + rbit $dst.16b,$src.16b +___ + } else { +$code.=<<___; + mov $dst.16b,$src.16b +___ + } + } else { + if ($std eq "_gb") { +$code.=<<___; + rbit $dst.16b,$src.16b +___ + } + } +} + +sub transpose() { + my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_; + +$code.=<<___; + zip1 $vt0.4s,$dat0.4s,$dat1.4s + zip2 $vt1.4s,$dat0.4s,$dat1.4s + zip1 $vt2.4s,$dat2.4s,$dat3.4s + zip2 $vt3.4s,$dat2.4s,$dat3.4s + zip1 $dat0.2d,$vt0.2d,$vt2.2d + zip2 $dat1.2d,$vt0.2d,$vt2.2d + zip1 $dat2.2d,$vt1.2d,$vt3.2d + zip2 $dat3.2d,$vt1.2d,$vt3.2d +___ +} + +# sbox operations for 4-lane of words +sub sbox() { + my $dat = shift; + +$code.=<<___; + movi @vtmp[0].16b,#64 + movi @vtmp[1].16b,#128 + movi @vtmp[2].16b,#192 + sub @vtmp[0].16b,$dat.16b,@vtmp[0].16b + sub @vtmp[1].16b,$dat.16b,@vtmp[1].16b + sub @vtmp[2].16b,$dat.16b,@vtmp[2].16b + tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b + tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b + tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b + tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b + add @vtmp[0].2d,@vtmp[0].2d,@vtmp[1].2d + add @vtmp[2].2d,@vtmp[2].2d,$dat.2d + add $dat.2d,@vtmp[0].2d,@vtmp[2].2d + + ushr @vtmp[0].4s,$dat.4s,32-2 + sli @vtmp[0].4s,$dat.4s,2 + ushr @vtmp[2].4s,$dat.4s,32-10 + eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b + sli @vtmp[2].4s,$dat.4s,10 + eor @vtmp[1].16b,@vtmp[2].16b,$vtmp[1].16b + ushr @vtmp[0].4s,$dat.4s,32-18 + sli @vtmp[0].4s,$dat.4s,18 + ushr @vtmp[2].4s,$dat.4s,32-24 + eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b + sli @vtmp[2].4s,$dat.4s,24 + eor $dat.16b,@vtmp[2].16b,@vtmp[1].16b +___ +} + +# sbox operation for 8-lane of words +sub sbox_double() { + my $dat = shift; + my $datx = shift; + +$code.=<<___; + movi @vtmp[3].16b,#64 + sub @vtmp[0].16b,$dat.16b,@vtmp[3].16b + sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b + sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b + tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b + tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b + tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b + tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b + add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d + add $dat.2d,@vtmp[2].2d,$dat.2d + add $dat.2d,@vtmp[1].2d,$dat.2d + + sub @vtmp[0].16b,$datx.16b,@vtmp[3].16b + sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b + sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b + tbl $datx.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$datx.16b + tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b + tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b + tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b + add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d + add $datx.2d,@vtmp[2].2d,$datx.2d + add $datx.2d,@vtmp[1].2d,$datx.2d + + ushr @vtmp[0].4s,$dat.4s,32-2 + sli @vtmp[0].4s,$dat.4s,2 + ushr @vtmp[2].4s,$datx.4s,32-2 + eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b + sli @vtmp[2].4s,$datx.4s,2 + + ushr @vtmp[0].4s,$dat.4s,32-10 + eor @vtmp[3].16b,@vtmp[2].16b,$datx.16b + sli @vtmp[0].4s,$dat.4s,10 + ushr @vtmp[2].4s,$datx.4s,32-10 + eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b + sli @vtmp[2].4s,$datx.4s,10 + + ushr @vtmp[0].4s,$dat.4s,32-18 + eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b + sli @vtmp[0].4s,$dat.4s,18 + ushr @vtmp[2].4s,$datx.4s,32-18 + eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b + sli @vtmp[2].4s,$datx.4s,18 + + ushr @vtmp[0].4s,$dat.4s,32-24 + eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b + sli @vtmp[0].4s,$dat.4s,24 + ushr @vtmp[2].4s,$datx.4s,32-24 + eor $dat.16b,@vtmp[0].16b,@vtmp[1].16b + sli @vtmp[2].4s,$datx.4s,24 + eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b +___ +} + +# sbox operation for one single word +sub sbox_1word () { + my $word = shift; + +$code.=<<___; + movi @vtmp[1].16b,#64 + movi @vtmp[2].16b,#128 + movi @vtmp[3].16b,#192 + mov @vtmp[0].s[0],$word + + sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[1].16b + sub @vtmp[2].16b,@vtmp[0].16b,@vtmp[2].16b + sub @vtmp[3].16b,@vtmp[0].16b,@vtmp[3].16b + + tbl @vtmp[0].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@vtmp[0].16b + tbl @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[1].16b + tbl @vtmp[2].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[2].16b + tbl @vtmp[3].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[3].16b + + mov $word,@vtmp[0].s[0] + mov $wtmp0,@vtmp[1].s[0] + mov $wtmp2,@vtmp[2].s[0] + add $wtmp0,$word,$wtmp0 + mov $word,@vtmp[3].s[0] + add $wtmp0,$wtmp0,$wtmp2 + add $wtmp0,$wtmp0,$word + + eor $word,$wtmp0,$wtmp0,ror #32-2 + eor $word,$word,$wtmp0,ror #32-10 + eor $word,$word,$wtmp0,ror #32-18 + eor $word,$word,$wtmp0,ror #32-24 +___ +} + +# sm4 for one block of data, in scalar registers word0/word1/word2/word3 +sub sm4_1blk () { + my $kptr = shift; + +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor $tmpw,$word2,$word3 + eor $wtmp2,$wtmp0,$word1 + eor $tmpw,$tmpw,$wtmp2 +___ + &sbox_1word($tmpw); +$code.=<<___; + eor $word0,$word0,$tmpw + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor $tmpw,$word2,$word3 + eor $wtmp2,$word0,$wtmp1 + eor $tmpw,$tmpw,$wtmp2 +___ + &sbox_1word($tmpw); +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + eor $word1,$word1,$tmpw + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor $tmpw,$word0,$word1 + eor $wtmp2,$wtmp0,$word3 + eor $tmpw,$tmpw,$wtmp2 +___ + &sbox_1word($tmpw); +$code.=<<___; + eor $word2,$word2,$tmpw + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor $tmpw,$word0,$word1 + eor $wtmp2,$word2,$wtmp1 + eor $tmpw,$tmpw,$wtmp2 +___ + &sbox_1word($tmpw); +$code.=<<___; + eor $word3,$word3,$tmpw +___ +} + +# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3 +sub sm4_4blks () { + my $kptr = shift; + +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + dup $rk0.4s,$wtmp0 + dup $rk1.4s,$wtmp1 + + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor $rka.16b,@data[2].16b,@data[3].16b + eor $rk0.16b,@data[1].16b,$rk0.16b + eor $rk0.16b,$rka.16b,$rk0.16b +___ + &sbox($rk0); +$code.=<<___; + eor @data[0].16b,@data[0].16b,$rk0.16b + + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor $rka.16b,$rka.16b,@data[0].16b + eor $rk1.16b,$rka.16b,$rk1.16b +___ + &sbox($rk1); +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + eor @data[1].16b,@data[1].16b,$rk1.16b + + dup $rk0.4s,$wtmp0 + dup $rk1.4s,$wtmp1 + + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor $rka.16b,@data[0].16b,@data[1].16b + eor $rk0.16b,@data[3].16b,$rk0.16b + eor $rk0.16b,$rka.16b,$rk0.16b +___ + &sbox($rk0); +$code.=<<___; + eor @data[2].16b,@data[2].16b,$rk0.16b + + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor $rka.16b,$rka.16b,@data[2].16b + eor $rk1.16b,$rka.16b,$rk1.16b +___ + &sbox($rk1); +$code.=<<___; + eor @data[3].16b,@data[3].16b,$rk1.16b +___ +} + +# sm4 for 8 lanes of data, in neon registers +# data0/data1/data2/data3 datax0/datax1/datax2/datax3 +sub sm4_8blks () { + my $kptr = shift; + +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + dup $rk0.4s,$wtmp0 + eor $rka.16b,@data[2].16b,@data[3].16b + eor $rkb.16b,@datax[2].16b,@datax[3].16b + eor @vtmp[0].16b,@data[1].16b,$rk0.16b + eor @vtmp[1].16b,@datax[1].16b,$rk0.16b + eor $rk0.16b,$rka.16b,@vtmp[0].16b + eor $rk1.16b,$rkb.16b,@vtmp[1].16b +___ + &sbox_double($rk0,$rk1); +$code.=<<___; + eor @data[0].16b,@data[0].16b,$rk0.16b + eor @datax[0].16b,@datax[0].16b,$rk1.16b + + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + dup $rk1.4s,$wtmp1 + eor $rka.16b,$rka.16b,@data[0].16b + eor $rkb.16b,$rkb.16b,@datax[0].16b + eor $rk0.16b,$rka.16b,$rk1.16b + eor $rk1.16b,$rkb.16b,$rk1.16b +___ + &sbox_double($rk0,$rk1); +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + eor @data[1].16b,@data[1].16b,$rk0.16b + eor @datax[1].16b,@datax[1].16b,$rk1.16b + + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + dup $rk0.4s,$wtmp0 + eor $rka.16b,@data[0].16b,@data[1].16b + eor $rkb.16b,@datax[0].16b,@datax[1].16b + eor @vtmp[0].16b,@data[3].16b,$rk0.16b + eor @vtmp[1].16b,@datax[3].16b,$rk0.16b + eor $rk0.16b,$rka.16b,@vtmp[0].16b + eor $rk1.16b,$rkb.16b,@vtmp[1].16b +___ + &sbox_double($rk0,$rk1); +$code.=<<___; + eor @data[2].16b,@data[2].16b,$rk0.16b + eor @datax[2].16b,@datax[2].16b,$rk1.16b + + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + dup $rk1.4s,$wtmp1 + eor $rka.16b,$rka.16b,@data[2].16b + eor $rkb.16b,$rkb.16b,@datax[2].16b + eor $rk0.16b,$rka.16b,$rk1.16b + eor $rk1.16b,$rkb.16b,$rk1.16b +___ + &sbox_double($rk0,$rk1); +$code.=<<___; + eor @data[3].16b,@data[3].16b,$rk0.16b + eor @datax[3].16b,@datax[3].16b,$rk1.16b +___ +} + +sub encrypt_1blk_norev() { + my $dat = shift; + +$code.=<<___; + mov $ptr,$rks + mov $counter,#8 + mov $word0,$dat.s[0] + mov $word1,$dat.s[1] + mov $word2,$dat.s[2] + mov $word3,$dat.s[3] +10: +___ + &sm4_1blk($ptr); +$code.=<<___; + subs $counter,$counter,#1 + b.ne 10b + mov $dat.s[0],$word3 + mov $dat.s[1],$word2 + mov $dat.s[2],$word1 + mov $dat.s[3],$word0 +___ +} + +sub encrypt_1blk() { + my $dat = shift; + + &encrypt_1blk_norev($dat); + &rev32($dat,$dat); +} + +sub encrypt_4blks() { +$code.=<<___; + mov $ptr,$rks + mov $counter,#8 +10: +___ + &sm4_4blks($ptr); +$code.=<<___; + subs $counter,$counter,#1 + b.ne 10b +___ + &rev32(@vtmp[3],@data[0]); + &rev32(@vtmp[2],@data[1]); + &rev32(@vtmp[1],@data[2]); + &rev32(@vtmp[0],@data[3]); +} + +sub encrypt_8blks() { +$code.=<<___; + mov $ptr,$rks + mov $counter,#8 +10: +___ + &sm4_8blks($ptr); +$code.=<<___; + subs $counter,$counter,#1 + b.ne 10b +___ + &rev32(@vtmp[3],@data[0]); + &rev32(@vtmp[2],@data[1]); + &rev32(@vtmp[1],@data[2]); + &rev32(@vtmp[0],@data[3]); + &rev32(@data[3],@datax[0]); + &rev32(@data[2],@datax[1]); + &rev32(@data[1],@datax[2]); + &rev32(@data[0],@datax[3]); +} + +sub load_sbox () { + my $data = shift; + +$code.=<<___; + adr $ptr,.Lsbox + ld1 {@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},[$ptr],#64 + ld1 {@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},[$ptr],#64 + ld1 {@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},[$ptr],#64 + ld1 {@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},[$ptr] +___ +} + + +sub mov_reg_to_vec() { + my $src0 = shift; + my $src1 = shift; + my $desv = shift; +$code.=<<___; + mov $desv.d[0],$src0 + mov $desv.d[1],$src1 +___ + &rev32_armeb($desv,$desv); +} + +sub mov_vec_to_reg() { + my $srcv = shift; + my $des0 = shift; + my $des1 = shift; +$code.=<<___; + mov $des0,$srcv.d[0] + mov $des1,$srcv.d[1] +___ +} + +sub compute_tweak() { + my $src0 = shift; + my $src1 = shift; + my $des0 = shift; + my $des1 = shift; +$code.=<<___; + mov $wtmp0,0x87 + extr $xtmp2,$src1,$src1,#32 + extr $des1,$src1,$src0,#63 + and $wtmp1,$wtmp0,$wtmp2,asr#31 + eor $des0,$xtmp1,$src0,lsl#1 +___ +} + +sub compute_tweak_vec() { + my $src = shift; + my $des = shift; + my $std = shift; + &rbit(@vtmp[2],$src,$std); +$code.=<<___; + ldr @qtmp[0], .Lxts_magic + shl $des.16b, @vtmp[2].16b, #1 + ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15 + ushr @vtmp[1].16b, @vtmp[1].16b, #7 + mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b + eor $des.16b, $des.16b, @vtmp[1].16b +___ + &rbit($des,$des,$std); +} + +$code=<<___; +#include "arm_arch.h" +.arch armv8-a +.text + +.type _vpsm4_consts,%object +.align 7 +_vpsm4_consts: +.Lsbox: + .byte 0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05 + .byte 0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99 + .byte 0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62 + .byte 0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6 + .byte 0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8 + .byte 0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35 + .byte 0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87 + .byte 0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E + .byte 0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1 + .byte 0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3 + .byte 0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F + .byte 0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51 + .byte 0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8 + .byte 0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0 + .byte 0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84 + .byte 0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48 +.Lck: + .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 + .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 + .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 + .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 + .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 + .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 + .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 + .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 +.Lfk: + .dword 0x56aa3350a3b1bac6,0xb27022dc677d9197 +.Lshuffles: + .dword 0x0B0A090807060504,0x030201000F0E0D0C +.Lxts_magic: + .dword 0x0101010101010187,0x0101010101010101 + +.size _vpsm4_consts,.-_vpsm4_consts +___ + +{{{ +my ($key,$keys,$enc)=("x0","x1","w2"); +my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8"); +my ($vkey,$vfk,$vmap)=("v5","v6","v7"); +$code.=<<___; +.type _vpsm4_set_key,%function +.align 4 +_vpsm4_set_key: + AARCH64_VALID_CALL_TARGET + ld1 {$vkey.4s},[$key] +___ + &load_sbox(); + &rev32($vkey,$vkey); +$code.=<<___; + adr $pointer,.Lshuffles + ld1 {$vmap.2d},[$pointer] + adr $pointer,.Lfk + ld1 {$vfk.2d},[$pointer] + eor $vkey.16b,$vkey.16b,$vfk.16b + mov $schedules,#32 + adr $pointer,.Lck + movi @vtmp[0].16b,#64 + cbnz $enc,1f + add $keys,$keys,124 +1: + mov $wtmp,$vkey.s[1] + ldr $roundkey,[$pointer],#4 + eor $roundkey,$roundkey,$wtmp + mov $wtmp,$vkey.s[2] + eor $roundkey,$roundkey,$wtmp + mov $wtmp,$vkey.s[3] + eor $roundkey,$roundkey,$wtmp + // sbox lookup + mov @data[0].s[0],$roundkey + tbl @vtmp[1].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@data[0].16b + sub @data[0].16b,@data[0].16b,@vtmp[0].16b + tbx @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@data[0].16b + sub @data[0].16b,@data[0].16b,@vtmp[0].16b + tbx @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@data[0].16b + sub @data[0].16b,@data[0].16b,@vtmp[0].16b + tbx @vtmp[1].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@data[0].16b + mov $wtmp,@vtmp[1].s[0] + eor $roundkey,$wtmp,$wtmp,ror #19 + eor $roundkey,$roundkey,$wtmp,ror #9 + mov $wtmp,$vkey.s[0] + eor $roundkey,$roundkey,$wtmp + mov $vkey.s[0],$roundkey + cbz $enc,2f + str $roundkey,[$keys],#4 + b 3f +2: + str $roundkey,[$keys],#-4 +3: + tbl $vkey.16b,{$vkey.16b},$vmap.16b + subs $schedules,$schedules,#1 + b.ne 1b + ret +.size _vpsm4_set_key,.-_vpsm4_set_key +___ +}}} + + +{{{ +$code.=<<___; +.type _vpsm4_enc_4blks,%function +.align 4 +_vpsm4_enc_4blks: + AARCH64_VALID_CALL_TARGET +___ + &encrypt_4blks(); +$code.=<<___; + ret +.size _vpsm4_enc_4blks,.-_vpsm4_enc_4blks +___ +}}} + +{{{ +$code.=<<___; +.type _vpsm4_enc_8blks,%function +.align 4 +_vpsm4_enc_8blks: + AARCH64_VALID_CALL_TARGET +___ + &encrypt_8blks(); +$code.=<<___; + ret +.size _vpsm4_enc_8blks,.-_vpsm4_enc_8blks +___ +}}} + + +{{{ +my ($key,$keys)=("x0","x1"); +$code.=<<___; +.globl ${prefix}_set_encrypt_key +.type ${prefix}_set_encrypt_key,%function +.align 5 +${prefix}_set_encrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + mov w2,1 + bl _vpsm4_set_key + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key +___ +}}} + +{{{ +my ($key,$keys)=("x0","x1"); +$code.=<<___; +.globl ${prefix}_set_decrypt_key +.type ${prefix}_set_decrypt_key,%function +.align 5 +${prefix}_set_decrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + mov w2,0 + bl _vpsm4_set_key + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key +___ +}}} + +{{{ +sub gen_block () { + my $dir = shift; + my ($inp,$outp,$rk)=map("x$_",(0..2)); + +$code.=<<___; +.globl ${prefix}_${dir}crypt +.type ${prefix}_${dir}crypt,%function +.align 5 +${prefix}_${dir}crypt: + AARCH64_VALID_CALL_TARGET + ld1 {@data[0].4s},[$inp] +___ + &load_sbox(); + &rev32(@data[0],@data[0]); +$code.=<<___; + mov $rks,x2 +___ + &encrypt_1blk(@data[0]); +$code.=<<___; + st1 {@data[0].4s},[$outp] + ret +.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt +___ +} +&gen_block("en"); +&gen_block("de"); +}}} + +{{{ +my ($enc) = ("w4"); +my @dat=map("v$_",(16..23)); + +$code.=<<___; +.globl ${prefix}_ecb_encrypt +.type ${prefix}_ecb_encrypt,%function +.align 5 +${prefix}_ecb_encrypt: + AARCH64_SIGN_LINK_REGISTER + // convert length into blocks + lsr x2,x2,4 + stp d8,d9,[sp,#-80]! + stp d10,d11,[sp,#16] + stp d12,d13,[sp,#32] + stp d14,d15,[sp,#48] + stp x29,x30,[sp,#64] +___ + &load_sbox(); +$code.=<<___; +.Lecb_8_blocks_process: + cmp $blocks,#8 + b.lt .Lecb_4_blocks_process + ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 + ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); + &rev32(@datax[0],@datax[0]); + &rev32(@datax[1],@datax[1]); + &rev32(@datax[2],@datax[2]); + &rev32(@datax[3],@datax[3]); +$code.=<<___; + bl _vpsm4_enc_8blks + st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 + subs $blocks,$blocks,#8 + b.gt .Lecb_8_blocks_process + b 100f +.Lecb_4_blocks_process: + cmp $blocks,#4 + b.lt 1f + ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); +$code.=<<___; + bl _vpsm4_enc_4blks + st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + sub $blocks,$blocks,#4 +1: + // process last block + cmp $blocks,#1 + b.lt 100f + b.gt 1f + ld1 {@data[0].4s},[$inp] +___ + &rev32(@data[0],@data[0]); + &encrypt_1blk(@data[0]); +$code.=<<___; + st1 {@data[0].4s},[$outp] + b 100f +1: // process last 2 blocks + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16 + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16 + cmp $blocks,#2 + b.gt 1f +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); +$code.=<<___; + bl _vpsm4_enc_4blks + st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 + st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp] + b 100f +1: // process last 3 blocks + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16 +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); +$code.=<<___; + bl _vpsm4_enc_4blks + st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 + st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16 + st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp] +100: + ldp d10,d11,[sp,#16] + ldp d12,d13,[sp,#32] + ldp d14,d15,[sp,#48] + ldp x29,x30,[sp,#64] + ldp d8,d9,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt +___ +}}} + +{{{ +my ($len,$ivp,$enc)=("x2","x4","w5"); +my $ivec0=("v3"); +my $ivec1=("v15"); + +$code.=<<___; +.globl ${prefix}_cbc_encrypt +.type ${prefix}_cbc_encrypt,%function +.align 5 +${prefix}_cbc_encrypt: + AARCH64_VALID_CALL_TARGET + lsr $len,$len,4 +___ + &load_sbox(); +$code.=<<___; + cbz $enc,.Ldec + ld1 {$ivec0.4s},[$ivp] +.Lcbc_4_blocks_enc: + cmp $blocks,#4 + b.lt 1f + ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 + eor @data[0].16b,@data[0].16b,$ivec0.16b +___ + &rev32(@data[1],@data[1]); + &rev32(@data[0],@data[0]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); + &encrypt_1blk_norev(@data[0]); +$code.=<<___; + eor @data[1].16b,@data[1].16b,@data[0].16b +___ + &encrypt_1blk_norev(@data[1]); + &rev32(@data[0],@data[0]); + +$code.=<<___; + eor @data[2].16b,@data[2].16b,@data[1].16b +___ + &encrypt_1blk_norev(@data[2]); + &rev32(@data[1],@data[1]); +$code.=<<___; + eor @data[3].16b,@data[3].16b,@data[2].16b +___ + &encrypt_1blk_norev(@data[3]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); +$code.=<<___; + orr $ivec0.16b,@data[3].16b,@data[3].16b + st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 + subs $blocks,$blocks,#4 + b.ne .Lcbc_4_blocks_enc + b 2f +1: + subs $blocks,$blocks,#1 + b.lt 2f + ld1 {@data[0].4s},[$inp],#16 + eor $ivec0.16b,$ivec0.16b,@data[0].16b +___ + &rev32($ivec0,$ivec0); + &encrypt_1blk($ivec0); +$code.=<<___; + st1 {$ivec0.4s},[$outp],#16 + b 1b +2: + // save back IV + st1 {$ivec0.4s},[$ivp] + ret + +.Ldec: + // decryption mode starts + AARCH64_SIGN_LINK_REGISTER + stp d8,d9,[sp,#-80]! + stp d10,d11,[sp,#16] + stp d12,d13,[sp,#32] + stp d14,d15,[sp,#48] + stp x29,x30,[sp,#64] +.Lcbc_8_blocks_dec: + cmp $blocks,#8 + b.lt 1f + ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp] + add $ptr,$inp,#64 + ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr] +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],$data[3]); + &rev32(@datax[0],@datax[0]); + &rev32(@datax[1],@datax[1]); + &rev32(@datax[2],@datax[2]); + &rev32(@datax[3],$datax[3]); +$code.=<<___; + bl _vpsm4_enc_8blks +___ + &transpose(@vtmp,@datax); + &transpose(@data,@datax); +$code.=<<___; + ld1 {$ivec1.4s},[$ivp] + ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 + // note ivec1 and vtmpx[3] are resuing the same register + // care needs to be taken to avoid conflict + eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b + ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 + eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b + eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b + eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b + // save back IV + st1 {$vtmpx[3].4s}, [$ivp] + eor @data[0].16b,@data[0].16b,$datax[3].16b + eor @data[1].16b,@data[1].16b,@vtmpx[0].16b + eor @data[2].16b,@data[2].16b,@vtmpx[1].16b + eor @data[3].16b,$data[3].16b,@vtmpx[2].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 + subs $blocks,$blocks,#8 + b.gt .Lcbc_8_blocks_dec + b.eq 100f +1: + ld1 {$ivec1.4s},[$ivp] +.Lcbc_4_blocks_dec: + cmp $blocks,#4 + b.lt 1f + ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp] +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],$data[3]); +$code.=<<___; + bl _vpsm4_enc_4blks + ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 +___ + &transpose(@vtmp,@datax); +$code.=<<___; + eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b + eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b + orr $ivec1.16b,@data[3].16b,@data[3].16b + eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b + eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + subs $blocks,$blocks,#4 + b.gt .Lcbc_4_blocks_dec + // save back IV + st1 {@data[3].4s}, [$ivp] + b 100f +1: // last block + subs $blocks,$blocks,#1 + b.lt 100f + b.gt 1f + ld1 {@data[0].4s},[$inp],#16 + // save back IV + st1 {$data[0].4s}, [$ivp] +___ + &rev32(@datax[0],@data[0]); + &encrypt_1blk(@datax[0]); +$code.=<<___; + eor @datax[0].16b,@datax[0].16b,$ivec1.16b + st1 {@datax[0].4s},[$outp],#16 + b 100f +1: // last two blocks + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp] + add $ptr,$inp,#16 + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16 + subs $blocks,$blocks,1 + b.gt 1f +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); +$code.=<<___; + bl _vpsm4_enc_4blks + ld1 {@data[0].4s,@data[1].4s},[$inp],#32 +___ + &transpose(@vtmp,@datax); +$code.=<<___; + eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b + eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b + st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 + // save back IV + st1 {@data[1].4s}, [$ivp] + b 100f +1: // last 3 blocks + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr] +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); +$code.=<<___; + bl _vpsm4_enc_4blks + ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 +___ + &transpose(@vtmp,@datax); +$code.=<<___; + eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b + eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b + eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 + // save back IV + st1 {@data[2].4s}, [$ivp] +100: + ldp d10,d11,[sp,#16] + ldp d12,d13,[sp,#32] + ldp d14,d15,[sp,#48] + ldp x29,x30,[sp,#64] + ldp d8,d9,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt +___ +}}} + +{{{ +my ($ivp)=("x4"); +my ($ctr)=("w5"); +my $ivec=("v3"); + +$code.=<<___; +.globl ${prefix}_ctr32_encrypt_blocks +.type ${prefix}_ctr32_encrypt_blocks,%function +.align 5 +${prefix}_ctr32_encrypt_blocks: + AARCH64_VALID_CALL_TARGET + ld1 {$ivec.4s},[$ivp] +___ + &rev32($ivec,$ivec); + &load_sbox(); +$code.=<<___; + cmp $blocks,#1 + b.ne 1f + // fast processing for one single block without + // context saving overhead +___ + &encrypt_1blk($ivec); +$code.=<<___; + ld1 {@data[0].4s},[$inp] + eor @data[0].16b,@data[0].16b,$ivec.16b + st1 {@data[0].4s},[$outp] + ret +1: + AARCH64_SIGN_LINK_REGISTER + stp d8,d9,[sp,#-80]! + stp d10,d11,[sp,#16] + stp d12,d13,[sp,#32] + stp d14,d15,[sp,#48] + stp x29,x30,[sp,#64] + mov $word0,$ivec.s[0] + mov $word1,$ivec.s[1] + mov $word2,$ivec.s[2] + mov $ctr,$ivec.s[3] +.Lctr32_4_blocks_process: + cmp $blocks,#4 + b.lt 1f + dup @data[0].4s,$word0 + dup @data[1].4s,$word1 + dup @data[2].4s,$word2 + mov @data[3].s[0],$ctr + add $ctr,$ctr,#1 + mov $data[3].s[1],$ctr + add $ctr,$ctr,#1 + mov @data[3].s[2],$ctr + add $ctr,$ctr,#1 + mov @data[3].s[3],$ctr + add $ctr,$ctr,#1 + cmp $blocks,#8 + b.ge .Lctr32_8_blocks_process + bl _vpsm4_enc_4blks + ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 + eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b + eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b + eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b + eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b + st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + subs $blocks,$blocks,#4 + b.ne .Lctr32_4_blocks_process + b 100f +.Lctr32_8_blocks_process: + dup @datax[0].4s,$word0 + dup @datax[1].4s,$word1 + dup @datax[2].4s,$word2 + mov @datax[3].s[0],$ctr + add $ctr,$ctr,#1 + mov $datax[3].s[1],$ctr + add $ctr,$ctr,#1 + mov @datax[3].s[2],$ctr + add $ctr,$ctr,#1 + mov @datax[3].s[3],$ctr + add $ctr,$ctr,#1 + bl _vpsm4_enc_8blks + ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 + ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 + eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b + eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b + eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b + eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b + eor @data[0].16b,@data[0].16b,@datax[0].16b + eor @data[1].16b,@data[1].16b,@datax[1].16b + eor @data[2].16b,@data[2].16b,@datax[2].16b + eor @data[3].16b,@data[3].16b,@datax[3].16b + st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 + subs $blocks,$blocks,#8 + b.ne .Lctr32_4_blocks_process + b 100f +1: // last block processing + subs $blocks,$blocks,#1 + b.lt 100f + b.gt 1f + mov $ivec.s[0],$word0 + mov $ivec.s[1],$word1 + mov $ivec.s[2],$word2 + mov $ivec.s[3],$ctr +___ + &encrypt_1blk($ivec); +$code.=<<___; + ld1 {@data[0].4s},[$inp] + eor @data[0].16b,@data[0].16b,$ivec.16b + st1 {@data[0].4s},[$outp] + b 100f +1: // last 2 blocks processing + dup @data[0].4s,$word0 + dup @data[1].4s,$word1 + dup @data[2].4s,$word2 + mov @data[3].s[0],$ctr + add $ctr,$ctr,#1 + mov @data[3].s[1],$ctr + subs $blocks,$blocks,#1 + b.ne 1f + bl _vpsm4_enc_4blks + ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16 + ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16 + eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b + eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b + eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b + eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b + st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16 + st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16 + b 100f +1: // last 3 blocks processing + add $ctr,$ctr,#1 + mov @data[3].s[2],$ctr + bl _vpsm4_enc_4blks + ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16 + ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16 + ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16 + eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b + eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b + eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b + eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b + st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16 + st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16 + st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16 +100: + ldp d10,d11,[sp,#16] + ldp d12,d13,[sp,#32] + ldp d14,d15,[sp,#48] + ldp x29,x30,[sp,#64] + ldp d8,d9,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks +___ +}}} + +{{{ +my ($blocks,$len)=("x2","x2"); +my $ivp=("x5"); +my @twx=map("x$_",(12..27)); +my ($rks1,$rks2)=("x26","x27"); +my $lastBlk=("x26"); +my $enc=("w28"); +my $remain=("x29"); + +my @tweak=@datax; + +sub gen_xts_cipher() { + my $std = shift; +$code.=<<___; +.globl ${prefix}_xts_encrypt${std} +.type ${prefix}_xts_encrypt${std},%function +.align 5 +${prefix}_xts_encrypt${std}: + AARCH64_SIGN_LINK_REGISTER + stp x15, x16, [sp, #-0x10]! + stp x17, x18, [sp, #-0x10]! + stp x19, x20, [sp, #-0x10]! + stp x21, x22, [sp, #-0x10]! + stp x23, x24, [sp, #-0x10]! + stp x25, x26, [sp, #-0x10]! + stp x27, x28, [sp, #-0x10]! + stp x29, x30, [sp, #-0x10]! + stp d8, d9, [sp, #-0x10]! + stp d10, d11, [sp, #-0x10]! + stp d12, d13, [sp, #-0x10]! + stp d14, d15, [sp, #-0x10]! + mov $rks1,x3 + mov $rks2,x4 + mov $enc,w6 + ld1 {@tweak[0].4s}, [$ivp] + mov $rks,$rks2 +___ + &load_sbox(); + &rev32(@tweak[0],@tweak[0]); + &encrypt_1blk(@tweak[0]); +$code.=<<___; + mov $rks,$rks1 + and $remain,$len,#0x0F + // convert length into blocks + lsr $blocks,$len,4 + cmp $blocks,#1 + b.lt .return${std} + + cmp $remain,0 + // If the encryption/decryption Length is N times of 16, + // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std} + b.eq .xts_encrypt_blocks${std} + + // If the encryption/decryption length is not N times of 16, + // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std} + // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std} + subs $blocks,$blocks,#1 + b.eq .only_2blks_tweak${std} +.xts_encrypt_blocks${std}: +___ + &rbit(@tweak[0],@tweak[0],$std); + &rev32_armeb(@tweak[0],@tweak[0]); + &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]); + &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); + &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); + &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); + &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); + &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); + &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); + &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); +$code.=<<___; +.Lxts_8_blocks_process${std}: + cmp $blocks,#8 + b.lt .Lxts_4_blocks_process${std} +___ + &mov_reg_to_vec(@twx[0],@twx[1],@vtmp[0]); + &mov_reg_to_vec(@twx[2],@twx[3],@vtmp[1]); + &mov_reg_to_vec(@twx[4],@twx[5],@vtmp[2]); + &mov_reg_to_vec(@twx[6],@twx[7],@vtmp[3]); + &mov_reg_to_vec(@twx[8],@twx[9],@vtmpx[0]); + &mov_reg_to_vec(@twx[10],@twx[11],@vtmpx[1]); + &mov_reg_to_vec(@twx[12],@twx[13],@vtmpx[2]); + &mov_reg_to_vec(@twx[14],@twx[15],@vtmpx[3]); +$code.=<<___; + ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 +___ + &rbit(@vtmp[0],@vtmp[0],$std); + &rbit(@vtmp[1],@vtmp[1],$std); + &rbit(@vtmp[2],@vtmp[2],$std); + &rbit(@vtmp[3],@vtmp[3],$std); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @vtmp[0].16b + eor @data[1].16b, @data[1].16b, @vtmp[1].16b + eor @data[2].16b, @data[2].16b, @vtmp[2].16b + eor @data[3].16b, @data[3].16b, @vtmp[3].16b + ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 +___ + &rbit(@vtmpx[0],@vtmpx[0],$std); + &rbit(@vtmpx[1],@vtmpx[1],$std); + &rbit(@vtmpx[2],@vtmpx[2],$std); + &rbit(@vtmpx[3],@vtmpx[3],$std); +$code.=<<___; + eor @datax[0].16b, @datax[0].16b, @vtmpx[0].16b + eor @datax[1].16b, @datax[1].16b, @vtmpx[1].16b + eor @datax[2].16b, @datax[2].16b, @vtmpx[2].16b + eor @datax[3].16b, @datax[3].16b, @vtmpx[3].16b +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); + &rev32(@datax[0],@datax[0]); + &rev32(@datax[1],@datax[1]); + &rev32(@datax[2],@datax[2]); + &rev32(@datax[3],@datax[3]); + &transpose(@data,@vtmp); + &transpose(@datax,@vtmp); +$code.=<<___; + bl _${prefix}_enc_8blks +___ + &transpose(@vtmp,@datax); + &transpose(@data,@datax); + + &mov_reg_to_vec(@twx[0],@twx[1],@vtmpx[0]); + &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]); + &mov_reg_to_vec(@twx[2],@twx[3],@vtmpx[1]); + &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); + &mov_reg_to_vec(@twx[4],@twx[5],@vtmpx[2]); + &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); + &mov_reg_to_vec(@twx[6],@twx[7],@vtmpx[3]); + &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); + &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]); + &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); + &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]); + &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); + &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]); + &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); + &mov_reg_to_vec(@twx[14],@twx[15],@tweak[3]); + &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); +$code.=<<___; + eor @vtmp[0].16b, @vtmp[0].16b, @vtmpx[0].16b + eor @vtmp[1].16b, @vtmp[1].16b, @vtmpx[1].16b + eor @vtmp[2].16b, @vtmp[2].16b, @vtmpx[2].16b + eor @vtmp[3].16b, @vtmp[3].16b, @vtmpx[3].16b + eor @data[0].16b, @data[0].16b, @tweak[0].16b + eor @data[1].16b, @data[1].16b, @tweak[1].16b + eor @data[2].16b, @data[2].16b, @tweak[2].16b + eor @data[3].16b, @data[3].16b, @tweak[3].16b + + // save the last tweak + st1 {@tweak[3].4s},[$ivp] + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 + subs $blocks,$blocks,#8 + b.gt .Lxts_8_blocks_process${std} + b 100f +.Lxts_4_blocks_process${std}: +___ + &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]); + &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]); + &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]); + &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]); +$code.=<<___; + cmp $blocks,#4 + b.lt 1f + ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 +___ + &rbit(@tweak[0],@tweak[0],$std); + &rbit(@tweak[1],@tweak[1],$std); + &rbit(@tweak[2],@tweak[2],$std); + &rbit(@tweak[3],@tweak[3],$std); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + eor @data[1].16b, @data[1].16b, @tweak[1].16b + eor @data[2].16b, @data[2].16b, @tweak[2].16b + eor @data[3].16b, @data[3].16b, @tweak[3].16b +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); + &transpose(@data,@vtmp); +$code.=<<___; + bl _${prefix}_enc_4blks +___ + &transpose(@vtmp,@data); +$code.=<<___; + eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b + eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b + eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b + eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + sub $blocks,$blocks,#4 +___ + &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]); + &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]); + &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]); +$code.=<<___; + // save the last tweak + st1 {@tweak[3].4s},[$ivp] +1: + // process last block + cmp $blocks,#1 + b.lt 100f + b.gt 1f + ld1 {@data[0].4s},[$inp],#16 +___ + &rbit(@tweak[0],@tweak[0],$std); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b +___ + &rev32(@data[0],@data[0]); + &encrypt_1blk(@data[0]); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + st1 {@data[0].4s},[$outp],#16 + // save the last tweak + st1 {@tweak[0].4s},[$ivp] + b 100f +1: // process last 2 blocks + cmp $blocks,#2 + b.gt 1f + ld1 {@data[0].4s,@data[1].4s},[$inp],#32 +___ + &rbit(@tweak[0],@tweak[0],$std); + &rbit(@tweak[1],@tweak[1],$std); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + eor @data[1].16b, @data[1].16b, @tweak[1].16b +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &transpose(@data,@vtmp); +$code.=<<___; + bl _${prefix}_enc_4blks +___ + &transpose(@vtmp,@data); +$code.=<<___; + eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b + eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b + st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 + // save the last tweak + st1 {@tweak[1].4s},[$ivp] + b 100f +1: // process last 3 blocks + ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 +___ + &rbit(@tweak[0],@tweak[0],$std); + &rbit(@tweak[1],@tweak[1],$std); + &rbit(@tweak[2],@tweak[2],$std); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + eor @data[1].16b, @data[1].16b, @tweak[1].16b + eor @data[2].16b, @data[2].16b, @tweak[2].16b +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &transpose(@data,@vtmp); +$code.=<<___; + bl _${prefix}_enc_4blks +___ + &transpose(@vtmp,@data); +$code.=<<___; + eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b + eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b + eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 + // save the last tweak + st1 {@tweak[2].4s},[$ivp] +100: + cmp $remain,0 + b.eq .return${std} + +// This brance calculates the last two tweaks, +// while the encryption/decryption length is larger than 32 +.last_2blks_tweak${std}: + ld1 {@tweak[0].4s},[$ivp] +___ + &rev32_armeb(@tweak[0],@tweak[0]); + &compute_tweak_vec(@tweak[0],@tweak[1],$std); + &compute_tweak_vec(@tweak[1],@tweak[2],$std); +$code.=<<___; + b .check_dec${std} + + +// This brance calculates the last two tweaks, +// while the encryption/decryption length is equal to 32, who only need two tweaks +.only_2blks_tweak${std}: + mov @tweak[1].16b,@tweak[0].16b +___ + &rev32_armeb(@tweak[1],@tweak[1]); + &compute_tweak_vec(@tweak[1],@tweak[2],$std); +$code.=<<___; + b .check_dec${std} + + +// Determine whether encryption or decryption is required. +// The last two tweaks need to be swapped for decryption. +.check_dec${std}: + // encryption:1 decryption:0 + cmp $enc,1 + b.eq .process_last_2blks${std} + mov @vtmp[0].16B,@tweak[1].16b + mov @tweak[1].16B,@tweak[2].16b + mov @tweak[2].16B,@vtmp[0].16b + +.process_last_2blks${std}: +___ + &rev32_armeb(@tweak[1],@tweak[1]); + &rev32_armeb(@tweak[2],@tweak[2]); +$code.=<<___; + ld1 {@data[0].4s},[$inp],#16 + eor @data[0].16b, @data[0].16b, @tweak[1].16b +___ + &rev32(@data[0],@data[0]); + &encrypt_1blk(@data[0]); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[1].16b + st1 {@data[0].4s},[$outp],#16 + + sub $lastBlk,$outp,16 + .loop${std}: + subs $remain,$remain,1 + ldrb $wtmp0,[$lastBlk,$remain] + ldrb $wtmp1,[$inp,$remain] + strb $wtmp1,[$lastBlk,$remain] + strb $wtmp0,[$outp,$remain] + b.gt .loop${std} + ld1 {@data[0].4s}, [$lastBlk] + eor @data[0].16b, @data[0].16b, @tweak[2].16b +___ + &rev32(@data[0],@data[0]); + &encrypt_1blk(@data[0]); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[2].16b + st1 {@data[0].4s}, [$lastBlk] +.return${std}: + ldp d14, d15, [sp], #0x10 + ldp d12, d13, [sp], #0x10 + ldp d10, d11, [sp], #0x10 + ldp d8, d9, [sp], #0x10 + ldp x29, x30, [sp], #0x10 + ldp x27, x28, [sp], #0x10 + ldp x25, x26, [sp], #0x10 + ldp x23, x24, [sp], #0x10 + ldp x21, x22, [sp], #0x10 + ldp x19, x20, [sp], #0x10 + ldp x17, x18, [sp], #0x10 + ldp x15, x16, [sp], #0x10 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std} +___ +} # end of gen_xts_cipher +&gen_xts_cipher("_gb"); +&gen_xts_cipher(""); +}}} +######################################## +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/ge; + print $_,"\n"; +} + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/sm4/asm/vpsm4_ex-armv8.pl b/crypto/sm4/asm/vpsm4_ex-armv8.pl new file mode 100644 index 0000000000000000000000000000000000000000..727e0f24e600a0402243075efb4b6d7bef9d0920 --- /dev/null +++ b/crypto/sm4/asm/vpsm4_ex-armv8.pl @@ -0,0 +1,1553 @@ +#! /usr/bin/env perl +# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# This module implements SM4 with ASIMD and AESE on AARCH64 +# +# Dec 2022 +# + +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour \"$output\"" + or die "can't call $xlate: $!"; +*STDOUT=*OUT; + +$prefix="vpsm4_ex"; +my @vtmp=map("v$_",(0..3)); +my @qtmp=map("q$_",(0..3)); +my @data=map("v$_",(4..7)); +my @datax=map("v$_",(8..11)); +my ($rk0,$rk1)=("v12","v13"); +my ($rka,$rkb)=("v14","v15"); +my @vtmpx=map("v$_",(12..15)); +my ($vtmp4,$vtmp5)=("v24","v25"); +my ($MaskV,$TAHMatV,$TALMatV,$ATAHMatV,$ATALMatV,$ANDMaskV)=("v26","v27","v28","v29","v30","v31"); +my ($MaskQ,$TAHMatQ,$TALMatQ,$ATAHMatQ,$ATALMatQ,$ANDMaskQ)=("q26","q27","q28","q29","q30","q31"); + +my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3"); +my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9"); +my ($xtmp1,$xtmp2)=("x8","x9"); +my ($ptr,$counter)=("x10","w11"); +my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15"); + +sub rev32() { + my $dst = shift; + my $src = shift; + + if ($src and ("$src" ne "$dst")) { +$code.=<<___; +#ifndef __AARCH64EB__ + rev32 $dst.16b,$src.16b +#else + mov $dst.16b,$src.16b +#endif +___ + } else { +$code.=<<___; +#ifndef __AARCH64EB__ + rev32 $dst.16b,$dst.16b +#endif +___ + } +} + +sub rev32_armeb() { + my $dst = shift; + my $src = shift; + + if ($src and ("$src" ne "$dst")) { +$code.=<<___; +#ifdef __AARCH64EB__ + rev32 $dst.16b,$src.16b +#else + mov $dst.16b,$src.16b +#endif +___ + } else { +$code.=<<___; +#ifdef __AARCH64EB__ + rev32 $dst.16b,$dst.16b +#endif +___ + } +} + +sub rbit() { + my $dst = shift; + my $src = shift; + my $std = shift; + + if ($src and ("$src" ne "$dst")) { + if ($std eq "_gb") { +$code.=<<___; + rbit $dst.16b,$src.16b +___ + } else { +$code.=<<___; + mov $dst.16b,$src.16b +___ + } + } else { + if ($std eq "_gb") { +$code.=<<___; + rbit $dst.16b,$src.16b +___ + } + } +} + +sub transpose() { + my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_; + +$code.=<<___; + zip1 $vt0.4s,$dat0.4s,$dat1.4s + zip2 $vt1.4s,$dat0.4s,$dat1.4s + zip1 $vt2.4s,$dat2.4s,$dat3.4s + zip2 $vt3.4s,$dat2.4s,$dat3.4s + zip1 $dat0.2d,$vt0.2d,$vt2.2d + zip2 $dat1.2d,$vt0.2d,$vt2.2d + zip1 $dat2.2d,$vt1.2d,$vt3.2d + zip2 $dat3.2d,$vt1.2d,$vt3.2d +___ +} + +# matrix multiplication Mat*x = (lowerMat*x) ^ (higherMat*x) +sub mul_matrix() { + my $x = shift; + my $higherMat = shift; + my $lowerMat = shift; + my $tmp = shift; +$code.=<<___; + ushr $tmp.16b, $x.16b, 4 + and $x.16b, $x.16b, $ANDMaskV.16b + tbl $x.16b, {$lowerMat.16b}, $x.16b + tbl $tmp.16b, {$higherMat.16b}, $tmp.16b + eor $x.16b, $x.16b, $tmp.16b +___ +} + +# sbox operations for 4-lane of words +# sbox operation for 4-lane of words +sub sbox() { + my $dat = shift; + +$code.=<<___; + // optimize sbox using AESE instruction + tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b +___ + &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4); +$code.=<<___; + eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b + aese @vtmp[0].16b,@vtmp[1].16b +___ + &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, $vtmp4); +$code.=<<___; + mov $dat.16b,@vtmp[0].16b + + // linear transformation + ushr @vtmp[0].4s,$dat.4s,32-2 + ushr @vtmp[1].4s,$dat.4s,32-10 + ushr @vtmp[2].4s,$dat.4s,32-18 + ushr @vtmp[3].4s,$dat.4s,32-24 + sli @vtmp[0].4s,$dat.4s,2 + sli @vtmp[1].4s,$dat.4s,10 + sli @vtmp[2].4s,$dat.4s,18 + sli @vtmp[3].4s,$dat.4s,24 + eor $vtmp4.16b,@vtmp[0].16b,$dat.16b + eor $vtmp4.16b,$vtmp4.16b,$vtmp[1].16b + eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b + eor $dat.16b,$dat.16b,$vtmp4.16b +___ +} + +# sbox operation for 8-lane of words +sub sbox_double() { + my $dat = shift; + my $datx = shift; + +$code.=<<___; + // optimize sbox using AESE instruction + tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b + tbl @vtmp[1].16b, {$datx.16b}, $MaskV.16b +___ + &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4); + &mul_matrix(@vtmp[1], $TAHMatV, $TALMatV, $vtmp4); +$code.=<<___; + eor $vtmp5.16b, $vtmp5.16b, $vtmp5.16b + aese @vtmp[0].16b,$vtmp5.16b + aese @vtmp[1].16b,$vtmp5.16b +___ + &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV,$vtmp4); + &mul_matrix(@vtmp[1], $ATAHMatV, $ATALMatV,$vtmp4); +$code.=<<___; + mov $dat.16b,@vtmp[0].16b + mov $datx.16b,@vtmp[1].16b + + // linear transformation + ushr @vtmp[0].4s,$dat.4s,32-2 + ushr $vtmp5.4s,$datx.4s,32-2 + ushr @vtmp[1].4s,$dat.4s,32-10 + ushr @vtmp[2].4s,$dat.4s,32-18 + ushr @vtmp[3].4s,$dat.4s,32-24 + sli @vtmp[0].4s,$dat.4s,2 + sli $vtmp5.4s,$datx.4s,2 + sli @vtmp[1].4s,$dat.4s,10 + sli @vtmp[2].4s,$dat.4s,18 + sli @vtmp[3].4s,$dat.4s,24 + eor $vtmp4.16b,@vtmp[0].16b,$dat.16b + eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b + eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b + eor $dat.16b,$dat.16b,$vtmp4.16b + ushr @vtmp[1].4s,$datx.4s,32-10 + ushr @vtmp[2].4s,$datx.4s,32-18 + ushr @vtmp[3].4s,$datx.4s,32-24 + sli @vtmp[1].4s,$datx.4s,10 + sli @vtmp[2].4s,$datx.4s,18 + sli @vtmp[3].4s,$datx.4s,24 + eor $vtmp4.16b,$vtmp5.16b,$datx.16b + eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b + eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b + eor $datx.16b,$datx.16b,$vtmp4.16b +___ +} + +# sbox operation for one single word +sub sbox_1word () { + my $word = shift; + +$code.=<<___; + mov @vtmp[3].s[0],$word + // optimize sbox using AESE instruction + tbl @vtmp[0].16b, {@vtmp[3].16b}, $MaskV.16b +___ + &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]); +$code.=<<___; + eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b + aese @vtmp[0].16b,@vtmp[1].16b +___ + &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]); +$code.=<<___; + + mov $wtmp0,@vtmp[0].s[0] + eor $word,$wtmp0,$wtmp0,ror #32-2 + eor $word,$word,$wtmp0,ror #32-10 + eor $word,$word,$wtmp0,ror #32-18 + eor $word,$word,$wtmp0,ror #32-24 +___ +} + +# sm4 for one block of data, in scalar registers word0/word1/word2/word3 +sub sm4_1blk () { + my $kptr = shift; + +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor $tmpw,$word2,$word3 + eor $wtmp2,$wtmp0,$word1 + eor $tmpw,$tmpw,$wtmp2 +___ + &sbox_1word($tmpw); +$code.=<<___; + eor $word0,$word0,$tmpw + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor $tmpw,$word2,$word3 + eor $wtmp2,$word0,$wtmp1 + eor $tmpw,$tmpw,$wtmp2 +___ + &sbox_1word($tmpw); +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + eor $word1,$word1,$tmpw + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor $tmpw,$word0,$word1 + eor $wtmp2,$wtmp0,$word3 + eor $tmpw,$tmpw,$wtmp2 +___ + &sbox_1word($tmpw); +$code.=<<___; + eor $word2,$word2,$tmpw + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor $tmpw,$word0,$word1 + eor $wtmp2,$word2,$wtmp1 + eor $tmpw,$tmpw,$wtmp2 +___ + &sbox_1word($tmpw); +$code.=<<___; + eor $word3,$word3,$tmpw +___ +} + +# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3 +sub sm4_4blks () { + my $kptr = shift; + +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + dup $rk0.4s,$wtmp0 + dup $rk1.4s,$wtmp1 + + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor $rka.16b,@data[2].16b,@data[3].16b + eor $rk0.16b,@data[1].16b,$rk0.16b + eor $rk0.16b,$rka.16b,$rk0.16b +___ + &sbox($rk0); +$code.=<<___; + eor @data[0].16b,@data[0].16b,$rk0.16b + + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor $rka.16b,$rka.16b,@data[0].16b + eor $rk1.16b,$rka.16b,$rk1.16b +___ + &sbox($rk1); +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + eor @data[1].16b,@data[1].16b,$rk1.16b + + dup $rk0.4s,$wtmp0 + dup $rk1.4s,$wtmp1 + + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor $rka.16b,@data[0].16b,@data[1].16b + eor $rk0.16b,@data[3].16b,$rk0.16b + eor $rk0.16b,$rka.16b,$rk0.16b +___ + &sbox($rk0); +$code.=<<___; + eor @data[2].16b,@data[2].16b,$rk0.16b + + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor $rka.16b,$rka.16b,@data[2].16b + eor $rk1.16b,$rka.16b,$rk1.16b +___ + &sbox($rk1); +$code.=<<___; + eor @data[3].16b,@data[3].16b,$rk1.16b +___ +} + +# sm4 for 8 lanes of data, in neon registers +# data0/data1/data2/data3 datax0/datax1/datax2/datax3 +sub sm4_8blks () { + my $kptr = shift; + +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + dup $rk0.4s,$wtmp0 + eor $rka.16b,@data[2].16b,@data[3].16b + eor $rkb.16b,@datax[2].16b,@datax[3].16b + eor @vtmp[0].16b,@data[1].16b,$rk0.16b + eor @vtmp[1].16b,@datax[1].16b,$rk0.16b + eor $rk0.16b,$rka.16b,@vtmp[0].16b + eor $rk1.16b,$rkb.16b,@vtmp[1].16b +___ + &sbox_double($rk0,$rk1); +$code.=<<___; + eor @data[0].16b,@data[0].16b,$rk0.16b + eor @datax[0].16b,@datax[0].16b,$rk1.16b + + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + dup $rk1.4s,$wtmp1 + eor $rka.16b,$rka.16b,@data[0].16b + eor $rkb.16b,$rkb.16b,@datax[0].16b + eor $rk0.16b,$rka.16b,$rk1.16b + eor $rk1.16b,$rkb.16b,$rk1.16b +___ + &sbox_double($rk0,$rk1); +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + eor @data[1].16b,@data[1].16b,$rk0.16b + eor @datax[1].16b,@datax[1].16b,$rk1.16b + + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + dup $rk0.4s,$wtmp0 + eor $rka.16b,@data[0].16b,@data[1].16b + eor $rkb.16b,@datax[0].16b,@datax[1].16b + eor @vtmp[0].16b,@data[3].16b,$rk0.16b + eor @vtmp[1].16b,@datax[3].16b,$rk0.16b + eor $rk0.16b,$rka.16b,@vtmp[0].16b + eor $rk1.16b,$rkb.16b,@vtmp[1].16b +___ + &sbox_double($rk0,$rk1); +$code.=<<___; + eor @data[2].16b,@data[2].16b,$rk0.16b + eor @datax[2].16b,@datax[2].16b,$rk1.16b + + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + dup $rk1.4s,$wtmp1 + eor $rka.16b,$rka.16b,@data[2].16b + eor $rkb.16b,$rkb.16b,@datax[2].16b + eor $rk0.16b,$rka.16b,$rk1.16b + eor $rk1.16b,$rkb.16b,$rk1.16b +___ + &sbox_double($rk0,$rk1); +$code.=<<___; + eor @data[3].16b,@data[3].16b,$rk0.16b + eor @datax[3].16b,@datax[3].16b,$rk1.16b +___ +} + +sub encrypt_1blk_norev() { + my $dat = shift; + +$code.=<<___; + mov $ptr,$rks + mov $counter,#8 + mov $word0,$dat.s[0] + mov $word1,$dat.s[1] + mov $word2,$dat.s[2] + mov $word3,$dat.s[3] +10: +___ + &sm4_1blk($ptr); +$code.=<<___; + subs $counter,$counter,#1 + b.ne 10b + mov $dat.s[0],$word3 + mov $dat.s[1],$word2 + mov $dat.s[2],$word1 + mov $dat.s[3],$word0 +___ +} + +sub encrypt_1blk() { + my $dat = shift; + + &encrypt_1blk_norev($dat); + &rev32($dat,$dat); +} + +sub encrypt_4blks() { +$code.=<<___; + mov $ptr,$rks + mov $counter,#8 +10: +___ + &sm4_4blks($ptr); +$code.=<<___; + subs $counter,$counter,#1 + b.ne 10b +___ + &rev32(@vtmp[3],@data[0]); + &rev32(@vtmp[2],@data[1]); + &rev32(@vtmp[1],@data[2]); + &rev32(@vtmp[0],@data[3]); +} + +sub encrypt_8blks() { +$code.=<<___; + mov $ptr,$rks + mov $counter,#8 +10: +___ + &sm4_8blks($ptr); +$code.=<<___; + subs $counter,$counter,#1 + b.ne 10b +___ + &rev32(@vtmp[3],@data[0]); + &rev32(@vtmp[2],@data[1]); + &rev32(@vtmp[1],@data[2]); + &rev32(@vtmp[0],@data[3]); + &rev32(@data[3],@datax[0]); + &rev32(@data[2],@datax[1]); + &rev32(@data[1],@datax[2]); + &rev32(@data[0],@datax[3]); +} + +sub load_sbox () { + my $data = shift; + +$code.=<<___; + ldr $MaskQ, .Lsbox_magic + ldr $TAHMatQ, .Lsbox_magic+16 + ldr $TALMatQ, .Lsbox_magic+32 + ldr $ATAHMatQ, .Lsbox_magic+48 + ldr $ATALMatQ, .Lsbox_magic+64 + ldr $ANDMaskQ, .Lsbox_magic+80 +___ +} + +sub mov_reg_to_vec() { + my $src0 = shift; + my $src1 = shift; + my $desv = shift; +$code.=<<___; + mov $desv.d[0],$src0 + mov $desv.d[1],$src1 +___ + &rev32_armeb($desv,$desv); +} + +sub mov_vec_to_reg() { + my $srcv = shift; + my $des0 = shift; + my $des1 = shift; +$code.=<<___; + mov $des0,$srcv.d[0] + mov $des1,$srcv.d[1] +___ +} + +sub compute_tweak() { + my $src0 = shift; + my $src1 = shift; + my $des0 = shift; + my $des1 = shift; +$code.=<<___; + mov $wtmp0,0x87 + extr $xtmp2,$src1,$src1,#32 + extr $des1,$src1,$src0,#63 + and $wtmp1,$wtmp0,$wtmp2,asr#31 + eor $des0,$xtmp1,$src0,lsl#1 +___ +} + +sub compute_tweak_vec() { + my $src = shift; + my $des = shift; + my $std = shift; + &rbit(@vtmp[2],$src,$std); +$code.=<<___; + ldr @qtmp[0], .Lxts_magic + shl $des.16b, @vtmp[2].16b, #1 + ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15 + ushr @vtmp[1].16b, @vtmp[1].16b, #7 + mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b + eor $des.16b, $des.16b, @vtmp[1].16b +___ + &rbit($des,$des,$std); +} + +$code=<<___; +#include "arm_arch.h" +.arch armv8-a+crypto +.text + +.type _${prefix}_consts,%object +.align 7 +_${prefix}_consts: +.Lck: + .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 + .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 + .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 + .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 + .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 + .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 + .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 + .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 +.Lfk: + .dword 0x56aa3350a3b1bac6,0xb27022dc677d9197 +.Lshuffles: + .dword 0x0B0A090807060504,0x030201000F0E0D0C +.Lxts_magic: + .dword 0x0101010101010187,0x0101010101010101 +.Lsbox_magic: + .dword 0x0b0e0104070a0d00,0x0306090c0f020508 + .dword 0x62185a2042387a00,0x22581a6002783a40 + .dword 0x15df62a89e54e923,0xc10bb67c4a803df7 + .dword 0xb9aa6b78c1d21300,0x1407c6d56c7fbead + .dword 0x6404462679195b3b,0xe383c1a1fe9edcbc + .dword 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f + +.size _${prefix}_consts,.-_${prefix}_consts +___ + +{{{ +my ($key,$keys,$enc)=("x0","x1","w2"); +my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8"); +my ($vkey,$vfk,$vmap)=("v5","v6","v7"); +$code.=<<___; +.type _${prefix}_set_key,%function +.align 4 +_${prefix}_set_key: + AARCH64_VALID_CALL_TARGET + ld1 {$vkey.4s},[$key] +___ + &load_sbox(); + &rev32($vkey,$vkey); +$code.=<<___; + adr $pointer,.Lshuffles + ld1 {$vmap.2d},[$pointer] + adr $pointer,.Lfk + ld1 {$vfk.2d},[$pointer] + eor $vkey.16b,$vkey.16b,$vfk.16b + mov $schedules,#32 + adr $pointer,.Lck + movi @vtmp[0].16b,#64 + cbnz $enc,1f + add $keys,$keys,124 +1: + mov $wtmp,$vkey.s[1] + ldr $roundkey,[$pointer],#4 + eor $roundkey,$roundkey,$wtmp + mov $wtmp,$vkey.s[2] + eor $roundkey,$roundkey,$wtmp + mov $wtmp,$vkey.s[3] + eor $roundkey,$roundkey,$wtmp + // optimize sbox using AESE instruction + mov @data[0].s[0],$roundkey + tbl @vtmp[0].16b, {@data[0].16b}, $MaskV.16b +___ + &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]); +$code.=<<___; + eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b + aese @vtmp[0].16b,@vtmp[1].16b +___ + &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]); +$code.=<<___; + mov $wtmp,@vtmp[0].s[0] + eor $roundkey,$wtmp,$wtmp,ror #19 + eor $roundkey,$roundkey,$wtmp,ror #9 + mov $wtmp,$vkey.s[0] + eor $roundkey,$roundkey,$wtmp + mov $vkey.s[0],$roundkey + cbz $enc,2f + str $roundkey,[$keys],#4 + b 3f +2: + str $roundkey,[$keys],#-4 +3: + tbl $vkey.16b,{$vkey.16b},$vmap.16b + subs $schedules,$schedules,#1 + b.ne 1b + ret +.size _${prefix}_set_key,.-_${prefix}_set_key +___ +}}} + + +{{{ +$code.=<<___; +.type _${prefix}_enc_4blks,%function +.align 4 +_${prefix}_enc_4blks: + AARCH64_VALID_CALL_TARGET +___ + &encrypt_4blks(); +$code.=<<___; + ret +.size _${prefix}_enc_4blks,.-_${prefix}_enc_4blks +___ +}}} + +{{{ +$code.=<<___; +.type _${prefix}_enc_8blks,%function +.align 4 +_${prefix}_enc_8blks: + AARCH64_VALID_CALL_TARGET +___ + &encrypt_8blks(); +$code.=<<___; + ret +.size _${prefix}_enc_8blks,.-_${prefix}_enc_8blks +___ +}}} + + +{{{ +my ($key,$keys)=("x0","x1"); +$code.=<<___; +.globl ${prefix}_set_encrypt_key +.type ${prefix}_set_encrypt_key,%function +.align 5 +${prefix}_set_encrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + mov w2,1 + bl _${prefix}_set_key + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key +___ +}}} + +{{{ +my ($key,$keys)=("x0","x1"); +$code.=<<___; +.globl ${prefix}_set_decrypt_key +.type ${prefix}_set_decrypt_key,%function +.align 5 +${prefix}_set_decrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + mov w2,0 + bl _${prefix}_set_key + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key +___ +}}} + +{{{ +sub gen_block () { + my $dir = shift; + my ($inp,$outp,$rk)=map("x$_",(0..2)); + +$code.=<<___; +.globl ${prefix}_${dir}crypt +.type ${prefix}_${dir}crypt,%function +.align 5 +${prefix}_${dir}crypt: + AARCH64_VALID_CALL_TARGET + ld1 {@data[0].4s},[$inp] +___ + &load_sbox(); + &rev32(@data[0],@data[0]); +$code.=<<___; + mov $rks,$rk +___ + &encrypt_1blk(@data[0]); +$code.=<<___; + st1 {@data[0].4s},[$outp] + ret +.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt +___ +} +&gen_block("en"); +&gen_block("de"); +}}} + +{{{ +$code.=<<___; +.globl ${prefix}_ecb_encrypt +.type ${prefix}_ecb_encrypt,%function +.align 5 +${prefix}_ecb_encrypt: + AARCH64_SIGN_LINK_REGISTER + // convert length into blocks + lsr x2,x2,4 + stp d8,d9,[sp,#-80]! + stp d10,d11,[sp,#16] + stp d12,d13,[sp,#32] + stp d14,d15,[sp,#48] + stp x29,x30,[sp,#64] +___ + &load_sbox(); +$code.=<<___; +.Lecb_8_blocks_process: + cmp $blocks,#8 + b.lt .Lecb_4_blocks_process + ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 + ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); + &rev32(@datax[0],@datax[0]); + &rev32(@datax[1],@datax[1]); + &rev32(@datax[2],@datax[2]); + &rev32(@datax[3],@datax[3]); +$code.=<<___; + bl _${prefix}_enc_8blks + st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 + subs $blocks,$blocks,#8 + b.gt .Lecb_8_blocks_process + b 100f +.Lecb_4_blocks_process: + cmp $blocks,#4 + b.lt 1f + ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); +$code.=<<___; + bl _${prefix}_enc_4blks + st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + sub $blocks,$blocks,#4 +1: + // process last block + cmp $blocks,#1 + b.lt 100f + b.gt 1f + ld1 {@data[0].4s},[$inp] +___ + &rev32(@data[0],@data[0]); + &encrypt_1blk(@data[0]); +$code.=<<___; + st1 {@data[0].4s},[$outp] + b 100f +1: // process last 2 blocks + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16 + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16 + cmp $blocks,#2 + b.gt 1f +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); +$code.=<<___; + bl _${prefix}_enc_4blks + st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 + st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp] + b 100f +1: // process last 3 blocks + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16 +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); +$code.=<<___; + bl _${prefix}_enc_4blks + st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 + st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16 + st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp] +100: + ldp d10,d11,[sp,#16] + ldp d12,d13,[sp,#32] + ldp d14,d15,[sp,#48] + ldp x29,x30,[sp,#64] + ldp d8,d9,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt +___ +}}} + +{{{ +my ($len,$ivp,$enc)=("x2","x4","w5"); +my $ivec0=("v3"); +my $ivec1=("v15"); + +$code.=<<___; +.globl ${prefix}_cbc_encrypt +.type ${prefix}_cbc_encrypt,%function +.align 5 +${prefix}_cbc_encrypt: + AARCH64_VALID_CALL_TARGET + lsr $len,$len,4 +___ + &load_sbox(); +$code.=<<___; + cbz $enc,.Ldec + ld1 {$ivec0.4s},[$ivp] +.Lcbc_4_blocks_enc: + cmp $blocks,#4 + b.lt 1f + ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 + eor @data[0].16b,@data[0].16b,$ivec0.16b +___ + &rev32(@data[1],@data[1]); + &rev32(@data[0],@data[0]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); + &encrypt_1blk_norev(@data[0]); +$code.=<<___; + eor @data[1].16b,@data[1].16b,@data[0].16b +___ + &encrypt_1blk_norev(@data[1]); + &rev32(@data[0],@data[0]); + +$code.=<<___; + eor @data[2].16b,@data[2].16b,@data[1].16b +___ + &encrypt_1blk_norev(@data[2]); + &rev32(@data[1],@data[1]); +$code.=<<___; + eor @data[3].16b,@data[3].16b,@data[2].16b +___ + &encrypt_1blk_norev(@data[3]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); +$code.=<<___; + orr $ivec0.16b,@data[3].16b,@data[3].16b + st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 + subs $blocks,$blocks,#4 + b.ne .Lcbc_4_blocks_enc + b 2f +1: + subs $blocks,$blocks,#1 + b.lt 2f + ld1 {@data[0].4s},[$inp],#16 + eor $ivec0.16b,$ivec0.16b,@data[0].16b +___ + &rev32($ivec0,$ivec0); + &encrypt_1blk($ivec0); +$code.=<<___; + st1 {$ivec0.4s},[$outp],#16 + b 1b +2: + // save back IV + st1 {$ivec0.4s},[$ivp] + ret + +.Ldec: + // decryption mode starts + AARCH64_SIGN_LINK_REGISTER + stp d8,d9,[sp,#-80]! + stp d10,d11,[sp,#16] + stp d12,d13,[sp,#32] + stp d14,d15,[sp,#48] + stp x29,x30,[sp,#64] +.Lcbc_8_blocks_dec: + cmp $blocks,#8 + b.lt 1f + ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp] + add $ptr,$inp,#64 + ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr] +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],$data[3]); + &rev32(@datax[0],@datax[0]); + &rev32(@datax[1],@datax[1]); + &rev32(@datax[2],@datax[2]); + &rev32(@datax[3],$datax[3]); +$code.=<<___; + bl _${prefix}_enc_8blks +___ + &transpose(@vtmp,@datax); + &transpose(@data,@datax); +$code.=<<___; + ld1 {$ivec1.4s},[$ivp] + ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 + // note ivec1 and vtmpx[3] are resuing the same register + // care needs to be taken to avoid conflict + eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b + ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 + eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b + eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b + eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b + // save back IV + st1 {$vtmpx[3].4s}, [$ivp] + eor @data[0].16b,@data[0].16b,$datax[3].16b + eor @data[1].16b,@data[1].16b,@vtmpx[0].16b + eor @data[2].16b,@data[2].16b,@vtmpx[1].16b + eor @data[3].16b,$data[3].16b,@vtmpx[2].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 + subs $blocks,$blocks,#8 + b.gt .Lcbc_8_blocks_dec + b.eq 100f +1: + ld1 {$ivec1.4s},[$ivp] +.Lcbc_4_blocks_dec: + cmp $blocks,#4 + b.lt 1f + ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp] +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],$data[3]); +$code.=<<___; + bl _${prefix}_enc_4blks + ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 +___ + &transpose(@vtmp,@datax); +$code.=<<___; + eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b + eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b + orr $ivec1.16b,@data[3].16b,@data[3].16b + eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b + eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + subs $blocks,$blocks,#4 + b.gt .Lcbc_4_blocks_dec + // save back IV + st1 {@data[3].4s}, [$ivp] + b 100f +1: // last block + subs $blocks,$blocks,#1 + b.lt 100f + b.gt 1f + ld1 {@data[0].4s},[$inp],#16 + // save back IV + st1 {$data[0].4s}, [$ivp] +___ + &rev32(@datax[0],@data[0]); + &encrypt_1blk(@datax[0]); +$code.=<<___; + eor @datax[0].16b,@datax[0].16b,$ivec1.16b + st1 {@datax[0].4s},[$outp],#16 + b 100f +1: // last two blocks + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp] + add $ptr,$inp,#16 + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16 + subs $blocks,$blocks,1 + b.gt 1f +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); +$code.=<<___; + bl _${prefix}_enc_4blks + ld1 {@data[0].4s,@data[1].4s},[$inp],#32 +___ + &transpose(@vtmp,@datax); +$code.=<<___; + eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b + eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b + st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 + // save back IV + st1 {@data[1].4s}, [$ivp] + b 100f +1: // last 3 blocks + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr] +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); +$code.=<<___; + bl _${prefix}_enc_4blks + ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 +___ + &transpose(@vtmp,@datax); +$code.=<<___; + eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b + eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b + eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 + // save back IV + st1 {@data[2].4s}, [$ivp] +100: + ldp d10,d11,[sp,#16] + ldp d12,d13,[sp,#32] + ldp d14,d15,[sp,#48] + ldp x29,x30,[sp,#64] + ldp d8,d9,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt +___ +}}} + +{{{ +my ($ivp)=("x4"); +my ($ctr)=("w5"); +my $ivec=("v3"); + +$code.=<<___; +.globl ${prefix}_ctr32_encrypt_blocks +.type ${prefix}_ctr32_encrypt_blocks,%function +.align 5 +${prefix}_ctr32_encrypt_blocks: + AARCH64_VALID_CALL_TARGET + ld1 {$ivec.4s},[$ivp] +___ + &rev32($ivec,$ivec); + &load_sbox(); +$code.=<<___; + cmp $blocks,#1 + b.ne 1f + // fast processing for one single block without + // context saving overhead +___ + &encrypt_1blk($ivec); +$code.=<<___; + ld1 {@data[0].4s},[$inp] + eor @data[0].16b,@data[0].16b,$ivec.16b + st1 {@data[0].4s},[$outp] + ret +1: + AARCH64_SIGN_LINK_REGISTER + stp d8,d9,[sp,#-80]! + stp d10,d11,[sp,#16] + stp d12,d13,[sp,#32] + stp d14,d15,[sp,#48] + stp x29,x30,[sp,#64] + mov $word0,$ivec.s[0] + mov $word1,$ivec.s[1] + mov $word2,$ivec.s[2] + mov $ctr,$ivec.s[3] +.Lctr32_4_blocks_process: + cmp $blocks,#4 + b.lt 1f + dup @data[0].4s,$word0 + dup @data[1].4s,$word1 + dup @data[2].4s,$word2 + mov @data[3].s[0],$ctr + add $ctr,$ctr,#1 + mov $data[3].s[1],$ctr + add $ctr,$ctr,#1 + mov @data[3].s[2],$ctr + add $ctr,$ctr,#1 + mov @data[3].s[3],$ctr + add $ctr,$ctr,#1 + cmp $blocks,#8 + b.ge .Lctr32_8_blocks_process + bl _${prefix}_enc_4blks + ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 + eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b + eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b + eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b + eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b + st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + subs $blocks,$blocks,#4 + b.ne .Lctr32_4_blocks_process + b 100f +.Lctr32_8_blocks_process: + dup @datax[0].4s,$word0 + dup @datax[1].4s,$word1 + dup @datax[2].4s,$word2 + mov @datax[3].s[0],$ctr + add $ctr,$ctr,#1 + mov $datax[3].s[1],$ctr + add $ctr,$ctr,#1 + mov @datax[3].s[2],$ctr + add $ctr,$ctr,#1 + mov @datax[3].s[3],$ctr + add $ctr,$ctr,#1 + bl _${prefix}_enc_8blks + ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 + ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 + eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b + eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b + eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b + eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b + eor @data[0].16b,@data[0].16b,@datax[0].16b + eor @data[1].16b,@data[1].16b,@datax[1].16b + eor @data[2].16b,@data[2].16b,@datax[2].16b + eor @data[3].16b,@data[3].16b,@datax[3].16b + st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 + subs $blocks,$blocks,#8 + b.ne .Lctr32_4_blocks_process + b 100f +1: // last block processing + subs $blocks,$blocks,#1 + b.lt 100f + b.gt 1f + mov $ivec.s[0],$word0 + mov $ivec.s[1],$word1 + mov $ivec.s[2],$word2 + mov $ivec.s[3],$ctr +___ + &encrypt_1blk($ivec); +$code.=<<___; + ld1 {@data[0].4s},[$inp] + eor @data[0].16b,@data[0].16b,$ivec.16b + st1 {@data[0].4s},[$outp] + b 100f +1: // last 2 blocks processing + dup @data[0].4s,$word0 + dup @data[1].4s,$word1 + dup @data[2].4s,$word2 + mov @data[3].s[0],$ctr + add $ctr,$ctr,#1 + mov @data[3].s[1],$ctr + subs $blocks,$blocks,#1 + b.ne 1f + bl _${prefix}_enc_4blks + ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16 + ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16 + eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b + eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b + eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b + eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b + st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16 + st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16 + b 100f +1: // last 3 blocks processing + add $ctr,$ctr,#1 + mov @data[3].s[2],$ctr + bl _${prefix}_enc_4blks + ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16 + ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16 + ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16 + eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b + eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b + eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b + eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b + st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16 + st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16 + st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16 +100: + ldp d10,d11,[sp,#16] + ldp d12,d13,[sp,#32] + ldp d14,d15,[sp,#48] + ldp x29,x30,[sp,#64] + ldp d8,d9,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks +___ +}}} + + +{{{ +my ($blocks,$len)=("x2","x2"); +my $ivp=("x5"); +my @twx=map("x$_",(12..27)); +my ($rks1,$rks2)=("x26","x27"); +my $lastBlk=("x26"); +my $enc=("w28"); +my $remain=("x29"); + +my @tweak=map("v$_",(16..23)); +my $lastTweak=("v25"); + +sub gen_xts_cipher() { + my $std = shift; +$code.=<<___; +.globl ${prefix}_xts_encrypt${std} +.type ${prefix}_xts_encrypt${std},%function +.align 5 +${prefix}_xts_encrypt${std}: + AARCH64_SIGN_LINK_REGISTER + stp x15, x16, [sp, #-0x10]! + stp x17, x18, [sp, #-0x10]! + stp x19, x20, [sp, #-0x10]! + stp x21, x22, [sp, #-0x10]! + stp x23, x24, [sp, #-0x10]! + stp x25, x26, [sp, #-0x10]! + stp x27, x28, [sp, #-0x10]! + stp x29, x30, [sp, #-0x10]! + stp d8, d9, [sp, #-0x10]! + stp d10, d11, [sp, #-0x10]! + stp d12, d13, [sp, #-0x10]! + stp d14, d15, [sp, #-0x10]! + mov $rks1,x3 + mov $rks2,x4 + mov $enc,w6 + ld1 {@tweak[0].4s}, [$ivp] + mov $rks,$rks2 +___ + &load_sbox(); + &rev32(@tweak[0],@tweak[0]); + &encrypt_1blk(@tweak[0]); +$code.=<<___; + mov $rks,$rks1 + and $remain,$len,#0x0F + // convert length into blocks + lsr $blocks,$len,4 + cmp $blocks,#1 + b.lt .return${std} + + cmp $remain,0 + // If the encryption/decryption Length is N times of 16, + // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std} + b.eq .xts_encrypt_blocks${std} + + // If the encryption/decryption length is not N times of 16, + // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std} + // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std} + subs $blocks,$blocks,#1 + b.eq .only_2blks_tweak${std} +.xts_encrypt_blocks${std}: +___ + &rbit(@tweak[0],@tweak[0],$std); + &rev32_armeb(@tweak[0],@tweak[0]); + &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]); + &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); + &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); + &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); + &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); + &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); + &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); + &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); +$code.=<<___; +.Lxts_8_blocks_process${std}: + cmp $blocks,#8 +___ + &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]); + &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]); + &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]); + &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); + &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]); + &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); + &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]); + &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); + &mov_reg_to_vec(@twx[8],@twx[9],@tweak[4]); + &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); + &mov_reg_to_vec(@twx[10],@twx[11],@tweak[5]); + &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); + &mov_reg_to_vec(@twx[12],@twx[13],@tweak[6]); + &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); + &mov_reg_to_vec(@twx[14],@twx[15],@tweak[7]); + &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); +$code.=<<___; + b.lt .Lxts_4_blocks_process${std} + ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 +___ + &rbit(@tweak[0],@tweak[0],$std); + &rbit(@tweak[1],@tweak[1],$std); + &rbit(@tweak[2],@tweak[2],$std); + &rbit(@tweak[3],@tweak[3],$std); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + eor @data[1].16b, @data[1].16b, @tweak[1].16b + eor @data[2].16b, @data[2].16b, @tweak[2].16b + eor @data[3].16b, @data[3].16b, @tweak[3].16b + ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 +___ + &rbit(@tweak[4],@tweak[4],$std); + &rbit(@tweak[5],@tweak[5],$std); + &rbit(@tweak[6],@tweak[6],$std); + &rbit(@tweak[7],@tweak[7],$std); +$code.=<<___; + eor @datax[0].16b, @datax[0].16b, @tweak[4].16b + eor @datax[1].16b, @datax[1].16b, @tweak[5].16b + eor @datax[2].16b, @datax[2].16b, @tweak[6].16b + eor @datax[3].16b, @datax[3].16b, @tweak[7].16b +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); + &rev32(@datax[0],@datax[0]); + &rev32(@datax[1],@datax[1]); + &rev32(@datax[2],@datax[2]); + &rev32(@datax[3],@datax[3]); + &transpose(@data,@vtmp); + &transpose(@datax,@vtmp); +$code.=<<___; + bl _${prefix}_enc_8blks +___ + &transpose(@vtmp,@datax); + &transpose(@data,@datax); +$code.=<<___; + eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b + eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b + eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b + eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b + eor @data[0].16b, @data[0].16b, @tweak[4].16b + eor @data[1].16b, @data[1].16b, @tweak[5].16b + eor @data[2].16b, @data[2].16b, @tweak[6].16b + eor @data[3].16b, @data[3].16b, @tweak[7].16b + + // save the last tweak + mov $lastTweak.16b,@tweak[7].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 + subs $blocks,$blocks,#8 + b.gt .Lxts_8_blocks_process${std} + b 100f +.Lxts_4_blocks_process${std}: + cmp $blocks,#4 + b.lt 1f + ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 +___ + &rbit(@tweak[0],@tweak[0],$std); + &rbit(@tweak[1],@tweak[1],$std); + &rbit(@tweak[2],@tweak[2],$std); + &rbit(@tweak[3],@tweak[3],$std); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + eor @data[1].16b, @data[1].16b, @tweak[1].16b + eor @data[2].16b, @data[2].16b, @tweak[2].16b + eor @data[3].16b, @data[3].16b, @tweak[3].16b +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); + &transpose(@data,@vtmp); +$code.=<<___; + bl _${prefix}_enc_4blks +___ + &transpose(@vtmp,@data); +$code.=<<___; + eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b + eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b + eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b + eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + sub $blocks,$blocks,#4 + mov @tweak[0].16b,@tweak[4].16b + mov @tweak[1].16b,@tweak[5].16b + mov @tweak[2].16b,@tweak[6].16b + // save the last tweak + mov $lastTweak.16b,@tweak[3].16b +1: + // process last block + cmp $blocks,#1 + b.lt 100f + b.gt 1f + ld1 {@data[0].4s},[$inp],#16 +___ + &rbit(@tweak[0],@tweak[0],$std); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b +___ + &rev32(@data[0],@data[0]); + &encrypt_1blk(@data[0]); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + st1 {@data[0].4s},[$outp],#16 + // save the last tweak + mov $lastTweak.16b,@tweak[0].16b + b 100f +1: // process last 2 blocks + cmp $blocks,#2 + b.gt 1f + ld1 {@data[0].4s,@data[1].4s},[$inp],#32 +___ + &rbit(@tweak[0],@tweak[0],$std); + &rbit(@tweak[1],@tweak[1],$std); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + eor @data[1].16b, @data[1].16b, @tweak[1].16b +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &transpose(@data,@vtmp); +$code.=<<___; + bl _${prefix}_enc_4blks +___ + &transpose(@vtmp,@data); +$code.=<<___; + eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b + eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b + st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 + // save the last tweak + mov $lastTweak.16b,@tweak[1].16b + b 100f +1: // process last 3 blocks + ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 +___ + &rbit(@tweak[0],@tweak[0],$std); + &rbit(@tweak[1],@tweak[1],$std); + &rbit(@tweak[2],@tweak[2],$std); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + eor @data[1].16b, @data[1].16b, @tweak[1].16b + eor @data[2].16b, @data[2].16b, @tweak[2].16b +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &transpose(@data,@vtmp); +$code.=<<___; + bl _${prefix}_enc_4blks +___ + &transpose(@vtmp,@data); +$code.=<<___; + eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b + eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b + eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 + // save the last tweak + mov $lastTweak.16b,@tweak[2].16b +100: + cmp $remain,0 + b.eq .return${std} + +// This brance calculates the last two tweaks, +// while the encryption/decryption length is larger than 32 +.last_2blks_tweak${std}: +___ + &rev32_armeb($lastTweak,$lastTweak); + &compute_tweak_vec($lastTweak,@tweak[1],$std); + &compute_tweak_vec(@tweak[1],@tweak[2],$std); +$code.=<<___; + b .check_dec${std} + + +// This brance calculates the last two tweaks, +// while the encryption/decryption length is equal to 32, who only need two tweaks +.only_2blks_tweak${std}: + mov @tweak[1].16b,@tweak[0].16b +___ + &rev32_armeb(@tweak[1],@tweak[1]); + &compute_tweak_vec(@tweak[1],@tweak[2],$std); +$code.=<<___; + b .check_dec${std} + + +// Determine whether encryption or decryption is required. +// The last two tweaks need to be swapped for decryption. +.check_dec${std}: + // encryption:1 decryption:0 + cmp $enc,1 + b.eq .process_last_2blks${std} + mov @vtmp[0].16B,@tweak[1].16b + mov @tweak[1].16B,@tweak[2].16b + mov @tweak[2].16B,@vtmp[0].16b + +.process_last_2blks${std}: +___ + &rev32_armeb(@tweak[1],@tweak[1]); + &rev32_armeb(@tweak[2],@tweak[2]); +$code.=<<___; + ld1 {@data[0].4s},[$inp],#16 + eor @data[0].16b, @data[0].16b, @tweak[1].16b +___ + &rev32(@data[0],@data[0]); + &encrypt_1blk(@data[0]); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[1].16b + st1 {@data[0].4s},[$outp],#16 + + sub $lastBlk,$outp,16 + .loop${std}: + subs $remain,$remain,1 + ldrb $wtmp0,[$lastBlk,$remain] + ldrb $wtmp1,[$inp,$remain] + strb $wtmp1,[$lastBlk,$remain] + strb $wtmp0,[$outp,$remain] + b.gt .loop${std} + ld1 {@data[0].4s}, [$lastBlk] + eor @data[0].16b, @data[0].16b, @tweak[2].16b +___ + &rev32(@data[0],@data[0]); + &encrypt_1blk(@data[0]); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[2].16b + st1 {@data[0].4s}, [$lastBlk] +.return${std}: + ldp d14, d15, [sp], #0x10 + ldp d12, d13, [sp], #0x10 + ldp d10, d11, [sp], #0x10 + ldp d8, d9, [sp], #0x10 + ldp x29, x30, [sp], #0x10 + ldp x27, x28, [sp], #0x10 + ldp x25, x26, [sp], #0x10 + ldp x23, x24, [sp], #0x10 + ldp x21, x22, [sp], #0x10 + ldp x19, x20, [sp], #0x10 + ldp x17, x18, [sp], #0x10 + ldp x15, x16, [sp], #0x10 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std} +___ +} # end of gen_xts_cipher +&gen_xts_cipher("_gb"); +&gen_xts_cipher(""); +}}} + +######################################## +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/ge; + print $_,"\n"; +} + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/sm4/build.info b/crypto/sm4/build.info index b65a7d149e5860964c4bb7dc46547b3e0d25edff..73ffe5ea0915faf56246ebbb6e882e0155853bac 100644 --- a/crypto/sm4/build.info +++ b/crypto/sm4/build.info @@ -1,4 +1,36 @@ LIBS=../../libcrypto -SOURCE[../../libcrypto]=\ - sm4.c +IF[{- !$disabled{asm} -}] + $SM4DEF_aarch64=SM4_ASM VPSM4_ASM + $SM4ASM_aarch64=sm4-armv8.S vpsm4-armv8.S vpsm4_ex-armv8.S + + # Now that we have defined all the arch specific variables, use the + # appropriate one, and define the appropriate macros + IF[$SM4ASM_{- $target{asm_arch} -}] + $SM4ASM=$SM4ASM_{- $target{asm_arch} -} + $SM4DEF=$SM4DEF_{- $target{asm_arch} -} + ENDIF +ENDIF + +SOURCE[../../libcrypto]= $SM4ASM sm4.c + + +# Implementations are now spread across several libraries, so the defines +# need to be applied to all affected libraries and modules. +DEFINE[../../libcrypto]=$SM4DEF +DEFINE[../../providers/libfips.a]=$SM4DEF +DEFINE[../../providers/libdefault.a]=$SM4DEF +# We only need to include the SM4DEF stuff in the legacy provider when it's a +# separate module and it's dynamically linked with libcrypto. Otherwise, it +# already gets everything that the static libcrypto.a has, and doesn't need it +# added again. +IF[{- !$disabled{module} && !$disabled{shared} -}] + DEFINE[../providers/liblegacy.a]=$SM4DEF +ENDIF + +GENERATE[sm4-armv8.S]=asm/sm4-armv8.pl +GENERATE[vpsm4-armv8.S]=asm/vpsm4-armv8.pl +GENERATE[vpsm4_ex-armv8.S]=asm/vpsm4_ex-armv8.pl +INCLUDE[sm4-armv8.o]=.. +INCLUDE[vpsm4-armv8.o]=.. +INCLUDE[vpsm4_ex-armv8.o]=.. diff --git a/include/crypto/modes.h b/include/crypto/modes.h index 19f9d85959c59cfce37a015cda5317e795e6cb6d..475b77f9256654517eaa02431e42e686ffc4ac17 100644 --- a/include/crypto/modes.h +++ b/include/crypto/modes.h @@ -148,6 +148,12 @@ struct xts128_context { block128_f block1, block2; }; +/* XTS mode for SM4 algorithm specified by GB/T 17964-2021 */ +int ossl_crypto_xts128gb_encrypt(const XTS128_CONTEXT *ctx, + const unsigned char iv[16], + const unsigned char *inp, unsigned char *out, + size_t len, int enc); + struct ccm128_context { union { u64 u[2]; diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h new file mode 100644 index 0000000000000000000000000000000000000000..a37dc5f6d37ba0a12ab3d44dd1fbb6781077d7bc --- /dev/null +++ b/include/crypto/sm4_platform.h @@ -0,0 +1,111 @@ +/* + * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#ifndef OSSL_SM4_PLATFORM_H +# define OSSL_SM4_PLATFORM_H +# pragma once + +# if defined(OPENSSL_CPUID_OBJ) +# if defined(__aarch64__) +# include "arm_arch.h" +extern unsigned int OPENSSL_arm_midr; +static inline int vpsm4_capable(void) +{ + return (OPENSSL_armcap_P & ARMV8_CPUID) && + (MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_V1) || + MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1)); +} +static inline int vpsm4_ex_capable(void) +{ + return (OPENSSL_armcap_P & ARMV8_CPUID) && + (MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, HISI_CPU_IMP, HISI_CPU_PART_KP920)); +} +# if defined(VPSM4_ASM) +# define VPSM4_CAPABLE vpsm4_capable() +# define VPSM4_EX_CAPABLE vpsm4_ex_capable() +# endif +# define HWSM4_CAPABLE (OPENSSL_armcap_P & ARMV8_SM4) +# define HWSM4_set_encrypt_key sm4_v8_set_encrypt_key +# define HWSM4_set_decrypt_key sm4_v8_set_decrypt_key +# define HWSM4_encrypt sm4_v8_encrypt +# define HWSM4_decrypt sm4_v8_decrypt +# define HWSM4_cbc_encrypt sm4_v8_cbc_encrypt +# define HWSM4_ecb_encrypt sm4_v8_ecb_encrypt +# define HWSM4_ctr32_encrypt_blocks sm4_v8_ctr32_encrypt_blocks +# endif +# endif /* OPENSSL_CPUID_OBJ */ + +# if defined(HWSM4_CAPABLE) +int HWSM4_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); +int HWSM4_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); +void HWSM4_encrypt(const unsigned char *in, unsigned char *out, + const SM4_KEY *key); +void HWSM4_decrypt(const unsigned char *in, unsigned char *out, + const SM4_KEY *key); +void HWSM4_cbc_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const SM4_KEY *key, + unsigned char *ivec, const int enc); +void HWSM4_ecb_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const SM4_KEY *key, + const int enc); +void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + const unsigned char ivec[16]); +# endif /* HWSM4_CAPABLE */ + +# ifdef VPSM4_CAPABLE +int vpsm4_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); +int vpsm4_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); +void vpsm4_encrypt(const unsigned char *in, unsigned char *out, + const SM4_KEY *key); +void vpsm4_decrypt(const unsigned char *in, unsigned char *out, + const SM4_KEY *key); +void vpsm4_cbc_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const SM4_KEY *key, + unsigned char *ivec, const int enc); +void vpsm4_ecb_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const SM4_KEY *key, + const int enc); +void vpsm4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + const unsigned char ivec[16]); +void vpsm4_xts_encrypt(const unsigned char *in, unsigned char *out, + size_t len, const SM4_KEY *key1, const SM4_KEY *key2, + const unsigned char ivec[16], const int enc); +void vpsm4_xts_encrypt_gb(const unsigned char *in, unsigned char *out, + size_t len, const SM4_KEY *key1, const SM4_KEY *key2, + const unsigned char ivec[16], const int enc); +# endif /* VPSM4_CAPABLE */ + +# ifdef VPSM4_EX_CAPABLE +int vpsm4_ex_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); +int vpsm4_ex_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); +void vpsm4_ex_encrypt(const unsigned char *in, unsigned char *out, + const SM4_KEY *key); +void vpsm4_ex_decrypt(const unsigned char *in, unsigned char *out, + const SM4_KEY *key); +void vpsm4_ex_cbc_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const SM4_KEY *key, + unsigned char *ivec, const int enc); +void vpsm4_ex_ecb_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const SM4_KEY *key, + const int enc); +void vpsm4_ex_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + const unsigned char ivec[16]); +void vpsm4_ex_xts_encrypt(const unsigned char *in, unsigned char *out, + size_t len, const SM4_KEY *key1, const SM4_KEY *key2, + const unsigned char ivec[16], const int enc); +void vpsm4_ex_xts_encrypt_gb(const unsigned char *in, unsigned char *out, + size_t len, const SM4_KEY *key1, + const SM4_KEY *key2, const unsigned char ivec[16], + const int enc); +# endif /* VPSM4_EX_CAPABLE */ + +#endif /* OSSL_SM4_PLATFORM_H */ diff --git a/include/openssl/core_names.h b/include/openssl/core_names.h index 6bed5a8a670fddd4fc6f23f77e0316b5224582fa..a90971099df082db6d1c87feb5b246b2a5062e2f 100644 --- a/include/openssl/core_names.h +++ b/include/openssl/core_names.h @@ -97,6 +97,7 @@ extern "C" { #define OSSL_CIPHER_PARAM_CTS_MODE "cts_mode" /* utf8_string */ /* For passing the AlgorithmIdentifier parameter in DER form */ #define OSSL_CIPHER_PARAM_ALGORITHM_ID_PARAMS "alg_id_param" /* octet_string */ +#define OSSL_CIPHER_PARAM_XTS_STANDARD "xts_standard" /* utf8_string */ #define OSSL_CIPHER_PARAM_TLS1_MULTIBLOCK_MAX_SEND_FRAGMENT \ "tls1multi_maxsndfrag" /* uint */ diff --git a/providers/defltprov.c b/providers/defltprov.c index ed3f4799e7b4979b91bdd6edefe56532fb0b3f87..ab898d3f44b3410fcb72430d707387f962f61161 100644 --- a/providers/defltprov.c +++ b/providers/defltprov.c @@ -289,11 +289,14 @@ static const OSSL_ALGORITHM_CAPABLE deflt_ciphers[] = { ALG(PROV_NAMES_DES_EDE_CFB, ossl_tdes_ede2_cfb_functions), #endif /* OPENSSL_NO_DES */ #ifndef OPENSSL_NO_SM4 + ALG(PROV_NAMES_SM4_GCM, ossl_sm4128gcm_functions), + ALG(PROV_NAMES_SM4_CCM, ossl_sm4128ccm_functions), ALG(PROV_NAMES_SM4_ECB, ossl_sm4128ecb_functions), ALG(PROV_NAMES_SM4_CBC, ossl_sm4128cbc_functions), ALG(PROV_NAMES_SM4_CTR, ossl_sm4128ctr_functions), ALG(PROV_NAMES_SM4_OFB, ossl_sm4128ofb128_functions), ALG(PROV_NAMES_SM4_CFB, ossl_sm4128cfb128_functions), + ALG(PROV_NAMES_SM4_XTS, ossl_sm4128xts_functions), #endif /* OPENSSL_NO_SM4 */ #ifndef OPENSSL_NO_CHACHA ALG(PROV_NAMES_ChaCha20, ossl_chacha20_functions), diff --git a/providers/implementations/ciphers/build.info b/providers/implementations/ciphers/build.info index e4c5f4f051b6f767748303fb8cbd2065c511c976..9f6eacf5e3cc999cbeaa4a8d56dc7d5a62265d27 100644 --- a/providers/implementations/ciphers/build.info +++ b/providers/implementations/ciphers/build.info @@ -105,7 +105,11 @@ ENDIF IF[{- !$disabled{sm4} -}] SOURCE[$SM4_GOAL]=\ - cipher_sm4.c cipher_sm4_hw.c + cipher_sm4.c cipher_sm4_hw.c \ + cipher_sm4_gcm.c cipher_sm4_gcm_hw.c \ + cipher_sm4_ccm.c cipher_sm4_ccm_hw.c \ + cipher_sm4_xts.c cipher_sm4_xts_hw.c + ENDIF IF[{- !$disabled{ocb} -}] diff --git a/providers/implementations/ciphers/cipher_sm4.h b/providers/implementations/ciphers/cipher_sm4.h index f7f833fcb4cf86bd607868970fd34757c8b3015f..01a031a74d799d509b5c84f7dbc09fb0d37a5787 100644 --- a/providers/implementations/ciphers/cipher_sm4.h +++ b/providers/implementations/ciphers/cipher_sm4.h @@ -9,6 +9,7 @@ #include "prov/ciphercommon.h" #include "crypto/sm4.h" +#include "crypto/sm4_platform.h" typedef struct prov_cast_ctx_st { PROV_CIPHER_CTX base; /* Must be first */ diff --git a/providers/implementations/ciphers/cipher_sm4_ccm.c b/providers/implementations/ciphers/cipher_sm4_ccm.c new file mode 100644 index 0000000000000000000000000000000000000000..f0295a5ca28ad8bde3bf589c648cdb179e1a4e15 --- /dev/null +++ b/providers/implementations/ciphers/cipher_sm4_ccm.c @@ -0,0 +1,39 @@ +/* + * Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +/* Dispatch functions for SM4 CCM mode */ + +#include "cipher_sm4_ccm.h" +#include "prov/implementations.h" +#include "prov/providercommon.h" + +static OSSL_FUNC_cipher_freectx_fn sm4_ccm_freectx; + +static void *sm4_ccm_newctx(void *provctx, size_t keybits) +{ + PROV_SM4_CCM_CTX *ctx; + + if (!ossl_prov_is_running()) + return NULL; + + ctx = OPENSSL_zalloc(sizeof(*ctx)); + if (ctx != NULL) + ossl_ccm_initctx(&ctx->base, keybits, ossl_prov_sm4_hw_ccm(keybits)); + return ctx; +} + +static void sm4_ccm_freectx(void *vctx) +{ + PROV_SM4_CCM_CTX *ctx = (PROV_SM4_CCM_CTX *)vctx; + + OPENSSL_clear_free(ctx, sizeof(*ctx)); +} + +/* sm4128ccm functions */ +IMPLEMENT_aead_cipher(sm4, ccm, CCM, AEAD_FLAGS, 128, 8, 96); diff --git a/providers/implementations/ciphers/cipher_sm4_ccm.h b/providers/implementations/ciphers/cipher_sm4_ccm.h new file mode 100644 index 0000000000000000000000000000000000000000..189e71e9e46abaeb7529d5a057011b44b0a1aff6 --- /dev/null +++ b/providers/implementations/ciphers/cipher_sm4_ccm.h @@ -0,0 +1,22 @@ +/* + * Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#include "crypto/sm4.h" +#include "prov/ciphercommon.h" +#include "prov/ciphercommon_ccm.h" + +typedef struct prov_sm4_ccm_ctx_st { + PROV_CCM_CTX base; /* Must be first */ + union { + OSSL_UNION_ALIGN; + SM4_KEY ks; + } ks; /* SM4 key schedule to use */ +} PROV_SM4_CCM_CTX; + +const PROV_CCM_HW *ossl_prov_sm4_hw_ccm(size_t keylen); diff --git a/providers/implementations/ciphers/cipher_sm4_ccm_hw.c b/providers/implementations/ciphers/cipher_sm4_ccm_hw.c new file mode 100644 index 0000000000000000000000000000000000000000..791daf3e46f3c72950172e7cf15a04fabba63604 --- /dev/null +++ b/providers/implementations/ciphers/cipher_sm4_ccm_hw.c @@ -0,0 +1,41 @@ +/* + * Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +/*- + * Generic support for SM4 CCM. + */ + +#include "cipher_sm4_ccm.h" + +static int ccm_sm4_initkey(PROV_CCM_CTX *ctx, + const unsigned char *key, size_t keylen) +{ + PROV_SM4_CCM_CTX *actx = (PROV_SM4_CCM_CTX *)ctx; + + ossl_sm4_set_key(key, &actx->ks.ks); + CRYPTO_ccm128_init(&ctx->ccm_ctx, ctx->m, ctx->l, &actx->ks.ks, + (block128_f)ossl_sm4_encrypt); + ctx->str = NULL; + ctx->key_set = 1; + return 1; +} + +static const PROV_CCM_HW ccm_sm4 = { + ccm_sm4_initkey, + ossl_ccm_generic_setiv, + ossl_ccm_generic_setaad, + ossl_ccm_generic_auth_encrypt, + ossl_ccm_generic_auth_decrypt, + ossl_ccm_generic_gettag +}; + +const PROV_CCM_HW *ossl_prov_sm4_hw_ccm(size_t keybits) +{ + return &ccm_sm4; +} diff --git a/providers/implementations/ciphers/cipher_sm4_gcm.c b/providers/implementations/ciphers/cipher_sm4_gcm.c new file mode 100644 index 0000000000000000000000000000000000000000..7a936f00ee55aedfaeef27c22730c26a701821b5 --- /dev/null +++ b/providers/implementations/ciphers/cipher_sm4_gcm.c @@ -0,0 +1,40 @@ +/* + * Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +/* Dispatch functions for SM4 GCM mode */ + +#include "cipher_sm4_gcm.h" +#include "prov/implementations.h" +#include "prov/providercommon.h" + +static OSSL_FUNC_cipher_freectx_fn sm4_gcm_freectx; + +static void *sm4_gcm_newctx(void *provctx, size_t keybits) +{ + PROV_SM4_GCM_CTX *ctx; + + if (!ossl_prov_is_running()) + return NULL; + + ctx = OPENSSL_zalloc(sizeof(*ctx)); + if (ctx != NULL) + ossl_gcm_initctx(provctx, &ctx->base, keybits, + ossl_prov_sm4_hw_gcm(keybits)); + return ctx; +} + +static void sm4_gcm_freectx(void *vctx) +{ + PROV_SM4_GCM_CTX *ctx = (PROV_SM4_GCM_CTX *)vctx; + + OPENSSL_clear_free(ctx, sizeof(*ctx)); +} + +/* ossl_sm4128gcm_functions */ +IMPLEMENT_aead_cipher(sm4, gcm, GCM, AEAD_FLAGS, 128, 8, 96); diff --git a/providers/implementations/ciphers/cipher_sm4_gcm.h b/providers/implementations/ciphers/cipher_sm4_gcm.h new file mode 100644 index 0000000000000000000000000000000000000000..2b6b5f3ece74e87a303ece4689331b26f57a7403 --- /dev/null +++ b/providers/implementations/ciphers/cipher_sm4_gcm.h @@ -0,0 +1,22 @@ +/* + * Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#include "crypto/sm4.h" +#include "prov/ciphercommon.h" +#include "prov/ciphercommon_gcm.h" + +typedef struct prov_sm4_gcm_ctx_st { + PROV_GCM_CTX base; /* must be first entry in struct */ + union { + OSSL_UNION_ALIGN; + SM4_KEY ks; + } ks; +} PROV_SM4_GCM_CTX; + +const PROV_GCM_HW *ossl_prov_sm4_hw_gcm(size_t keybits); diff --git a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c new file mode 100644 index 0000000000000000000000000000000000000000..db7fe0fe2f96b04b7ac0d02dbd78dd4a25673de5 --- /dev/null +++ b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c @@ -0,0 +1,87 @@ +/* + * Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +/*- + * Generic support for SM4 GCM. + */ + +#include "cipher_sm4_gcm.h" +#include "crypto/sm4_platform.h" + +static int sm4_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key, + size_t keylen) +{ + PROV_SM4_GCM_CTX *actx = (PROV_SM4_GCM_CTX *)ctx; + SM4_KEY *ks = &actx->ks.ks; + + ctx->ks = ks; +# ifdef HWSM4_CAPABLE + if (HWSM4_CAPABLE) { + HWSM4_set_encrypt_key(key, ks); + CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f) HWSM4_encrypt); +# ifdef HWSM4_ctr32_encrypt_blocks + ctx->ctr = (ctr128_f) HWSM4_ctr32_encrypt_blocks; +# else /* HWSM4_ctr32_encrypt_blocks */ + ctx->ctr = (ctr128_f)NULL; +# endif + } else +# endif /* HWSM4_CAPABLE */ +# ifdef VPSM4_CAPABLE + if (VPSM4_CAPABLE) { + vpsm4_set_encrypt_key(key, ks); + CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f) vpsm4_encrypt); + ctx->ctr = (ctr128_f) vpsm4_ctr32_encrypt_blocks; + } else +# endif /* VPSM4_CAPABLE */ + { + ossl_sm4_set_key(key, ks); + CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f)ossl_sm4_encrypt); + ctx->ctr = (ctr128_f)NULL; + } + ctx->key_set = 1; + + return 1; +} + +static int hw_gcm_cipher_update(PROV_GCM_CTX *ctx, const unsigned char *in, + size_t len, unsigned char *out) +{ + if (ctx->enc) { + if (ctx->ctr != NULL) { + if (CRYPTO_gcm128_encrypt_ctr32(&ctx->gcm, in, out, len, ctx->ctr)) + return 0; + } else { + if (CRYPTO_gcm128_encrypt(&ctx->gcm, in, out, len)) + return 0; + } + } else { + if (ctx->ctr != NULL) { + if (CRYPTO_gcm128_decrypt_ctr32(&ctx->gcm, in, out, len, ctx->ctr)) + return 0; + } else { + if (CRYPTO_gcm128_decrypt(&ctx->gcm, in, out, len)) + return 0; + } + } + return 1; +} + +static const PROV_GCM_HW sm4_gcm = { + sm4_gcm_initkey, + ossl_gcm_setiv, + ossl_gcm_aad_update, + hw_gcm_cipher_update, + ossl_gcm_cipher_final, + ossl_gcm_one_shot +}; + +const PROV_GCM_HW *ossl_prov_sm4_hw_gcm(size_t keybits) +{ + return &sm4_gcm; +} diff --git a/providers/implementations/ciphers/cipher_sm4_hw.c b/providers/implementations/ciphers/cipher_sm4_hw.c index 0db04b1a743b3be8f0e1410369e025e829284ae7..8cabd7826621999b194573c2d9c6e5999315379e 100644 --- a/providers/implementations/ciphers/cipher_sm4_hw.c +++ b/providers/implementations/ciphers/cipher_sm4_hw.c @@ -15,14 +15,107 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx, PROV_SM4_CTX *sctx = (PROV_SM4_CTX *)ctx; SM4_KEY *ks = &sctx->ks.ks; - ossl_sm4_set_key(key, ks); ctx->ks = ks; if (ctx->enc || (ctx->mode != EVP_CIPH_ECB_MODE - && ctx->mode != EVP_CIPH_CBC_MODE)) - ctx->block = (block128_f)ossl_sm4_encrypt; - else - ctx->block = (block128_f)ossl_sm4_decrypt; + && ctx->mode != EVP_CIPH_CBC_MODE)) { +#ifdef HWSM4_CAPABLE + if (HWSM4_CAPABLE) { + HWSM4_set_encrypt_key(key, ks); + ctx->block = (block128_f)HWSM4_encrypt; + ctx->stream.cbc = NULL; +#ifdef HWSM4_cbc_encrypt + if (ctx->mode == EVP_CIPH_CBC_MODE) + ctx->stream.cbc = (cbc128_f)HWSM4_cbc_encrypt; + else +#endif +#ifdef HWSM4_ecb_encrypt + if (ctx->mode == EVP_CIPH_ECB_MODE) + ctx->stream.ecb = (ecb128_f)HWSM4_ecb_encrypt; + else +#endif +#ifdef HWSM4_ctr32_encrypt_blocks + if (ctx->mode == EVP_CIPH_CTR_MODE) + ctx->stream.ctr = (ctr128_f)HWSM4_ctr32_encrypt_blocks; + else +#endif + (void)0; /* terminate potentially open 'else' */ + } else +#endif +#ifdef VPSM4_EX_CAPABLE + if (VPSM4_EX_CAPABLE) { + vpsm4_ex_set_encrypt_key(key, ks); + ctx->block = (block128_f)vpsm4_ex_encrypt; + ctx->stream.cbc = NULL; + if (ctx->mode == EVP_CIPH_CBC_MODE) + ctx->stream.cbc = (cbc128_f)vpsm4_ex_cbc_encrypt; + else if (ctx->mode == EVP_CIPH_ECB_MODE) + ctx->stream.ecb = (ecb128_f)vpsm4_ex_ecb_encrypt; + else if (ctx->mode == EVP_CIPH_CTR_MODE) + ctx->stream.ctr = (ctr128_f)vpsm4_ex_ctr32_encrypt_blocks; + } else +#endif +#ifdef VPSM4_CAPABLE + if (VPSM4_CAPABLE) { + vpsm4_set_encrypt_key(key, ks); + ctx->block = (block128_f)vpsm4_encrypt; + ctx->stream.cbc = NULL; + if (ctx->mode == EVP_CIPH_CBC_MODE) + ctx->stream.cbc = (cbc128_f)vpsm4_cbc_encrypt; + else if (ctx->mode == EVP_CIPH_ECB_MODE) + ctx->stream.ecb = (ecb128_f)vpsm4_ecb_encrypt; + else if (ctx->mode == EVP_CIPH_CTR_MODE) + ctx->stream.ctr = (ctr128_f)vpsm4_ctr32_encrypt_blocks; + } else +#endif + { + ossl_sm4_set_key(key, ks); + ctx->block = (block128_f)ossl_sm4_encrypt; + } + } else { +#ifdef HWSM4_CAPABLE + if (HWSM4_CAPABLE) { + HWSM4_set_decrypt_key(key, ks); + ctx->block = (block128_f)HWSM4_decrypt; + ctx->stream.cbc = NULL; +#ifdef HWSM4_cbc_encrypt + if (ctx->mode == EVP_CIPH_CBC_MODE) + ctx->stream.cbc = (cbc128_f)HWSM4_cbc_encrypt; +#endif +#ifdef HWSM4_ecb_encrypt + if (ctx->mode == EVP_CIPH_ECB_MODE) + ctx->stream.ecb = (ecb128_f)HWSM4_ecb_encrypt; +#endif + } else +#endif +#ifdef VPSM4_EX_CAPABLE + if (VPSM4_EX_CAPABLE) { + vpsm4_ex_set_decrypt_key(key, ks); + ctx->block = (block128_f)vpsm4_ex_decrypt; + ctx->stream.cbc = NULL; + if (ctx->mode == EVP_CIPH_CBC_MODE) + ctx->stream.cbc = (cbc128_f)vpsm4_ex_cbc_encrypt; + else if (ctx->mode == EVP_CIPH_ECB_MODE) + ctx->stream.ecb = (ecb128_f)vpsm4_ex_ecb_encrypt; + } else +#endif +#ifdef VPSM4_CAPABLE + if (VPSM4_CAPABLE) { + vpsm4_set_decrypt_key(key, ks); + ctx->block = (block128_f)vpsm4_decrypt; + ctx->stream.cbc = NULL; + if (ctx->mode == EVP_CIPH_CBC_MODE) + ctx->stream.cbc = (cbc128_f)vpsm4_cbc_encrypt; + else if (ctx->mode == EVP_CIPH_ECB_MODE) + ctx->stream.ecb = (ecb128_f)vpsm4_ecb_encrypt; + } else +#endif + { + ossl_sm4_set_key(key, ks); + ctx->block = (block128_f)ossl_sm4_decrypt; + } + } + return 1; } @@ -31,7 +124,7 @@ IMPLEMENT_CIPHER_HW_COPYCTX(cipher_hw_sm4_copyctx, PROV_SM4_CTX) # define PROV_CIPHER_HW_sm4_mode(mode) \ static const PROV_CIPHER_HW sm4_##mode = { \ cipher_hw_sm4_initkey, \ - ossl_cipher_hw_chunked_##mode, \ + ossl_cipher_hw_generic_##mode, \ cipher_hw_sm4_copyctx \ }; \ const PROV_CIPHER_HW *ossl_prov_cipher_hw_sm4_##mode(size_t keybits) \ diff --git a/providers/implementations/ciphers/cipher_sm4_xts.c b/providers/implementations/ciphers/cipher_sm4_xts.c new file mode 100644 index 0000000000000000000000000000000000000000..037055fce8453b2e6ab3b063dfb40e4bb68d943b --- /dev/null +++ b/providers/implementations/ciphers/cipher_sm4_xts.c @@ -0,0 +1,281 @@ + +/* + * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +/* Dispatch functions for SM4 XTS mode */ + +#include +#include "cipher_sm4_xts.h" +#include "prov/implementations.h" +#include "prov/providercommon.h" + +#define SM4_XTS_FLAGS PROV_CIPHER_FLAG_CUSTOM_IV +#define SM4_XTS_IV_BITS 128 +#define SM4_XTS_BLOCK_BITS 8 + +/* forward declarations */ +static OSSL_FUNC_cipher_encrypt_init_fn sm4_xts_einit; +static OSSL_FUNC_cipher_decrypt_init_fn sm4_xts_dinit; +static OSSL_FUNC_cipher_update_fn sm4_xts_stream_update; +static OSSL_FUNC_cipher_final_fn sm4_xts_stream_final; +static OSSL_FUNC_cipher_cipher_fn sm4_xts_cipher; +static OSSL_FUNC_cipher_freectx_fn sm4_xts_freectx; +static OSSL_FUNC_cipher_dupctx_fn sm4_xts_dupctx; +static OSSL_FUNC_cipher_set_ctx_params_fn sm4_xts_set_ctx_params; +static OSSL_FUNC_cipher_settable_ctx_params_fn sm4_xts_settable_ctx_params; + +/*- + * Provider dispatch functions + */ +static int sm4_xts_init(void *vctx, const unsigned char *key, size_t keylen, + const unsigned char *iv, size_t ivlen, + const OSSL_PARAM params[], int enc) +{ + PROV_SM4_XTS_CTX *xctx = (PROV_SM4_XTS_CTX *)vctx; + PROV_CIPHER_CTX *ctx = &xctx->base; + + if (!ossl_prov_is_running()) + return 0; + + ctx->enc = enc; + + if (iv != NULL) { + if (!ossl_cipher_generic_initiv(vctx, iv, ivlen)) + return 0; + } + if (key != NULL) { + if (keylen != ctx->keylen) { + ERR_raise(ERR_LIB_PROV, PROV_R_INVALID_KEY_LENGTH); + return 0; + } + if (!ctx->hw->init(ctx, key, keylen)) + return 0; + } + return sm4_xts_set_ctx_params(xctx, params); +} + +static int sm4_xts_einit(void *vctx, const unsigned char *key, size_t keylen, + const unsigned char *iv, size_t ivlen, + const OSSL_PARAM params[]) +{ + return sm4_xts_init(vctx, key, keylen, iv, ivlen, params, 1); +} + +static int sm4_xts_dinit(void *vctx, const unsigned char *key, size_t keylen, + const unsigned char *iv, size_t ivlen, + const OSSL_PARAM params[]) +{ + return sm4_xts_init(vctx, key, keylen, iv, ivlen, params, 0); +} + +static void *sm4_xts_newctx(void *provctx, unsigned int mode, uint64_t flags, + size_t kbits, size_t blkbits, size_t ivbits) +{ + PROV_SM4_XTS_CTX *ctx = OPENSSL_zalloc(sizeof(*ctx)); + + if (ctx != NULL) { + ossl_cipher_generic_initkey(&ctx->base, kbits, blkbits, ivbits, mode, + flags, ossl_prov_cipher_hw_sm4_xts(kbits), + NULL); + } + return ctx; +} + +static void sm4_xts_freectx(void *vctx) +{ + PROV_SM4_XTS_CTX *ctx = (PROV_SM4_XTS_CTX *)vctx; + + ossl_cipher_generic_reset_ctx((PROV_CIPHER_CTX *)vctx); + OPENSSL_clear_free(ctx, sizeof(*ctx)); +} + +static void *sm4_xts_dupctx(void *vctx) +{ + PROV_SM4_XTS_CTX *in = (PROV_SM4_XTS_CTX *)vctx; + PROV_SM4_XTS_CTX *ret = NULL; + + if (!ossl_prov_is_running()) + return NULL; + + if (in->xts.key1 != NULL) { + if (in->xts.key1 != &in->ks1) + return NULL; + } + if (in->xts.key2 != NULL) { + if (in->xts.key2 != &in->ks2) + return NULL; + } + ret = OPENSSL_malloc(sizeof(*ret)); + if (ret == NULL) + return NULL; + in->base.hw->copyctx(&ret->base, &in->base); + return ret; +} + +static int sm4_xts_cipher(void *vctx, unsigned char *out, size_t *outl, + size_t outsize, const unsigned char *in, size_t inl) +{ + PROV_SM4_XTS_CTX *ctx = (PROV_SM4_XTS_CTX *)vctx; + + if (!ossl_prov_is_running() + || ctx->xts.key1 == NULL + || ctx->xts.key2 == NULL + || !ctx->base.iv_set + || out == NULL + || in == NULL + || inl < SM4_BLOCK_SIZE) + return 0; + + /* + * Impose a limit of 2^20 blocks per data unit as specified by + * IEEE Std 1619-2018. The earlier and obsolete IEEE Std 1619-2007 + * indicated that this was a SHOULD NOT rather than a MUST NOT. + * NIST SP 800-38E mandates the same limit. + */ + if (inl > XTS_MAX_BLOCKS_PER_DATA_UNIT * SM4_BLOCK_SIZE) { + ERR_raise(ERR_LIB_PROV, PROV_R_XTS_DATA_UNIT_IS_TOO_LARGE); + return 0; + } + if (ctx->xts_standard) { + if (ctx->stream != NULL) + (*ctx->stream)(in, out, inl, ctx->xts.key1, ctx->xts.key2, + ctx->base.iv, ctx->base.enc); + else if (CRYPTO_xts128_encrypt(&ctx->xts, ctx->base.iv, in, out, inl, + ctx->base.enc)) + return 0; + } else { + if (ctx->stream_gb != NULL) + (*ctx->stream_gb)(in, out, inl, ctx->xts.key1, ctx->xts.key2, + ctx->base.iv, ctx->base.enc); + else if (ossl_crypto_xts128gb_encrypt(&ctx->xts, ctx->base.iv, in, out, + inl, ctx->base.enc)) + return 0; + } + *outl = inl; + return 1; +} + +static int sm4_xts_stream_update(void *vctx, unsigned char *out, size_t *outl, + size_t outsize, const unsigned char *in, + size_t inl) +{ + PROV_SM4_XTS_CTX *ctx = (PROV_SM4_XTS_CTX *)vctx; + + if (outsize < inl) { + ERR_raise(ERR_LIB_PROV, PROV_R_OUTPUT_BUFFER_TOO_SMALL); + return 0; + } + + if (!sm4_xts_cipher(ctx, out, outl, outsize, in, inl)) { + ERR_raise(ERR_LIB_PROV, PROV_R_CIPHER_OPERATION_FAILED); + return 0; + } + + return 1; +} + +static int sm4_xts_stream_final(void *vctx, unsigned char *out, size_t *outl, + size_t outsize) +{ + if (!ossl_prov_is_running()) + return 0; + *outl = 0; + return 1; +} + +static const OSSL_PARAM sm4_xts_known_settable_ctx_params[] = { + OSSL_PARAM_utf8_string(OSSL_CIPHER_PARAM_XTS_STANDARD, NULL, 0), + OSSL_PARAM_END +}; + +static const OSSL_PARAM *sm4_xts_settable_ctx_params(ossl_unused void *cctx, + ossl_unused void *provctx) +{ + return sm4_xts_known_settable_ctx_params; +} + +static int sm4_xts_set_ctx_params(void *vxctx, const OSSL_PARAM params[]) +{ + PROV_SM4_XTS_CTX *xctx = (PROV_SM4_XTS_CTX *)vxctx; + const OSSL_PARAM *p; + + if (params == NULL) + return 1; + + /*- + * Sets the XTS standard to use with SM4-XTS algorithm. + * + * Must be utf8 string "GB" or "IEEE", + * "GB" means the GB/T 17964-2021 standard + * "IEEE" means the IEEE Std 1619-2007 standard + */ + p = OSSL_PARAM_locate_const(params, OSSL_CIPHER_PARAM_XTS_STANDARD); + + if (p != NULL) { + const char *xts_standard = NULL; + + if (p->data_type != OSSL_PARAM_UTF8_STRING) + return 0; + + if (!OSSL_PARAM_get_utf8_string_ptr(p, &xts_standard)) { + ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_GET_PARAMETER); + return 0; + } + if (OPENSSL_strcasecmp(xts_standard, "GB") == 0) { + xctx->xts_standard = 0; + } else if (OPENSSL_strcasecmp(xts_standard, "IEEE") == 0) { + xctx->xts_standard = 1; + } else { + ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_SET_PARAMETER); + return 0; + } + } + + return 1; +} + +#define IMPLEMENT_cipher(lcmode, UCMODE, kbits, flags) \ +static OSSL_FUNC_cipher_get_params_fn sm4_##kbits##_##lcmode##_get_params; \ +static int sm4_##kbits##_##lcmode##_get_params(OSSL_PARAM params[]) \ +{ \ + return ossl_cipher_generic_get_params(params, EVP_CIPH_##UCMODE##_MODE, \ + flags, 2 * kbits, SM4_XTS_BLOCK_BITS,\ + SM4_XTS_IV_BITS); \ +} \ +static OSSL_FUNC_cipher_newctx_fn sm4_##kbits##_xts_newctx; \ +static void *sm4_##kbits##_xts_newctx(void *provctx) \ +{ \ + return sm4_xts_newctx(provctx, EVP_CIPH_##UCMODE##_MODE, flags, 2 * kbits, \ + SM4_XTS_BLOCK_BITS, SM4_XTS_IV_BITS); \ +} \ +const OSSL_DISPATCH ossl_sm4##kbits##xts_functions[] = { \ + { OSSL_FUNC_CIPHER_NEWCTX, (void (*)(void))sm4_##kbits##_xts_newctx }, \ + { OSSL_FUNC_CIPHER_ENCRYPT_INIT, (void (*)(void))sm4_xts_einit }, \ + { OSSL_FUNC_CIPHER_DECRYPT_INIT, (void (*)(void))sm4_xts_dinit }, \ + { OSSL_FUNC_CIPHER_UPDATE, (void (*)(void))sm4_xts_stream_update }, \ + { OSSL_FUNC_CIPHER_FINAL, (void (*)(void))sm4_xts_stream_final }, \ + { OSSL_FUNC_CIPHER_CIPHER, (void (*)(void))sm4_xts_cipher }, \ + { OSSL_FUNC_CIPHER_FREECTX, (void (*)(void))sm4_xts_freectx }, \ + { OSSL_FUNC_CIPHER_DUPCTX, (void (*)(void))sm4_xts_dupctx }, \ + { OSSL_FUNC_CIPHER_GET_PARAMS, \ + (void (*)(void))sm4_##kbits##_##lcmode##_get_params }, \ + { OSSL_FUNC_CIPHER_GETTABLE_PARAMS, \ + (void (*)(void))ossl_cipher_generic_gettable_params }, \ + { OSSL_FUNC_CIPHER_GET_CTX_PARAMS, \ + (void (*)(void))ossl_cipher_generic_get_ctx_params }, \ + { OSSL_FUNC_CIPHER_GETTABLE_CTX_PARAMS, \ + (void (*)(void))ossl_cipher_generic_gettable_ctx_params }, \ + { OSSL_FUNC_CIPHER_SET_CTX_PARAMS, \ + (void (*)(void))sm4_xts_set_ctx_params }, \ + { OSSL_FUNC_CIPHER_SETTABLE_CTX_PARAMS, \ + (void (*)(void))sm4_xts_settable_ctx_params }, \ + { 0, NULL } \ +} +/* ossl_sm4128xts_functions */ +IMPLEMENT_cipher(xts, XTS, 128, SM4_XTS_FLAGS); diff --git a/providers/implementations/ciphers/cipher_sm4_xts.h b/providers/implementations/ciphers/cipher_sm4_xts.h new file mode 100644 index 0000000000000000000000000000000000000000..cfca596979cc81f9929806ddc664598a442a8d8b --- /dev/null +++ b/providers/implementations/ciphers/cipher_sm4_xts.h @@ -0,0 +1,46 @@ +/* + * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#include +#include "prov/ciphercommon.h" +#include "crypto/sm4_platform.h" + +PROV_CIPHER_FUNC(void, xts_stream, + (const unsigned char *in, unsigned char *out, size_t len, + const SM4_KEY *key1, const SM4_KEY *key2, + const unsigned char iv[16], const int enc)); + +typedef struct prov_sm4_xts_ctx_st { + /* Must be first */ + PROV_CIPHER_CTX base; + + /* SM4 key schedules to use */ + union { + OSSL_UNION_ALIGN; + SM4_KEY ks; + } ks1, ks2; + + /*- + * XTS standard to use with SM4-XTS algorithm + * + * Must be 0 or 1, + * 0 for XTS mode specified by GB/T 17964-2021 + * 1 for XTS mode specified by IEEE Std 1619-2007 + */ + int xts_standard; + + XTS128_CONTEXT xts; + + /* Stream function for XTS mode specified by GB/T 17964-2021 */ + OSSL_xts_stream_fn stream_gb; + /* Stream function for XTS mode specified by IEEE Std 1619-2007 */ + OSSL_xts_stream_fn stream; +} PROV_SM4_XTS_CTX; + +const PROV_CIPHER_HW *ossl_prov_cipher_hw_sm4_xts(size_t keybits); diff --git a/providers/implementations/ciphers/cipher_sm4_xts_hw.c b/providers/implementations/ciphers/cipher_sm4_xts_hw.c new file mode 100644 index 0000000000000000000000000000000000000000..67a9923d94084b4d266c4ee6e775b0f7c9c744de --- /dev/null +++ b/providers/implementations/ciphers/cipher_sm4_xts_hw.c @@ -0,0 +1,94 @@ +/* + * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#include "cipher_sm4_xts.h" + +#define XTS_SET_KEY_FN(fn_set_enc_key, fn_set_dec_key, \ + fn_block_enc, fn_block_dec, \ + fn_stream, fn_stream_gb) { \ + size_t bytes = keylen / 2; \ + \ + if (ctx->enc) { \ + fn_set_enc_key(key, &xctx->ks1.ks); \ + xctx->xts.block1 = (block128_f)fn_block_enc; \ + } else { \ + fn_set_dec_key(key, &xctx->ks1.ks); \ + xctx->xts.block1 = (block128_f)fn_block_dec; \ + } \ + fn_set_enc_key(key + bytes, &xctx->ks2.ks); \ + xctx->xts.block2 = (block128_f)fn_block_enc; \ + xctx->xts.key1 = &xctx->ks1; \ + xctx->xts.key2 = &xctx->ks2; \ + xctx->stream = fn_stream; \ + xctx->stream_gb = fn_stream_gb; \ +} + +static int cipher_hw_sm4_xts_generic_initkey(PROV_CIPHER_CTX *ctx, + const unsigned char *key, + size_t keylen) +{ + PROV_SM4_XTS_CTX *xctx = (PROV_SM4_XTS_CTX *)ctx; + OSSL_xts_stream_fn stream = NULL; + OSSL_xts_stream_fn stream_gb = NULL; +#ifdef HWSM4_CAPABLE + if (HWSM4_CAPABLE) { + XTS_SET_KEY_FN(HWSM4_set_encrypt_key, HWSM4_set_decrypt_key, + HWSM4_encrypt, HWSM4_decrypt, stream, stream_gb); + return 1; + } else +#endif /* HWSM4_CAPABLE */ +#ifdef VPSM4_EX_CAPABLE + if (VPSM4_EX_CAPABLE) { + stream = vpsm4_ex_xts_encrypt; + stream_gb = vpsm4_ex_xts_encrypt_gb; + XTS_SET_KEY_FN(vpsm4_ex_set_encrypt_key, vpsm4_ex_set_decrypt_key, + vpsm4_ex_encrypt, vpsm4_ex_decrypt, stream, stream_gb); + return 1; + } else +#endif /* VPSM4_EX_CAPABLE */ +#ifdef VPSM4_CAPABLE + if (VPSM4_CAPABLE) { + stream = vpsm4_xts_encrypt; + stream_gb = vpsm4_xts_encrypt_gb; + XTS_SET_KEY_FN(vpsm4_set_encrypt_key, vpsm4_set_decrypt_key, + vpsm4_encrypt, vpsm4_decrypt, stream, stream_gb); + return 1; + } else +#endif /* VPSM4_CAPABLE */ + { + (void)0; + } + { + XTS_SET_KEY_FN(ossl_sm4_set_key, ossl_sm4_set_key, ossl_sm4_encrypt, + ossl_sm4_decrypt, stream, stream_gb); + } + return 1; +} + +static void cipher_hw_sm4_xts_copyctx(PROV_CIPHER_CTX *dst, + const PROV_CIPHER_CTX *src) +{ + PROV_SM4_XTS_CTX *sctx = (PROV_SM4_XTS_CTX *)src; + PROV_SM4_XTS_CTX *dctx = (PROV_SM4_XTS_CTX *)dst; + + *dctx = *sctx; + dctx->xts.key1 = &dctx->ks1.ks; + dctx->xts.key2 = &dctx->ks2.ks; +} + + +static const PROV_CIPHER_HW sm4_generic_xts = { + cipher_hw_sm4_xts_generic_initkey, + NULL, + cipher_hw_sm4_xts_copyctx +}; +const PROV_CIPHER_HW *ossl_prov_cipher_hw_sm4_xts(size_t keybits) +{ + return &sm4_generic_xts; +} diff --git a/providers/implementations/include/prov/implementations.h b/providers/implementations/include/prov/implementations.h index 3f6dd7ee16b6f4de7041196b321c010fc63c7c71..cfa32ea3cab4db0df280c4aa088261f9f051eed6 100644 --- a/providers/implementations/include/prov/implementations.h +++ b/providers/implementations/include/prov/implementations.h @@ -174,11 +174,14 @@ extern const OSSL_DISPATCH ossl_seed128ofb128_functions[]; extern const OSSL_DISPATCH ossl_seed128cfb128_functions[]; #endif /* OPENSSL_NO_SEED */ #ifndef OPENSSL_NO_SM4 +extern const OSSL_DISPATCH ossl_sm4128gcm_functions[]; +extern const OSSL_DISPATCH ossl_sm4128ccm_functions[]; extern const OSSL_DISPATCH ossl_sm4128ecb_functions[]; extern const OSSL_DISPATCH ossl_sm4128cbc_functions[]; extern const OSSL_DISPATCH ossl_sm4128ctr_functions[]; extern const OSSL_DISPATCH ossl_sm4128ofb128_functions[]; extern const OSSL_DISPATCH ossl_sm4128cfb128_functions[]; +extern const OSSL_DISPATCH ossl_sm4128xts_functions[]; #endif /* OPENSSL_NO_SM4 */ #ifndef OPENSSL_NO_RC5 extern const OSSL_DISPATCH ossl_rc5128ecb_functions[]; diff --git a/providers/implementations/include/prov/names.h b/providers/implementations/include/prov/names.h index e0dbb69a9d8c13579380f0ebe807710deb369798..5192f4f47132848f4cee0906718cde8cdd665f67 100644 --- a/providers/implementations/include/prov/names.h +++ b/providers/implementations/include/prov/names.h @@ -162,6 +162,9 @@ #define PROV_NAMES_SM4_CTR "SM4-CTR:1.2.156.10197.1.104.7" #define PROV_NAMES_SM4_OFB "SM4-OFB:SM4-OFB128:1.2.156.10197.1.104.3" #define PROV_NAMES_SM4_CFB "SM4-CFB:SM4-CFB128:1.2.156.10197.1.104.4" +#define PROV_NAMES_SM4_GCM "SM4-GCM:1.2.156.10197.1.104.8" +#define PROV_NAMES_SM4_CCM "SM4-CCM:1.2.156.10197.1.104.9" +#define PROV_NAMES_SM4_XTS "SM4-XTS:1.2.156.10197.1.104.10" #define PROV_NAMES_ChaCha20 "ChaCha20" #define PROV_NAMES_ChaCha20_Poly1305 "ChaCha20-Poly1305" #define PROV_NAMES_CAST5_ECB "CAST5-ECB" diff --git a/test/recipes/30-test_evp_data/evpciph_sm4.txt b/test/recipes/30-test_evp_data/evpciph_sm4.txt index ec8a45bd3f847d987c28ccce0e0ec0bb24a0df6d..e9a98c9898ecb1bd65c499953d436b8f77ace8c7 100644 --- a/test/recipes/30-test_evp_data/evpciph_sm4.txt +++ b/test/recipes/30-test_evp_data/evpciph_sm4.txt @@ -19,6 +19,18 @@ IV = 0123456789ABCDEFFEDCBA9876543210 Plaintext = 0123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA9876543210 Ciphertext = 2677F46B09C122CC975533105BD4A22AF6125F7275CE552C3A2BBCF533DE8A3B +Cipher = SM4-CBC +Key = 0123456789ABCDEFFEDCBA9876543210 +IV = 0123456789ABCDEFFEDCBA9876543210 +Plaintext = 0123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA9876543210 +Ciphertext = 2677F46B09C122CC975533105BD4A22AF6125F7275CE552C3A2BBCF533DE8A3BFFF5A4F208092C0901BA02D5772977369915E3FA2356C9F4EB6460ECC457E7f8E3CFA3DEEBFE9883E3A48BCF7C4A11AA3EC9E0D317C5D319BE72A5CDDDEC640C + +Cipher = SM4-CBC +Key = 0123456789ABCDEFFEDCBA9876543210 +IV = 0123456789ABCDEFFEDCBA9876543210 +Plaintext = 0123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA9876543210 +Ciphertext = 2677f46b09c122cc975533105bd4a22af6125f7275ce552c3a2bbcf533de8a3bfff5a4f208092c0901ba02d5772977369915e3fa2356c9f4eb6460ecc457e7f8e3cfa3deebfe9883e3a48bcf7c4a11aa3ec9e0d317c5d319be72a5cdddec640c6fc70bfa3ddaafffdd7c09b2774dcb2cec29f0c6f0b6773e985b3e395e924238505a8f120d9ca84de5c3cf7e45f097b14b3a46c5b1068669982a5c1f5f61be291b984f331d44ffb2758f771672448fc957fa1416c446427a41e25d5524a2418b9d96b2f17582f0f1aa9c204c6807f54f7b6833c5f00856659ddabc245936868c + Cipher = SM4-OFB Key = 0123456789ABCDEFFEDCBA9876543210 IV = 0123456789ABCDEFFEDCBA9876543210 @@ -36,3 +48,23 @@ Key = 0123456789ABCDEFFEDCBA9876543210 IV = 0123456789ABCDEFFEDCBA9876543210 Plaintext = AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBCCCCCCCCCCCCCCCCDDDDDDDDDDDDDDDDEEEEEEEEEEEEEEEEFFFFFFFFFFFFFFFFEEEEEEEEEEEEEEEEAAAAAAAAAAAAAAAA Ciphertext = C2B4759E78AC3CF43D0852F4E8D5F9FD7256E8A5FCB65A350EE00630912E44492A0B17E1B85B060D0FBA612D8A95831638B361FD5FFACD942F081485A83CA35D + +Title = SM4 GCM test vectors from RFC8998 + +Cipher = SM4-GCM +Key = 0123456789abcdeffedcba9876543210 +IV = 00001234567800000000abcd +AAD = feedfacedeadbeeffeedfacedeadbeefabaddad2 +Tag = 83de3541e4c2b58177e065a9bf7b62ec +Plaintext = aaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbccccccccccccccccddddddddddddddddeeeeeeeeeeeeeeeeffffffffffffffffeeeeeeeeeeeeeeeeaaaaaaaaaaaaaaaa +Ciphertext = 17f399f08c67d5ee19d0dc9969c4bb7d5fd46fd3756489069157b282bb200735d82710ca5c22f0ccfa7cbf93d496ac15a56834cbcf98c397b4024a2691233b8d + +Title = SM4 CCM test vectors from RFC8998 + +Cipher = SM4-CCM +Key = 0123456789abcdeffedcba9876543210 +IV = 00001234567800000000abcd +AAD = feedfacedeadbeeffeedfacedeadbeefabaddad2 +Tag = 16842d4fa186f56ab33256971fa110f4 +Plaintext = aaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbccccccccccccccccddddddddddddddddeeeeeeeeeeeeeeeeffffffffffffffffeeeeeeeeeeeeeeeeaaaaaaaaaaaaaaaa +Ciphertext = 48af93501fa62adbcd414cce6034d895dda1bf8f132f042098661572e7483094fd12e518ce062c98acee28d95df4416bed31a2f04476c18bb40c84a74b97dc5b