From: Wenkai Lin linwenkai6@hisilicon.com
This patch implements the CE instruction using SM4 CBC and CTR modes, and includes the necessary logic for mode-specific operations, such as generating initialization vectors (IV) and handling chaining and counter values.
Signed-off-by: Wenkai Lin linwenkai6@hisilicon.com --- Makefile.am | 5 +- drv/isa_ce_sm4.c | 232 ++++++++++++ drv/isa_ce_sm4.h | 33 ++ drv/isa_ce_sm4_armv8.S | 774 +++++++++++++++++++++++++++++++++++++++++ wd_cipher.c | 4 +- 5 files changed, 1044 insertions(+), 4 deletions(-) create mode 100644 drv/isa_ce_sm4.c create mode 100644 drv/isa_ce_sm4.h create mode 100644 drv/isa_ce_sm4_armv8.S
diff --git a/Makefile.am b/Makefile.am index 19eab30..5102a93 100644 --- a/Makefile.am +++ b/Makefile.am @@ -77,6 +77,7 @@ libwd_crypto_la_SOURCES=wd_cipher.c wd_cipher.h wd_cipher_drv.h \ wd_rsa.c wd_rsa.h wd_rsa_drv.h \ wd_dh.c wd_dh.h wd_dh_drv.h \ wd_ecc.c wd_ecc.h wd_ecc_drv.h \ + arm_arch_ce.h isa_ce_sm3.h isa_ce_sm4.h \ wd_digest.c wd_digest.h wd_digest_drv.h \ wd_util.c wd_util.h \ wd_sched.c wd_sched.h \ @@ -89,8 +90,8 @@ libhisi_sec_la_SOURCES=drv/hisi_sec.c drv/hisi_qm_udrv.c \ libhisi_hpre_la_SOURCES=drv/hisi_hpre.c drv/hisi_qm_udrv.c \ hisi_qm_udrv.h
-libisa_ce_la_SOURCES=drv/isa_ce_sm3.c drv/isa_ce_sm3_armv8.S arm_arch_ce.h \ - drv/isa_ce_sm3.h +libisa_ce_la_SOURCES=arm_arch_ce.h drv/isa_ce_sm3.c drv/isa_ce_sm3_armv8.S isa_ce_sm3.h \ + drv/isa_ce_sm4.c drv/isa_ce_sm4_armv8.S drv/isa_ce_sm4.h
if WD_STATIC_DRV AM_CFLAGS += -DWD_STATIC_DRV -fPIC diff --git a/drv/isa_ce_sm4.c b/drv/isa_ce_sm4.c new file mode 100644 index 0000000..d48db1d --- /dev/null +++ b/drv/isa_ce_sm4.c @@ -0,0 +1,232 @@ +// SPDX-License-Identifier: Apache-2.0 +/* + * Copyright 2011-2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ +/* + * Copyright 2023 Huawei Technologies Co.,Ltd. All rights reserved. + */ + +#include "drv/wd_cipher_drv.h" +#include "isa_ce_sm4.h" +#include "wd_cipher.h" + +#define SM4_ENCRYPT 1 +#define SM4_DECRYPT 0 +#define MSG_Q_DEPTH 1024 +#define INCREASE_BITS 96 +#define BYTE_BITS 8 +#define SM4_BLOCK_SIZE 16 +#define MAX_BLOCK_NUM (1U << 28) + +#define GETU32(p) \ + ((__u32)(p)[0] << 24 | (__u32)(p)[1] << 16 | (__u32)(p)[2] << 8 | (__u32)(p)[3]) +#define PUTU32(p, v) \ + ((p)[0] = (__u8)((v) >> 24), (p)[1] = (__u8)((v) >> 16), \ + (p)[2] = (__u8)((v) >> 8), (p)[3] = (__u8)(v)) + +static int isa_ce_init(void *conf, void *priv) +{ + return 0; +} + +static void isa_ce_exit(void *priv) +{ +} + +/* increment upper 96 bits of 128-bit counter by 1 */ +static void ctr96_inc(__u8 *counter) +{ + __u32 n = INCREASE_BITS / BYTE_BITS; + __u32 c = 1; + + do { + --n; + c += counter[n]; + counter[n] = (__u8)c; + c >>= BYTE_BITS; + } while (n); +} + +static void sm4_v8_ctr32_encrypt(__u8 *in, __u8 *out, + __u64 len, const struct SM4_KEY *key, __u8 *iv) +{ + __u8 ecount_buf[SM4_BLOCK_SIZE] = {0}; + __u64 blocks, offset; + __u32 ctr32; + __u32 n = 0; + + while (n && len) { + *(out++) = *(in++) ^ ecount_buf[n]; + --len; + n = (n + 1) % SM4_BLOCK_SIZE; + } + + ctr32 = GETU32(iv + INCREASE_BITS / BYTE_BITS); + while (len >= SM4_BLOCK_SIZE) { + blocks = len / SM4_BLOCK_SIZE; + /* + * 1<<28 is just a not-so-small yet not-so-large number... + * Below condition is practically never met, but it has to + * be checked for code correctness. + */ + if (blocks > MAX_BLOCK_NUM) + blocks = MAX_BLOCK_NUM; + /* + * As (*func) operates on 32-bit counter, caller + * has to handle overflow. 'if' below detects the + * overflow, which is then handled by limiting the + * amount of blocks to the exact overflow point... + */ + ctr32 += (__u32)blocks; + if (ctr32 < blocks) { + blocks -= ctr32; + ctr32 = 0; + } + sm4_v8_ctr32_encrypt_blocks(in, out, blocks, key, iv); + /* (*ctr) does not update iv, caller does: */ + PUTU32(iv + INCREASE_BITS / BYTE_BITS, ctr32); + /* ... overflow was detected, propagate carry. */ + if (ctr32 == 0) + ctr96_inc(iv); + offset = blocks * SM4_BLOCK_SIZE; + len -= offset; + out += offset; + in += offset; + } + if (len) { + memset(ecount_buf, 0, SM4_BLOCK_SIZE); + sm4_v8_ctr32_encrypt_blocks(ecount_buf, ecount_buf, 1, key, iv); + ++ctr32; + PUTU32(iv + INCREASE_BITS / BYTE_BITS, ctr32); + if (ctr32 == 0) + ctr96_inc(iv); + while (len--) { + out[n] = in[n] ^ ecount_buf[n]; + ++n; + } + } +} + +static void sm4_ctr_encrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey_enc) +{ + sm4_v8_ctr32_encrypt(msg->in, msg->out, msg->in_bytes, rkey_enc, msg->iv); +} + +static void sm4_cbc_encrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey_enc) +{ + sm4_v8_cbc_encrypt(msg->in, msg->out, msg->in_bytes, rkey_enc, msg->iv, SM4_ENCRYPT); +} + +static void sm4_cbc_decrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey_dec) +{ + sm4_v8_cbc_encrypt(msg->in, msg->out, msg->in_bytes, rkey_dec, msg->iv, SM4_DECRYPT); +} + +void sm4_set_encrypt_key(const __u8 *userKey, struct SM4_KEY *key) +{ + sm4_v8_set_encrypt_key(userKey, key); +} + +void sm4_set_decrypt_key(const __u8 *userKey, struct SM4_KEY *key) +{ + sm4_v8_set_decrypt_key(userKey, key); +} + +static int isa_ce_cipher_send(handle_t ctx, void *wd_msg) +{ + struct wd_cipher_msg *msg = wd_msg; + struct SM4_KEY rkey; + + if (!msg) { + WD_ERR("invalid: input sm4 msg is NULL!\n"); + return -WD_EINVAL; + } + + if (msg->op_type == WD_CIPHER_ENCRYPTION || msg->mode == WD_CIPHER_CTR) + sm4_set_encrypt_key(msg->key, &rkey); + else + sm4_set_decrypt_key(msg->key, &rkey); + + switch (msg->mode) { + case WD_CIPHER_CBC: + if (msg->op_type == WD_CIPHER_ENCRYPTION) + sm4_cbc_encrypt(msg, &rkey); + else + sm4_cbc_decrypt(msg, &rkey); + break; + case WD_CIPHER_CTR: + sm4_ctr_encrypt(msg, &rkey); + break; + default: + WD_ERR("The current block cipher mode is not supported!\n"); + return -WD_EINVAL; + } + + return 0; +} + +static int isa_ce_cipher_recv(handle_t ctx, void *wd_msg) +{ + return 0; +} + +static int cipher_send(handle_t ctx, void *msg) +{ + return isa_ce_cipher_send(ctx, msg); +} + +static int cipher_recv(handle_t ctx, void *msg) +{ + return isa_ce_cipher_recv(ctx, msg); +} + +#define GEN_CE_ALG_DRIVER(ce_alg_name, alg_type) \ +{\ + .drv_name = "isa_ce_sm4",\ + .alg_name = (ce_alg_name),\ + .calc_type = UADK_ALG_CE_INSTR,\ + .priority = 200,\ + .priv_size = 0,\ + .op_type_num = 1,\ + .fallback = 0,\ + .init = isa_ce_init,\ + .exit = isa_ce_exit,\ + .send = alg_type##_send,\ + .recv = alg_type##_recv,\ +} + +static struct wd_alg_driver cipher_alg_driver[] = { + GEN_CE_ALG_DRIVER("cbc(sm4)", cipher), + GEN_CE_ALG_DRIVER("ctr(sm4)", cipher), +}; + +static void __attribute__((constructor)) isa_ce_probe(void) +{ + __u32 alg_num, i; + int ret; + + WD_INFO("Info: register SM4 CE alg drivers!\n"); + + alg_num = ARRAY_SIZE(cipher_alg_driver); + for (i = 0; i < alg_num; i++) { + ret = wd_alg_driver_register(&cipher_alg_driver[i]); + if (ret && ret != -WD_ENODEV) + WD_ERR("Error: register SM4 CE %s failed!\n", + cipher_alg_driver[i].alg_name); + } +} + +static void __attribute__((destructor)) isa_ce_remove(void) +{ + __u32 alg_num, i; + + WD_INFO("Info: unregister SM4 CE alg drivers!\n"); + alg_num = ARRAY_SIZE(cipher_alg_driver); + for (i = 0; i < alg_num; i++) + wd_alg_driver_unregister(&cipher_alg_driver[i]); +} diff --git a/drv/isa_ce_sm4.h b/drv/isa_ce_sm4.h new file mode 100644 index 0000000..d91c864 --- /dev/null +++ b/drv/isa_ce_sm4.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: Apache-2.0 */ +/* Copyright 2023 Huawei Technologies Co.,Ltd. All rights reserved. */ + +#ifndef __SM4_CE_DRV_H +#define __SM4_CE_DRV_H + +#pragma once +#include <stdint.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define SM4_KEY_SCHEDULE 32 + +struct SM4_KEY { + __u32 rk[SM4_KEY_SCHEDULE]; +}; + + +void sm4_v8_set_encrypt_key(const unsigned char *userKey, struct SM4_KEY *key); +void sm4_v8_set_decrypt_key(const unsigned char *userKey, struct SM4_KEY *key); +void sm4_v8_cbc_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const struct SM4_KEY *key, + unsigned char *ivec, const int enc); +void sm4_v8_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, + size_t len, const void *key, const unsigned char ivec[16]); + +#ifdef __cplusplus +} +#endif + +#endif /* __SM4_CE_DRV_H */ diff --git a/drv/isa_ce_sm4_armv8.S b/drv/isa_ce_sm4_armv8.S new file mode 100644 index 0000000..d7d172a --- /dev/null +++ b/drv/isa_ce_sm4_armv8.S @@ -0,0 +1,774 @@ +/* SPDX-License-Identifier: Apache-2.0 */ +/* + * Copyright 2011-2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ +#include "../include/drv/arm_arch_ce.h" + +.arch armv8-a+crypto + +.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ + 16, 17, 18, 19, 20, 21, 22, 23 24, 25, 26, 27, 28, 29, 30, 31 + .set .Lv\b().4s, \b +.endr + +.macro sm4e, vd, vn + .inst 0xcec08400 | (.L\vn << 5) | .L\vd +.endm + +.macro sm4ekey, vd, vn, vm + .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd +.endm + +.text +.align 6 +.Lck: +.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 +.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 +.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 +.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 +.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 +.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 +.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 +.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 +.Lfk: +.long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc +.globl sm4_v8_set_encrypt_key +.type sm4_v8_set_encrypt_key,%function +.align 5 +sm4_v8_set_encrypt_key: + AARCH64_VALID_CALL_TARGET + ld1 {v0.4s},[x0] + adr x2,.Lfk + ld1 {v24.4s},[x2] + adr x2,.Lck + ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x2],64 +#ifndef __ARMEB__ + rev32 v0.16b,v0.16b +#endif + ld1 {v20.4s,v21.4s,v22.4s,v23.4s},[x2] + eor v0.16b,v0.16b,v24.16b; + sm4ekey v0.4s,v0.4s,v16.4s; + sm4ekey v1.4s,v0.4s,v17.4s; + sm4ekey v2.4s,v1.4s,v18.4s; + sm4ekey v3.4s,v2.4s,v19.4s; + sm4ekey v4.4s,v3.4s,v20.4s; + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],64 + sm4ekey v5.4s,v4.4s,v21.4s; + sm4ekey v6.4s,v5.4s,v22.4s; + sm4ekey v7.4s,v6.4s,v23.4s; + st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1] + ret +.size sm4_v8_set_encrypt_key,.-sm4_v8_set_encrypt_key +.globl sm4_v8_set_decrypt_key +.type sm4_v8_set_decrypt_key,%function +.align 5 +sm4_v8_set_decrypt_key: + AARCH64_VALID_CALL_TARGET + ld1 {v7.4s},[x0] + adr x2,.Lfk + ld1 {v24.4s},[x2] + adr x2, .Lck + ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x2],64 +#ifndef __ARMEB__ + rev32 v7.16b,v7.16b +#endif + ld1 {v20.4s,v21.4s,v22.4s,v23.4s},[x2] + eor v7.16b, v7.16b,v24.16b; + sm4ekey v7.4s,v7.4s,v16.4s; + sm4ekey v6.4s,v7.4s,v17.4s; + sm4ekey v5.4s,v6.4s,v18.4s; + rev64 v7.4s,v7.4s + rev64 v6.4s,v6.4s + ext v7.16b,v7.16b,v7.16b,#8 + ext v6.16b,v6.16b,v6.16b,#8 + sm4ekey v4.4s,v5.4s,v19.4s; + sm4ekey v3.4s,v4.4s,v20.4s; + rev64 v5.4s,v5.4s + rev64 v4.4s,v4.4s + ext v5.16b,v5.16b,v5.16b,#8 + ext v4.16b,v4.16b,v4.16b,#8 + sm4ekey v2.4s,v3.4s,v21.4s; + sm4ekey v1.4s,v2.4s,v22.4s; + rev64 v3.4s,v3.4s + rev64 v2.4s,v2.4s + ext v3.16b,v3.16b,v3.16b,#8 + ext v2.16b,v2.16b,v2.16b,#8 + sm4ekey v0.4s,v1.4s,v23.4s; + rev64 v1.4s, v1.4s + rev64 v0.4s, v0.4s + ext v1.16b,v1.16b,v1.16b,#8 + ext v0.16b,v0.16b,v0.16b,#8 + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],64 + st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1] + ret +.size sm4_v8_set_decrypt_key,.-sm4_v8_set_decrypt_key +.globl sm4_v8_cbc_encrypt +.type sm4_v8_cbc_encrypt,%function +.align 5 +sm4_v8_cbc_encrypt: + AARCH64_VALID_CALL_TARGET + stp d8,d9,[sp, #-16]! + + ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x3],#64 + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x3] + ld1 {v8.4s},[x4] + cmp w5,#0 + b.eq .Ldec +1: + cmp x2, #64 + b.lt 1f + ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x0],#64 + eor v16.16b,v16.16b,v8.16b +#ifndef __ARMEB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __ARMEB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __ARMEB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __ARMEB__ + rev32 v19.16b,v19.16b +#endif + sm4e v16.4s,v0.4s; + sm4e v16.4s,v1.4s; + sm4e v16.4s,v2.4s; + sm4e v16.4s,v3.4s; + sm4e v16.4s,v4.4s; + sm4e v16.4s,v5.4s; + sm4e v16.4s,v6.4s; + sm4e v16.4s,v7.4s; + rev64 v16.4s,v16.4s + ext v16.16b,v16.16b,v16.16b,#8 + eor v17.16b,v17.16b,v16.16b + sm4e v17.4s,v0.4s; + sm4e v17.4s,v1.4s; + sm4e v17.4s,v2.4s; + sm4e v17.4s,v3.4s; + sm4e v17.4s,v4.4s; + sm4e v17.4s,v5.4s; + sm4e v17.4s,v6.4s; + sm4e v17.4s,v7.4s; + rev64 v17.4s,v17.4s + ext v17.16b,v17.16b,v17.16b,#8 +#ifndef __ARMEB__ + rev32 v16.16b,v16.16b +#endif + eor v18.16b,v18.16b,v17.16b + sm4e v18.4s,v0.4s; + sm4e v18.4s,v1.4s; + sm4e v18.4s,v2.4s; + sm4e v18.4s,v3.4s; + sm4e v18.4s,v4.4s; + sm4e v18.4s,v5.4s; + sm4e v18.4s,v6.4s; + sm4e v18.4s,v7.4s; + rev64 v18.4s,v18.4s + ext v18.16b,v18.16b,v18.16b,#8 +#ifndef __ARMEB__ + rev32 v17.16b,v17.16b +#endif + eor v19.16b,v19.16b,v18.16b + sm4e v19.4s,v0.4s; + sm4e v19.4s,v1.4s; + sm4e v19.4s,v2.4s; + sm4e v19.4s,v3.4s; + sm4e v19.4s,v4.4s; + sm4e v19.4s,v5.4s; + sm4e v19.4s,v6.4s; + sm4e v19.4s,v7.4s; + rev64 v19.4s,v19.4s + ext v19.16b,v19.16b,v19.16b,#8 +#ifndef __ARMEB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __ARMEB__ + rev32 v19.16b,v19.16b +#endif + mov v8.16b,v19.16b + st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64 + subs x2,x2,#64 + b.ne 1b +1: + subs x2,x2,#16 + b.lt 3f + ld1 {v16.4s},[x0],#16 + eor v8.16b,v8.16b,v16.16b +#ifndef __ARMEB__ + rev32 v8.16b,v8.16b +#endif + sm4e v8.4s,v0.4s; + sm4e v8.4s,v1.4s; + sm4e v8.4s,v2.4s; + sm4e v8.4s,v3.4s; + sm4e v8.4s,v4.4s; + sm4e v8.4s,v5.4s; + sm4e v8.4s,v6.4s; + sm4e v8.4s,v7.4s; + rev64 v8.4s,v8.4s + ext v8.16b,v8.16b,v8.16b,#8 +#ifndef __ARMEB__ + rev32 v8.16b,v8.16b +#endif + st1 {v8.16b},[x1],#16 + b.ne 1b + b 3f +.Ldec: +1: + cmp x2, #64 + b.lt 1f + ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x0] + ld1 {v24.4s,v25.4s,v26.4s,v27.4s},[x0],#64 + cmp x2,#128 + b.lt 2f + // 8 blocks mode + ld1 {v20.4s,v21.4s,v22.4s,v23.4s},[x0] + ld1 {v28.4s,v29.4s,v30.4s,v31.4s},[x0],#64 +#ifndef __ARMEB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __ARMEB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __ARMEB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __ARMEB__ + rev32 v19.16b,v19.16b +#endif +#ifndef __ARMEB__ + rev32 v20.16b,v20.16b +#endif +#ifndef __ARMEB__ + rev32 v21.16b,v21.16b +#endif +#ifndef __ARMEB__ + rev32 v22.16b,v22.16b +#endif +#ifndef __ARMEB__ + rev32 v23.16b,v23.16b +#endif + sm4e v16.4s,v0.4s; + sm4e v17.4s,v0.4s; + sm4e v18.4s,v0.4s; + sm4e v19.4s,v0.4s; + + sm4e v16.4s,v1.4s; + sm4e v17.4s,v1.4s; + sm4e v18.4s,v1.4s; + sm4e v19.4s,v1.4s; + + sm4e v16.4s,v2.4s; + sm4e v17.4s,v2.4s; + sm4e v18.4s,v2.4s; + sm4e v19.4s,v2.4s; + + sm4e v16.4s,v3.4s; + sm4e v17.4s,v3.4s; + sm4e v18.4s,v3.4s; + sm4e v19.4s,v3.4s; + + sm4e v16.4s,v4.4s; + sm4e v17.4s,v4.4s; + sm4e v18.4s,v4.4s; + sm4e v19.4s,v4.4s; + + sm4e v16.4s,v5.4s; + sm4e v17.4s,v5.4s; + sm4e v18.4s,v5.4s; + sm4e v19.4s,v5.4s; + + sm4e v16.4s,v6.4s; + sm4e v17.4s,v6.4s; + sm4e v18.4s,v6.4s; + sm4e v19.4s,v6.4s; + + sm4e v16.4s,v7.4s; + rev64 v16.4s,v16.4s + sm4e v17.4s,v7.4s; + ext v16.16b,v16.16b,v16.16b,#8 + rev64 v17.4s,v17.4s + sm4e v18.4s,v7.4s; + ext v17.16b,v17.16b,v17.16b,#8 + rev64 v18.4s,v18.4s + sm4e v19.4s,v7.4s; + ext v18.16b,v18.16b,v18.16b,#8 + rev64 v19.4s,v19.4s + ext v19.16b,v19.16b,v19.16b,#8 + sm4e v20.4s,v0.4s; + sm4e v21.4s,v0.4s; + sm4e v22.4s,v0.4s; + sm4e v23.4s,v0.4s; + + sm4e v20.4s,v1.4s; + sm4e v21.4s,v1.4s; + sm4e v22.4s,v1.4s; + sm4e v23.4s,v1.4s; + + sm4e v20.4s,v2.4s; + sm4e v21.4s,v2.4s; + sm4e v22.4s,v2.4s; + sm4e v23.4s,v2.4s; + + sm4e v20.4s,v3.4s; + sm4e v21.4s,v3.4s; + sm4e v22.4s,v3.4s; + sm4e v23.4s,v3.4s; + + sm4e v20.4s,v4.4s; + sm4e v21.4s,v4.4s; + sm4e v22.4s,v4.4s; + sm4e v23.4s,v4.4s; + + sm4e v20.4s,v5.4s; + sm4e v21.4s,v5.4s; + sm4e v22.4s,v5.4s; + sm4e v23.4s,v5.4s; + + sm4e v20.4s,v6.4s; + sm4e v21.4s,v6.4s; + sm4e v22.4s,v6.4s; + sm4e v23.4s,v6.4s; + + sm4e v20.4s,v7.4s; + rev64 v20.4s,v20.4s + sm4e v21.4s,v7.4s; + ext v20.16b,v20.16b,v20.16b,#8 + rev64 v21.4s,v21.4s + sm4e v22.4s,v7.4s; + ext v21.16b,v21.16b,v21.16b,#8 + rev64 v22.4s,v22.4s + sm4e v23.4s,v7.4s; + ext v22.16b,v22.16b,v22.16b,#8 + rev64 v23.4s,v23.4s + ext v23.16b,v23.16b,v23.16b,#8 +#ifndef __ARMEB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __ARMEB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __ARMEB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __ARMEB__ + rev32 v19.16b,v19.16b +#endif +#ifndef __ARMEB__ + rev32 v20.16b,v20.16b +#endif +#ifndef __ARMEB__ + rev32 v21.16b,v21.16b +#endif +#ifndef __ARMEB__ + rev32 v22.16b,v22.16b +#endif +#ifndef __ARMEB__ + rev32 v23.16b,v23.16b +#endif + eor v16.16b,v16.16b,v8.16b + eor v17.16b,v17.16b,v24.16b + eor v18.16b,v18.16b,v25.16b + mov v8.16b,v31.16b + eor v19.16b,v19.16b,v26.16b + eor v20.16b,v20.16b,v27.16b + eor v21.16b,v21.16b,v28.16b + eor v22.16b,v22.16b,v29.16b + eor v23.16b,v23.16b,v30.16b + st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64 + st1 {v20.4s,v21.4s,v22.4s,v23.4s},[x1],#64 + subs x2,x2,128 + b.gt 1b + b 3f + // 4 blocks mode +2: +#ifndef __ARMEB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __ARMEB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __ARMEB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __ARMEB__ + rev32 v19.16b,v19.16b +#endif + sm4e v16.4s,v0.4s; + sm4e v17.4s,v0.4s; + sm4e v18.4s,v0.4s; + sm4e v19.4s,v0.4s; + + sm4e v16.4s,v1.4s; + sm4e v17.4s,v1.4s; + sm4e v18.4s,v1.4s; + sm4e v19.4s,v1.4s; + + sm4e v16.4s,v2.4s; + sm4e v17.4s,v2.4s; + sm4e v18.4s,v2.4s; + sm4e v19.4s,v2.4s; + + sm4e v16.4s,v3.4s; + sm4e v17.4s,v3.4s; + sm4e v18.4s,v3.4s; + sm4e v19.4s,v3.4s; + + sm4e v16.4s,v4.4s; + sm4e v17.4s,v4.4s; + sm4e v18.4s,v4.4s; + sm4e v19.4s,v4.4s; + + sm4e v16.4s,v5.4s; + sm4e v17.4s,v5.4s; + sm4e v18.4s,v5.4s; + sm4e v19.4s,v5.4s; + + sm4e v16.4s,v6.4s; + sm4e v17.4s,v6.4s; + sm4e v18.4s,v6.4s; + sm4e v19.4s,v6.4s; + + sm4e v16.4s,v7.4s; + rev64 v16.4s,v16.4s + sm4e v17.4s,v7.4s; + ext v16.16b,v16.16b,v16.16b,#8 + rev64 v17.4s,v17.4s + sm4e v18.4s,v7.4s; + ext v17.16b,v17.16b,v17.16b,#8 + rev64 v18.4s,v18.4s + sm4e v19.4s,v7.4s; + ext v18.16b,v18.16b,v18.16b,#8 + rev64 v19.4s,v19.4s + ext v19.16b,v19.16b,v19.16b,#8 +#ifndef __ARMEB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __ARMEB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __ARMEB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __ARMEB__ + rev32 v19.16b,v19.16b +#endif + eor v16.16b,v16.16b,v8.16b + eor v17.16b,v17.16b,v24.16b + mov v8.16b,v27.16b + eor v18.16b,v18.16b,v25.16b + eor v19.16b,v19.16b,v26.16b + st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64 + subs x2,x2,#64 + b.gt 1b +1: + subs x2,x2,#16 + b.lt 3f + ld1 {v16.4s},[x0],#16 + mov v24.16b,v16.16b +#ifndef __ARMEB__ + rev32 v16.16b,v16.16b +#endif + sm4e v16.4s,v0.4s; + sm4e v16.4s,v1.4s; + sm4e v16.4s,v2.4s; + sm4e v16.4s,v3.4s; + sm4e v16.4s,v4.4s; + sm4e v16.4s,v5.4s; + sm4e v16.4s,v6.4s; + sm4e v16.4s,v7.4s; + rev64 v16.4s,v16.4s + ext v16.16b,v16.16b,v16.16b,#8 +#ifndef __ARMEB__ + rev32 v16.16b,v16.16b +#endif + eor v16.16b,v16.16b,v8.16b + mov v8.16b,v24.16b + st1 {v16.16b},[x1],#16 + b.ne 1b +3: + // save back IV + st1 {v8.16b},[x4] + ldp d8,d9,[sp],#16 + ret +.size sm4_v8_cbc_encrypt,.-sm4_v8_cbc_encrypt +.globl sm4_v8_ctr32_encrypt_blocks +.type sm4_v8_ctr32_encrypt_blocks,%function +.align 5 +sm4_v8_ctr32_encrypt_blocks: + AARCH64_VALID_CALL_TARGET + stp d8,d9,[sp, #-16]! + + ld1 {v8.4s},[x4] + ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x3],64 + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x3] +#ifndef __ARMEB__ + rev32 v8.16b,v8.16b +#endif + mov w5,v8.s[3] +1: + cmp x2,#4 + b.lt 1f + ld1 {v24.4s,v25.4s,v26.4s,v27.4s},[x0],#64 + mov v16.16b,v8.16b + mov v17.16b,v8.16b + mov v18.16b,v8.16b + mov v19.16b,v8.16b + add w5,w5,#1 + mov v17.s[3],w5 + add w5,w5,#1 + mov v18.s[3],w5 + add w5,w5,#1 + mov v19.s[3],w5 + cmp x2,#8 + b.lt 2f + ld1 {v28.4s,v29.4s,v30.4s,v31.4s},[x0],#64 + mov v20.16b,v8.16b + mov v21.16b,v8.16b + mov v22.16b,v8.16b + mov v23.16b,v8.16b + add w5,w5,#1 + mov v20.s[3],w5 + add w5,w5,#1 + mov v21.s[3],w5 + add w5,w5,#1 + mov v22.s[3],w5 + add w5,w5,#1 + mov v23.s[3],w5 + sm4e v16.4s,v0.4s; + sm4e v17.4s,v0.4s; + sm4e v18.4s,v0.4s; + sm4e v19.4s,v0.4s; + + sm4e v16.4s,v1.4s; + sm4e v17.4s,v1.4s; + sm4e v18.4s,v1.4s; + sm4e v19.4s,v1.4s; + + sm4e v16.4s,v2.4s; + sm4e v17.4s,v2.4s; + sm4e v18.4s,v2.4s; + sm4e v19.4s,v2.4s; + + sm4e v16.4s,v3.4s; + sm4e v17.4s,v3.4s; + sm4e v18.4s,v3.4s; + sm4e v19.4s,v3.4s; + + sm4e v16.4s,v4.4s; + sm4e v17.4s,v4.4s; + sm4e v18.4s,v4.4s; + sm4e v19.4s,v4.4s; + + sm4e v16.4s,v5.4s; + sm4e v17.4s,v5.4s; + sm4e v18.4s,v5.4s; + sm4e v19.4s,v5.4s; + + sm4e v16.4s,v6.4s; + sm4e v17.4s,v6.4s; + sm4e v18.4s,v6.4s; + sm4e v19.4s,v6.4s; + + sm4e v16.4s,v7.4s; + rev64 v16.4s,v16.4s + sm4e v17.4s,v7.4s; + ext v16.16b,v16.16b,v16.16b,#8 + rev64 v17.4s,v17.4s + sm4e v18.4s,v7.4s; + ext v17.16b,v17.16b,v17.16b,#8 + rev64 v18.4s,v18.4s + sm4e v19.4s,v7.4s; + ext v18.16b,v18.16b,v18.16b,#8 + rev64 v19.4s,v19.4s + ext v19.16b,v19.16b,v19.16b,#8 + sm4e v20.4s,v0.4s; + sm4e v21.4s,v0.4s; + sm4e v22.4s,v0.4s; + sm4e v23.4s,v0.4s; + + sm4e v20.4s,v1.4s; + sm4e v21.4s,v1.4s; + sm4e v22.4s,v1.4s; + sm4e v23.4s,v1.4s; + + sm4e v20.4s,v2.4s; + sm4e v21.4s,v2.4s; + sm4e v22.4s,v2.4s; + sm4e v23.4s,v2.4s; + + sm4e v20.4s,v3.4s; + sm4e v21.4s,v3.4s; + sm4e v22.4s,v3.4s; + sm4e v23.4s,v3.4s; + + sm4e v20.4s,v4.4s; + sm4e v21.4s,v4.4s; + sm4e v22.4s,v4.4s; + sm4e v23.4s,v4.4s; + + sm4e v20.4s,v5.4s; + sm4e v21.4s,v5.4s; + sm4e v22.4s,v5.4s; + sm4e v23.4s,v5.4s; + + sm4e v20.4s,v6.4s; + sm4e v21.4s,v6.4s; + sm4e v22.4s,v6.4s; + sm4e v23.4s,v6.4s; + + sm4e v20.4s,v7.4s; + rev64 v20.4s,v20.4s + sm4e v21.4s,v7.4s; + ext v20.16b,v20.16b,v20.16b,#8 + rev64 v21.4s,v21.4s + sm4e v22.4s,v7.4s; + ext v21.16b,v21.16b,v21.16b,#8 + rev64 v22.4s,v22.4s + sm4e v23.4s,v7.4s; + ext v22.16b,v22.16b,v22.16b,#8 + rev64 v23.4s,v23.4s + ext v23.16b,v23.16b,v23.16b,#8 +#ifndef __ARMEB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __ARMEB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __ARMEB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __ARMEB__ + rev32 v19.16b,v19.16b +#endif +#ifndef __ARMEB__ + rev32 v20.16b,v20.16b +#endif +#ifndef __ARMEB__ + rev32 v21.16b,v21.16b +#endif +#ifndef __ARMEB__ + rev32 v22.16b,v22.16b +#endif +#ifndef __ARMEB__ + rev32 v23.16b,v23.16b +#endif + eor v16.16b,v16.16b,v24.16b + eor v17.16b,v17.16b,v25.16b + eor v18.16b,v18.16b,v26.16b + eor v19.16b,v19.16b,v27.16b + eor v20.16b,v20.16b,v28.16b + eor v21.16b,v21.16b,v29.16b + eor v22.16b,v22.16b,v30.16b + eor v23.16b,v23.16b,v31.16b + st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64 + st1 {v20.4s,v21.4s,v22.4s,v23.4s},[x1],#64 + subs x2,x2,#8 + b.eq 3f + add w5,w5,#1 + mov v8.s[3],w5 + b 1b +2: + sm4e v16.4s,v0.4s; + sm4e v17.4s,v0.4s; + sm4e v18.4s,v0.4s; + sm4e v19.4s,v0.4s; + + sm4e v16.4s,v1.4s; + sm4e v17.4s,v1.4s; + sm4e v18.4s,v1.4s; + sm4e v19.4s,v1.4s; + + sm4e v16.4s,v2.4s; + sm4e v17.4s,v2.4s; + sm4e v18.4s,v2.4s; + sm4e v19.4s,v2.4s; + + sm4e v16.4s,v3.4s; + sm4e v17.4s,v3.4s; + sm4e v18.4s,v3.4s; + sm4e v19.4s,v3.4s; + + sm4e v16.4s,v4.4s; + sm4e v17.4s,v4.4s; + sm4e v18.4s,v4.4s; + sm4e v19.4s,v4.4s; + + sm4e v16.4s,v5.4s; + sm4e v17.4s,v5.4s; + sm4e v18.4s,v5.4s; + sm4e v19.4s,v5.4s; + + sm4e v16.4s,v6.4s; + sm4e v17.4s,v6.4s; + sm4e v18.4s,v6.4s; + sm4e v19.4s,v6.4s; + + sm4e v16.4s,v7.4s; + rev64 v16.4s,v16.4s + sm4e v17.4s,v7.4s; + ext v16.16b,v16.16b,v16.16b,#8 + rev64 v17.4s,v17.4s + sm4e v18.4s,v7.4s; + ext v17.16b,v17.16b,v17.16b,#8 + rev64 v18.4s,v18.4s + sm4e v19.4s,v7.4s; + ext v18.16b,v18.16b,v18.16b,#8 + rev64 v19.4s,v19.4s + ext v19.16b,v19.16b,v19.16b,#8 +#ifndef __ARMEB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __ARMEB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __ARMEB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __ARMEB__ + rev32 v19.16b,v19.16b +#endif + eor v16.16b,v16.16b,v24.16b + eor v17.16b,v17.16b,v25.16b + eor v18.16b,v18.16b,v26.16b + eor v19.16b,v19.16b,v27.16b + st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64 + subs x2,x2,#4 + b.eq 3f + add w5,w5,#1 + mov v8.s[3],w5 + b 1b +1: + subs x2,x2,#1 + b.lt 3f + mov v16.16b,v8.16b + ld1 {v24.4s},[x0],#16 + sm4e v16.4s,v0.4s; + sm4e v16.4s,v1.4s; + sm4e v16.4s,v2.4s; + sm4e v16.4s,v3.4s; + sm4e v16.4s,v4.4s; + sm4e v16.4s,v5.4s; + sm4e v16.4s,v6.4s; + sm4e v16.4s,v7.4s; + rev64 v16.4s,v16.4s + ext v16.16b,v16.16b,v16.16b,#8 +#ifndef __ARMEB__ + rev32 v16.16b,v16.16b +#endif + eor v16.16b,v16.16b,v24.16b + st1 {v16.4s},[x1],#16 + b.eq 3f + add w5,w5,#1 + mov v8.s[3],w5 + b 1b +3: + ldp d8,d9,[sp],#16 + ret +.size sm4_v8_ctr32_encrypt_blocks,.-sm4_v8_ctr32_encrypt_blocks diff --git a/wd_cipher.c b/wd_cipher.c index f35ce6f..63ec362 100644 --- a/wd_cipher.c +++ b/wd_cipher.c @@ -622,10 +622,10 @@ static int send_recv_sync(struct wd_ctx_internal *ctx, msg_handle.send = wd_cipher_setting.driver->send; msg_handle.recv = wd_cipher_setting.driver->recv;
- pthread_spin_lock(&ctx->lock); + wd_ctx_spin_lock(ctx, wd_cipher_setting.driver->calc_type); ret = wd_handle_msg_sync(wd_cipher_setting.driver, &msg_handle, ctx->ctx, msg, NULL, wd_cipher_setting.config.epoll_en); - pthread_spin_unlock(&ctx->lock); + wd_ctx_spin_unlock(ctx, wd_cipher_setting.driver->calc_type);
return ret; }