*** BLURB HERE ***
Qi Tao (2): cipher: add support for SM4(ECB) algorithm in CE instruction cipher: fix sm4 ce code defect
Wenkai Lin (2): cipher: add support for SM4 CBC and CTR modes in CE instruction uadk: fix calloc zero size problem
Yang Shen (3): uadk/cipher: isa_ce - support SM4 cbc_cts mode uadk/cipher: isa_ce - fix memcpy overwrite uadk/cipher: isa - fix code style
Yuzeng Zhuang (1): cipher: add support for SM4 CFB and XTS modes in CE instruction
Makefile.am | 5 +- drv/isa_ce_sm4.c | 473 +++++++++ drv/isa_ce_sm4.h | 62 ++ drv/isa_ce_sm4_armv8.S | 2296 ++++++++++++++++++++++++++++++++++++++++ wd_cipher.c | 4 +- wd_util.c | 5 + 6 files changed, 2841 insertions(+), 4 deletions(-) create mode 100644 drv/isa_ce_sm4.c create mode 100644 drv/isa_ce_sm4.h create mode 100644 drv/isa_ce_sm4_armv8.S
From: Wenkai Lin linwenkai6@hisilicon.com
This patch implements the CE instruction using SM4 CBC and CTR modes, and includes the necessary logic for mode-specific operations, such as generating initialization vectors (IV) and handling chaining and counter values.
Signed-off-by: Wenkai Lin linwenkai6@hisilicon.com --- Makefile.am | 5 +- drv/isa_ce_sm4.c | 232 ++++++++++++ drv/isa_ce_sm4.h | 33 ++ drv/isa_ce_sm4_armv8.S | 774 +++++++++++++++++++++++++++++++++++++++++ wd_cipher.c | 4 +- 5 files changed, 1044 insertions(+), 4 deletions(-) create mode 100644 drv/isa_ce_sm4.c create mode 100644 drv/isa_ce_sm4.h create mode 100644 drv/isa_ce_sm4_armv8.S
diff --git a/Makefile.am b/Makefile.am index 19eab30..5102a93 100644 --- a/Makefile.am +++ b/Makefile.am @@ -77,6 +77,7 @@ libwd_crypto_la_SOURCES=wd_cipher.c wd_cipher.h wd_cipher_drv.h \ wd_rsa.c wd_rsa.h wd_rsa_drv.h \ wd_dh.c wd_dh.h wd_dh_drv.h \ wd_ecc.c wd_ecc.h wd_ecc_drv.h \ + arm_arch_ce.h isa_ce_sm3.h isa_ce_sm4.h \ wd_digest.c wd_digest.h wd_digest_drv.h \ wd_util.c wd_util.h \ wd_sched.c wd_sched.h \ @@ -89,8 +90,8 @@ libhisi_sec_la_SOURCES=drv/hisi_sec.c drv/hisi_qm_udrv.c \ libhisi_hpre_la_SOURCES=drv/hisi_hpre.c drv/hisi_qm_udrv.c \ hisi_qm_udrv.h
-libisa_ce_la_SOURCES=drv/isa_ce_sm3.c drv/isa_ce_sm3_armv8.S arm_arch_ce.h \ - drv/isa_ce_sm3.h +libisa_ce_la_SOURCES=arm_arch_ce.h drv/isa_ce_sm3.c drv/isa_ce_sm3_armv8.S isa_ce_sm3.h \ + drv/isa_ce_sm4.c drv/isa_ce_sm4_armv8.S drv/isa_ce_sm4.h
if WD_STATIC_DRV AM_CFLAGS += -DWD_STATIC_DRV -fPIC diff --git a/drv/isa_ce_sm4.c b/drv/isa_ce_sm4.c new file mode 100644 index 0000000..d48db1d --- /dev/null +++ b/drv/isa_ce_sm4.c @@ -0,0 +1,232 @@ +// SPDX-License-Identifier: Apache-2.0 +/* + * Copyright 2011-2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ +/* + * Copyright 2023 Huawei Technologies Co.,Ltd. All rights reserved. + */ + +#include "drv/wd_cipher_drv.h" +#include "isa_ce_sm4.h" +#include "wd_cipher.h" + +#define SM4_ENCRYPT 1 +#define SM4_DECRYPT 0 +#define MSG_Q_DEPTH 1024 +#define INCREASE_BITS 96 +#define BYTE_BITS 8 +#define SM4_BLOCK_SIZE 16 +#define MAX_BLOCK_NUM (1U << 28) + +#define GETU32(p) \ + ((__u32)(p)[0] << 24 | (__u32)(p)[1] << 16 | (__u32)(p)[2] << 8 | (__u32)(p)[3]) +#define PUTU32(p, v) \ + ((p)[0] = (__u8)((v) >> 24), (p)[1] = (__u8)((v) >> 16), \ + (p)[2] = (__u8)((v) >> 8), (p)[3] = (__u8)(v)) + +static int isa_ce_init(void *conf, void *priv) +{ + return 0; +} + +static void isa_ce_exit(void *priv) +{ +} + +/* increment upper 96 bits of 128-bit counter by 1 */ +static void ctr96_inc(__u8 *counter) +{ + __u32 n = INCREASE_BITS / BYTE_BITS; + __u32 c = 1; + + do { + --n; + c += counter[n]; + counter[n] = (__u8)c; + c >>= BYTE_BITS; + } while (n); +} + +static void sm4_v8_ctr32_encrypt(__u8 *in, __u8 *out, + __u64 len, const struct SM4_KEY *key, __u8 *iv) +{ + __u8 ecount_buf[SM4_BLOCK_SIZE] = {0}; + __u64 blocks, offset; + __u32 ctr32; + __u32 n = 0; + + while (n && len) { + *(out++) = *(in++) ^ ecount_buf[n]; + --len; + n = (n + 1) % SM4_BLOCK_SIZE; + } + + ctr32 = GETU32(iv + INCREASE_BITS / BYTE_BITS); + while (len >= SM4_BLOCK_SIZE) { + blocks = len / SM4_BLOCK_SIZE; + /* + * 1<<28 is just a not-so-small yet not-so-large number... + * Below condition is practically never met, but it has to + * be checked for code correctness. + */ + if (blocks > MAX_BLOCK_NUM) + blocks = MAX_BLOCK_NUM; + /* + * As (*func) operates on 32-bit counter, caller + * has to handle overflow. 'if' below detects the + * overflow, which is then handled by limiting the + * amount of blocks to the exact overflow point... + */ + ctr32 += (__u32)blocks; + if (ctr32 < blocks) { + blocks -= ctr32; + ctr32 = 0; + } + sm4_v8_ctr32_encrypt_blocks(in, out, blocks, key, iv); + /* (*ctr) does not update iv, caller does: */ + PUTU32(iv + INCREASE_BITS / BYTE_BITS, ctr32); + /* ... overflow was detected, propagate carry. */ + if (ctr32 == 0) + ctr96_inc(iv); + offset = blocks * SM4_BLOCK_SIZE; + len -= offset; + out += offset; + in += offset; + } + if (len) { + memset(ecount_buf, 0, SM4_BLOCK_SIZE); + sm4_v8_ctr32_encrypt_blocks(ecount_buf, ecount_buf, 1, key, iv); + ++ctr32; + PUTU32(iv + INCREASE_BITS / BYTE_BITS, ctr32); + if (ctr32 == 0) + ctr96_inc(iv); + while (len--) { + out[n] = in[n] ^ ecount_buf[n]; + ++n; + } + } +} + +static void sm4_ctr_encrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey_enc) +{ + sm4_v8_ctr32_encrypt(msg->in, msg->out, msg->in_bytes, rkey_enc, msg->iv); +} + +static void sm4_cbc_encrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey_enc) +{ + sm4_v8_cbc_encrypt(msg->in, msg->out, msg->in_bytes, rkey_enc, msg->iv, SM4_ENCRYPT); +} + +static void sm4_cbc_decrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey_dec) +{ + sm4_v8_cbc_encrypt(msg->in, msg->out, msg->in_bytes, rkey_dec, msg->iv, SM4_DECRYPT); +} + +void sm4_set_encrypt_key(const __u8 *userKey, struct SM4_KEY *key) +{ + sm4_v8_set_encrypt_key(userKey, key); +} + +void sm4_set_decrypt_key(const __u8 *userKey, struct SM4_KEY *key) +{ + sm4_v8_set_decrypt_key(userKey, key); +} + +static int isa_ce_cipher_send(handle_t ctx, void *wd_msg) +{ + struct wd_cipher_msg *msg = wd_msg; + struct SM4_KEY rkey; + + if (!msg) { + WD_ERR("invalid: input sm4 msg is NULL!\n"); + return -WD_EINVAL; + } + + if (msg->op_type == WD_CIPHER_ENCRYPTION || msg->mode == WD_CIPHER_CTR) + sm4_set_encrypt_key(msg->key, &rkey); + else + sm4_set_decrypt_key(msg->key, &rkey); + + switch (msg->mode) { + case WD_CIPHER_CBC: + if (msg->op_type == WD_CIPHER_ENCRYPTION) + sm4_cbc_encrypt(msg, &rkey); + else + sm4_cbc_decrypt(msg, &rkey); + break; + case WD_CIPHER_CTR: + sm4_ctr_encrypt(msg, &rkey); + break; + default: + WD_ERR("The current block cipher mode is not supported!\n"); + return -WD_EINVAL; + } + + return 0; +} + +static int isa_ce_cipher_recv(handle_t ctx, void *wd_msg) +{ + return 0; +} + +static int cipher_send(handle_t ctx, void *msg) +{ + return isa_ce_cipher_send(ctx, msg); +} + +static int cipher_recv(handle_t ctx, void *msg) +{ + return isa_ce_cipher_recv(ctx, msg); +} + +#define GEN_CE_ALG_DRIVER(ce_alg_name, alg_type) \ +{\ + .drv_name = "isa_ce_sm4",\ + .alg_name = (ce_alg_name),\ + .calc_type = UADK_ALG_CE_INSTR,\ + .priority = 200,\ + .priv_size = 0,\ + .op_type_num = 1,\ + .fallback = 0,\ + .init = isa_ce_init,\ + .exit = isa_ce_exit,\ + .send = alg_type##_send,\ + .recv = alg_type##_recv,\ +} + +static struct wd_alg_driver cipher_alg_driver[] = { + GEN_CE_ALG_DRIVER("cbc(sm4)", cipher), + GEN_CE_ALG_DRIVER("ctr(sm4)", cipher), +}; + +static void __attribute__((constructor)) isa_ce_probe(void) +{ + __u32 alg_num, i; + int ret; + + WD_INFO("Info: register SM4 CE alg drivers!\n"); + + alg_num = ARRAY_SIZE(cipher_alg_driver); + for (i = 0; i < alg_num; i++) { + ret = wd_alg_driver_register(&cipher_alg_driver[i]); + if (ret && ret != -WD_ENODEV) + WD_ERR("Error: register SM4 CE %s failed!\n", + cipher_alg_driver[i].alg_name); + } +} + +static void __attribute__((destructor)) isa_ce_remove(void) +{ + __u32 alg_num, i; + + WD_INFO("Info: unregister SM4 CE alg drivers!\n"); + alg_num = ARRAY_SIZE(cipher_alg_driver); + for (i = 0; i < alg_num; i++) + wd_alg_driver_unregister(&cipher_alg_driver[i]); +} diff --git a/drv/isa_ce_sm4.h b/drv/isa_ce_sm4.h new file mode 100644 index 0000000..d91c864 --- /dev/null +++ b/drv/isa_ce_sm4.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: Apache-2.0 */ +/* Copyright 2023 Huawei Technologies Co.,Ltd. All rights reserved. */ + +#ifndef __SM4_CE_DRV_H +#define __SM4_CE_DRV_H + +#pragma once +#include <stdint.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define SM4_KEY_SCHEDULE 32 + +struct SM4_KEY { + __u32 rk[SM4_KEY_SCHEDULE]; +}; + + +void sm4_v8_set_encrypt_key(const unsigned char *userKey, struct SM4_KEY *key); +void sm4_v8_set_decrypt_key(const unsigned char *userKey, struct SM4_KEY *key); +void sm4_v8_cbc_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const struct SM4_KEY *key, + unsigned char *ivec, const int enc); +void sm4_v8_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, + size_t len, const void *key, const unsigned char ivec[16]); + +#ifdef __cplusplus +} +#endif + +#endif /* __SM4_CE_DRV_H */ diff --git a/drv/isa_ce_sm4_armv8.S b/drv/isa_ce_sm4_armv8.S new file mode 100644 index 0000000..d7d172a --- /dev/null +++ b/drv/isa_ce_sm4_armv8.S @@ -0,0 +1,774 @@ +/* SPDX-License-Identifier: Apache-2.0 */ +/* + * Copyright 2011-2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ +#include "../include/drv/arm_arch_ce.h" + +.arch armv8-a+crypto + +.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ + 16, 17, 18, 19, 20, 21, 22, 23 24, 25, 26, 27, 28, 29, 30, 31 + .set .Lv\b().4s, \b +.endr + +.macro sm4e, vd, vn + .inst 0xcec08400 | (.L\vn << 5) | .L\vd +.endm + +.macro sm4ekey, vd, vn, vm + .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd +.endm + +.text +.align 6 +.Lck: +.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 +.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 +.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 +.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 +.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 +.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 +.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 +.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 +.Lfk: +.long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc +.globl sm4_v8_set_encrypt_key +.type sm4_v8_set_encrypt_key,%function +.align 5 +sm4_v8_set_encrypt_key: + AARCH64_VALID_CALL_TARGET + ld1 {v0.4s},[x0] + adr x2,.Lfk + ld1 {v24.4s},[x2] + adr x2,.Lck + ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x2],64 +#ifndef __ARMEB__ + rev32 v0.16b,v0.16b +#endif + ld1 {v20.4s,v21.4s,v22.4s,v23.4s},[x2] + eor v0.16b,v0.16b,v24.16b; + sm4ekey v0.4s,v0.4s,v16.4s; + sm4ekey v1.4s,v0.4s,v17.4s; + sm4ekey v2.4s,v1.4s,v18.4s; + sm4ekey v3.4s,v2.4s,v19.4s; + sm4ekey v4.4s,v3.4s,v20.4s; + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],64 + sm4ekey v5.4s,v4.4s,v21.4s; + sm4ekey v6.4s,v5.4s,v22.4s; + sm4ekey v7.4s,v6.4s,v23.4s; + st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1] + ret +.size sm4_v8_set_encrypt_key,.-sm4_v8_set_encrypt_key +.globl sm4_v8_set_decrypt_key +.type sm4_v8_set_decrypt_key,%function +.align 5 +sm4_v8_set_decrypt_key: + AARCH64_VALID_CALL_TARGET + ld1 {v7.4s},[x0] + adr x2,.Lfk + ld1 {v24.4s},[x2] + adr x2, .Lck + ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x2],64 +#ifndef __ARMEB__ + rev32 v7.16b,v7.16b +#endif + ld1 {v20.4s,v21.4s,v22.4s,v23.4s},[x2] + eor v7.16b, v7.16b,v24.16b; + sm4ekey v7.4s,v7.4s,v16.4s; + sm4ekey v6.4s,v7.4s,v17.4s; + sm4ekey v5.4s,v6.4s,v18.4s; + rev64 v7.4s,v7.4s + rev64 v6.4s,v6.4s + ext v7.16b,v7.16b,v7.16b,#8 + ext v6.16b,v6.16b,v6.16b,#8 + sm4ekey v4.4s,v5.4s,v19.4s; + sm4ekey v3.4s,v4.4s,v20.4s; + rev64 v5.4s,v5.4s + rev64 v4.4s,v4.4s + ext v5.16b,v5.16b,v5.16b,#8 + ext v4.16b,v4.16b,v4.16b,#8 + sm4ekey v2.4s,v3.4s,v21.4s; + sm4ekey v1.4s,v2.4s,v22.4s; + rev64 v3.4s,v3.4s + rev64 v2.4s,v2.4s + ext v3.16b,v3.16b,v3.16b,#8 + ext v2.16b,v2.16b,v2.16b,#8 + sm4ekey v0.4s,v1.4s,v23.4s; + rev64 v1.4s, v1.4s + rev64 v0.4s, v0.4s + ext v1.16b,v1.16b,v1.16b,#8 + ext v0.16b,v0.16b,v0.16b,#8 + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],64 + st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1] + ret +.size sm4_v8_set_decrypt_key,.-sm4_v8_set_decrypt_key +.globl sm4_v8_cbc_encrypt +.type sm4_v8_cbc_encrypt,%function +.align 5 +sm4_v8_cbc_encrypt: + AARCH64_VALID_CALL_TARGET + stp d8,d9,[sp, #-16]! + + ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x3],#64 + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x3] + ld1 {v8.4s},[x4] + cmp w5,#0 + b.eq .Ldec +1: + cmp x2, #64 + b.lt 1f + ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x0],#64 + eor v16.16b,v16.16b,v8.16b +#ifndef __ARMEB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __ARMEB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __ARMEB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __ARMEB__ + rev32 v19.16b,v19.16b +#endif + sm4e v16.4s,v0.4s; + sm4e v16.4s,v1.4s; + sm4e v16.4s,v2.4s; + sm4e v16.4s,v3.4s; + sm4e v16.4s,v4.4s; + sm4e v16.4s,v5.4s; + sm4e v16.4s,v6.4s; + sm4e v16.4s,v7.4s; + rev64 v16.4s,v16.4s + ext v16.16b,v16.16b,v16.16b,#8 + eor v17.16b,v17.16b,v16.16b + sm4e v17.4s,v0.4s; + sm4e v17.4s,v1.4s; + sm4e v17.4s,v2.4s; + sm4e v17.4s,v3.4s; + sm4e v17.4s,v4.4s; + sm4e v17.4s,v5.4s; + sm4e v17.4s,v6.4s; + sm4e v17.4s,v7.4s; + rev64 v17.4s,v17.4s + ext v17.16b,v17.16b,v17.16b,#8 +#ifndef __ARMEB__ + rev32 v16.16b,v16.16b +#endif + eor v18.16b,v18.16b,v17.16b + sm4e v18.4s,v0.4s; + sm4e v18.4s,v1.4s; + sm4e v18.4s,v2.4s; + sm4e v18.4s,v3.4s; + sm4e v18.4s,v4.4s; + sm4e v18.4s,v5.4s; + sm4e v18.4s,v6.4s; + sm4e v18.4s,v7.4s; + rev64 v18.4s,v18.4s + ext v18.16b,v18.16b,v18.16b,#8 +#ifndef __ARMEB__ + rev32 v17.16b,v17.16b +#endif + eor v19.16b,v19.16b,v18.16b + sm4e v19.4s,v0.4s; + sm4e v19.4s,v1.4s; + sm4e v19.4s,v2.4s; + sm4e v19.4s,v3.4s; + sm4e v19.4s,v4.4s; + sm4e v19.4s,v5.4s; + sm4e v19.4s,v6.4s; + sm4e v19.4s,v7.4s; + rev64 v19.4s,v19.4s + ext v19.16b,v19.16b,v19.16b,#8 +#ifndef __ARMEB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __ARMEB__ + rev32 v19.16b,v19.16b +#endif + mov v8.16b,v19.16b + st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64 + subs x2,x2,#64 + b.ne 1b +1: + subs x2,x2,#16 + b.lt 3f + ld1 {v16.4s},[x0],#16 + eor v8.16b,v8.16b,v16.16b +#ifndef __ARMEB__ + rev32 v8.16b,v8.16b +#endif + sm4e v8.4s,v0.4s; + sm4e v8.4s,v1.4s; + sm4e v8.4s,v2.4s; + sm4e v8.4s,v3.4s; + sm4e v8.4s,v4.4s; + sm4e v8.4s,v5.4s; + sm4e v8.4s,v6.4s; + sm4e v8.4s,v7.4s; + rev64 v8.4s,v8.4s + ext v8.16b,v8.16b,v8.16b,#8 +#ifndef __ARMEB__ + rev32 v8.16b,v8.16b +#endif + st1 {v8.16b},[x1],#16 + b.ne 1b + b 3f +.Ldec: +1: + cmp x2, #64 + b.lt 1f + ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x0] + ld1 {v24.4s,v25.4s,v26.4s,v27.4s},[x0],#64 + cmp x2,#128 + b.lt 2f + // 8 blocks mode + ld1 {v20.4s,v21.4s,v22.4s,v23.4s},[x0] + ld1 {v28.4s,v29.4s,v30.4s,v31.4s},[x0],#64 +#ifndef __ARMEB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __ARMEB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __ARMEB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __ARMEB__ + rev32 v19.16b,v19.16b +#endif +#ifndef __ARMEB__ + rev32 v20.16b,v20.16b +#endif +#ifndef __ARMEB__ + rev32 v21.16b,v21.16b +#endif +#ifndef __ARMEB__ + rev32 v22.16b,v22.16b +#endif +#ifndef __ARMEB__ + rev32 v23.16b,v23.16b +#endif + sm4e v16.4s,v0.4s; + sm4e v17.4s,v0.4s; + sm4e v18.4s,v0.4s; + sm4e v19.4s,v0.4s; + + sm4e v16.4s,v1.4s; + sm4e v17.4s,v1.4s; + sm4e v18.4s,v1.4s; + sm4e v19.4s,v1.4s; + + sm4e v16.4s,v2.4s; + sm4e v17.4s,v2.4s; + sm4e v18.4s,v2.4s; + sm4e v19.4s,v2.4s; + + sm4e v16.4s,v3.4s; + sm4e v17.4s,v3.4s; + sm4e v18.4s,v3.4s; + sm4e v19.4s,v3.4s; + + sm4e v16.4s,v4.4s; + sm4e v17.4s,v4.4s; + sm4e v18.4s,v4.4s; + sm4e v19.4s,v4.4s; + + sm4e v16.4s,v5.4s; + sm4e v17.4s,v5.4s; + sm4e v18.4s,v5.4s; + sm4e v19.4s,v5.4s; + + sm4e v16.4s,v6.4s; + sm4e v17.4s,v6.4s; + sm4e v18.4s,v6.4s; + sm4e v19.4s,v6.4s; + + sm4e v16.4s,v7.4s; + rev64 v16.4s,v16.4s + sm4e v17.4s,v7.4s; + ext v16.16b,v16.16b,v16.16b,#8 + rev64 v17.4s,v17.4s + sm4e v18.4s,v7.4s; + ext v17.16b,v17.16b,v17.16b,#8 + rev64 v18.4s,v18.4s + sm4e v19.4s,v7.4s; + ext v18.16b,v18.16b,v18.16b,#8 + rev64 v19.4s,v19.4s + ext v19.16b,v19.16b,v19.16b,#8 + sm4e v20.4s,v0.4s; + sm4e v21.4s,v0.4s; + sm4e v22.4s,v0.4s; + sm4e v23.4s,v0.4s; + + sm4e v20.4s,v1.4s; + sm4e v21.4s,v1.4s; + sm4e v22.4s,v1.4s; + sm4e v23.4s,v1.4s; + + sm4e v20.4s,v2.4s; + sm4e v21.4s,v2.4s; + sm4e v22.4s,v2.4s; + sm4e v23.4s,v2.4s; + + sm4e v20.4s,v3.4s; + sm4e v21.4s,v3.4s; + sm4e v22.4s,v3.4s; + sm4e v23.4s,v3.4s; + + sm4e v20.4s,v4.4s; + sm4e v21.4s,v4.4s; + sm4e v22.4s,v4.4s; + sm4e v23.4s,v4.4s; + + sm4e v20.4s,v5.4s; + sm4e v21.4s,v5.4s; + sm4e v22.4s,v5.4s; + sm4e v23.4s,v5.4s; + + sm4e v20.4s,v6.4s; + sm4e v21.4s,v6.4s; + sm4e v22.4s,v6.4s; + sm4e v23.4s,v6.4s; + + sm4e v20.4s,v7.4s; + rev64 v20.4s,v20.4s + sm4e v21.4s,v7.4s; + ext v20.16b,v20.16b,v20.16b,#8 + rev64 v21.4s,v21.4s + sm4e v22.4s,v7.4s; + ext v21.16b,v21.16b,v21.16b,#8 + rev64 v22.4s,v22.4s + sm4e v23.4s,v7.4s; + ext v22.16b,v22.16b,v22.16b,#8 + rev64 v23.4s,v23.4s + ext v23.16b,v23.16b,v23.16b,#8 +#ifndef __ARMEB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __ARMEB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __ARMEB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __ARMEB__ + rev32 v19.16b,v19.16b +#endif +#ifndef __ARMEB__ + rev32 v20.16b,v20.16b +#endif +#ifndef __ARMEB__ + rev32 v21.16b,v21.16b +#endif +#ifndef __ARMEB__ + rev32 v22.16b,v22.16b +#endif +#ifndef __ARMEB__ + rev32 v23.16b,v23.16b +#endif + eor v16.16b,v16.16b,v8.16b + eor v17.16b,v17.16b,v24.16b + eor v18.16b,v18.16b,v25.16b + mov v8.16b,v31.16b + eor v19.16b,v19.16b,v26.16b + eor v20.16b,v20.16b,v27.16b + eor v21.16b,v21.16b,v28.16b + eor v22.16b,v22.16b,v29.16b + eor v23.16b,v23.16b,v30.16b + st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64 + st1 {v20.4s,v21.4s,v22.4s,v23.4s},[x1],#64 + subs x2,x2,128 + b.gt 1b + b 3f + // 4 blocks mode +2: +#ifndef __ARMEB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __ARMEB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __ARMEB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __ARMEB__ + rev32 v19.16b,v19.16b +#endif + sm4e v16.4s,v0.4s; + sm4e v17.4s,v0.4s; + sm4e v18.4s,v0.4s; + sm4e v19.4s,v0.4s; + + sm4e v16.4s,v1.4s; + sm4e v17.4s,v1.4s; + sm4e v18.4s,v1.4s; + sm4e v19.4s,v1.4s; + + sm4e v16.4s,v2.4s; + sm4e v17.4s,v2.4s; + sm4e v18.4s,v2.4s; + sm4e v19.4s,v2.4s; + + sm4e v16.4s,v3.4s; + sm4e v17.4s,v3.4s; + sm4e v18.4s,v3.4s; + sm4e v19.4s,v3.4s; + + sm4e v16.4s,v4.4s; + sm4e v17.4s,v4.4s; + sm4e v18.4s,v4.4s; + sm4e v19.4s,v4.4s; + + sm4e v16.4s,v5.4s; + sm4e v17.4s,v5.4s; + sm4e v18.4s,v5.4s; + sm4e v19.4s,v5.4s; + + sm4e v16.4s,v6.4s; + sm4e v17.4s,v6.4s; + sm4e v18.4s,v6.4s; + sm4e v19.4s,v6.4s; + + sm4e v16.4s,v7.4s; + rev64 v16.4s,v16.4s + sm4e v17.4s,v7.4s; + ext v16.16b,v16.16b,v16.16b,#8 + rev64 v17.4s,v17.4s + sm4e v18.4s,v7.4s; + ext v17.16b,v17.16b,v17.16b,#8 + rev64 v18.4s,v18.4s + sm4e v19.4s,v7.4s; + ext v18.16b,v18.16b,v18.16b,#8 + rev64 v19.4s,v19.4s + ext v19.16b,v19.16b,v19.16b,#8 +#ifndef __ARMEB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __ARMEB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __ARMEB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __ARMEB__ + rev32 v19.16b,v19.16b +#endif + eor v16.16b,v16.16b,v8.16b + eor v17.16b,v17.16b,v24.16b + mov v8.16b,v27.16b + eor v18.16b,v18.16b,v25.16b + eor v19.16b,v19.16b,v26.16b + st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64 + subs x2,x2,#64 + b.gt 1b +1: + subs x2,x2,#16 + b.lt 3f + ld1 {v16.4s},[x0],#16 + mov v24.16b,v16.16b +#ifndef __ARMEB__ + rev32 v16.16b,v16.16b +#endif + sm4e v16.4s,v0.4s; + sm4e v16.4s,v1.4s; + sm4e v16.4s,v2.4s; + sm4e v16.4s,v3.4s; + sm4e v16.4s,v4.4s; + sm4e v16.4s,v5.4s; + sm4e v16.4s,v6.4s; + sm4e v16.4s,v7.4s; + rev64 v16.4s,v16.4s + ext v16.16b,v16.16b,v16.16b,#8 +#ifndef __ARMEB__ + rev32 v16.16b,v16.16b +#endif + eor v16.16b,v16.16b,v8.16b + mov v8.16b,v24.16b + st1 {v16.16b},[x1],#16 + b.ne 1b +3: + // save back IV + st1 {v8.16b},[x4] + ldp d8,d9,[sp],#16 + ret +.size sm4_v8_cbc_encrypt,.-sm4_v8_cbc_encrypt +.globl sm4_v8_ctr32_encrypt_blocks +.type sm4_v8_ctr32_encrypt_blocks,%function +.align 5 +sm4_v8_ctr32_encrypt_blocks: + AARCH64_VALID_CALL_TARGET + stp d8,d9,[sp, #-16]! + + ld1 {v8.4s},[x4] + ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x3],64 + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x3] +#ifndef __ARMEB__ + rev32 v8.16b,v8.16b +#endif + mov w5,v8.s[3] +1: + cmp x2,#4 + b.lt 1f + ld1 {v24.4s,v25.4s,v26.4s,v27.4s},[x0],#64 + mov v16.16b,v8.16b + mov v17.16b,v8.16b + mov v18.16b,v8.16b + mov v19.16b,v8.16b + add w5,w5,#1 + mov v17.s[3],w5 + add w5,w5,#1 + mov v18.s[3],w5 + add w5,w5,#1 + mov v19.s[3],w5 + cmp x2,#8 + b.lt 2f + ld1 {v28.4s,v29.4s,v30.4s,v31.4s},[x0],#64 + mov v20.16b,v8.16b + mov v21.16b,v8.16b + mov v22.16b,v8.16b + mov v23.16b,v8.16b + add w5,w5,#1 + mov v20.s[3],w5 + add w5,w5,#1 + mov v21.s[3],w5 + add w5,w5,#1 + mov v22.s[3],w5 + add w5,w5,#1 + mov v23.s[3],w5 + sm4e v16.4s,v0.4s; + sm4e v17.4s,v0.4s; + sm4e v18.4s,v0.4s; + sm4e v19.4s,v0.4s; + + sm4e v16.4s,v1.4s; + sm4e v17.4s,v1.4s; + sm4e v18.4s,v1.4s; + sm4e v19.4s,v1.4s; + + sm4e v16.4s,v2.4s; + sm4e v17.4s,v2.4s; + sm4e v18.4s,v2.4s; + sm4e v19.4s,v2.4s; + + sm4e v16.4s,v3.4s; + sm4e v17.4s,v3.4s; + sm4e v18.4s,v3.4s; + sm4e v19.4s,v3.4s; + + sm4e v16.4s,v4.4s; + sm4e v17.4s,v4.4s; + sm4e v18.4s,v4.4s; + sm4e v19.4s,v4.4s; + + sm4e v16.4s,v5.4s; + sm4e v17.4s,v5.4s; + sm4e v18.4s,v5.4s; + sm4e v19.4s,v5.4s; + + sm4e v16.4s,v6.4s; + sm4e v17.4s,v6.4s; + sm4e v18.4s,v6.4s; + sm4e v19.4s,v6.4s; + + sm4e v16.4s,v7.4s; + rev64 v16.4s,v16.4s + sm4e v17.4s,v7.4s; + ext v16.16b,v16.16b,v16.16b,#8 + rev64 v17.4s,v17.4s + sm4e v18.4s,v7.4s; + ext v17.16b,v17.16b,v17.16b,#8 + rev64 v18.4s,v18.4s + sm4e v19.4s,v7.4s; + ext v18.16b,v18.16b,v18.16b,#8 + rev64 v19.4s,v19.4s + ext v19.16b,v19.16b,v19.16b,#8 + sm4e v20.4s,v0.4s; + sm4e v21.4s,v0.4s; + sm4e v22.4s,v0.4s; + sm4e v23.4s,v0.4s; + + sm4e v20.4s,v1.4s; + sm4e v21.4s,v1.4s; + sm4e v22.4s,v1.4s; + sm4e v23.4s,v1.4s; + + sm4e v20.4s,v2.4s; + sm4e v21.4s,v2.4s; + sm4e v22.4s,v2.4s; + sm4e v23.4s,v2.4s; + + sm4e v20.4s,v3.4s; + sm4e v21.4s,v3.4s; + sm4e v22.4s,v3.4s; + sm4e v23.4s,v3.4s; + + sm4e v20.4s,v4.4s; + sm4e v21.4s,v4.4s; + sm4e v22.4s,v4.4s; + sm4e v23.4s,v4.4s; + + sm4e v20.4s,v5.4s; + sm4e v21.4s,v5.4s; + sm4e v22.4s,v5.4s; + sm4e v23.4s,v5.4s; + + sm4e v20.4s,v6.4s; + sm4e v21.4s,v6.4s; + sm4e v22.4s,v6.4s; + sm4e v23.4s,v6.4s; + + sm4e v20.4s,v7.4s; + rev64 v20.4s,v20.4s + sm4e v21.4s,v7.4s; + ext v20.16b,v20.16b,v20.16b,#8 + rev64 v21.4s,v21.4s + sm4e v22.4s,v7.4s; + ext v21.16b,v21.16b,v21.16b,#8 + rev64 v22.4s,v22.4s + sm4e v23.4s,v7.4s; + ext v22.16b,v22.16b,v22.16b,#8 + rev64 v23.4s,v23.4s + ext v23.16b,v23.16b,v23.16b,#8 +#ifndef __ARMEB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __ARMEB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __ARMEB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __ARMEB__ + rev32 v19.16b,v19.16b +#endif +#ifndef __ARMEB__ + rev32 v20.16b,v20.16b +#endif +#ifndef __ARMEB__ + rev32 v21.16b,v21.16b +#endif +#ifndef __ARMEB__ + rev32 v22.16b,v22.16b +#endif +#ifndef __ARMEB__ + rev32 v23.16b,v23.16b +#endif + eor v16.16b,v16.16b,v24.16b + eor v17.16b,v17.16b,v25.16b + eor v18.16b,v18.16b,v26.16b + eor v19.16b,v19.16b,v27.16b + eor v20.16b,v20.16b,v28.16b + eor v21.16b,v21.16b,v29.16b + eor v22.16b,v22.16b,v30.16b + eor v23.16b,v23.16b,v31.16b + st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64 + st1 {v20.4s,v21.4s,v22.4s,v23.4s},[x1],#64 + subs x2,x2,#8 + b.eq 3f + add w5,w5,#1 + mov v8.s[3],w5 + b 1b +2: + sm4e v16.4s,v0.4s; + sm4e v17.4s,v0.4s; + sm4e v18.4s,v0.4s; + sm4e v19.4s,v0.4s; + + sm4e v16.4s,v1.4s; + sm4e v17.4s,v1.4s; + sm4e v18.4s,v1.4s; + sm4e v19.4s,v1.4s; + + sm4e v16.4s,v2.4s; + sm4e v17.4s,v2.4s; + sm4e v18.4s,v2.4s; + sm4e v19.4s,v2.4s; + + sm4e v16.4s,v3.4s; + sm4e v17.4s,v3.4s; + sm4e v18.4s,v3.4s; + sm4e v19.4s,v3.4s; + + sm4e v16.4s,v4.4s; + sm4e v17.4s,v4.4s; + sm4e v18.4s,v4.4s; + sm4e v19.4s,v4.4s; + + sm4e v16.4s,v5.4s; + sm4e v17.4s,v5.4s; + sm4e v18.4s,v5.4s; + sm4e v19.4s,v5.4s; + + sm4e v16.4s,v6.4s; + sm4e v17.4s,v6.4s; + sm4e v18.4s,v6.4s; + sm4e v19.4s,v6.4s; + + sm4e v16.4s,v7.4s; + rev64 v16.4s,v16.4s + sm4e v17.4s,v7.4s; + ext v16.16b,v16.16b,v16.16b,#8 + rev64 v17.4s,v17.4s + sm4e v18.4s,v7.4s; + ext v17.16b,v17.16b,v17.16b,#8 + rev64 v18.4s,v18.4s + sm4e v19.4s,v7.4s; + ext v18.16b,v18.16b,v18.16b,#8 + rev64 v19.4s,v19.4s + ext v19.16b,v19.16b,v19.16b,#8 +#ifndef __ARMEB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __ARMEB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __ARMEB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __ARMEB__ + rev32 v19.16b,v19.16b +#endif + eor v16.16b,v16.16b,v24.16b + eor v17.16b,v17.16b,v25.16b + eor v18.16b,v18.16b,v26.16b + eor v19.16b,v19.16b,v27.16b + st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64 + subs x2,x2,#4 + b.eq 3f + add w5,w5,#1 + mov v8.s[3],w5 + b 1b +1: + subs x2,x2,#1 + b.lt 3f + mov v16.16b,v8.16b + ld1 {v24.4s},[x0],#16 + sm4e v16.4s,v0.4s; + sm4e v16.4s,v1.4s; + sm4e v16.4s,v2.4s; + sm4e v16.4s,v3.4s; + sm4e v16.4s,v4.4s; + sm4e v16.4s,v5.4s; + sm4e v16.4s,v6.4s; + sm4e v16.4s,v7.4s; + rev64 v16.4s,v16.4s + ext v16.16b,v16.16b,v16.16b,#8 +#ifndef __ARMEB__ + rev32 v16.16b,v16.16b +#endif + eor v16.16b,v16.16b,v24.16b + st1 {v16.4s},[x1],#16 + b.eq 3f + add w5,w5,#1 + mov v8.s[3],w5 + b 1b +3: + ldp d8,d9,[sp],#16 + ret +.size sm4_v8_ctr32_encrypt_blocks,.-sm4_v8_ctr32_encrypt_blocks diff --git a/wd_cipher.c b/wd_cipher.c index f35ce6f..63ec362 100644 --- a/wd_cipher.c +++ b/wd_cipher.c @@ -622,10 +622,10 @@ static int send_recv_sync(struct wd_ctx_internal *ctx, msg_handle.send = wd_cipher_setting.driver->send; msg_handle.recv = wd_cipher_setting.driver->recv;
- pthread_spin_lock(&ctx->lock); + wd_ctx_spin_lock(ctx, wd_cipher_setting.driver->calc_type); ret = wd_handle_msg_sync(wd_cipher_setting.driver, &msg_handle, ctx->ctx, msg, NULL, wd_cipher_setting.config.epoll_en); - pthread_spin_unlock(&ctx->lock); + wd_ctx_spin_unlock(ctx, wd_cipher_setting.driver->calc_type);
return ret; }
From: Yuzeng Zhuang yisen.zhuang@huawei.com
This patch implements the CE instruction using SM4 CFB and XTS modes.
Signed-off-by: Yuzeng Zhuang yisen.zhuang@huawei.com --- drv/isa_ce_sm4.c | 130 ++++- drv/isa_ce_sm4.h | 14 + drv/isa_ce_sm4_armv8.S | 1126 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1268 insertions(+), 2 deletions(-)
diff --git a/drv/isa_ce_sm4.c b/drv/isa_ce_sm4.c index d48db1d..fe8fbdc 100644 --- a/drv/isa_ce_sm4.c +++ b/drv/isa_ce_sm4.c @@ -22,6 +22,8 @@ #define BYTE_BITS 8 #define SM4_BLOCK_SIZE 16 #define MAX_BLOCK_NUM (1U << 28) +#define SM4_BYTES2BLKS(nbytes) ((nbytes) >> 4) +#define SM4_KEY_SIZE 16
#define GETU32(p) \ ((__u32)(p)[0] << 24 | (__u32)(p)[1] << 16 | (__u32)(p)[2] << 8 | (__u32)(p)[3]) @@ -137,17 +139,127 @@ void sm4_set_decrypt_key(const __u8 *userKey, struct SM4_KEY *key) sm4_v8_set_decrypt_key(userKey, key); }
+static void sm4_cfb_encrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey_enc) +{ + unsigned int nbytes = msg->in_bytes; + const unsigned char *src = msg->in; + unsigned char *dst = msg->out; + unsigned int blocks; + unsigned int bbytes; + + blocks = SM4_BYTES2BLKS(nbytes); + if (blocks) { + sm4_v8_cfb_encrypt_blocks(src, dst, blocks, rkey_enc, msg->iv); + bbytes = blocks * SM4_BLOCK_SIZE; + dst += bbytes; + src += bbytes; + nbytes -= bbytes; + } + + if (nbytes > 0) { + unsigned char keydata[SM4_BLOCK_SIZE]; + unsigned int i = 0; + + sm4_v8_crypt_block(msg->iv, keydata, rkey_enc); + while (nbytes > 0) { + *dst++ = *src++ ^ keydata[i++]; + nbytes--; + } + + /* store new IV */ + if (msg->out_bytes >= msg->iv_bytes) + memcpy(msg->iv, msg->out + msg->out_bytes - + msg->iv_bytes, msg->iv_bytes); + else + memcpy(msg->iv, msg->out, msg->out_bytes); + } +} + +static void sm4_cfb_decrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey_dec) +{ + unsigned int nbytes = msg->in_bytes; + const unsigned char *src = msg->in; + unsigned char *dst = msg->out; + unsigned int blocks; + unsigned int bbytes; + + blocks = SM4_BYTES2BLKS(nbytes); + if (blocks) { + sm4_v8_cfb_decrypt_blocks(src, dst, blocks, rkey_dec, msg->iv); + bbytes = blocks * SM4_BLOCK_SIZE; + dst += bbytes; + src += bbytes; + nbytes -= bbytes; + } + + if (nbytes > 0) { + unsigned char keydata[SM4_BLOCK_SIZE]; + unsigned int i = 0; + + sm4_v8_crypt_block(msg->iv, keydata, rkey_dec); + while (nbytes > 0) { + *dst++ = *src++ ^ keydata[i++]; + nbytes--; + } + + /* store new IV */ + if (msg->in_bytes >= msg->iv_bytes) + memcpy(msg->iv, msg->in + msg->in_bytes - + msg->iv_bytes, msg->iv_bytes); + else + memcpy(msg->iv, msg->in, msg->in_bytes); + } +} + +static int sm4_xts_encrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey) +{ + struct SM4_KEY rkey2; + + if (msg->in_bytes < SM4_BLOCK_SIZE) { + WD_ERR("invalid: cipher input length is wrong!\n"); + return -WD_EINVAL; + } + + /* set key for tweak */ + sm4_set_encrypt_key(msg->key + SM4_KEY_SIZE, &rkey2); + + sm4_v8_xts_encrypt(msg->in, msg->out, msg->in_bytes, + rkey, msg->iv, &rkey2); + + return 0; +} + +static int sm4_xts_decrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey) +{ + struct SM4_KEY rkey2; + + if (msg->in_bytes < SM4_BLOCK_SIZE) { + WD_ERR("invalid: cipher input length is wrong!\n"); + return -WD_EINVAL; + } + + /* set key for tweak */ + sm4_set_encrypt_key(msg->key + SM4_KEY_SIZE, &rkey2); + + sm4_v8_xts_decrypt(msg->in, msg->out, msg->in_bytes, + rkey, msg->iv, &rkey2); + + return 0; +} + static int isa_ce_cipher_send(handle_t ctx, void *wd_msg) { struct wd_cipher_msg *msg = wd_msg; struct SM4_KEY rkey; + int ret = 0;
if (!msg) { WD_ERR("invalid: input sm4 msg is NULL!\n"); return -WD_EINVAL; }
- if (msg->op_type == WD_CIPHER_ENCRYPTION || msg->mode == WD_CIPHER_CTR) + if (msg->op_type == WD_CIPHER_ENCRYPTION || msg->mode == WD_CIPHER_CTR + || msg->mode == WD_CIPHER_CFB) sm4_set_encrypt_key(msg->key, &rkey); else sm4_set_decrypt_key(msg->key, &rkey); @@ -162,12 +274,24 @@ static int isa_ce_cipher_send(handle_t ctx, void *wd_msg) case WD_CIPHER_CTR: sm4_ctr_encrypt(msg, &rkey); break; + case WD_CIPHER_CFB: + if (msg->op_type == WD_CIPHER_ENCRYPTION) + sm4_cfb_encrypt(msg, &rkey); + else + sm4_cfb_decrypt(msg, &rkey); + break; + case WD_CIPHER_XTS: + if (msg->op_type == WD_CIPHER_ENCRYPTION) + ret = sm4_xts_encrypt(msg, &rkey); + else + ret = sm4_xts_decrypt(msg, &rkey); + break; default: WD_ERR("The current block cipher mode is not supported!\n"); return -WD_EINVAL; }
- return 0; + return ret; }
static int isa_ce_cipher_recv(handle_t ctx, void *wd_msg) @@ -203,6 +327,8 @@ static int cipher_recv(handle_t ctx, void *msg) static struct wd_alg_driver cipher_alg_driver[] = { GEN_CE_ALG_DRIVER("cbc(sm4)", cipher), GEN_CE_ALG_DRIVER("ctr(sm4)", cipher), + GEN_CE_ALG_DRIVER("cfb(sm4)", cipher), + GEN_CE_ALG_DRIVER("xts(sm4)", cipher), };
static void __attribute__((constructor)) isa_ce_probe(void) diff --git a/drv/isa_ce_sm4.h b/drv/isa_ce_sm4.h index d91c864..eba1c9e 100644 --- a/drv/isa_ce_sm4.h +++ b/drv/isa_ce_sm4.h @@ -26,6 +26,20 @@ void sm4_v8_cbc_encrypt(const unsigned char *in, unsigned char *out, void sm4_v8_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, size_t len, const void *key, const unsigned char ivec[16]);
+void sm4_v8_cfb_encrypt_blocks(const unsigned char *in, unsigned char *out, + size_t length, const struct SM4_KEY *key, unsigned char *ivec); +void sm4_v8_cfb_decrypt_blocks(const unsigned char *in, unsigned char *out, + size_t length, const struct SM4_KEY *key, unsigned char *ivec); +void sm4_v8_crypt_block(const unsigned char *in, unsigned char *out, + const struct SM4_KEY *key); + +int sm4_v8_xts_encrypt(const unsigned char *in, unsigned char *out, size_t length, + const struct SM4_KEY *key, unsigned char *ivec, + const struct SM4_KEY *key2); +int sm4_v8_xts_decrypt(const unsigned char *in, unsigned char *out, size_t length, + const struct SM4_KEY *key, unsigned char *ivec, + const struct SM4_KEY *key2); + #ifdef __cplusplus } #endif diff --git a/drv/isa_ce_sm4_armv8.S b/drv/isa_ce_sm4_armv8.S index d7d172a..342dfa5 100644 --- a/drv/isa_ce_sm4_armv8.S +++ b/drv/isa_ce_sm4_armv8.S @@ -37,6 +37,14 @@ .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 .Lfk: .long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc +.align 4 +.cts_permute_table: +.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 +.byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf +.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .globl sm4_v8_set_encrypt_key .type sm4_v8_set_encrypt_key,%function .align 5 @@ -772,3 +780,1121 @@ sm4_v8_ctr32_encrypt_blocks: ldp d8,d9,[sp],#16 ret .size sm4_v8_ctr32_encrypt_blocks,.-sm4_v8_ctr32_encrypt_blocks + +.globl sm4_v8_crypt_block +.type sm4_v8_crypt_block,%function +.align 5 +sm4_v8_crypt_block: + /* parameters: + * x0: src + * x1: dst + * x2: key + */ + AARCH64_VALID_CALL_TARGET + + ld1 {v0.16b-v3.16b}, [x2], #64 + ld1 {v4.16b-v7.16b}, [x2] + + ld1 {v16.4s},[x0] + + rev32 v16.16b, v16.16b + sm4e v16.4s, v0.4s + sm4e v16.4s, v1.4s + sm4e v16.4s, v2.4s + sm4e v16.4s, v3.4s + sm4e v16.4s, v4.4s + sm4e v16.4s, v5.4s + sm4e v16.4s, v6.4s + sm4e v16.4s, v7.4s + rev64 v16.4s, v16.4s + ext v16.16b, v16.16b, v16.16b, #8 + rev32 v16.16b, v16.16b + + st1 {v16.16b}, [x1]; + + ret +.size sm4_v8_crypt_block,.-sm4_v8_crypt_block + +.globl sm4_v8_cfb_encrypt_blocks +.type sm4_v8_cfb_encrypt_blocks,%function +.align 5 +sm4_v8_cfb_encrypt_blocks: + /* parameters: + * x0: src + * x1: dst + * w2: nblocks + * x3: key + * x4: iv + */ + AARCH64_VALID_CALL_TARGET + stp d8,d9,[sp, #-16]! + + ld1 {v0.4s-v3.4s}, [x3], #64 + ld1 {v4.4s-v7.4s}, [x3] + + ld1 {v8.4s},[x4] + +.loop_cfb_enc_4block: + cmp w2, #4 + blt .loob_cfb_enc_1block + + sub w2, w2, #4 + + ld1 {v16.4s-v19.4s}, [x0], #64 + + rev32 v8.16b, v8.16b + sm4e v8.4s, v0.4s + sm4e v8.4s, v1.4s + sm4e v8.4s, v2.4s + sm4e v8.4s, v3.4s + sm4e v8.4s, v4.4s + sm4e v8.4s, v5.4s + sm4e v8.4s, v6.4s + sm4e v8.4s, v7.4s + rev64 v8.4s, v8.4s + ext v8.16b, v8.16b, v8.16b, #8 + rev32 v8.16b, v8.16b + eor v16.16b, v16.16b, v8.16b + + rev32 v8.16b, v16.16b + sm4e v8.4s, v0.4s + sm4e v8.4s, v1.4s + sm4e v8.4s, v2.4s + sm4e v8.4s, v3.4s + sm4e v8.4s, v4.4s + sm4e v8.4s, v5.4s + sm4e v8.4s, v6.4s + sm4e v8.4s, v7.4s + rev64 v8.4s, v8.4s + ext v8.16b, v8.16b, v8.16b, #8 + rev32 v8.16b, v8.16b + eor v17.16b, v17.16b, v8.16b + + rev32 v8.16b, v17.16b + sm4e v8.4s, v0.4s + sm4e v8.4s, v1.4s + sm4e v8.4s, v2.4s + sm4e v8.4s, v3.4s + sm4e v8.4s, v4.4s + sm4e v8.4s, v5.4s + sm4e v8.4s, v6.4s + sm4e v8.4s, v7.4s + rev64 v8.4s, v8.4s + ext v8.16b, v8.16b, v8.16b, #8 + rev32 v8.16b, v8.16b + eor v18.16b, v18.16b, v8.16b + + rev32 v8.16b, v18.16b + sm4e v8.4s, v0.4s + sm4e v8.4s, v1.4s + sm4e v8.4s, v2.4s + sm4e v8.4s, v3.4s + sm4e v8.4s, v4.4s + sm4e v8.4s, v5.4s + sm4e v8.4s, v6.4s + sm4e v8.4s, v7.4s + rev64 v8.4s, v8.4s + ext v8.16b, v8.16b, v8.16b, #8 + rev32 v8.16b, v8.16b + eor v19.16b, v19.16b, v8.16b + + st1 {v16.4s-v19.4s}, [x1], #64 + mov v8.16b, v19.16b + + cbz w2, .end_cfb_enc + b .loop_cfb_enc_4block + +.loob_cfb_enc_1block: + sub w2, w2, #1 + + ld1 {v16.4s}, [x0], #16 + + rev32 v8.16b, v8.16b + sm4e v8.4s, v0.4s + sm4e v8.4s, v1.4s + sm4e v8.4s, v2.4s + sm4e v8.4s, v3.4s + sm4e v8.4s, v4.4s + sm4e v8.4s, v5.4s + sm4e v8.4s, v6.4s + sm4e v8.4s, v7.4s + rev64 v8.4s, v8.4s + ext v8.16b, v8.16b, v8.16b, #8 + rev32 v8.16b, v8.16b + eor v8.16b, v8.16b, v16.16b + + st1 {v8.4s}, [x1], #16 + + cbnz w2, .loob_cfb_enc_1block + +.end_cfb_enc: + st1 {v8.4s}, [x4] + + ldp d8,d9,[sp],#16 + ret +.size sm4_v8_cfb_encrypt_blocks,.-sm4_v8_cfb_encrypt_blocks + +.globl sm4_v8_cfb_decrypt_blocks +.type sm4_v8_cfb_decrypt_blocks,%function +.align 5 +sm4_v8_cfb_decrypt_blocks: + /* parameters: + * x0: src + * x1: dst + * w2: nblocks + * x3: key + * x4: iv + */ + AARCH64_VALID_CALL_TARGET + stp d8,d9,[sp, #-16]! + + ld1 {v0.4s-v3.4s}, [x3], #64 + ld1 {v4.4s-v7.4s}, [x3] + + ld1 {v8.4s},[x4] + +.loop_cfb_dec_8block: + cmp w2, #8 + blt .cfb_dec_4block + + sub w2, w2, #8 + + ld1 {v12.4s-v15.4s}, [x0], #64 + ld1 {v16.4s-v19.4s}, [x0], #64 + + rev32 v20.16b, v8.16b + rev32 v21.16b, v12.16b + rev32 v22.16b, v13.16b + rev32 v23.16b, v14.16b + rev32 v24.16b, v15.16b + rev32 v25.16b, v16.16b + rev32 v26.16b, v17.16b + rev32 v27.16b, v18.16b + sm4e v20.4s, v0.4s + sm4e v21.4s, v0.4s + sm4e v22.4s, v0.4s + sm4e v23.4s, v0.4s + sm4e v24.4s, v0.4s + sm4e v25.4s, v0.4s + sm4e v26.4s, v0.4s + sm4e v27.4s, v0.4s + sm4e v20.4s, v1.4s + sm4e v21.4s, v1.4s + sm4e v22.4s, v1.4s + sm4e v23.4s, v1.4s + sm4e v24.4s, v1.4s + sm4e v25.4s, v1.4s + sm4e v26.4s, v1.4s + sm4e v27.4s, v1.4s + sm4e v20.4s, v2.4s + sm4e v21.4s, v2.4s + sm4e v22.4s, v2.4s + sm4e v23.4s, v2.4s + sm4e v24.4s, v2.4s + sm4e v25.4s, v2.4s + sm4e v26.4s, v2.4s + sm4e v27.4s, v2.4s + sm4e v20.4s, v3.4s + sm4e v21.4s, v3.4s + sm4e v22.4s, v3.4s + sm4e v23.4s, v3.4s + sm4e v24.4s, v3.4s + sm4e v25.4s, v3.4s + sm4e v26.4s, v3.4s + sm4e v27.4s, v3.4s + sm4e v20.4s, v4.4s + sm4e v21.4s, v4.4s + sm4e v22.4s, v4.4s + sm4e v23.4s, v4.4s + sm4e v24.4s, v4.4s + sm4e v25.4s, v4.4s + sm4e v26.4s, v4.4s + sm4e v27.4s, v4.4s + sm4e v20.4s, v5.4s + sm4e v21.4s, v5.4s + sm4e v22.4s, v5.4s + sm4e v23.4s, v5.4s + sm4e v24.4s, v5.4s + sm4e v25.4s, v5.4s + sm4e v26.4s, v5.4s + sm4e v27.4s, v5.4s + sm4e v20.4s, v6.4s + sm4e v21.4s, v6.4s + sm4e v22.4s, v6.4s + sm4e v23.4s, v6.4s + sm4e v24.4s, v6.4s + sm4e v25.4s, v6.4s + sm4e v26.4s, v6.4s + sm4e v27.4s, v6.4s + sm4e v20.4s, v7.4s + sm4e v21.4s, v7.4s + sm4e v22.4s, v7.4s + sm4e v23.4s, v7.4s + sm4e v24.4s, v7.4s + sm4e v25.4s, v7.4s + sm4e v26.4s, v7.4s + sm4e v27.4s, v7.4s + rev64 v20.4s, v20.4s + rev64 v21.4s, v21.4s + rev64 v22.4s, v22.4s + rev64 v23.4s, v23.4s + rev64 v24.4s, v24.4s + rev64 v25.4s, v25.4s + rev64 v26.4s, v26.4s + rev64 v27.4s, v27.4s + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v21.16b, v21.16b, #8 + ext v22.16b, v22.16b, v22.16b, #8 + ext v23.16b, v23.16b, v23.16b, #8 + ext v24.16b, v24.16b, v24.16b, #8 + ext v25.16b, v25.16b, v25.16b, #8 + ext v26.16b, v26.16b, v26.16b, #8 + ext v27.16b, v27.16b, v27.16b, #8 + rev32 v20.16b, v20.16b + rev32 v21.16b, v21.16b + rev32 v22.16b, v22.16b + rev32 v23.16b, v23.16b + rev32 v24.16b, v24.16b + rev32 v25.16b, v25.16b + rev32 v26.16b, v26.16b + rev32 v27.16b, v27.16b + + mov v8.16b, v19.16b //Modify IV + + eor v20.16b, v20.16b, v12.16b + eor v21.16b, v21.16b, v13.16b + eor v22.16b, v22.16b, v14.16b + eor v23.16b, v23.16b, v15.16b + eor v24.16b, v24.16b, v16.16b + eor v25.16b, v25.16b, v17.16b + eor v26.16b, v26.16b, v18.16b + eor v27.16b, v27.16b, v19.16b + + st1 {v20.4s-v23.4s}, [x1], #64 + st1 {v24.4s-v27.4s}, [x1], #64 + + cbz w2, .end_cfb_dec + b .loop_cfb_dec_8block + +.cfb_dec_4block: + cmp w2, #4 + blt .loop_cfb_dec_1block + + sub w2, w2, #4 + + ld1 {v12.4s-v15.4s}, [x0], #64 + + rev32 v20.16b, v8.16b + rev32 v21.16b, v12.16b + rev32 v22.16b, v13.16b + rev32 v23.16b, v14.16b + sm4e v20.4s, v0.4s + sm4e v21.4s, v0.4s + sm4e v22.4s, v0.4s + sm4e v23.4s, v0.4s + sm4e v20.4s, v1.4s + sm4e v21.4s, v1.4s + sm4e v22.4s, v1.4s + sm4e v23.4s, v1.4s + sm4e v20.4s, v2.4s + sm4e v21.4s, v2.4s + sm4e v22.4s, v2.4s + sm4e v23.4s, v2.4s + sm4e v20.4s, v3.4s + sm4e v21.4s, v3.4s + sm4e v22.4s, v3.4s + sm4e v23.4s, v3.4s + sm4e v20.4s, v4.4s + sm4e v21.4s, v4.4s + sm4e v22.4s, v4.4s + sm4e v23.4s, v4.4s + sm4e v20.4s, v5.4s + sm4e v21.4s, v5.4s + sm4e v22.4s, v5.4s + sm4e v23.4s, v5.4s + sm4e v20.4s, v6.4s + sm4e v21.4s, v6.4s + sm4e v22.4s, v6.4s + sm4e v23.4s, v6.4s + sm4e v20.4s, v7.4s + sm4e v21.4s, v7.4s + sm4e v22.4s, v7.4s + sm4e v23.4s, v7.4s + rev64 v20.4s, v20.4s + rev64 v21.4s, v21.4s + rev64 v22.4s, v22.4s + rev64 v23.4s, v23.4s + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v21.16b, v21.16b, #8 + ext v22.16b, v22.16b, v22.16b, #8 + ext v23.16b, v23.16b, v23.16b, #8 + rev32 v20.16b, v20.16b + rev32 v21.16b, v21.16b + rev32 v22.16b, v22.16b + rev32 v23.16b, v23.16b + + mov v8.16b, v15.16b //Modify IV + + eor v20.16b, v20.16b, v12.16b + eor v21.16b, v21.16b, v13.16b + eor v22.16b, v22.16b, v14.16b + eor v23.16b, v23.16b, v15.16b + + st1 {v20.4s-v23.4s}, [x1], #64 + + cbz w2, .end_cfb_dec + +.loop_cfb_dec_1block: + sub w2, w2, #1 + + ld1 {v12.4s}, [x0], #16 + + rev32 v20.16b, v8.16b + sm4e v20.4s, v0.4s + sm4e v20.4s, v1.4s + sm4e v20.4s, v2.4s + sm4e v20.4s, v3.4s + sm4e v20.4s, v4.4s + sm4e v20.4s, v5.4s + sm4e v20.4s, v6.4s + sm4e v20.4s, v7.4s + rev64 v20.4s, v20.4s + ext v20.16b, v20.16b, v20.16b, #8 + rev32 v20.16b, v20.16b + + eor v20.16b, v20.16b, v12.16b + st1 {v20.4s}, [x1], #16 + + mov v8.16b, v12.16b //Modify IV + + cbnz w2, .loop_cfb_dec_1block + +.end_cfb_dec: + /* store new IV */ + st1 {v8.4s}, [x4] + + ldp d8,d9,[sp],#16 + ret +.size sm4_v8_cfb_decrypt_blocks,.-sm4_v8_cfb_decrypt_blocks + +#define tweak_calc(out, in, MSK, TMP) \ + sshr TMP.2d, in.2d, #63; \ + and TMP.16b, TMP.16b, MSK.16b; \ + add out.2d, in.2d, in.2d; \ + ext TMP.16b, TMP.16b, TMP.16b, #8; \ + eor out.16b, out.16b, TMP.16b; + +.globl sm4_v8_xts_encrypt +.type sm4_v8_xts_encrypt,%function +.align 5 +sm4_v8_xts_encrypt: + /* parameters: + * x0: src + * x1: dst + * w2: nbytes + * x3: key + * x4: tweak + * x5: key array for tweak + */ + AARCH64_VALID_CALL_TARGET + stp d8,d9,[sp, #-16]! + + ld1 {v8.16b}, [x4] + + cbz x5, .enc_xts_nokey2 + + /* load round key array for tweak */ + ld1 {v0.16b-v3.16b}, [x5], #64 + ld1 {v4.16b-v7.16b}, [x5] + + /* first tweak */ + rev32 v8.16b, v8.16b + sm4e v8.4s, v0.4s + sm4e v8.4s, v1.4s + sm4e v8.4s, v2.4s + sm4e v8.4s, v3.4s + sm4e v8.4s, v4.4s + sm4e v8.4s, v5.4s + sm4e v8.4s, v6.4s + sm4e v8.4s, v7.4s + rev64 v8.4s, v8.4s + ext v8.16b, v8.16b, v8.16b, #8 + rev32 v8.16b, v8.16b + +.enc_xts_nokey2: + /* load key array */ + ld1 {v0.16b-v3.16b}, [x3], #64 + ld1 {v4.16b-v7.16b}, [x3] + + and w5, w2, #15 + lsr w2, w2, #4 + cbz w5, .enc_xts_mask + /* leave the last block for tail */ + sub w2, w2, #1 + +.enc_xts_mask: + /* init mask */ + movi v31.2s, #0x1 + movi v16.2s, #0x87 + uzp1 v31.4s, v31.4s, v16.4s + + cbz w2, .enc_xts_tail + +.enc_xts_8block: + sub w2, w2, #8 + tbnz w2, #31, .enc_xts_4block + + tweak_calc(v9, v8, v31, v16) + tweak_calc(v10, v9, v31, v17) + tweak_calc(v11, v10, v31, v18) + tweak_calc(v12, v11, v31, v19) + tweak_calc(v13, v12, v31, v16) + tweak_calc(v14, v13, v31, v17) + tweak_calc(v15, v14, v31, v18) + + ld1 {v20.16b-v23.16b}, [x0], #64 + ld1 {v24.16b-v27.16b}, [x0], #64 + eor v20.16b, v20.16b, v8.16b + eor v21.16b, v21.16b, v9.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v11.16b + eor v24.16b, v24.16b, v12.16b + eor v25.16b, v25.16b, v13.16b + eor v26.16b, v26.16b, v14.16b + eor v27.16b, v27.16b, v15.16b + + rev32 v20.16b, v20.16b + rev32 v21.16b, v21.16b + rev32 v22.16b, v22.16b + rev32 v23.16b, v23.16b + rev32 v24.16b, v24.16b + rev32 v25.16b, v25.16b + rev32 v26.16b, v26.16b + rev32 v27.16b, v27.16b + sm4e v20.4s, v0.4s + sm4e v21.4s, v0.4s + sm4e v22.4s, v0.4s + sm4e v23.4s, v0.4s + sm4e v24.4s, v0.4s + sm4e v25.4s, v0.4s + sm4e v26.4s, v0.4s + sm4e v27.4s, v0.4s + sm4e v20.4s, v1.4s + sm4e v21.4s, v1.4s + sm4e v22.4s, v1.4s + sm4e v23.4s, v1.4s + sm4e v24.4s, v1.4s + sm4e v25.4s, v1.4s + sm4e v26.4s, v1.4s + sm4e v27.4s, v1.4s + sm4e v20.4s, v2.4s + sm4e v21.4s, v2.4s + sm4e v22.4s, v2.4s + sm4e v23.4s, v2.4s + sm4e v24.4s, v2.4s + sm4e v25.4s, v2.4s + sm4e v26.4s, v2.4s + sm4e v27.4s, v2.4s + sm4e v20.4s, v3.4s + sm4e v21.4s, v3.4s + sm4e v22.4s, v3.4s + sm4e v23.4s, v3.4s + sm4e v24.4s, v3.4s + sm4e v25.4s, v3.4s + sm4e v26.4s, v3.4s + sm4e v27.4s, v3.4s + sm4e v20.4s, v4.4s + sm4e v21.4s, v4.4s + sm4e v22.4s, v4.4s + sm4e v23.4s, v4.4s + sm4e v24.4s, v4.4s + sm4e v25.4s, v4.4s + sm4e v26.4s, v4.4s + sm4e v27.4s, v4.4s + sm4e v20.4s, v5.4s + sm4e v21.4s, v5.4s + sm4e v22.4s, v5.4s + sm4e v23.4s, v5.4s + sm4e v24.4s, v5.4s + sm4e v25.4s, v5.4s + sm4e v26.4s, v5.4s + sm4e v27.4s, v5.4s + sm4e v20.4s, v6.4s + sm4e v21.4s, v6.4s + sm4e v22.4s, v6.4s + sm4e v23.4s, v6.4s + sm4e v24.4s, v6.4s + sm4e v25.4s, v6.4s + sm4e v26.4s, v6.4s + sm4e v27.4s, v6.4s + sm4e v20.4s, v7.4s + sm4e v21.4s, v7.4s + sm4e v22.4s, v7.4s + sm4e v23.4s, v7.4s + sm4e v24.4s, v7.4s + sm4e v25.4s, v7.4s + sm4e v26.4s, v7.4s + sm4e v27.4s, v7.4s + rev64 v20.4s, v20.4s + rev64 v21.4s, v21.4s + rev64 v22.4s, v22.4s + rev64 v23.4s, v23.4s + rev64 v24.4s, v24.4s + rev64 v25.4s, v25.4s + rev64 v26.4s, v26.4s + rev64 v27.4s, v27.4s + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v21.16b, v21.16b, #8 + ext v22.16b, v22.16b, v22.16b, #8 + ext v23.16b, v23.16b, v23.16b, #8 + ext v24.16b, v24.16b, v24.16b, #8 + ext v25.16b, v25.16b, v25.16b, #8 + ext v26.16b, v26.16b, v26.16b, #8 + ext v27.16b, v27.16b, v27.16b, #8 + rev32 v20.16b, v20.16b + rev32 v21.16b, v21.16b + rev32 v22.16b, v22.16b + rev32 v23.16b, v23.16b + rev32 v24.16b, v24.16b + rev32 v25.16b, v25.16b + rev32 v26.16b, v26.16b + rev32 v27.16b, v27.16b + + eor v20.16b, v20.16b, v8.16b + eor v21.16b, v21.16b, v9.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v11.16b + eor v24.16b, v24.16b, v12.16b + eor v25.16b, v25.16b, v13.16b + eor v26.16b, v26.16b, v14.16b + eor v27.16b, v27.16b, v15.16b + st1 {v20.16b-v23.16b}, [x1], #64 + st1 {v24.16b-v27.16b}, [x1], #64 + + tweak_calc(v8, v15, v31, v19) + + cbz w2, .enc_xts_tail + b .enc_xts_8block + +.enc_xts_4block: + add w2, w2, #8 + cmp w2, #4 + blt .enc_xts_1block + + sub w2, w2, #4 + + tweak_calc(v9, v8, v31, v16) + tweak_calc(v10, v9, v31, v17) + tweak_calc(v11, v10, v31, v18) + + ld1 {v20.16b-v23.16b}, [x0], #64 + eor v20.16b, v20.16b, v8.16b + eor v21.16b, v21.16b, v9.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v11.16b + + rev32 v20.16b, v20.16b + rev32 v21.16b, v21.16b + rev32 v22.16b, v22.16b + rev32 v23.16b, v23.16b + sm4e v20.4s, v0.4s + sm4e v21.4s, v0.4s + sm4e v22.4s, v0.4s + sm4e v23.4s, v0.4s + sm4e v20.4s, v1.4s + sm4e v21.4s, v1.4s + sm4e v22.4s, v1.4s + sm4e v23.4s, v1.4s + sm4e v20.4s, v2.4s + sm4e v21.4s, v2.4s + sm4e v22.4s, v2.4s + sm4e v23.4s, v2.4s + sm4e v20.4s, v3.4s + sm4e v21.4s, v3.4s + sm4e v22.4s, v3.4s + sm4e v23.4s, v3.4s + sm4e v20.4s, v4.4s + sm4e v21.4s, v4.4s + sm4e v22.4s, v4.4s + sm4e v23.4s, v4.4s + sm4e v20.4s, v5.4s + sm4e v21.4s, v5.4s + sm4e v22.4s, v5.4s + sm4e v23.4s, v5.4s + sm4e v20.4s, v6.4s + sm4e v21.4s, v6.4s + sm4e v22.4s, v6.4s + sm4e v23.4s, v6.4s + sm4e v20.4s, v7.4s + sm4e v21.4s, v7.4s + sm4e v22.4s, v7.4s + sm4e v23.4s, v7.4s + rev64 v20.4s, v20.4s + rev64 v21.4s, v21.4s + rev64 v22.4s, v22.4s + rev64 v23.4s, v23.4s + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v21.16b, v21.16b, #8 + ext v22.16b, v22.16b, v22.16b, #8 + ext v23.16b, v23.16b, v23.16b, #8 + rev32 v20.16b, v20.16b + rev32 v21.16b, v21.16b + rev32 v22.16b, v22.16b + rev32 v23.16b, v23.16b + + eor v20.16b, v20.16b, v8.16b + eor v21.16b, v21.16b, v9.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v11.16b + st1 {v20.16b-v23.16b}, [x1], #64 + + tweak_calc(v8, v11, v31, v19) + + cbz w2, .enc_xts_tail + +.enc_xts_1block: + sub w2, w2, #1 + + ld1 {v20.16b}, [x0], #16 + eor v20.16b, v20.16b, v8.16b + + rev32 v20.16b, v20.16b + sm4e v20.4s, v0.4s + sm4e v20.4s, v1.4s + sm4e v20.4s, v2.4s + sm4e v20.4s, v3.4s + sm4e v20.4s, v4.4s + sm4e v20.4s, v5.4s + sm4e v20.4s, v6.4s + sm4e v20.4s, v7.4s + rev64 v20.4s, v20.4s + ext v20.16b, v20.16b, v20.16b, #8 + rev32 v20.16b, v20.16b + + eor v20.16b, v20.16b, v8.16b + st1 {v20.16b}, [x1], #16 + + tweak_calc(v8, v8, v31, v16) + + cbnz w2, .enc_xts_1block + +.enc_xts_tail: + uxtw x5, w5 + cbz x5, .enc_xts_end + + tweak_calc(v9, v8, v31, v16) + ld1 {v20.16b}, [x0] + eor v20.16b, v20.16b, v8.16b + rev32 v20.16b, v20.16b + sm4e v20.4s, v0.4s + sm4e v20.4s, v1.4s + sm4e v20.4s, v2.4s + sm4e v20.4s, v3.4s + sm4e v20.4s, v4.4s + sm4e v20.4s, v5.4s + sm4e v20.4s, v6.4s + sm4e v20.4s, v7.4s + rev64 v20.4s, v20.4s + ext v20.16b, v20.16b, v20.16b, #8 + rev32 v20.16b, v20.16b + eor v20.16b, v20.16b, v8.16b + + adr x6, .cts_permute_table + add x7, x6, #32 + add x6, x6, x5 + sub x7, x7, x5 + ld1 {v23.16b}, [x6] + ld1 {v24.16b}, [x7] + + add x0, x0, x5 + ld1 {v21.16b}, [x0] + + tbl v22.16b, {v20.16b}, v23.16b + tbx v20.16b, {v21.16b}, v24.16b + + eor v20.16b, v20.16b, v9.16b + rev32 v20.16b, v20.16b + sm4e v20.4s, v0.4s + sm4e v20.4s, v1.4s + sm4e v20.4s, v2.4s + sm4e v20.4s, v3.4s + sm4e v20.4s, v4.4s + sm4e v20.4s, v5.4s + sm4e v20.4s, v6.4s + sm4e v20.4s, v7.4s + rev64 v20.4s, v20.4s + ext v20.16b, v20.16b, v20.16b, #8 + rev32 v20.16b, v20.16b + eor v20.16b, v20.16b, v9.16b + + add x5, x1, x5 + st1 {v22.16b}, [x5] + st1 {v20.16b}, [x1] + + b .enc_xts_ret + +.enc_xts_end: + /* new tweak */ + st1 {v8.16b}, [x4] + +.enc_xts_ret: + ldp d8,d9,[sp],#16 + ret +.size sm4_v8_xts_encrypt,.-sm4_v8_xts_encrypt + +.globl sm4_v8_xts_decrypt +.type sm4_v8_xts_decrypt,%function +.align 5 +sm4_v8_xts_decrypt: + /* parameters: + * x0: src + * x1: dst + * w2: nbytes + * x3: key + * x4: tweak + * x5: key array for tweak + */ + AARCH64_VALID_CALL_TARGET + stp d8,d9,[sp, #-16]! + + ld1 {v8.16b}, [x4] + + cbz x5, .dec_xts_nokey2 + + /* load round key array for tweak */ + ld1 {v0.16b-v3.16b}, [x5], #64 + ld1 {v4.16b-v7.16b}, [x5] + + /* first tweak */ + rev32 v8.16b, v8.16b + sm4e v8.4s, v0.4s + sm4e v8.4s, v1.4s + sm4e v8.4s, v2.4s + sm4e v8.4s, v3.4s + sm4e v8.4s, v4.4s + sm4e v8.4s, v5.4s + sm4e v8.4s, v6.4s + sm4e v8.4s, v7.4s + rev64 v8.4s, v8.4s + ext v8.16b, v8.16b, v8.16b, #8 + rev32 v8.16b, v8.16b + +.dec_xts_nokey2: + ld1 {v0.16b-v3.16b}, [x3], #64 + ld1 {v4.16b-v7.16b}, [x3] + + and w5, w2, #15 + lsr w2, w2, #4 + cbz w5, .dec_xts_mask + /* leave the last block for tail */ + sub w2, w2, #1 + +.dec_xts_mask: + /* init mask */ + movi v31.2s, #0x1 + movi v16.2s, #0x87 + uzp1 v31.4s, v31.4s, v16.4s + + cbz w2, .dec_xts_tail + +.dec_xts_8block: + sub w2, w2, #8 + tbnz w2, #31, .dec_xts_4block + + tweak_calc(v9, v8, v31, v16) + tweak_calc(v10, v9, v31, v17) + tweak_calc(v11, v10, v31, v18) + tweak_calc(v12, v11, v31, v19) + tweak_calc(v13, v12, v31, v16) + tweak_calc(v14, v13, v31, v17) + tweak_calc(v15, v14, v31, v18) + + ld1 {v20.16b-v23.16b}, [x0], #64 + ld1 {v24.16b-v27.16b}, [x0], #64 + eor v20.16b, v20.16b, v8.16b + eor v21.16b, v21.16b, v9.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v11.16b + eor v24.16b, v24.16b, v12.16b + eor v25.16b, v25.16b, v13.16b + eor v26.16b, v26.16b, v14.16b + eor v27.16b, v27.16b, v15.16b + + rev32 v20.16b, v20.16b + rev32 v21.16b, v21.16b + rev32 v22.16b, v22.16b + rev32 v23.16b, v23.16b + rev32 v24.16b, v24.16b + rev32 v25.16b, v25.16b + rev32 v26.16b, v26.16b + rev32 v27.16b, v27.16b + sm4e v20.4s, v0.4s + sm4e v21.4s, v0.4s + sm4e v22.4s, v0.4s + sm4e v23.4s, v0.4s + sm4e v24.4s, v0.4s + sm4e v25.4s, v0.4s + sm4e v26.4s, v0.4s + sm4e v27.4s, v0.4s + sm4e v20.4s, v1.4s + sm4e v21.4s, v1.4s + sm4e v22.4s, v1.4s + sm4e v23.4s, v1.4s + sm4e v24.4s, v1.4s + sm4e v25.4s, v1.4s + sm4e v26.4s, v1.4s + sm4e v27.4s, v1.4s + sm4e v20.4s, v2.4s + sm4e v21.4s, v2.4s + sm4e v22.4s, v2.4s + sm4e v23.4s, v2.4s + sm4e v24.4s, v2.4s + sm4e v25.4s, v2.4s + sm4e v26.4s, v2.4s + sm4e v27.4s, v2.4s + sm4e v20.4s, v3.4s + sm4e v21.4s, v3.4s + sm4e v22.4s, v3.4s + sm4e v23.4s, v3.4s + sm4e v24.4s, v3.4s + sm4e v25.4s, v3.4s + sm4e v26.4s, v3.4s + sm4e v27.4s, v3.4s + sm4e v20.4s, v4.4s + sm4e v21.4s, v4.4s + sm4e v22.4s, v4.4s + sm4e v23.4s, v4.4s + sm4e v24.4s, v4.4s + sm4e v25.4s, v4.4s + sm4e v26.4s, v4.4s + sm4e v27.4s, v4.4s + sm4e v20.4s, v5.4s + sm4e v21.4s, v5.4s + sm4e v22.4s, v5.4s + sm4e v23.4s, v5.4s + sm4e v24.4s, v5.4s + sm4e v25.4s, v5.4s + sm4e v26.4s, v5.4s + sm4e v27.4s, v5.4s + sm4e v20.4s, v6.4s + sm4e v21.4s, v6.4s + sm4e v22.4s, v6.4s + sm4e v23.4s, v6.4s + sm4e v24.4s, v6.4s + sm4e v25.4s, v6.4s + sm4e v26.4s, v6.4s + sm4e v27.4s, v6.4s + sm4e v20.4s, v7.4s + sm4e v21.4s, v7.4s + sm4e v22.4s, v7.4s + sm4e v23.4s, v7.4s + sm4e v24.4s, v7.4s + sm4e v25.4s, v7.4s + sm4e v26.4s, v7.4s + sm4e v27.4s, v7.4s + rev64 v20.4s, v20.4s + rev64 v21.4s, v21.4s + rev64 v22.4s, v22.4s + rev64 v23.4s, v23.4s + rev64 v24.4s, v24.4s + rev64 v25.4s, v25.4s + rev64 v26.4s, v26.4s + rev64 v27.4s, v27.4s + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v21.16b, v21.16b, #8 + ext v22.16b, v22.16b, v22.16b, #8 + ext v23.16b, v23.16b, v23.16b, #8 + ext v24.16b, v24.16b, v24.16b, #8 + ext v25.16b, v25.16b, v25.16b, #8 + ext v26.16b, v26.16b, v26.16b, #8 + ext v27.16b, v27.16b, v27.16b, #8 + rev32 v20.16b, v20.16b + rev32 v21.16b, v21.16b + rev32 v22.16b, v22.16b + rev32 v23.16b, v23.16b + rev32 v24.16b, v24.16b + rev32 v25.16b, v25.16b + rev32 v26.16b, v26.16b + rev32 v27.16b, v27.16b + + eor v20.16b, v20.16b, v8.16b + eor v21.16b, v21.16b, v9.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v11.16b + eor v24.16b, v24.16b, v12.16b + eor v25.16b, v25.16b, v13.16b + eor v26.16b, v26.16b, v14.16b + eor v27.16b, v27.16b, v15.16b + st1 {v20.16b-v23.16b}, [x1], #64 + st1 {v24.16b-v27.16b}, [x1], #64 + + tweak_calc(v8, v15, v31, v19) + + cbz w2, .dec_xts_tail + b .dec_xts_8block + +.dec_xts_4block: + add w2, w2, #8 + cmp w2, #4 + blt .dec_xts_1block + + sub w2, w2, #4 + + tweak_calc(v9, v8, v31, v16) + tweak_calc(v10, v9, v31, v17) + tweak_calc(v11, v10, v31, v18) + + ld1 {v20.16b-v23.16b}, [x0], #64 + eor v20.16b, v20.16b, v8.16b + eor v21.16b, v21.16b, v9.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v11.16b + + rev32 v20.16b, v20.16b + rev32 v21.16b, v21.16b + rev32 v22.16b, v22.16b + rev32 v23.16b, v23.16b + sm4e v20.4s, v0.4s + sm4e v21.4s, v0.4s + sm4e v22.4s, v0.4s + sm4e v23.4s, v0.4s + sm4e v20.4s, v1.4s + sm4e v21.4s, v1.4s + sm4e v22.4s, v1.4s + sm4e v23.4s, v1.4s + sm4e v20.4s, v2.4s + sm4e v21.4s, v2.4s + sm4e v22.4s, v2.4s + sm4e v23.4s, v2.4s + sm4e v20.4s, v3.4s + sm4e v21.4s, v3.4s + sm4e v22.4s, v3.4s + sm4e v23.4s, v3.4s + sm4e v20.4s, v4.4s + sm4e v21.4s, v4.4s + sm4e v22.4s, v4.4s + sm4e v23.4s, v4.4s + sm4e v20.4s, v5.4s + sm4e v21.4s, v5.4s + sm4e v22.4s, v5.4s + sm4e v23.4s, v5.4s + sm4e v20.4s, v6.4s + sm4e v21.4s, v6.4s + sm4e v22.4s, v6.4s + sm4e v23.4s, v6.4s + sm4e v20.4s, v7.4s + sm4e v21.4s, v7.4s + sm4e v22.4s, v7.4s + sm4e v23.4s, v7.4s + rev64 v20.4s, v20.4s + rev64 v21.4s, v21.4s + rev64 v22.4s, v22.4s + rev64 v23.4s, v23.4s + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v21.16b, v21.16b, #8 + ext v22.16b, v22.16b, v22.16b, #8 + ext v23.16b, v23.16b, v23.16b, #8 + rev32 v20.16b, v20.16b + rev32 v21.16b, v21.16b + rev32 v22.16b, v22.16b + rev32 v23.16b, v23.16b + + eor v20.16b, v20.16b, v8.16b + eor v21.16b, v21.16b, v9.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v11.16b + st1 {v20.16b-v23.16b}, [x1], #64 + + tweak_calc(v8, v11, v31, v19) + + cbz w2, .dec_xts_tail + +.dec_xts_1block: + sub w2, w2, #1 + + ld1 {v20.16b}, [x0], #16 + eor v20.16b, v20.16b, v8.16b + + rev32 v20.16b, v20.16b + sm4e v20.4s, v0.4s + sm4e v20.4s, v1.4s + sm4e v20.4s, v2.4s + sm4e v20.4s, v3.4s + sm4e v20.4s, v4.4s + sm4e v20.4s, v5.4s + sm4e v20.4s, v6.4s + sm4e v20.4s, v7.4s + rev64 v20.4s, v20.4s + ext v20.16b, v20.16b, v20.16b, #8 + rev32 v20.16b, v20.16b + + eor v20.16b, v20.16b, v8.16b + st1 {v20.16b}, [x1], #16 + + tweak_calc(v8, v8, v31, v16) + + cbnz w2, .dec_xts_1block + +.dec_xts_tail: + uxtw x5, w5 + cbz x5, .dec_xts_end + + tweak_calc(v9, v8, v31, v16) + ld1 {v20.16b}, [x0] + eor v20.16b, v20.16b, v9.16b + rev32 v20.16b, v20.16b + sm4e v20.4s, v0.4s + sm4e v20.4s, v1.4s + sm4e v20.4s, v2.4s + sm4e v20.4s, v3.4s + sm4e v20.4s, v4.4s + sm4e v20.4s, v5.4s + sm4e v20.4s, v6.4s + sm4e v20.4s, v7.4s + rev64 v20.4s, v20.4s + ext v20.16b, v20.16b, v20.16b, #8 + rev32 v20.16b, v20.16b + eor v20.16b, v20.16b, v9.16b + + adr x6, .cts_permute_table + add x7, x6, #32 + add x6, x6, x5 + sub x7, x7, x5 + ld1 {v23.16b}, [x6] + ld1 {v24.16b}, [x7] + + add x0, x0, x5 + ld1 {v21.16b}, [x0] + + tbl v22.16b, {v20.16b}, v23.16b + tbx v20.16b, {v21.16b}, v24.16b + + eor v20.16b, v20.16b, v8.16b + rev32 v20.16b, v20.16b + sm4e v20.4s, v0.4s + sm4e v20.4s, v1.4s + sm4e v20.4s, v2.4s + sm4e v20.4s, v3.4s + sm4e v20.4s, v4.4s + sm4e v20.4s, v5.4s + sm4e v20.4s, v6.4s + sm4e v20.4s, v7.4s + rev64 v20.4s, v20.4s + ext v20.16b, v20.16b, v20.16b, #8 + rev32 v20.16b, v20.16b + eor v20.16b, v20.16b, v8.16b + + add x5, x1, x5 + st1 {v22.16b}, [x5] + st1 {v20.16b}, [x1] + + b .dec_xts_ret + +.dec_xts_end: + /* new tweak */ + st1 {v8.16b}, [x4] + +.dec_xts_ret: + ldp d8,d9,[sp],#16 + ret +.size sm4_v8_xts_decrypt,.-sm4_v8_xts_decrypt
Provides the CE acceleration instruction (Crypto-Extension) to accelerate the execution of the SM4(ECB) algorithm.
Signed-off-by: Qi Tao taoqi10@huawei.com --- drv/isa_ce_sm4.c | 17 +++ drv/isa_ce_sm4.h | 2 + drv/isa_ce_sm4_armv8.S | 263 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 282 insertions(+)
diff --git a/drv/isa_ce_sm4.c b/drv/isa_ce_sm4.c index fe8fbdc..bbc2c07 100644 --- a/drv/isa_ce_sm4.c +++ b/drv/isa_ce_sm4.c @@ -129,6 +129,16 @@ static void sm4_cbc_decrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rke sm4_v8_cbc_encrypt(msg->in, msg->out, msg->in_bytes, rkey_dec, msg->iv, SM4_DECRYPT); }
+static void sm4_ecb_encrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey_enc) +{ + sm4_v8_ecb_encrypt(msg->in, msg->out, msg->in_bytes, rkey_enc, SM4_ENCRYPT); +} + +static void sm4_ecb_decrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey_dec) +{ + sm4_v8_ecb_encrypt(msg->in, msg->out, msg->in_bytes, rkey_dec, SM4_DECRYPT); +} + void sm4_set_encrypt_key(const __u8 *userKey, struct SM4_KEY *key) { sm4_v8_set_encrypt_key(userKey, key); @@ -265,6 +275,12 @@ static int isa_ce_cipher_send(handle_t ctx, void *wd_msg) sm4_set_decrypt_key(msg->key, &rkey);
switch (msg->mode) { + case WD_CIPHER_ECB: + if (msg->op_type == WD_CIPHER_ENCRYPTION) + sm4_ecb_encrypt(msg, &rkey); + else + sm4_ecb_decrypt(msg, &rkey); + break; case WD_CIPHER_CBC: if (msg->op_type == WD_CIPHER_ENCRYPTION) sm4_cbc_encrypt(msg, &rkey); @@ -329,6 +345,7 @@ static struct wd_alg_driver cipher_alg_driver[] = { GEN_CE_ALG_DRIVER("ctr(sm4)", cipher), GEN_CE_ALG_DRIVER("cfb(sm4)", cipher), GEN_CE_ALG_DRIVER("xts(sm4)", cipher), + GEN_CE_ALG_DRIVER("ecb(sm4)", cipher), };
static void __attribute__((constructor)) isa_ce_probe(void) diff --git a/drv/isa_ce_sm4.h b/drv/isa_ce_sm4.h index eba1c9e..89e882e 100644 --- a/drv/isa_ce_sm4.h +++ b/drv/isa_ce_sm4.h @@ -23,6 +23,8 @@ void sm4_v8_set_decrypt_key(const unsigned char *userKey, struct SM4_KEY *key); void sm4_v8_cbc_encrypt(const unsigned char *in, unsigned char *out, size_t length, const struct SM4_KEY *key, unsigned char *ivec, const int enc); +void sm4_v8_ecb_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const struct SM4_KEY *key, const int enc); void sm4_v8_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, size_t len, const void *key, const unsigned char ivec[16]);
diff --git a/drv/isa_ce_sm4_armv8.S b/drv/isa_ce_sm4_armv8.S index 342dfa5..7d84496 100644 --- a/drv/isa_ce_sm4_armv8.S +++ b/drv/isa_ce_sm4_armv8.S @@ -506,6 +506,269 @@ sm4_v8_cbc_encrypt: ldp d8,d9,[sp],#16 ret .size sm4_v8_cbc_encrypt,.-sm4_v8_cbc_encrypt +.globl sm4_v8_ecb_encrypt +.type sm4_v8_ecb_encrypt,%function +.align 5 +sm4_v8_ecb_encrypt: + AARCH64_VALID_CALL_TARGET + ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x3],#64 + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x3] +1: + cmp x2,#64 + b.lt 1f + ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x0],#64 + cmp x2,#128 + b.lt 2f + ld1 {v20.4s,v21.4s,v22.4s,v23.4s},[x0],#64 + // 8 blocks +#ifndef __ARMEB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __ARMEB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __ARMEB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __ARMEB__ + rev32 v19.16b,v19.16b +#endif +#ifndef __ARMEB__ + rev32 v20.16b,v20.16b +#endif +#ifndef __ARMEB__ + rev32 v21.16b,v21.16b +#endif +#ifndef __ARMEB__ + rev32 v22.16b,v22.16b +#endif +#ifndef __ARMEB__ + rev32 v23.16b,v23.16b +#endif + sm4e v16.4s,v0.4s; + sm4e v17.4s,v0.4s; + sm4e v18.4s,v0.4s; + sm4e v19.4s,v0.4s; + + sm4e v16.4s,v1.4s; + sm4e v17.4s,v1.4s; + sm4e v18.4s,v1.4s; + sm4e v19.4s,v1.4s; + + sm4e v16.4s,v2.4s; + sm4e v17.4s,v2.4s; + sm4e v18.4s,v2.4s; + sm4e v19.4s,v2.4s; + + sm4e v16.4s,v3.4s; + sm4e v17.4s,v3.4s; + sm4e v18.4s,v3.4s; + sm4e v19.4s,v3.4s; + + sm4e v16.4s,v4.4s; + sm4e v17.4s,v4.4s; + sm4e v18.4s,v4.4s; + sm4e v19.4s,v4.4s; + + sm4e v16.4s,v5.4s; + sm4e v17.4s,v5.4s; + sm4e v18.4s,v5.4s; + sm4e v19.4s,v5.4s; + + sm4e v16.4s,v6.4s; + sm4e v17.4s,v6.4s; + sm4e v18.4s,v6.4s; + sm4e v19.4s,v6.4s; + + sm4e v16.4s,v7.4s; + rev64 v16.4S,v16.4S + sm4e v17.4s,v7.4s; + ext v16.16b,v16.16b,v16.16b,#8 + rev64 v17.4S,v17.4S + sm4e v18.4s,v7.4s; + ext v17.16b,v17.16b,v17.16b,#8 + rev64 v18.4S,v18.4S + sm4e v19.4s,v7.4s; + ext v18.16b,v18.16b,v18.16b,#8 + rev64 v19.4S,v19.4S + ext v19.16b,v19.16b,v19.16b,#8 + sm4e v20.4s,v0.4s; + sm4e v21.4s,v0.4s; + sm4e v22.4s,v0.4s; + sm4e v23.4s,v0.4s; + + sm4e v20.4s,v1.4s; + sm4e v21.4s,v1.4s; + sm4e v22.4s,v1.4s; + sm4e v23.4s,v1.4s; + + sm4e v20.4s,v2.4s; + sm4e v21.4s,v2.4s; + sm4e v22.4s,v2.4s; + sm4e v23.4s,v2.4s; + + sm4e v20.4s,v3.4s; + sm4e v21.4s,v3.4s; + sm4e v22.4s,v3.4s; + sm4e v23.4s,v3.4s; + + sm4e v20.4s,v4.4s; + sm4e v21.4s,v4.4s; + sm4e v22.4s,v4.4s; + sm4e v23.4s,v4.4s; + + sm4e v20.4s,v5.4s; + sm4e v21.4s,v5.4s; + sm4e v22.4s,v5.4s; + sm4e v23.4s,v5.4s; + + sm4e v20.4s,v6.4s; + sm4e v21.4s,v6.4s; + sm4e v22.4s,v6.4s; + sm4e v23.4s,v6.4s; + + sm4e v20.4s,v7.4s; + rev64 v20.4S,v20.4S + sm4e v21.4s,v7.4s; + ext v20.16b,v20.16b,v20.16b,#8 + rev64 v21.4S,v21.4S + sm4e v22.4s,v7.4s; + ext v21.16b,v21.16b,v21.16b,#8 + rev64 v22.4S,v22.4S + sm4e v23.4s,v7.4s; + ext v22.16b,v22.16b,v22.16b,#8 + rev64 v23.4S,v23.4S + ext v23.16b,v23.16b,v23.16b,#8 +#ifndef __ARMEB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __ARMEB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __ARMEB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __ARMEB__ + rev32 v19.16b,v19.16b +#endif +#ifndef __ARMEB__ + rev32 v20.16b,v20.16b +#endif +#ifndef __ARMEB__ + rev32 v21.16b,v21.16b +#endif + st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64 +#ifndef __ARMEB__ + rev32 v22.16b,v22.16b +#endif +#ifndef __ARMEB__ + rev32 v23.16b,v23.16b +#endif + st1 {v20.4s,v21.4s,v22.4s,v23.4s},[x1],#64 + subs x2,x2,#128 + b.gt 1b + ret + // 4 blocks +2: +#ifndef __ARMEB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __ARMEB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __ARMEB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __ARMEB__ + rev32 v19.16b,v19.16b +#endif + sm4e v16.4s,v0.4s; + sm4e v17.4s,v0.4s; + sm4e v18.4s,v0.4s; + sm4e v19.4s,v0.4s; + + sm4e v16.4s,v1.4s; + sm4e v17.4s,v1.4s; + sm4e v18.4s,v1.4s; + sm4e v19.4s,v1.4s; + + sm4e v16.4s,v2.4s; + sm4e v17.4s,v2.4s; + sm4e v18.4s,v2.4s; + sm4e v19.4s,v2.4s; + + sm4e v16.4s,v3.4s; + sm4e v17.4s,v3.4s; + sm4e v18.4s,v3.4s; + sm4e v19.4s,v3.4s; + + sm4e v16.4s,v4.4s; + sm4e v17.4s,v4.4s; + sm4e v18.4s,v4.4s; + sm4e v19.4s,v4.4s; + + sm4e v16.4s,v5.4s; + sm4e v17.4s,v5.4s; + sm4e v18.4s,v5.4s; + sm4e v19.4s,v5.4s; + + sm4e v16.4s,v6.4s; + sm4e v17.4s,v6.4s; + sm4e v18.4s,v6.4s; + sm4e v19.4s,v6.4s; + + sm4e v16.4s,v7.4s; + rev64 v16.4S,v16.4S + sm4e v17.4s,v7.4s; + ext v16.16b,v16.16b,v16.16b,#8 + rev64 v17.4S,v17.4S + sm4e v18.4s,v7.4s; + ext v17.16b,v17.16b,v17.16b,#8 + rev64 v18.4S,v18.4S + sm4e v19.4s,v7.4s; + ext v18.16b,v18.16b,v18.16b,#8 + rev64 v19.4S,v19.4S + ext v19.16b,v19.16b,v19.16b,#8 +#ifndef __ARMEB__ + rev32 v16.16b,v16.16b +#endif +#ifndef __ARMEB__ + rev32 v17.16b,v17.16b +#endif +#ifndef __ARMEB__ + rev32 v18.16b,v18.16b +#endif +#ifndef __ARMEB__ + rev32 v19.16b,v19.16b +#endif + st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64 + subs x2,x2,#64 + b.gt 1b +1: + subs x2,x2,#16 + b.lt 1f + ld1 {v16.4s},[x0],#16 +#ifndef __ARMEB__ + rev32 v16.16b,v16.16b +#endif + sm4e v16.4s,v0.4s; + sm4e v16.4s,v1.4s; + sm4e v16.4s,v2.4s; + sm4e v16.4s,v3.4s; + sm4e v16.4s,v4.4s; + sm4e v16.4s,v5.4s; + sm4e v16.4s,v6.4s; + sm4e v16.4s,v7.4s; + rev64 v16.4S,v16.4S + ext v16.16b,v16.16b,v16.16b,#8 +#ifndef __ARMEB__ + rev32 v16.16b,v16.16b +#endif + st1 {v16.4s},[x1],#16 + b.ne 1b +1: + ret +.size sm4_v8_ecb_encrypt,.-sm4_v8_ecb_encrypt .globl sm4_v8_ctr32_encrypt_blocks .type sm4_v8_ctr32_encrypt_blocks,%function .align 5
From: Yang Shen shenyang39@huawei.com
This patch implements the CE instruction using SM4 CBC_CTS modes.
Signed-off-by: Yang Shen shenyang39@huawei.com --- drv/isa_ce_sm4.c | 91 ++++++++++++++++++++++++- drv/isa_ce_sm4.h | 24 ++++--- drv/isa_ce_sm4_armv8.S | 146 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 251 insertions(+), 10 deletions(-)
diff --git a/drv/isa_ce_sm4.c b/drv/isa_ce_sm4.c index bbc2c07..3a6b477 100644 --- a/drv/isa_ce_sm4.c +++ b/drv/isa_ce_sm4.c @@ -129,6 +129,82 @@ static void sm4_cbc_decrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rke sm4_v8_cbc_encrypt(msg->in, msg->out, msg->in_bytes, rkey_dec, msg->iv, SM4_DECRYPT); }
+/* + * In some situations, the cts mode can use cbc mode instead to imporve performance. + */ +static int sm4_cts_cbc_instead(struct wd_cipher_msg *msg) +{ + if (msg->in_bytes == SM4_BLOCK_SIZE) + return true; + + if (!(msg->in_bytes % SM4_BLOCK_SIZE) && msg->mode != WD_CIPHER_CBC_CS3) + return true; + + return false; +} + +static void sm4_cts_cs1_mode_adapt(__u8 *cts_in, __u8 *cts_out, + const __u32 cts_bytes, const int enc) +{ + __u32 rsv_bytes = cts_bytes % SM4_BLOCK_SIZE; + __u8 blocks[SM4_BLOCK_SIZE] = {0}; + + if (enc == SM4_ENCRYPT) { + memcpy(blocks, cts_out + SM4_BLOCK_SIZE, rsv_bytes); + memcpy(cts_out + rsv_bytes, cts_out, SM4_BLOCK_SIZE); + memcpy(cts_out, blocks, rsv_bytes); + } else { + memcpy(blocks, cts_in + rsv_bytes, SM4_BLOCK_SIZE); + memcpy(cts_in + SM4_BLOCK_SIZE, cts_in, rsv_bytes); + memcpy(cts_in, blocks, SM4_BLOCK_SIZE); + } +} + +static void sm4_cts_cbc_crypt(struct wd_cipher_msg *msg, + const struct SM4_KEY *rkey_enc, const int enc) +{ + enum wd_cipher_mode mode = msg->mode; + __u32 in_bytes = msg->in_bytes; + __u8 *cts_in, *cts_out; + __u32 cts_bytes; + + if (sm4_cts_cbc_instead(msg)) + return sm4_v8_cbc_encrypt(msg->in, msg->out, in_bytes, rkey_enc, msg->iv, enc); + + cts_bytes = in_bytes % SM4_BLOCK_SIZE + SM4_BLOCK_SIZE; + if (cts_bytes == SM4_BLOCK_SIZE) + cts_bytes += SM4_BLOCK_SIZE; + + in_bytes -= cts_bytes; + if (in_bytes) + sm4_v8_cbc_encrypt(msg->in, msg->out, in_bytes, rkey_enc, msg->iv, enc); + + cts_in = msg->in + in_bytes; + cts_out = msg->out + in_bytes; + + if (enc == SM4_ENCRYPT) { + sm4_v8_cbc_cts_encrypt(cts_in, cts_out, cts_bytes, rkey_enc, msg->iv); + + if (mode == WD_CIPHER_CBC_CS1) + sm4_cts_cs1_mode_adapt(cts_in, cts_out, cts_bytes, enc); + } else { + if (mode == WD_CIPHER_CBC_CS1) + sm4_cts_cs1_mode_adapt(cts_in, cts_out, cts_bytes, enc); + + sm4_v8_cbc_cts_decrypt(cts_in, cts_out, cts_bytes, rkey_enc, msg->iv); + } +} + +static void sm4_cbc_cts_encrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey_enc) +{ + sm4_cts_cbc_crypt(msg, rkey_enc, SM4_ENCRYPT); +} + +static void sm4_cbc_cts_decrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey_enc) +{ + sm4_cts_cbc_crypt(msg, rkey_enc, SM4_DECRYPT); +} + static void sm4_ecb_encrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey_enc) { sm4_v8_ecb_encrypt(msg->in, msg->out, msg->in_bytes, rkey_enc, SM4_ENCRYPT); @@ -139,12 +215,12 @@ static void sm4_ecb_decrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rke sm4_v8_ecb_encrypt(msg->in, msg->out, msg->in_bytes, rkey_dec, SM4_DECRYPT); }
-void sm4_set_encrypt_key(const __u8 *userKey, struct SM4_KEY *key) +static void sm4_set_encrypt_key(const __u8 *userKey, struct SM4_KEY *key) { sm4_v8_set_encrypt_key(userKey, key); }
-void sm4_set_decrypt_key(const __u8 *userKey, struct SM4_KEY *key) +static void sm4_set_decrypt_key(const __u8 *userKey, struct SM4_KEY *key) { sm4_v8_set_decrypt_key(userKey, key); } @@ -287,6 +363,14 @@ static int isa_ce_cipher_send(handle_t ctx, void *wd_msg) else sm4_cbc_decrypt(msg, &rkey); break; + case WD_CIPHER_CBC_CS1: + case WD_CIPHER_CBC_CS2: + case WD_CIPHER_CBC_CS3: + if (msg->op_type == WD_CIPHER_ENCRYPTION) + sm4_cbc_cts_encrypt(msg, &rkey); + else + sm4_cbc_cts_decrypt(msg, &rkey); + break; case WD_CIPHER_CTR: sm4_ctr_encrypt(msg, &rkey); break; @@ -342,6 +426,9 @@ static int cipher_recv(handle_t ctx, void *msg)
static struct wd_alg_driver cipher_alg_driver[] = { GEN_CE_ALG_DRIVER("cbc(sm4)", cipher), + GEN_CE_ALG_DRIVER("cbc-cs1(sm4)", cipher), + GEN_CE_ALG_DRIVER("cbc-cs2(sm4)", cipher), + GEN_CE_ALG_DRIVER("cbc-cs3(sm4)", cipher), GEN_CE_ALG_DRIVER("ctr(sm4)", cipher), GEN_CE_ALG_DRIVER("cfb(sm4)", cipher), GEN_CE_ALG_DRIVER("xts(sm4)", cipher), diff --git a/drv/isa_ce_sm4.h b/drv/isa_ce_sm4.h index 89e882e..9ae027d 100644 --- a/drv/isa_ce_sm4.h +++ b/drv/isa_ce_sm4.h @@ -20,27 +20,35 @@ struct SM4_KEY {
void sm4_v8_set_encrypt_key(const unsigned char *userKey, struct SM4_KEY *key); void sm4_v8_set_decrypt_key(const unsigned char *userKey, struct SM4_KEY *key); + void sm4_v8_cbc_encrypt(const unsigned char *in, unsigned char *out, size_t length, const struct SM4_KEY *key, unsigned char *ivec, const int enc); +void sm4_v8_cbc_cts_encrypt(const unsigned char *in, unsigned char *out, + size_t len, const void *key, const unsigned char ivec[16]); +void sm4_v8_cbc_cts_decrypt(const unsigned char *in, unsigned char *out, + size_t len, const void *key, const unsigned char ivec[16]); + void sm4_v8_ecb_encrypt(const unsigned char *in, unsigned char *out, size_t length, const struct SM4_KEY *key, const int enc); + void sm4_v8_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, - size_t len, const void *key, const unsigned char ivec[16]); + size_t len, const void *key, const unsigned char ivec[16]);
void sm4_v8_cfb_encrypt_blocks(const unsigned char *in, unsigned char *out, - size_t length, const struct SM4_KEY *key, unsigned char *ivec); + size_t length, const struct SM4_KEY *key, unsigned char *ivec); void sm4_v8_cfb_decrypt_blocks(const unsigned char *in, unsigned char *out, - size_t length, const struct SM4_KEY *key, unsigned char *ivec); + size_t length, const struct SM4_KEY *key, unsigned char *ivec); + void sm4_v8_crypt_block(const unsigned char *in, unsigned char *out, - const struct SM4_KEY *key); + const struct SM4_KEY *key);
int sm4_v8_xts_encrypt(const unsigned char *in, unsigned char *out, size_t length, - const struct SM4_KEY *key, unsigned char *ivec, - const struct SM4_KEY *key2); + const struct SM4_KEY *key, unsigned char *ivec, + const struct SM4_KEY *key2); int sm4_v8_xts_decrypt(const unsigned char *in, unsigned char *out, size_t length, - const struct SM4_KEY *key, unsigned char *ivec, - const struct SM4_KEY *key2); + const struct SM4_KEY *key, unsigned char *ivec, + const struct SM4_KEY *key2);
#ifdef __cplusplus } diff --git a/drv/isa_ce_sm4_armv8.S b/drv/isa_ce_sm4_armv8.S index 7d84496..2f9477a 100644 --- a/drv/isa_ce_sm4_armv8.S +++ b/drv/isa_ce_sm4_armv8.S @@ -24,6 +24,11 @@ .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd .endm
+.macro adr_l, dst, sym + adrp \dst, \sym + add \dst, \dst, :lo12:\sym +.endm + .text .align 6 .Lck: @@ -506,6 +511,147 @@ sm4_v8_cbc_encrypt: ldp d8,d9,[sp],#16 ret .size sm4_v8_cbc_encrypt,.-sm4_v8_cbc_encrypt + +.globl sm4_v8_cbc_cts_encrypt +.type sm4_v8_cbc_cts_encrypt,%function +.align 5 +sm4_v8_cbc_cts_encrypt: + AARCH64_VALID_CALL_TARGET + ld1 {v24.16b-v27.16b}, [x3], #64; + ld1 {v28.16b-v31.16b}, [x3]; + + sub x5, x2, #16 + + ld1 {v20.16b}, [x4] + + ld1 {v0.16b}, [x0] + eor v20.16b, v20.16b, v0.16b + rev32 v20.16b, v20.16b; + sm4e v20.4s, v24.4s; + sm4e v20.4s, v25.4s; + sm4e v20.4s, v26.4s; + sm4e v20.4s, v27.4s; + sm4e v20.4s, v28.4s; + sm4e v20.4s, v29.4s; + sm4e v20.4s, v30.4s; + sm4e v20.4s, v31.4s; + rev64 v20.4s, v20.4s; + ext v20.16b, v20.16b, v20.16b, #8; + rev32 v20.16b, v20.16b; + + /* load permute table */ + adr x6, .Lcts_permute_table + add x7, x6, #32 + add x6, x6, x5 + sub x7, x7, x5 + ld1 {v3.16b}, [x6] + ld1 {v4.16b}, [x7] + + /* overlapping loads */ + add x0, x0, x5 + ld1 {v1.16b}, [x0] + + /* create Cn from En-1 */ + tbl v0.16b, {v20.16b}, v3.16b + /* padding Pn with zeros */ + tbl v1.16b, {v1.16b}, v4.16b + + eor v1.16b, v1.16b, v20.16b + rev32 v1.16b, v1.16b; + sm4e v1.4s, v24.4s; + sm4e v1.4s, v25.4s; + sm4e v1.4s, v26.4s; + sm4e v1.4s, v27.4s; + sm4e v1.4s, v28.4s; + sm4e v1.4s, v29.4s; + sm4e v1.4s, v30.4s; + sm4e v1.4s, v31.4s; + rev64 v1.4s, v1.4s; + ext v1.16b, v1.16b, v1.16b, #8; + rev32 v1.16b, v1.16b; + + /* overlapping stores */ + add x5, x1, x5 + st1 {v0.16b}, [x5] + st1 {v1.16b}, [x1] + + ret +.size sm4_v8_cbc_cts_encrypt,.-sm4_v8_cbc_cts_encrypt + +.globl sm4_v8_cbc_cts_decrypt +.type sm4_v8_cbc_cts_decrypt,%function +.align 5 +sm4_v8_cbc_cts_decrypt: + ld1 {v24.16b-v27.16b}, [x3], #64; + ld1 {v28.16b-v31.16b}, [x3]; + + sub x5, x2, #16 + + ld1 {v20.16b}, [x4] + + /* load permute table */ + adr_l x6, .Lcts_permute_table + add x7, x6, #32 + add x6, x6, x5 + sub x7, x7, x5 + ld1 {v3.16b}, [x6] + ld1 {v4.16b}, [x7] + + /* overlapping loads */ + ld1 {v0.16b}, [x0], x5 + ld1 {v1.16b}, [x0] + + rev32 v0.16b, v0.16b; + sm4e v0.4s, v24.4s; + sm4e v0.4s, v25.4s; + sm4e v0.4s, v26.4s; + sm4e v0.4s, v27.4s; + sm4e v0.4s, v28.4s; + sm4e v0.4s, v29.4s; + sm4e v0.4s, v30.4s; + sm4e v0.4s, v31.4s; + rev64 v0.4s, v0.4s; + ext v0.16b, v0.16b, v0.16b, #8; + rev32 v0.16b, v0.16b; + + /* select the first Ln bytes of Xn to create Pn */ + tbl v2.16b, {v0.16b}, v3.16b + eor v2.16b, v2.16b, v1.16b + + /* overwrite the first Ln bytes with Cn to create En-1 */ + tbx v0.16b, {v1.16b}, v4.16b + + rev32 v0.16b, v0.16b; + sm4e v0.4s, v24.4s; + sm4e v0.4s, v25.4s; + sm4e v0.4s, v26.4s; + sm4e v0.4s, v27.4s; + sm4e v0.4s, v28.4s; + sm4e v0.4s, v29.4s; + sm4e v0.4s, v30.4s; + sm4e v0.4s, v31.4s; + rev64 v0.4s, v0.4s; + ext v0.16b, v0.16b, v0.16b, #8; + rev32 v0.16b, v0.16b; + + eor v0.16b, v0.16b, v20.16b + + /* overlapping stores */ + add x5, x1, x5 + st1 {v2.16b}, [x5] + st1 {v0.16b}, [x1] + + ret +.size sm4_v8_cbc_cts_decrypt,.-sm4_v8_cbc_cts_decrypt + +.Lcts_permute_table: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .globl sm4_v8_ecb_encrypt .type sm4_v8_ecb_encrypt,%function .align 5
From: Yang Shen shenyang39@huawei.com
Fix the possible memcpy overwrite.
Signed-off-by: Yang Shen shenyang39@huawei.com --- drv/isa_ce_sm4.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/drv/isa_ce_sm4.c b/drv/isa_ce_sm4.c index 3a6b477..16e0ae7 100644 --- a/drv/isa_ce_sm4.c +++ b/drv/isa_ce_sm4.c @@ -150,9 +150,9 @@ static void sm4_cts_cs1_mode_adapt(__u8 *cts_in, __u8 *cts_out, __u8 blocks[SM4_BLOCK_SIZE] = {0};
if (enc == SM4_ENCRYPT) { - memcpy(blocks, cts_out + SM4_BLOCK_SIZE, rsv_bytes); - memcpy(cts_out + rsv_bytes, cts_out, SM4_BLOCK_SIZE); - memcpy(cts_out, blocks, rsv_bytes); + memcpy(blocks, cts_out, SM4_BLOCK_SIZE); + memcpy(cts_out, cts_out + SM4_BLOCK_SIZE, rsv_bytes); + memcpy(cts_out + rsv_bytes, blocks, SM4_BLOCK_SIZE); } else { memcpy(blocks, cts_in + rsv_bytes, SM4_BLOCK_SIZE); memcpy(cts_in + SM4_BLOCK_SIZE, cts_in, rsv_bytes);
The following code defects are rectified: 1、SM4 CE does not support the epoll mode, Therefore, disable the epoll mode.
2、SM4 CE does not support sgl data format, Therefore, intercept the sgl data format.
Signed-off-by: Qi Tao taoqi10@huawei.com --- drv/isa_ce_sm4.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/drv/isa_ce_sm4.c b/drv/isa_ce_sm4.c index 16e0ae7..fbbb2b7 100644 --- a/drv/isa_ce_sm4.c +++ b/drv/isa_ce_sm4.c @@ -33,6 +33,9 @@
static int isa_ce_init(void *conf, void *priv) { + struct wd_ctx_config_internal *config = conf; + + config->epoll_en = 0; return 0; }
@@ -344,6 +347,11 @@ static int isa_ce_cipher_send(handle_t ctx, void *wd_msg) return -WD_EINVAL; }
+ if (msg->data_fmt == WD_SGL_BUF) { + WD_ERR("invalid: SM4 CE driver do not support sgl data format!\n"); + return -WD_EINVAL; + } + if (msg->op_type == WD_CIPHER_ENCRYPTION || msg->mode == WD_CIPHER_CTR || msg->mode == WD_CIPHER_CFB) sm4_set_encrypt_key(msg->key, &rkey);
From: Wenkai Lin linwenkai6@hisilicon.com
SM4 CE driver set priv size to 0, allocating memory is not allowed, so set a proper priv size and add size check.
Signed-off-by: Wenkai Lin linwenkai6@hisilicon.com --- drv/isa_ce_sm4.c | 5 ++++- drv/isa_ce_sm4.h | 5 +++++ wd_util.c | 5 +++++ 3 files changed, 14 insertions(+), 1 deletion(-)
diff --git a/drv/isa_ce_sm4.c b/drv/isa_ce_sm4.c index fbbb2b7..7c6d983 100644 --- a/drv/isa_ce_sm4.c +++ b/drv/isa_ce_sm4.c @@ -34,8 +34,11 @@ static int isa_ce_init(void *conf, void *priv) { struct wd_ctx_config_internal *config = conf; + struct sm4_ce_drv_ctx *sctx = priv;
config->epoll_en = 0; + memcpy(&sctx->config, config, sizeof(struct wd_ctx_config_internal)); + return 0; }
@@ -423,7 +426,7 @@ static int cipher_recv(handle_t ctx, void *msg) .alg_name = (ce_alg_name),\ .calc_type = UADK_ALG_CE_INSTR,\ .priority = 200,\ - .priv_size = 0,\ + .priv_size = sizeof(struct sm4_ce_drv_ctx),\ .op_type_num = 1,\ .fallback = 0,\ .init = isa_ce_init,\ diff --git a/drv/isa_ce_sm4.h b/drv/isa_ce_sm4.h index 9ae027d..a80224f 100644 --- a/drv/isa_ce_sm4.h +++ b/drv/isa_ce_sm4.h @@ -6,6 +6,7 @@
#pragma once #include <stdint.h> +#include "wd_alg_common.h"
#ifdef __cplusplus extern "C" { @@ -17,6 +18,10 @@ struct SM4_KEY { __u32 rk[SM4_KEY_SCHEDULE]; };
+struct sm4_ce_drv_ctx { + struct wd_ctx_config_internal config; +}; +
void sm4_v8_set_encrypt_key(const unsigned char *userKey, struct SM4_KEY *key); void sm4_v8_set_decrypt_key(const unsigned char *userKey, struct SM4_KEY *key); diff --git a/wd_util.c b/wd_util.c index fb58167..dc627d8 100644 --- a/wd_util.c +++ b/wd_util.c @@ -1935,6 +1935,11 @@ int wd_alg_init_driver(struct wd_ctx_config_internal *config, { int ret;
+ if (!driver->priv_size) { + WD_ERR("invalid: driver priv ctx size is zero!\n"); + return -WD_EINVAL; + } + if (!driver->init) { driver->fallback = 0; WD_ERR("driver have no init interface.\n");
From: Yang Shen shenyang39@huawei.com
Code the cbc_cts mode as openssl code style.
Signed-off-by: Yang Shen shenyang39@huawei.com --- drv/isa_ce_sm4_armv8.S | 201 +++++++++++++++++++---------------------- 1 file changed, 94 insertions(+), 107 deletions(-)
diff --git a/drv/isa_ce_sm4_armv8.S b/drv/isa_ce_sm4_armv8.S index 2f9477a..6ebf39b 100644 --- a/drv/isa_ce_sm4_armv8.S +++ b/drv/isa_ce_sm4_armv8.S @@ -24,11 +24,6 @@ .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd .endm
-.macro adr_l, dst, sym - adrp \dst, \sym - add \dst, \dst, :lo12:\sym -.endm - .text .align 6 .Lck: @@ -517,63 +512,62 @@ sm4_v8_cbc_encrypt: .align 5 sm4_v8_cbc_cts_encrypt: AARCH64_VALID_CALL_TARGET - ld1 {v24.16b-v27.16b}, [x3], #64; - ld1 {v28.16b-v31.16b}, [x3]; - - sub x5, x2, #16 - - ld1 {v20.16b}, [x4] - - ld1 {v0.16b}, [x0] - eor v20.16b, v20.16b, v0.16b - rev32 v20.16b, v20.16b; - sm4e v20.4s, v24.4s; - sm4e v20.4s, v25.4s; - sm4e v20.4s, v26.4s; - sm4e v20.4s, v27.4s; - sm4e v20.4s, v28.4s; - sm4e v20.4s, v29.4s; - sm4e v20.4s, v30.4s; - sm4e v20.4s, v31.4s; - rev64 v20.4s, v20.4s; - ext v20.16b, v20.16b, v20.16b, #8; - rev32 v20.16b, v20.16b; + ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x3], #64 + ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x3] + sub x5, x2, #16 + + ld1 {v8.4s}, [x4] + + ld1 {v10.4s}, [x0] + eor v8.16b, v8.16b, v10.16b + rev32 v8.16b, v8.16b; + sm4e v8.4s, v0.4s; + sm4e v8.4s, v1.4s; + sm4e v8.4s, v2.4s; + sm4e v8.4s, v3.4s; + sm4e v8.4s, v4.4s; + sm4e v8.4s, v5.4s; + sm4e v8.4s, v6.4s; + sm4e v8.4s, v7.4s; + rev64 v8.4s, v8.4s; + ext v8.16b, v8.16b, v8.16b, #8; + rev32 v8.16b, v8.16b;
/* load permute table */ - adr x6, .Lcts_permute_table - add x7, x6, #32 - add x6, x6, x5 - sub x7, x7, x5 - ld1 {v3.16b}, [x6] - ld1 {v4.16b}, [x7] + adr x6, .cts_permute_table + add x7, x6, #32 + add x6, x6, x5 + sub x7, x7, x5 + ld1 {v13.4s}, [x6] + ld1 {v14.4s}, [x7]
/* overlapping loads */ - add x0, x0, x5 - ld1 {v1.16b}, [x0] + add x0, x0, x5 + ld1 {v11.4s}, [x0]
/* create Cn from En-1 */ - tbl v0.16b, {v20.16b}, v3.16b + tbl v10.16b, {v8.16b}, v13.16b /* padding Pn with zeros */ - tbl v1.16b, {v1.16b}, v4.16b - - eor v1.16b, v1.16b, v20.16b - rev32 v1.16b, v1.16b; - sm4e v1.4s, v24.4s; - sm4e v1.4s, v25.4s; - sm4e v1.4s, v26.4s; - sm4e v1.4s, v27.4s; - sm4e v1.4s, v28.4s; - sm4e v1.4s, v29.4s; - sm4e v1.4s, v30.4s; - sm4e v1.4s, v31.4s; - rev64 v1.4s, v1.4s; - ext v1.16b, v1.16b, v1.16b, #8; - rev32 v1.16b, v1.16b; + tbl v11.16b, {v11.16b}, v14.16b + + eor v11.16b, v11.16b, v8.16b + rev32 v11.16b, v11.16b; + sm4e v11.4s, v0.4s; + sm4e v11.4s, v1.4s; + sm4e v11.4s, v2.4s; + sm4e v11.4s, v3.4s; + sm4e v11.4s, v4.4s; + sm4e v11.4s, v5.4s; + sm4e v11.4s, v6.4s; + sm4e v11.4s, v7.4s; + rev64 v11.4s, v11.4s; + ext v11.16b, v11.16b, v11.16b, #8; + rev32 v11.16b, v11.16b;
/* overlapping stores */ - add x5, x1, x5 - st1 {v0.16b}, [x5] - st1 {v1.16b}, [x1] + add x5, x1, x5 + st1 {v10.16b}, [x5] + st1 {v11.16b}, [x1]
ret .size sm4_v8_cbc_cts_encrypt,.-sm4_v8_cbc_cts_encrypt @@ -582,76 +576,69 @@ sm4_v8_cbc_cts_encrypt: .type sm4_v8_cbc_cts_decrypt,%function .align 5 sm4_v8_cbc_cts_decrypt: - ld1 {v24.16b-v27.16b}, [x3], #64; - ld1 {v28.16b-v31.16b}, [x3]; + AARCH64_VALID_CALL_TARGET + ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x3], #64 + ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x3]
- sub x5, x2, #16 + sub x5, x2, #16
- ld1 {v20.16b}, [x4] + ld1 {v8.4s}, [x4]
/* load permute table */ - adr_l x6, .Lcts_permute_table - add x7, x6, #32 - add x6, x6, x5 - sub x7, x7, x5 - ld1 {v3.16b}, [x6] - ld1 {v4.16b}, [x7] + adr x6, .cts_permute_table + add x7, x6, #32 + add x6, x6, x5 + sub x7, x7, x5 + ld1 {v13.4s}, [x6] + ld1 {v14.4s}, [x7]
/* overlapping loads */ - ld1 {v0.16b}, [x0], x5 - ld1 {v1.16b}, [x0] - - rev32 v0.16b, v0.16b; - sm4e v0.4s, v24.4s; - sm4e v0.4s, v25.4s; - sm4e v0.4s, v26.4s; - sm4e v0.4s, v27.4s; - sm4e v0.4s, v28.4s; - sm4e v0.4s, v29.4s; - sm4e v0.4s, v30.4s; - sm4e v0.4s, v31.4s; - rev64 v0.4s, v0.4s; - ext v0.16b, v0.16b, v0.16b, #8; - rev32 v0.16b, v0.16b; + ld1 {v10.16b}, [x0], x5 + ld1 {v11.16b}, [x0] + + rev32 v10.16b, v10.16b; + sm4e v10.4s, v0.4s; + sm4e v10.4s, v1.4s; + sm4e v10.4s, v2.4s; + sm4e v10.4s, v3.4s; + sm4e v10.4s, v4.4s; + sm4e v10.4s, v5.4s; + sm4e v10.4s, v6.4s; + sm4e v10.4s, v7.4s; + rev64 v10.4s, v10.4s; + ext v10.16b, v10.16b, v10.16b, #8; + rev32 v10.16b, v10.16b;
/* select the first Ln bytes of Xn to create Pn */ - tbl v2.16b, {v0.16b}, v3.16b - eor v2.16b, v2.16b, v1.16b + tbl v12.16b, {v10.16b}, v13.16b + eor v12.16b, v12.16b, v11.16b
/* overwrite the first Ln bytes with Cn to create En-1 */ - tbx v0.16b, {v1.16b}, v4.16b - - rev32 v0.16b, v0.16b; - sm4e v0.4s, v24.4s; - sm4e v0.4s, v25.4s; - sm4e v0.4s, v26.4s; - sm4e v0.4s, v27.4s; - sm4e v0.4s, v28.4s; - sm4e v0.4s, v29.4s; - sm4e v0.4s, v30.4s; - sm4e v0.4s, v31.4s; - rev64 v0.4s, v0.4s; - ext v0.16b, v0.16b, v0.16b, #8; - rev32 v0.16b, v0.16b; - - eor v0.16b, v0.16b, v20.16b + tbx v10.16b, {v11.16b}, v14.16b + + rev32 v10.16b, v10.16b; + sm4e v10.4s, v0.4s; + sm4e v10.4s, v1.4s; + sm4e v10.4s, v2.4s; + sm4e v10.4s, v3.4s; + sm4e v10.4s, v4.4s; + sm4e v10.4s, v5.4s; + sm4e v10.4s, v6.4s; + sm4e v10.4s, v7.4s; + rev64 v10.4s, v10.4s; + ext v10.16b, v10.16b, v10.16b, #8; + rev32 v10.16b, v10.16b; + + eor v10.16b, v10.16b, v8.16b
/* overlapping stores */ - add x5, x1, x5 - st1 {v2.16b}, [x5] - st1 {v0.16b}, [x1] + add x5, x1, x5 + st1 {v12.16b}, [x5] + st1 {v10.16b}, [x1]
ret .size sm4_v8_cbc_cts_decrypt,.-sm4_v8_cbc_cts_decrypt
-.Lcts_permute_table: - .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff - .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff - .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 - .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf - .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff - .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff - .globl sm4_v8_ecb_encrypt .type sm4_v8_ecb_encrypt,%function .align 5