From: Yang Shen shenyang39@huawei.com
This patch implements the CE instruction using SM4 CBC_CTS modes.
Signed-off-by: Yang Shen shenyang39@huawei.com --- drv/isa_ce_sm4.c | 91 ++++++++++++++++++++++++- drv/isa_ce_sm4.h | 24 ++++--- drv/isa_ce_sm4_armv8.S | 146 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 251 insertions(+), 10 deletions(-)
diff --git a/drv/isa_ce_sm4.c b/drv/isa_ce_sm4.c index bbc2c07..3a6b477 100644 --- a/drv/isa_ce_sm4.c +++ b/drv/isa_ce_sm4.c @@ -129,6 +129,82 @@ static void sm4_cbc_decrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rke sm4_v8_cbc_encrypt(msg->in, msg->out, msg->in_bytes, rkey_dec, msg->iv, SM4_DECRYPT); }
+/* + * In some situations, the cts mode can use cbc mode instead to imporve performance. + */ +static int sm4_cts_cbc_instead(struct wd_cipher_msg *msg) +{ + if (msg->in_bytes == SM4_BLOCK_SIZE) + return true; + + if (!(msg->in_bytes % SM4_BLOCK_SIZE) && msg->mode != WD_CIPHER_CBC_CS3) + return true; + + return false; +} + +static void sm4_cts_cs1_mode_adapt(__u8 *cts_in, __u8 *cts_out, + const __u32 cts_bytes, const int enc) +{ + __u32 rsv_bytes = cts_bytes % SM4_BLOCK_SIZE; + __u8 blocks[SM4_BLOCK_SIZE] = {0}; + + if (enc == SM4_ENCRYPT) { + memcpy(blocks, cts_out + SM4_BLOCK_SIZE, rsv_bytes); + memcpy(cts_out + rsv_bytes, cts_out, SM4_BLOCK_SIZE); + memcpy(cts_out, blocks, rsv_bytes); + } else { + memcpy(blocks, cts_in + rsv_bytes, SM4_BLOCK_SIZE); + memcpy(cts_in + SM4_BLOCK_SIZE, cts_in, rsv_bytes); + memcpy(cts_in, blocks, SM4_BLOCK_SIZE); + } +} + +static void sm4_cts_cbc_crypt(struct wd_cipher_msg *msg, + const struct SM4_KEY *rkey_enc, const int enc) +{ + enum wd_cipher_mode mode = msg->mode; + __u32 in_bytes = msg->in_bytes; + __u8 *cts_in, *cts_out; + __u32 cts_bytes; + + if (sm4_cts_cbc_instead(msg)) + return sm4_v8_cbc_encrypt(msg->in, msg->out, in_bytes, rkey_enc, msg->iv, enc); + + cts_bytes = in_bytes % SM4_BLOCK_SIZE + SM4_BLOCK_SIZE; + if (cts_bytes == SM4_BLOCK_SIZE) + cts_bytes += SM4_BLOCK_SIZE; + + in_bytes -= cts_bytes; + if (in_bytes) + sm4_v8_cbc_encrypt(msg->in, msg->out, in_bytes, rkey_enc, msg->iv, enc); + + cts_in = msg->in + in_bytes; + cts_out = msg->out + in_bytes; + + if (enc == SM4_ENCRYPT) { + sm4_v8_cbc_cts_encrypt(cts_in, cts_out, cts_bytes, rkey_enc, msg->iv); + + if (mode == WD_CIPHER_CBC_CS1) + sm4_cts_cs1_mode_adapt(cts_in, cts_out, cts_bytes, enc); + } else { + if (mode == WD_CIPHER_CBC_CS1) + sm4_cts_cs1_mode_adapt(cts_in, cts_out, cts_bytes, enc); + + sm4_v8_cbc_cts_decrypt(cts_in, cts_out, cts_bytes, rkey_enc, msg->iv); + } +} + +static void sm4_cbc_cts_encrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey_enc) +{ + sm4_cts_cbc_crypt(msg, rkey_enc, SM4_ENCRYPT); +} + +static void sm4_cbc_cts_decrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey_enc) +{ + sm4_cts_cbc_crypt(msg, rkey_enc, SM4_DECRYPT); +} + static void sm4_ecb_encrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey_enc) { sm4_v8_ecb_encrypt(msg->in, msg->out, msg->in_bytes, rkey_enc, SM4_ENCRYPT); @@ -139,12 +215,12 @@ static void sm4_ecb_decrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rke sm4_v8_ecb_encrypt(msg->in, msg->out, msg->in_bytes, rkey_dec, SM4_DECRYPT); }
-void sm4_set_encrypt_key(const __u8 *userKey, struct SM4_KEY *key) +static void sm4_set_encrypt_key(const __u8 *userKey, struct SM4_KEY *key) { sm4_v8_set_encrypt_key(userKey, key); }
-void sm4_set_decrypt_key(const __u8 *userKey, struct SM4_KEY *key) +static void sm4_set_decrypt_key(const __u8 *userKey, struct SM4_KEY *key) { sm4_v8_set_decrypt_key(userKey, key); } @@ -287,6 +363,14 @@ static int isa_ce_cipher_send(handle_t ctx, void *wd_msg) else sm4_cbc_decrypt(msg, &rkey); break; + case WD_CIPHER_CBC_CS1: + case WD_CIPHER_CBC_CS2: + case WD_CIPHER_CBC_CS3: + if (msg->op_type == WD_CIPHER_ENCRYPTION) + sm4_cbc_cts_encrypt(msg, &rkey); + else + sm4_cbc_cts_decrypt(msg, &rkey); + break; case WD_CIPHER_CTR: sm4_ctr_encrypt(msg, &rkey); break; @@ -342,6 +426,9 @@ static int cipher_recv(handle_t ctx, void *msg)
static struct wd_alg_driver cipher_alg_driver[] = { GEN_CE_ALG_DRIVER("cbc(sm4)", cipher), + GEN_CE_ALG_DRIVER("cbc-cs1(sm4)", cipher), + GEN_CE_ALG_DRIVER("cbc-cs2(sm4)", cipher), + GEN_CE_ALG_DRIVER("cbc-cs3(sm4)", cipher), GEN_CE_ALG_DRIVER("ctr(sm4)", cipher), GEN_CE_ALG_DRIVER("cfb(sm4)", cipher), GEN_CE_ALG_DRIVER("xts(sm4)", cipher), diff --git a/drv/isa_ce_sm4.h b/drv/isa_ce_sm4.h index 89e882e..9ae027d 100644 --- a/drv/isa_ce_sm4.h +++ b/drv/isa_ce_sm4.h @@ -20,27 +20,35 @@ struct SM4_KEY {
void sm4_v8_set_encrypt_key(const unsigned char *userKey, struct SM4_KEY *key); void sm4_v8_set_decrypt_key(const unsigned char *userKey, struct SM4_KEY *key); + void sm4_v8_cbc_encrypt(const unsigned char *in, unsigned char *out, size_t length, const struct SM4_KEY *key, unsigned char *ivec, const int enc); +void sm4_v8_cbc_cts_encrypt(const unsigned char *in, unsigned char *out, + size_t len, const void *key, const unsigned char ivec[16]); +void sm4_v8_cbc_cts_decrypt(const unsigned char *in, unsigned char *out, + size_t len, const void *key, const unsigned char ivec[16]); + void sm4_v8_ecb_encrypt(const unsigned char *in, unsigned char *out, size_t length, const struct SM4_KEY *key, const int enc); + void sm4_v8_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, - size_t len, const void *key, const unsigned char ivec[16]); + size_t len, const void *key, const unsigned char ivec[16]);
void sm4_v8_cfb_encrypt_blocks(const unsigned char *in, unsigned char *out, - size_t length, const struct SM4_KEY *key, unsigned char *ivec); + size_t length, const struct SM4_KEY *key, unsigned char *ivec); void sm4_v8_cfb_decrypt_blocks(const unsigned char *in, unsigned char *out, - size_t length, const struct SM4_KEY *key, unsigned char *ivec); + size_t length, const struct SM4_KEY *key, unsigned char *ivec); + void sm4_v8_crypt_block(const unsigned char *in, unsigned char *out, - const struct SM4_KEY *key); + const struct SM4_KEY *key);
int sm4_v8_xts_encrypt(const unsigned char *in, unsigned char *out, size_t length, - const struct SM4_KEY *key, unsigned char *ivec, - const struct SM4_KEY *key2); + const struct SM4_KEY *key, unsigned char *ivec, + const struct SM4_KEY *key2); int sm4_v8_xts_decrypt(const unsigned char *in, unsigned char *out, size_t length, - const struct SM4_KEY *key, unsigned char *ivec, - const struct SM4_KEY *key2); + const struct SM4_KEY *key, unsigned char *ivec, + const struct SM4_KEY *key2);
#ifdef __cplusplus } diff --git a/drv/isa_ce_sm4_armv8.S b/drv/isa_ce_sm4_armv8.S index 7d84496..2f9477a 100644 --- a/drv/isa_ce_sm4_armv8.S +++ b/drv/isa_ce_sm4_armv8.S @@ -24,6 +24,11 @@ .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd .endm
+.macro adr_l, dst, sym + adrp \dst, \sym + add \dst, \dst, :lo12:\sym +.endm + .text .align 6 .Lck: @@ -506,6 +511,147 @@ sm4_v8_cbc_encrypt: ldp d8,d9,[sp],#16 ret .size sm4_v8_cbc_encrypt,.-sm4_v8_cbc_encrypt + +.globl sm4_v8_cbc_cts_encrypt +.type sm4_v8_cbc_cts_encrypt,%function +.align 5 +sm4_v8_cbc_cts_encrypt: + AARCH64_VALID_CALL_TARGET + ld1 {v24.16b-v27.16b}, [x3], #64; + ld1 {v28.16b-v31.16b}, [x3]; + + sub x5, x2, #16 + + ld1 {v20.16b}, [x4] + + ld1 {v0.16b}, [x0] + eor v20.16b, v20.16b, v0.16b + rev32 v20.16b, v20.16b; + sm4e v20.4s, v24.4s; + sm4e v20.4s, v25.4s; + sm4e v20.4s, v26.4s; + sm4e v20.4s, v27.4s; + sm4e v20.4s, v28.4s; + sm4e v20.4s, v29.4s; + sm4e v20.4s, v30.4s; + sm4e v20.4s, v31.4s; + rev64 v20.4s, v20.4s; + ext v20.16b, v20.16b, v20.16b, #8; + rev32 v20.16b, v20.16b; + + /* load permute table */ + adr x6, .Lcts_permute_table + add x7, x6, #32 + add x6, x6, x5 + sub x7, x7, x5 + ld1 {v3.16b}, [x6] + ld1 {v4.16b}, [x7] + + /* overlapping loads */ + add x0, x0, x5 + ld1 {v1.16b}, [x0] + + /* create Cn from En-1 */ + tbl v0.16b, {v20.16b}, v3.16b + /* padding Pn with zeros */ + tbl v1.16b, {v1.16b}, v4.16b + + eor v1.16b, v1.16b, v20.16b + rev32 v1.16b, v1.16b; + sm4e v1.4s, v24.4s; + sm4e v1.4s, v25.4s; + sm4e v1.4s, v26.4s; + sm4e v1.4s, v27.4s; + sm4e v1.4s, v28.4s; + sm4e v1.4s, v29.4s; + sm4e v1.4s, v30.4s; + sm4e v1.4s, v31.4s; + rev64 v1.4s, v1.4s; + ext v1.16b, v1.16b, v1.16b, #8; + rev32 v1.16b, v1.16b; + + /* overlapping stores */ + add x5, x1, x5 + st1 {v0.16b}, [x5] + st1 {v1.16b}, [x1] + + ret +.size sm4_v8_cbc_cts_encrypt,.-sm4_v8_cbc_cts_encrypt + +.globl sm4_v8_cbc_cts_decrypt +.type sm4_v8_cbc_cts_decrypt,%function +.align 5 +sm4_v8_cbc_cts_decrypt: + ld1 {v24.16b-v27.16b}, [x3], #64; + ld1 {v28.16b-v31.16b}, [x3]; + + sub x5, x2, #16 + + ld1 {v20.16b}, [x4] + + /* load permute table */ + adr_l x6, .Lcts_permute_table + add x7, x6, #32 + add x6, x6, x5 + sub x7, x7, x5 + ld1 {v3.16b}, [x6] + ld1 {v4.16b}, [x7] + + /* overlapping loads */ + ld1 {v0.16b}, [x0], x5 + ld1 {v1.16b}, [x0] + + rev32 v0.16b, v0.16b; + sm4e v0.4s, v24.4s; + sm4e v0.4s, v25.4s; + sm4e v0.4s, v26.4s; + sm4e v0.4s, v27.4s; + sm4e v0.4s, v28.4s; + sm4e v0.4s, v29.4s; + sm4e v0.4s, v30.4s; + sm4e v0.4s, v31.4s; + rev64 v0.4s, v0.4s; + ext v0.16b, v0.16b, v0.16b, #8; + rev32 v0.16b, v0.16b; + + /* select the first Ln bytes of Xn to create Pn */ + tbl v2.16b, {v0.16b}, v3.16b + eor v2.16b, v2.16b, v1.16b + + /* overwrite the first Ln bytes with Cn to create En-1 */ + tbx v0.16b, {v1.16b}, v4.16b + + rev32 v0.16b, v0.16b; + sm4e v0.4s, v24.4s; + sm4e v0.4s, v25.4s; + sm4e v0.4s, v26.4s; + sm4e v0.4s, v27.4s; + sm4e v0.4s, v28.4s; + sm4e v0.4s, v29.4s; + sm4e v0.4s, v30.4s; + sm4e v0.4s, v31.4s; + rev64 v0.4s, v0.4s; + ext v0.16b, v0.16b, v0.16b, #8; + rev32 v0.16b, v0.16b; + + eor v0.16b, v0.16b, v20.16b + + /* overlapping stores */ + add x5, x1, x5 + st1 {v2.16b}, [x5] + st1 {v0.16b}, [x1] + + ret +.size sm4_v8_cbc_cts_decrypt,.-sm4_v8_cbc_cts_decrypt + +.Lcts_permute_table: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .globl sm4_v8_ecb_encrypt .type sm4_v8_ecb_encrypt,%function .align 5