From: Yang Shen <shenyang39(a)huawei.com>
Code the cbc_cts mode as openssl code style.
Signed-off-by: Yang Shen <shenyang39(a)huawei.com>
---
drv/isa_ce_sm4_armv8.S | 201 +++++++++++++++++++----------------------
1 file changed, 94 insertions(+), 107 deletions(-)
diff --git a/drv/isa_ce_sm4_armv8.S b/drv/isa_ce_sm4_armv8.S
index 2f9477a..6ebf39b 100644
--- a/drv/isa_ce_sm4_armv8.S
+++ b/drv/isa_ce_sm4_armv8.S
@@ -24,11 +24,6 @@
.inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
.endm
-.macro adr_l, dst, sym
- adrp \dst, \sym
- add \dst, \dst, :lo12:\sym
-.endm
-
.text
.align 6
.Lck:
@@ -517,63 +512,62 @@ sm4_v8_cbc_encrypt:
.align 5
sm4_v8_cbc_cts_encrypt:
AARCH64_VALID_CALL_TARGET
- ld1 {v24.16b-v27.16b}, [x3], #64;
- ld1 {v28.16b-v31.16b}, [x3];
-
- sub x5, x2, #16
-
- ld1 {v20.16b}, [x4]
-
- ld1 {v0.16b}, [x0]
- eor v20.16b, v20.16b, v0.16b
- rev32 v20.16b, v20.16b;
- sm4e v20.4s, v24.4s;
- sm4e v20.4s, v25.4s;
- sm4e v20.4s, v26.4s;
- sm4e v20.4s, v27.4s;
- sm4e v20.4s, v28.4s;
- sm4e v20.4s, v29.4s;
- sm4e v20.4s, v30.4s;
- sm4e v20.4s, v31.4s;
- rev64 v20.4s, v20.4s;
- ext v20.16b, v20.16b, v20.16b, #8;
- rev32 v20.16b, v20.16b;
+ ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x3], #64
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x3]
+ sub x5, x2, #16
+
+ ld1 {v8.4s}, [x4]
+
+ ld1 {v10.4s}, [x0]
+ eor v8.16b, v8.16b, v10.16b
+ rev32 v8.16b, v8.16b;
+ sm4e v8.4s, v0.4s;
+ sm4e v8.4s, v1.4s;
+ sm4e v8.4s, v2.4s;
+ sm4e v8.4s, v3.4s;
+ sm4e v8.4s, v4.4s;
+ sm4e v8.4s, v5.4s;
+ sm4e v8.4s, v6.4s;
+ sm4e v8.4s, v7.4s;
+ rev64 v8.4s, v8.4s;
+ ext v8.16b, v8.16b, v8.16b, #8;
+ rev32 v8.16b, v8.16b;
/* load permute table */
- adr x6, .Lcts_permute_table
- add x7, x6, #32
- add x6, x6, x5
- sub x7, x7, x5
- ld1 {v3.16b}, [x6]
- ld1 {v4.16b}, [x7]
+ adr x6, .cts_permute_table
+ add x7, x6, #32
+ add x6, x6, x5
+ sub x7, x7, x5
+ ld1 {v13.4s}, [x6]
+ ld1 {v14.4s}, [x7]
/* overlapping loads */
- add x0, x0, x5
- ld1 {v1.16b}, [x0]
+ add x0, x0, x5
+ ld1 {v11.4s}, [x0]
/* create Cn from En-1 */
- tbl v0.16b, {v20.16b}, v3.16b
+ tbl v10.16b, {v8.16b}, v13.16b
/* padding Pn with zeros */
- tbl v1.16b, {v1.16b}, v4.16b
-
- eor v1.16b, v1.16b, v20.16b
- rev32 v1.16b, v1.16b;
- sm4e v1.4s, v24.4s;
- sm4e v1.4s, v25.4s;
- sm4e v1.4s, v26.4s;
- sm4e v1.4s, v27.4s;
- sm4e v1.4s, v28.4s;
- sm4e v1.4s, v29.4s;
- sm4e v1.4s, v30.4s;
- sm4e v1.4s, v31.4s;
- rev64 v1.4s, v1.4s;
- ext v1.16b, v1.16b, v1.16b, #8;
- rev32 v1.16b, v1.16b;
+ tbl v11.16b, {v11.16b}, v14.16b
+
+ eor v11.16b, v11.16b, v8.16b
+ rev32 v11.16b, v11.16b;
+ sm4e v11.4s, v0.4s;
+ sm4e v11.4s, v1.4s;
+ sm4e v11.4s, v2.4s;
+ sm4e v11.4s, v3.4s;
+ sm4e v11.4s, v4.4s;
+ sm4e v11.4s, v5.4s;
+ sm4e v11.4s, v6.4s;
+ sm4e v11.4s, v7.4s;
+ rev64 v11.4s, v11.4s;
+ ext v11.16b, v11.16b, v11.16b, #8;
+ rev32 v11.16b, v11.16b;
/* overlapping stores */
- add x5, x1, x5
- st1 {v0.16b}, [x5]
- st1 {v1.16b}, [x1]
+ add x5, x1, x5
+ st1 {v10.16b}, [x5]
+ st1 {v11.16b}, [x1]
ret
.size sm4_v8_cbc_cts_encrypt,.-sm4_v8_cbc_cts_encrypt
@@ -582,76 +576,69 @@ sm4_v8_cbc_cts_encrypt:
.type sm4_v8_cbc_cts_decrypt,%function
.align 5
sm4_v8_cbc_cts_decrypt:
- ld1 {v24.16b-v27.16b}, [x3], #64;
- ld1 {v28.16b-v31.16b}, [x3];
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x3], #64
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x3]
- sub x5, x2, #16
+ sub x5, x2, #16
- ld1 {v20.16b}, [x4]
+ ld1 {v8.4s}, [x4]
/* load permute table */
- adr_l x6, .Lcts_permute_table
- add x7, x6, #32
- add x6, x6, x5
- sub x7, x7, x5
- ld1 {v3.16b}, [x6]
- ld1 {v4.16b}, [x7]
+ adr x6, .cts_permute_table
+ add x7, x6, #32
+ add x6, x6, x5
+ sub x7, x7, x5
+ ld1 {v13.4s}, [x6]
+ ld1 {v14.4s}, [x7]
/* overlapping loads */
- ld1 {v0.16b}, [x0], x5
- ld1 {v1.16b}, [x0]
-
- rev32 v0.16b, v0.16b;
- sm4e v0.4s, v24.4s;
- sm4e v0.4s, v25.4s;
- sm4e v0.4s, v26.4s;
- sm4e v0.4s, v27.4s;
- sm4e v0.4s, v28.4s;
- sm4e v0.4s, v29.4s;
- sm4e v0.4s, v30.4s;
- sm4e v0.4s, v31.4s;
- rev64 v0.4s, v0.4s;
- ext v0.16b, v0.16b, v0.16b, #8;
- rev32 v0.16b, v0.16b;
+ ld1 {v10.16b}, [x0], x5
+ ld1 {v11.16b}, [x0]
+
+ rev32 v10.16b, v10.16b;
+ sm4e v10.4s, v0.4s;
+ sm4e v10.4s, v1.4s;
+ sm4e v10.4s, v2.4s;
+ sm4e v10.4s, v3.4s;
+ sm4e v10.4s, v4.4s;
+ sm4e v10.4s, v5.4s;
+ sm4e v10.4s, v6.4s;
+ sm4e v10.4s, v7.4s;
+ rev64 v10.4s, v10.4s;
+ ext v10.16b, v10.16b, v10.16b, #8;
+ rev32 v10.16b, v10.16b;
/* select the first Ln bytes of Xn to create Pn */
- tbl v2.16b, {v0.16b}, v3.16b
- eor v2.16b, v2.16b, v1.16b
+ tbl v12.16b, {v10.16b}, v13.16b
+ eor v12.16b, v12.16b, v11.16b
/* overwrite the first Ln bytes with Cn to create En-1 */
- tbx v0.16b, {v1.16b}, v4.16b
-
- rev32 v0.16b, v0.16b;
- sm4e v0.4s, v24.4s;
- sm4e v0.4s, v25.4s;
- sm4e v0.4s, v26.4s;
- sm4e v0.4s, v27.4s;
- sm4e v0.4s, v28.4s;
- sm4e v0.4s, v29.4s;
- sm4e v0.4s, v30.4s;
- sm4e v0.4s, v31.4s;
- rev64 v0.4s, v0.4s;
- ext v0.16b, v0.16b, v0.16b, #8;
- rev32 v0.16b, v0.16b;
-
- eor v0.16b, v0.16b, v20.16b
+ tbx v10.16b, {v11.16b}, v14.16b
+
+ rev32 v10.16b, v10.16b;
+ sm4e v10.4s, v0.4s;
+ sm4e v10.4s, v1.4s;
+ sm4e v10.4s, v2.4s;
+ sm4e v10.4s, v3.4s;
+ sm4e v10.4s, v4.4s;
+ sm4e v10.4s, v5.4s;
+ sm4e v10.4s, v6.4s;
+ sm4e v10.4s, v7.4s;
+ rev64 v10.4s, v10.4s;
+ ext v10.16b, v10.16b, v10.16b, #8;
+ rev32 v10.16b, v10.16b;
+
+ eor v10.16b, v10.16b, v8.16b
/* overlapping stores */
- add x5, x1, x5
- st1 {v2.16b}, [x5]
- st1 {v0.16b}, [x1]
+ add x5, x1, x5
+ st1 {v12.16b}, [x5]
+ st1 {v10.16b}, [x1]
ret
.size sm4_v8_cbc_cts_decrypt,.-sm4_v8_cbc_cts_decrypt
-.Lcts_permute_table:
- .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
- .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
- .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
- .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
- .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
- .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
-
.globl sm4_v8_ecb_encrypt
.type sm4_v8_ecb_encrypt,%function
.align 5
--
2.33.0