From: Yang Shen shenyang39@huawei.com
Code the cbc_cts mode as openssl code style.
Signed-off-by: Yang Shen shenyang39@huawei.com --- drv/isa_ce_sm4_armv8.S | 201 +++++++++++++++++++---------------------- 1 file changed, 94 insertions(+), 107 deletions(-)
diff --git a/drv/isa_ce_sm4_armv8.S b/drv/isa_ce_sm4_armv8.S index 2f9477a..6ebf39b 100644 --- a/drv/isa_ce_sm4_armv8.S +++ b/drv/isa_ce_sm4_armv8.S @@ -24,11 +24,6 @@ .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd .endm
-.macro adr_l, dst, sym - adrp \dst, \sym - add \dst, \dst, :lo12:\sym -.endm - .text .align 6 .Lck: @@ -517,63 +512,62 @@ sm4_v8_cbc_encrypt: .align 5 sm4_v8_cbc_cts_encrypt: AARCH64_VALID_CALL_TARGET - ld1 {v24.16b-v27.16b}, [x3], #64; - ld1 {v28.16b-v31.16b}, [x3]; - - sub x5, x2, #16 - - ld1 {v20.16b}, [x4] - - ld1 {v0.16b}, [x0] - eor v20.16b, v20.16b, v0.16b - rev32 v20.16b, v20.16b; - sm4e v20.4s, v24.4s; - sm4e v20.4s, v25.4s; - sm4e v20.4s, v26.4s; - sm4e v20.4s, v27.4s; - sm4e v20.4s, v28.4s; - sm4e v20.4s, v29.4s; - sm4e v20.4s, v30.4s; - sm4e v20.4s, v31.4s; - rev64 v20.4s, v20.4s; - ext v20.16b, v20.16b, v20.16b, #8; - rev32 v20.16b, v20.16b; + ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x3], #64 + ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x3] + sub x5, x2, #16 + + ld1 {v8.4s}, [x4] + + ld1 {v10.4s}, [x0] + eor v8.16b, v8.16b, v10.16b + rev32 v8.16b, v8.16b; + sm4e v8.4s, v0.4s; + sm4e v8.4s, v1.4s; + sm4e v8.4s, v2.4s; + sm4e v8.4s, v3.4s; + sm4e v8.4s, v4.4s; + sm4e v8.4s, v5.4s; + sm4e v8.4s, v6.4s; + sm4e v8.4s, v7.4s; + rev64 v8.4s, v8.4s; + ext v8.16b, v8.16b, v8.16b, #8; + rev32 v8.16b, v8.16b;
/* load permute table */ - adr x6, .Lcts_permute_table - add x7, x6, #32 - add x6, x6, x5 - sub x7, x7, x5 - ld1 {v3.16b}, [x6] - ld1 {v4.16b}, [x7] + adr x6, .cts_permute_table + add x7, x6, #32 + add x6, x6, x5 + sub x7, x7, x5 + ld1 {v13.4s}, [x6] + ld1 {v14.4s}, [x7]
/* overlapping loads */ - add x0, x0, x5 - ld1 {v1.16b}, [x0] + add x0, x0, x5 + ld1 {v11.4s}, [x0]
/* create Cn from En-1 */ - tbl v0.16b, {v20.16b}, v3.16b + tbl v10.16b, {v8.16b}, v13.16b /* padding Pn with zeros */ - tbl v1.16b, {v1.16b}, v4.16b - - eor v1.16b, v1.16b, v20.16b - rev32 v1.16b, v1.16b; - sm4e v1.4s, v24.4s; - sm4e v1.4s, v25.4s; - sm4e v1.4s, v26.4s; - sm4e v1.4s, v27.4s; - sm4e v1.4s, v28.4s; - sm4e v1.4s, v29.4s; - sm4e v1.4s, v30.4s; - sm4e v1.4s, v31.4s; - rev64 v1.4s, v1.4s; - ext v1.16b, v1.16b, v1.16b, #8; - rev32 v1.16b, v1.16b; + tbl v11.16b, {v11.16b}, v14.16b + + eor v11.16b, v11.16b, v8.16b + rev32 v11.16b, v11.16b; + sm4e v11.4s, v0.4s; + sm4e v11.4s, v1.4s; + sm4e v11.4s, v2.4s; + sm4e v11.4s, v3.4s; + sm4e v11.4s, v4.4s; + sm4e v11.4s, v5.4s; + sm4e v11.4s, v6.4s; + sm4e v11.4s, v7.4s; + rev64 v11.4s, v11.4s; + ext v11.16b, v11.16b, v11.16b, #8; + rev32 v11.16b, v11.16b;
/* overlapping stores */ - add x5, x1, x5 - st1 {v0.16b}, [x5] - st1 {v1.16b}, [x1] + add x5, x1, x5 + st1 {v10.16b}, [x5] + st1 {v11.16b}, [x1]
ret .size sm4_v8_cbc_cts_encrypt,.-sm4_v8_cbc_cts_encrypt @@ -582,76 +576,69 @@ sm4_v8_cbc_cts_encrypt: .type sm4_v8_cbc_cts_decrypt,%function .align 5 sm4_v8_cbc_cts_decrypt: - ld1 {v24.16b-v27.16b}, [x3], #64; - ld1 {v28.16b-v31.16b}, [x3]; + AARCH64_VALID_CALL_TARGET + ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x3], #64 + ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x3]
- sub x5, x2, #16 + sub x5, x2, #16
- ld1 {v20.16b}, [x4] + ld1 {v8.4s}, [x4]
/* load permute table */ - adr_l x6, .Lcts_permute_table - add x7, x6, #32 - add x6, x6, x5 - sub x7, x7, x5 - ld1 {v3.16b}, [x6] - ld1 {v4.16b}, [x7] + adr x6, .cts_permute_table + add x7, x6, #32 + add x6, x6, x5 + sub x7, x7, x5 + ld1 {v13.4s}, [x6] + ld1 {v14.4s}, [x7]
/* overlapping loads */ - ld1 {v0.16b}, [x0], x5 - ld1 {v1.16b}, [x0] - - rev32 v0.16b, v0.16b; - sm4e v0.4s, v24.4s; - sm4e v0.4s, v25.4s; - sm4e v0.4s, v26.4s; - sm4e v0.4s, v27.4s; - sm4e v0.4s, v28.4s; - sm4e v0.4s, v29.4s; - sm4e v0.4s, v30.4s; - sm4e v0.4s, v31.4s; - rev64 v0.4s, v0.4s; - ext v0.16b, v0.16b, v0.16b, #8; - rev32 v0.16b, v0.16b; + ld1 {v10.16b}, [x0], x5 + ld1 {v11.16b}, [x0] + + rev32 v10.16b, v10.16b; + sm4e v10.4s, v0.4s; + sm4e v10.4s, v1.4s; + sm4e v10.4s, v2.4s; + sm4e v10.4s, v3.4s; + sm4e v10.4s, v4.4s; + sm4e v10.4s, v5.4s; + sm4e v10.4s, v6.4s; + sm4e v10.4s, v7.4s; + rev64 v10.4s, v10.4s; + ext v10.16b, v10.16b, v10.16b, #8; + rev32 v10.16b, v10.16b;
/* select the first Ln bytes of Xn to create Pn */ - tbl v2.16b, {v0.16b}, v3.16b - eor v2.16b, v2.16b, v1.16b + tbl v12.16b, {v10.16b}, v13.16b + eor v12.16b, v12.16b, v11.16b
/* overwrite the first Ln bytes with Cn to create En-1 */ - tbx v0.16b, {v1.16b}, v4.16b - - rev32 v0.16b, v0.16b; - sm4e v0.4s, v24.4s; - sm4e v0.4s, v25.4s; - sm4e v0.4s, v26.4s; - sm4e v0.4s, v27.4s; - sm4e v0.4s, v28.4s; - sm4e v0.4s, v29.4s; - sm4e v0.4s, v30.4s; - sm4e v0.4s, v31.4s; - rev64 v0.4s, v0.4s; - ext v0.16b, v0.16b, v0.16b, #8; - rev32 v0.16b, v0.16b; - - eor v0.16b, v0.16b, v20.16b + tbx v10.16b, {v11.16b}, v14.16b + + rev32 v10.16b, v10.16b; + sm4e v10.4s, v0.4s; + sm4e v10.4s, v1.4s; + sm4e v10.4s, v2.4s; + sm4e v10.4s, v3.4s; + sm4e v10.4s, v4.4s; + sm4e v10.4s, v5.4s; + sm4e v10.4s, v6.4s; + sm4e v10.4s, v7.4s; + rev64 v10.4s, v10.4s; + ext v10.16b, v10.16b, v10.16b, #8; + rev32 v10.16b, v10.16b; + + eor v10.16b, v10.16b, v8.16b
/* overlapping stores */ - add x5, x1, x5 - st1 {v2.16b}, [x5] - st1 {v0.16b}, [x1] + add x5, x1, x5 + st1 {v12.16b}, [x5] + st1 {v10.16b}, [x1]
ret .size sm4_v8_cbc_cts_decrypt,.-sm4_v8_cbc_cts_decrypt
-.Lcts_permute_table: - .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff - .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff - .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 - .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf - .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff - .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff - .globl sm4_v8_ecb_encrypt .type sm4_v8_ecb_encrypt,%function .align 5