From: YueHaibing yuehaibing@huawei.com
hulk inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I8K36D CVE: NA
-------------------------------------------------
This is the ARM64 CRC T10 DIF transform accelerated with the ARMv8 NEON instruction.
Signed-off-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Li Bin huawei.libin@huawei.com Signed-off-by: GUO Zihua guozihua@huawei.com --- arch/arm64/crypto/Makefile | 3 +- arch/arm64/crypto/crct10dif-neon-asm_64.S | 752 ++++++++++++++++++++++ arch/arm64/crypto/crct10dif-neon_glue.c | 117 ++++ 3 files changed, 871 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/crypto/crct10dif-neon-asm_64.S create mode 100644 arch/arm64/crypto/crct10dif-neon_glue.c
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index fbe64dce66e0..c9d88436411b 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -45,7 +45,8 @@ obj-$(CONFIG_CRYPTO_POLYVAL_ARM64_CE) += polyval-ce.o polyval-ce-y := polyval-ce-glue.o polyval-ce-core.o
obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o -crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o +crct10dif-ce-y := crct10dif-neon-asm_64.o crct10dif-neon_glue.o +AFLAGS_crct10dif-neon-asm_64.o := -march=armv8-a+crypto
obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o diff --git a/arch/arm64/crypto/crct10dif-neon-asm_64.S b/arch/arm64/crypto/crct10dif-neon-asm_64.S new file mode 100644 index 000000000000..a37204bf5a7a --- /dev/null +++ b/arch/arm64/crypto/crct10dif-neon-asm_64.S @@ -0,0 +1,752 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2016-2017 Hisilicon Limited. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +.global crc_t10dif_neon +.text + +/* X0 is initial CRC value + * X1 is data buffer + * X2 is the length of buffer + * X3 is the backup buffer(for extend) + * X4 for other extend parameter(for extend) + * Q0, Q1, Q2, Q3 maybe as parameter for other functions, + * the value of Q0, Q1, Q2, Q3 maybe modified. + * + * suggestion: + * 1. dont use general purpose register for calculation + * 2. set data endianness outside of the kernel + * 3. use ext as shifting around + * 4. dont use LD3/LD4, ST3/ST4 + */ + +crc_t10dif_neon: + /* push the register to stack that CRC16 will use */ + STP X5, X6, [sp, #-0x10]! + STP X7, X8, [sp, #-0x10]! + STP X9, X10, [sp, #-0x10]! + STP X11, X12, [sp, #-0x10]! + STP X13, X14, [sp, #-0x10]! + STP Q10, Q11, [sp, #-0x20]! + STP Q12, Q13, [sp, #-0x20]! + STP Q4, Q5, [sp, #-0x20]! + STP Q6, Q7, [sp, #-0x20]! + STP Q8, Q9, [sp, #-0x20]! + STP Q14, Q15, [sp, #-0x20]! + STP Q16, Q17, [sp, #-0x20]! + STP Q18, Q19, [sp, #-0x20]! + + SUB sp,sp,#0x20 + + MOV X11, #0 // PUSH STACK FLAG + + CMP X2, #0x80 + B.LT 2f // _less_than_128, <128 + + /* V10/V11/V12/V13 is 128bit. + * we get data 512bit( by cacheline ) each time + */ + LDP Q10, Q11, [X1], #0x20 + LDP Q12, Q13, [X1], #0x20 + + /* move the initial value to V6 register */ + LSL X0, X0, #48 + EOR V6.16B, V6.16B, V6.16B + MOV V6.D[1], X0 + + /* big-little end change. because the data in memory is little-end, + * we deal the data for bigend + */ + + REV64 V10.16B, V10.16B + REV64 V11.16B, V11.16B + REV64 V12.16B, V12.16B + REV64 V13.16B, V13.16B + EXT V10.16B, V10.16B, V10.16B, #8 + EXT V11.16B, V11.16B, V11.16B, #8 + EXT V12.16B, V12.16B, V12.16B, #8 + EXT V13.16B, V13.16B, V13.16B, #8 + + EOR V10.16B, V10.16B, V6.16B + + SUB X2, X2, #0x80 + ADD X5, X1, #0x20 + + /* deal data when the size of buffer bigger than 128 bytes */ + /* _fold_64_B_loop */ + LDR Q6,=0xe658000000000000044c000000000000 +1: + + LDP Q16, Q17, [X1] ,#0x40 + LDP Q18, Q19, [X5], #0x40 + + /* carry-less multiply. + * V10 high-64bits carry-less multiply + * V6 high-64bits(PMULL2) + * V11 low-64bits carry-less multiply V6 low-64bits(PMULL) + */ + + PMULL2 V4.1Q, V10.2D, V6.2D + PMULL V10.1Q, V10.1D, V6.1D + PMULL2 V5.1Q, V11.2D, V6.2D + PMULL V11.1Q, V11.1D, V6.1D + + REV64 V16.16B, V16.16B + REV64 V17.16B, V17.16B + REV64 V18.16B, V18.16B + REV64 V19.16B, V19.16B + + PMULL2 V14.1Q, V12.2D, V6.2D + PMULL V12.1Q, V12.1D, V6.1D + PMULL2 V15.1Q, V13.2D, V6.2D + PMULL V13.1Q, V13.1D, V6.1D + + EXT V16.16B, V16.16B, V16.16B, #8 + EOR V10.16B, V10.16B, V4.16B + + EXT V17.16B, V17.16B, V17.16B, #8 + EOR V11.16B, V11.16B, V5.16B + + EXT V18.16B, V18.16B, V18.16B, #8 + EOR V12.16B, V12.16B, V14.16B + + EXT V19.16B, V19.16B, V19.16B, #8 + EOR V13.16B, V13.16B, V15.16B + + SUB X2, X2, #0x40 + + + EOR V10.16B, V10.16B, V16.16B + EOR V11.16B, V11.16B, V17.16B + + EOR V12.16B, V12.16B, V18.16B + EOR V13.16B, V13.16B, V19.16B + + CMP X2, #0x0 + B.GE 1b // >=0 + + LDR Q6, =0x06df0000000000002d56000000000000 + MOV V4.16B, V10.16B + /* V10 carry-less 0x06df000000000000([127:64]*[127:64]) */ + PMULL V4.1Q, V4.1D, V6.1D //switch PMULL & PMULL2 order + PMULL2 V10.1Q, V10.2D, V6.2D + EOR V11.16B, V11.16B, V4.16B + EOR V11.16B, V11.16B, V10.16B + + MOV V4.16B, V11.16B + PMULL V4.1Q, V4.1D, V6.1D //switch PMULL & PMULL2 order + PMULL2 V11.1Q, V11.2D, V6.2D + EOR V12.16B, V12.16B, V4.16B + EOR V12.16B, V12.16B, V11.16B + + MOV V4.16B, V12.16B + PMULL V4.1Q, V4.1D, V6.1D //switch PMULL & PMULL2 order + PMULL2 V12.1Q, V12.2D, V6.2D + EOR V13.16B, V13.16B, V4.16B + EOR V13.16B, V13.16B, V12.16B + + ADD X2, X2, #48 + CMP X2, #0x0 + B.LT 3f // _final_reduction_for_128, <0 + + /* _16B_reduction_loop */ +4: + /* unrelated load as early as possible*/ + LDR Q10, [X1], #0x10 + + MOV V4.16B, V13.16B + PMULL2 V13.1Q, V13.2D, V6.2D + PMULL V4.1Q, V4.1D, V6.1D + EOR V13.16B, V13.16B, V4.16B + + REV64 V10.16B, V10.16B + EXT V10.16B, V10.16B, V10.16B, #8 + + EOR V13.16B, V13.16B, V10.16B + + SUB X2, X2, #0x10 + CMP X2, #0x0 + B.GE 4b // _16B_reduction_loop, >=0 + + /* _final_reduction_for_128 */ +3: ADD X2, X2, #0x10 + CMP X2, #0x0 + B.EQ 5f // _128_done, ==0 + + /* _get_last_two_xmms */ +6: MOV V12.16B, V13.16B + SUB X1, X1, #0x10 + ADD X1, X1, X2 + LDR Q11, [X1], #0x10 + REV64 V11.16B, V11.16B + EXT V11.16B, V11.16B, V11.16B, #8 + + CMP X2, #8 + B.EQ 50f + B.LT 51f + B.GT 52f + +50: + /* dont use X register as temp one */ + FMOV D14, D12 + MOVI D12, #0 + MOV V12.D[1],V14.D[0] + B 53f +51: + MOV X9, #64 + LSL X13, X2, #3 // <<3 equal x8 + SUB X9, X9, X13 + MOV X5, V12.D[0] // low 64-bit + MOV X6, V12.D[1] // high 64-bit + LSR X10, X5, X9 // high bit of low 64-bit + LSL X7, X5, X13 + LSL X8, X6, X13 + ORR X8, X8, X10 // combination of high 64-bit + MOV V12.D[1], X8 + MOV V12.D[0], X7 + + B 53f +52: + LSL X13, X2, #3 // <<3 equal x8 + SUB X13, X13, #64 + + DUP V18.2D, X13 + FMOV D16, D12 + USHL D16, D16, D18 + EXT V12.16B, V16.16B, V16.16B, #8 + +53: + MOVI D14, #0 //add one zero constant + + CMP X2, #0 + B.EQ 30f + CMP X2, #1 + B.EQ 31f + CMP X2, #2 + B.EQ 32f + CMP X2, #3 + B.EQ 33f + CMP X2, #4 + B.EQ 34f + CMP X2, #5 + B.EQ 35f + CMP X2, #6 + B.EQ 36f + CMP X2, #7 + B.EQ 37f + CMP X2, #8 + B.EQ 38f + CMP X2, #9 + B.EQ 39f + CMP X2, #10 + B.EQ 40f + CMP X2, #11 + B.EQ 41f + CMP X2, #12 + B.EQ 42f + CMP X2, #13 + B.EQ 43f + CMP X2, #14 + B.EQ 44f + CMP X2, #15 + B.EQ 45f + + // >> 128bit +30: + EOR V13.16B, V13.16B, V13.16B + EOR V8.16B, V8.16B, V8.16B + LDR Q9,=0xffffffffffffffffffffffffffffffff + B 46f + + // >> 120bit +31: + USHR V13.2D, V13.2D, #56 + EXT V13.16B, V13.16B, V14.16B, #8 + LDR Q8,=0xff + LDR Q9,=0xffffffffffffffffffffffffffffff00 + B 46f + + // >> 112bit +32: + USHR V13.2D, V13.2D, #48 + EXT V13.16B, V13.16B, V14.16B, #8 + LDR Q8,=0xffff + LDR Q9,=0xffffffffffffffffffffffffffff0000 + B 46f + + // >> 104bit +33: + USHR V13.2D, V13.2D, #40 + EXT V13.16B, V13.16B, V14.16B, #8 + LDR Q8,=0xffffff + LDR Q9,=0xffffffffffffffffffffffffff000000 + B 46f + + // >> 96bit +34: + USHR V13.2D, V13.2D, #32 + EXT V13.16B, V13.16B, V14.16B, #8 + LDR Q8,=0xffffffff + LDR Q9,=0xffffffffffffffffffffffff00000000 + B 46f + + // >> 88bit +35: + USHR V13.2D, V13.2D, #24 + EXT V13.16B, V13.16B, V14.16B, #8 + LDR Q8,=0xffffffffff + LDR Q9,=0xffffffffffffffffffffff0000000000 + B 46f + + // >> 80bit +36: + USHR V13.2D, V13.2D, #16 + EXT V13.16B, V13.16B, V14.16B, #8 + LDR Q8,=0xffffffffffff + LDR Q9,=0xffffffffffffffffffff000000000000 + B 46f + + // >> 72bit +37: + USHR V13.2D, V13.2D, #8 + EXT V13.16B, V13.16B, V14.16B, #8 + LDR Q8,=0xffffffffffffff + LDR Q9,=0xffffffffffffffffff00000000000000 + B 46f + + // >> 64bit +38: + EXT V13.16B, V13.16B, V14.16B, #8 + LDR Q8,=0xffffffffffffffff + LDR Q9,=0xffffffffffffffff0000000000000000 + B 46f + + // >> 56bit +39: + EXT V13.16B, V13.16B, V13.16B, #7 + MOV V13.S[3], V14.S[0] + MOV V13.H[5], V14.H[0] + MOV V13.B[9], V14.B[0] + + LDR Q8,=0xffffffffffffffffff + LDR Q9,=0xffffffffffffff000000000000000000 + B 46f + + // >> 48bit +40: + EXT V13.16B, V13.16B, V13.16B, #6 + MOV V13.S[3], V14.S[0] + MOV V13.H[5], V14.H[0] + + LDR Q8,=0xffffffffffffffffffff + LDR Q9,=0xffffffffffff00000000000000000000 + B 46f + + // >> 40bit +41: + EXT V13.16B, V13.16B, V13.16B, #5 + MOV V13.S[3], V14.S[0] + MOV V13.B[11], V14.B[0] + + LDR Q8,=0xffffffffffffffffffffff + LDR Q9,=0xffffffffff0000000000000000000000 + B 46f + + // >> 32bit +42: + EXT V13.16B, V13.16B, V13.16B, #4 + MOV V13.S[3], V14.S[0] + + LDR Q8,=0xffffffffffffffffffffffff + LDR Q9,=0xffffffff000000000000000000000000 + B 46f + + // >> 24bit +43: + EXT V13.16B, V13.16B, V13.16B, #3 + MOV V13.H[7], V14.H[0] + MOV V13.B[13], V14.B[0] + + LDR Q8,=0xffffffffffffffffffffffffff + LDR Q9,=0xffffff00000000000000000000000000 + B 46f + + // >> 16bit +44: + EXT V13.16B, V13.16B, V13.16B, #2 + MOV V13.H[7], V14.H[0] + + LDR Q8,=0xffffffffffffffffffffffffffff + LDR Q9,=0xffff0000000000000000000000000000 + B 46f + + // >> 8bit +45: + EXT V13.16B, V13.16B, V13.16B, #1 + MOV V13.B[15], V14.B[0] + + LDR Q8,=0xffffffffffffffffffffffffffffff + LDR Q9,=0xff000000000000000000000000000000 + + // backup V12 first + // pblendvb xmm1, xmm2 +46: + AND V12.16B, V12.16B, V9.16B + AND V11.16B, V11.16B, V8.16B + ORR V11.16B, V11.16B, V12.16B + + MOV V12.16B, V11.16B + MOV V4.16B, V13.16B + PMULL2 V13.1Q, V13.2D, V6.2D + PMULL V4.1Q, V4.1D, V6.1D + EOR V13.16B, V13.16B, V4.16B + EOR V13.16B, V13.16B, V12.16B + + /* _128_done. we change the Q6 D[0] and D[1] */ +5: LDR Q6, =0x2d560000000000001368000000000000 + MOVI D14, #0 + MOV V10.16B, V13.16B + PMULL2 V13.1Q, V13.2D, V6.2D + + MOV V10.D[1], V10.D[0] + MOV V10.D[0], V14.D[0] //set zero + + EOR V13.16B, V13.16B, V10.16B + + MOV V10.16B, V13.16B + LDR Q7, =0x00000000FFFFFFFFFFFFFFFFFFFFFFFF + AND V10.16B, V10.16B, V7.16B + + MOV S13, V13.S[3] + + PMULL V13.1Q, V13.1D, V6.1D + EOR V13.16B, V13.16B, V10.16B + + /* _barrett */ +7: LDR Q6, =0x00000001f65a57f8000000018bb70000 + MOVI D14, #0 + MOV V10.16B, V13.16B + PMULL2 V13.1Q, V13.2D, V6.2D + + EXT V13.16B, V13.16B, V13.16B, #12 + MOV V13.S[0], V14.S[0] + + EXT V6.16B, V6.16B, V6.16B, #8 + PMULL2 V13.1Q, V13.2D, V6.2D + + EXT V13.16B, V13.16B, V13.16B, #12 + MOV V13.S[0], V14.S[0] + + EOR V13.16B, V13.16B, V10.16B + MOV X0, V13.D[0] + + /* _cleanup */ +8: MOV X14, #48 + LSR X0, X0, X14 +99: + ADD sp, sp, #0x20 + + LDP Q18, Q19, [sp], #0x20 + LDP Q16, Q17, [sp], #0x20 + LDP Q14, Q15, [sp], #0x20 + + LDP Q8, Q9, [sp], #0x20 + LDP Q6, Q7, [sp], #0x20 + LDP Q4, Q5, [sp], #0x20 + LDP Q12, Q13, [sp], #0x20 + LDP Q10, Q11, [sp], #0x20 + LDP X13, X14, [sp], #0x10 + LDP X11, X12, [sp], #0x10 + LDP X9, X10, [sp], #0x10 + LDP X7, X8, [sp], #0x10 + LDP X5, X6, [sp], #0x10 + + RET + + /* _less_than_128 */ +2: CMP X2, #32 + B.LT 9f // _less_than_32 + LDR Q6, =0x06df0000000000002d56000000000000 + + LSL X0, X0, #48 + LDR Q10, =0x0 + MOV V10.D[1], X0 + LDR Q13, [X1], #0x10 + REV64 V13.16B, V13.16B + EXT V13.16B, V13.16B, V13.16B, #8 + + EOR V13.16B, V13.16B, V10.16B + + SUB X2, X2, #32 + B 4b + + /* _less_than_32 */ +9: CMP X2, #0 + B.EQ 99b // _cleanup + LSL X0, X0, #48 + LDR Q10,=0x0 + MOV V10.D[1], X0 + + CMP X2, #16 + B.EQ 10f // _exact_16_left + B.LE 11f // _less_than_16_left + LDR Q13, [X1], #0x10 + + REV64 V13.16B, V13.16B + EXT V13.16B, V13.16B, V13.16B, #8 + + EOR V13.16B, V13.16B, V10.16B + SUB X2, X2, #16 + LDR Q6, =0x06df0000000000002d56000000000000 + B 6b // _get_last_two_xmms + + /* _less_than_16_left */ +11: CMP X2, #4 + B.LT 13f // _only_less_than_4 + + /* backup the length of data, we used in _less_than_2_left */ + MOV X8, X2 + CMP X2, #8 + B.LT 14f // _less_than_8_left + + LDR X14, [X1], #8 + /* push the data to stack, we backup the data to V10 */ + STR X14, [sp, #0] + SUB X2, X2, #8 + ADD X11, X11, #8 + + /* _less_than_8_left */ +14: CMP X2, #4 + B.LT 15f // _less_than_4_left + + /* get 32bit data */ + LDR W5, [X1], #4 + + /* push the data to stack */ + STR W5, [sp, X11] + SUB X2, X2, #4 + ADD X11, X11, #4 + + /* _less_than_4_left */ +15: CMP X2, #2 + B.LT 16f // _less_than_2_left + + /* get 16bits data */ + LDRH W6, [X1], #2 + + /* push the data to stack */ + STRH W6, [sp, X11] + SUB X2, X2, #2 + ADD X11, X11, #2 + + /* _less_than_2_left */ +16: + /* get 8bits data */ + LDRB W7, [X1], #1 + STRB W7, [sp, X11] + ADD X11, X11, #1 + + /* POP data from stack, store to V13 */ + LDR Q13, [sp] + MOVI D14, #0 + REV64 V13.16B, V13.16B + MOV V8.16B, V13.16B + MOV V13.D[1], V8.D[0] + MOV V13.D[0], V8.D[1] + + EOR V13.16B, V13.16B, V10.16B + CMP X8, #15 + B.EQ 80f + CMP X8, #14 + B.EQ 81f + CMP X8, #13 + B.EQ 82f + CMP X8, #12 + B.EQ 83f + CMP X8, #11 + B.EQ 84f + CMP X8, #10 + B.EQ 85f + CMP X8, #9 + B.EQ 86f + CMP X8, #8 + B.EQ 87f + CMP X8, #7 + B.EQ 88f + CMP X8, #6 + B.EQ 89f + CMP X8, #5 + B.EQ 90f + CMP X8, #4 + B.EQ 91f + CMP X8, #3 + B.EQ 92f + CMP X8, #2 + B.EQ 93f + CMP X8, #1 + B.EQ 94f + CMP X8, #0 + B.EQ 95f + +80: + EXT V13.16B, V13.16B, V13.16B, #1 + MOV V13.B[15], V14.B[0] + B 5b + +81: + EXT V13.16B, V13.16B, V13.16B, #2 + MOV V13.H[7], V14.H[0] + B 5b + +82: + EXT V13.16B, V13.16B, V13.16B, #3 + MOV V13.H[7], V14.H[0] + MOV V13.B[13], V14.B[0] + B 5b +83: + + EXT V13.16B, V13.16B, V13.16B, #4 + MOV V13.S[3], V14.S[0] + B 5b + +84: + EXT V13.16B, V13.16B, V13.16B, #5 + MOV V13.S[3], V14.S[0] + MOV V13.B[11], V14.B[0] + B 5b + +85: + EXT V13.16B, V13.16B, V13.16B, #6 + MOV V13.S[3], V14.S[0] + MOV V13.H[5], V14.H[0] + B 5b + +86: + EXT V13.16B, V13.16B, V13.16B, #7 + MOV V13.S[3], V14.S[0] + MOV V13.H[5], V14.H[0] + MOV V13.B[9], V14.B[0] + B 5b + +87: + MOV V13.D[0], V13.D[1] + MOV V13.D[1], V14.D[0] + B 5b + +88: + EXT V13.16B, V13.16B, V13.16B, #9 + MOV V13.D[1], V14.D[0] + MOV V13.B[7], V14.B[0] + B 5b + +89: + EXT V13.16B, V13.16B, V13.16B, #10 + MOV V13.D[1], V14.D[0] + MOV V13.H[3], V14.H[0] + B 5b + +90: + EXT V13.16B, V13.16B, V13.16B, #11 + MOV V13.D[1], V14.D[0] + MOV V13.H[3], V14.H[0] + MOV V13.B[5], V14.B[0] + B 5b + +91: + MOV V13.S[0], V13.S[3] + MOV V13.D[1], V14.D[0] + MOV V13.S[1], V14.S[0] + B 5b + +92: + EXT V13.16B, V13.16B, V13.16B, #13 + MOV V13.D[1], V14.D[0] + MOV V13.S[1], V14.S[0] + MOV V13.B[3], V14.B[0] + B 5b + +93: + MOV V15.H[0], V13.H[7] + MOV V13.16B, V14.16B + MOV V13.H[0], V15.H[0] + B 5b + +94: + MOV V15.B[0], V13.B[15] + MOV V13.16B, V14.16B + MOV V13.B[0], V15.B[0] + B 5b + +95: + LDR Q13,=0x0 + B 5b // _128_done + + /* _exact_16_left */ +10: + LD1 { V13.2D }, [X1], #0x10 + + REV64 V13.16B, V13.16B + EXT V13.16B, V13.16B, V13.16B, #8 + EOR V13.16B, V13.16B, V10.16B + B 5b // _128_done + + /* _only_less_than_4 */ +13: CMP X2, #3 + MOVI D14, #0 + B.LT 17f //_only_less_than_3 + + LDR S13, [X1], #4 + MOV V13.B[15], V13.B[0] + MOV V13.B[14], V13.B[1] + MOV V13.B[13], V13.B[2] + MOV V13.S[0], V13.S[1] + + EOR V13.16B, V13.16B, V10.16B + + EXT V13.16B, V13.16B, V13.16B, #5 + + MOV V13.S[3], V14.S[0] + MOV V13.B[11], V14.B[0] + + B 7b // _barrett + /* _only_less_than_3 */ +17: + CMP X2, #2 + B.LT 18f // _only_less_than_2 + + LDR H13, [X1], #2 + MOV V13.B[15], V13.B[0] + MOV V13.B[14], V13.B[1] + MOV V13.H[0], V13.H[1] + + EOR V13.16B, V13.16B, V10.16B + + EXT V13.16B, V13.16B, V13.16B, #6 + MOV V13.S[3], V14.S[0] + MOV V13.H[5], V14.H[0] + + B 7b // _barrett + + /* _only_less_than_2 */ +18: + LDRB W7, [X1], #1 + LDR Q13, = 0x0 + MOV V13.B[15], W7 + + EOR V13.16B, V13.16B, V10.16B + + EXT V13.16B, V13.16B, V13.16B, #7 + MOV V13.S[3], V14.S[0] + MOV V13.H[5], V14.H[0] + MOV V13.B[9], V14.B[0] + + B 7b // _barrett diff --git a/arch/arm64/crypto/crct10dif-neon_glue.c b/arch/arm64/crypto/crct10dif-neon_glue.c new file mode 100644 index 000000000000..5ddfd6d41364 --- /dev/null +++ b/arch/arm64/crypto/crct10dif-neon_glue.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2016-2017 Hisilicon Limited. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + + +#include <linux/types.h> +#include <linux/module.h> +#include <linux/crc-t10dif.h> +#include <crypto/internal/hash.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/kernel.h> + +asmlinkage __u16 crc_t10dif_neon(__u16 crc, const unsigned char *buf, + size_t len); + +struct chksum_desc_ctx { + __u16 crc; +}; + +/* + * Steps through buffer one byte at a time, calculates reflected + * crc using table. + */ + +static int chksum_init(struct shash_desc *desc) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + ctx->crc = 0; + + return 0; +} + +static int chksum_update(struct shash_desc *desc, const u8 *data, + unsigned int length) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + ctx->crc = crc_t10dif_neon(ctx->crc, data, length); + return 0; +} + +static int chksum_final(struct shash_desc *desc, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + *(__u16 *)out = ctx->crc; + return 0; +} + +static int __chksum_finup(__u16 crc, const u8 *data, unsigned int len, + u8 *out) +{ + *(__u16 *)out = crc_t10dif_neon(crc, data, len); + return 0; +} + +static int chksum_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + return __chksum_finup(ctx->crc, data, len, out); +} + +static int chksum_digest(struct shash_desc *desc, const u8 *data, + unsigned int length, u8 *out) +{ + return __chksum_finup(0, data, length, out); +} + +static struct shash_alg alg = { + .digestsize = CRC_T10DIF_DIGEST_SIZE, + .init = chksum_init, + .update = chksum_update, + .final = chksum_final, + .finup = chksum_finup, + .digest = chksum_digest, + .descsize = sizeof(struct chksum_desc_ctx), + .base = { + .cra_name = "crct10dif", + .cra_driver_name = "crct10dif-neon", + .cra_priority = 200, + .cra_blocksize = CRC_T10DIF_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static int __init crct10dif_arm64_mod_init(void) +{ + if (cpu_have_named_feature(PMULL)) + return crypto_register_shash(&alg); + else + return -ENODEV; +} + +static void __exit crct10dif_arm64_mod_fini(void) +{ + crypto_unregister_shash(&alg); +} + +module_init(crct10dif_arm64_mod_init); +module_exit(crct10dif_arm64_mod_fini); + +MODULE_AUTHOR("YueHaibing yuehaibing@huawei.com"); +MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with ARM64 NEON instruction."); +MODULE_LICENSE("GPL"); + +MODULE_ALIAS_CRYPTO("crct10dif"); +MODULE_ALIAS_CRYPTO("crct10dif-neon");