This patchset introduces ARM64 NEON implementation of CRC T10 DIF algorithm.
This algorithm could be enabled by enabling CONFIG_CRYPTO_CRCT10DIF_ARM64_NEON, and the compiled KO is named crct10dif-neon.ko
GUO Zihua (2): arm64/crypto: Introduce individual config for CRCT10DIF-NEON algorithm crypto: crct10dif-neon - fix use via crypto_shash_digest()
YueHaibing (1): arm64/crypto: Accelerated CRC T10 DIF computation
arch/arm64/crypto/Kconfig | 10 + arch/arm64/crypto/Makefile | 4 + arch/arm64/crypto/crct10dif-neon-asm_64.S | 752 ++++++++++++++++++++++ arch/arm64/crypto/crct10dif-neon_glue.c | 114 ++++ 4 files changed, 880 insertions(+) create mode 100644 arch/arm64/crypto/crct10dif-neon-asm_64.S create mode 100644 arch/arm64/crypto/crct10dif-neon_glue.c
From: YueHaibing yuehaibing@huawei.com
hulk inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I8K36D CVE: NA
-------------------------------------------------
This is the ARM64 CRC T10 DIF transform accelerated with the ARMv8 NEON instruction.
Signed-off-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Li Bin huawei.libin@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Conflicts: arch/arm64/crypto/Makefile Signed-off-by: GUO Zihua guozihua@huawei.com --- arch/arm64/crypto/Makefile | 3 +- arch/arm64/crypto/crct10dif-neon-asm_64.S | 752 ++++++++++++++++++++++ arch/arm64/crypto/crct10dif-neon_glue.c | 116 ++++ 3 files changed, 870 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/crypto/crct10dif-neon-asm_64.S create mode 100644 arch/arm64/crypto/crct10dif-neon_glue.c
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index fbe64dce66e0..c9d88436411b 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -45,7 +45,8 @@ obj-$(CONFIG_CRYPTO_POLYVAL_ARM64_CE) += polyval-ce.o polyval-ce-y := polyval-ce-glue.o polyval-ce-core.o
obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o -crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o +crct10dif-ce-y := crct10dif-neon-asm_64.o crct10dif-neon_glue.o +AFLAGS_crct10dif-neon-asm_64.o := -march=armv8-a+crypto
obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o diff --git a/arch/arm64/crypto/crct10dif-neon-asm_64.S b/arch/arm64/crypto/crct10dif-neon-asm_64.S new file mode 100644 index 000000000000..a37204bf5a7a --- /dev/null +++ b/arch/arm64/crypto/crct10dif-neon-asm_64.S @@ -0,0 +1,752 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2016-2017 Hisilicon Limited. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +.global crc_t10dif_neon +.text + +/* X0 is initial CRC value + * X1 is data buffer + * X2 is the length of buffer + * X3 is the backup buffer(for extend) + * X4 for other extend parameter(for extend) + * Q0, Q1, Q2, Q3 maybe as parameter for other functions, + * the value of Q0, Q1, Q2, Q3 maybe modified. + * + * suggestion: + * 1. dont use general purpose register for calculation + * 2. set data endianness outside of the kernel + * 3. use ext as shifting around + * 4. dont use LD3/LD4, ST3/ST4 + */ + +crc_t10dif_neon: + /* push the register to stack that CRC16 will use */ + STP X5, X6, [sp, #-0x10]! + STP X7, X8, [sp, #-0x10]! + STP X9, X10, [sp, #-0x10]! + STP X11, X12, [sp, #-0x10]! + STP X13, X14, [sp, #-0x10]! + STP Q10, Q11, [sp, #-0x20]! + STP Q12, Q13, [sp, #-0x20]! + STP Q4, Q5, [sp, #-0x20]! + STP Q6, Q7, [sp, #-0x20]! + STP Q8, Q9, [sp, #-0x20]! + STP Q14, Q15, [sp, #-0x20]! + STP Q16, Q17, [sp, #-0x20]! + STP Q18, Q19, [sp, #-0x20]! + + SUB sp,sp,#0x20 + + MOV X11, #0 // PUSH STACK FLAG + + CMP X2, #0x80 + B.LT 2f // _less_than_128, <128 + + /* V10/V11/V12/V13 is 128bit. + * we get data 512bit( by cacheline ) each time + */ + LDP Q10, Q11, [X1], #0x20 + LDP Q12, Q13, [X1], #0x20 + + /* move the initial value to V6 register */ + LSL X0, X0, #48 + EOR V6.16B, V6.16B, V6.16B + MOV V6.D[1], X0 + + /* big-little end change. because the data in memory is little-end, + * we deal the data for bigend + */ + + REV64 V10.16B, V10.16B + REV64 V11.16B, V11.16B + REV64 V12.16B, V12.16B + REV64 V13.16B, V13.16B + EXT V10.16B, V10.16B, V10.16B, #8 + EXT V11.16B, V11.16B, V11.16B, #8 + EXT V12.16B, V12.16B, V12.16B, #8 + EXT V13.16B, V13.16B, V13.16B, #8 + + EOR V10.16B, V10.16B, V6.16B + + SUB X2, X2, #0x80 + ADD X5, X1, #0x20 + + /* deal data when the size of buffer bigger than 128 bytes */ + /* _fold_64_B_loop */ + LDR Q6,=0xe658000000000000044c000000000000 +1: + + LDP Q16, Q17, [X1] ,#0x40 + LDP Q18, Q19, [X5], #0x40 + + /* carry-less multiply. + * V10 high-64bits carry-less multiply + * V6 high-64bits(PMULL2) + * V11 low-64bits carry-less multiply V6 low-64bits(PMULL) + */ + + PMULL2 V4.1Q, V10.2D, V6.2D + PMULL V10.1Q, V10.1D, V6.1D + PMULL2 V5.1Q, V11.2D, V6.2D + PMULL V11.1Q, V11.1D, V6.1D + + REV64 V16.16B, V16.16B + REV64 V17.16B, V17.16B + REV64 V18.16B, V18.16B + REV64 V19.16B, V19.16B + + PMULL2 V14.1Q, V12.2D, V6.2D + PMULL V12.1Q, V12.1D, V6.1D + PMULL2 V15.1Q, V13.2D, V6.2D + PMULL V13.1Q, V13.1D, V6.1D + + EXT V16.16B, V16.16B, V16.16B, #8 + EOR V10.16B, V10.16B, V4.16B + + EXT V17.16B, V17.16B, V17.16B, #8 + EOR V11.16B, V11.16B, V5.16B + + EXT V18.16B, V18.16B, V18.16B, #8 + EOR V12.16B, V12.16B, V14.16B + + EXT V19.16B, V19.16B, V19.16B, #8 + EOR V13.16B, V13.16B, V15.16B + + SUB X2, X2, #0x40 + + + EOR V10.16B, V10.16B, V16.16B + EOR V11.16B, V11.16B, V17.16B + + EOR V12.16B, V12.16B, V18.16B + EOR V13.16B, V13.16B, V19.16B + + CMP X2, #0x0 + B.GE 1b // >=0 + + LDR Q6, =0x06df0000000000002d56000000000000 + MOV V4.16B, V10.16B + /* V10 carry-less 0x06df000000000000([127:64]*[127:64]) */ + PMULL V4.1Q, V4.1D, V6.1D //switch PMULL & PMULL2 order + PMULL2 V10.1Q, V10.2D, V6.2D + EOR V11.16B, V11.16B, V4.16B + EOR V11.16B, V11.16B, V10.16B + + MOV V4.16B, V11.16B + PMULL V4.1Q, V4.1D, V6.1D //switch PMULL & PMULL2 order + PMULL2 V11.1Q, V11.2D, V6.2D + EOR V12.16B, V12.16B, V4.16B + EOR V12.16B, V12.16B, V11.16B + + MOV V4.16B, V12.16B + PMULL V4.1Q, V4.1D, V6.1D //switch PMULL & PMULL2 order + PMULL2 V12.1Q, V12.2D, V6.2D + EOR V13.16B, V13.16B, V4.16B + EOR V13.16B, V13.16B, V12.16B + + ADD X2, X2, #48 + CMP X2, #0x0 + B.LT 3f // _final_reduction_for_128, <0 + + /* _16B_reduction_loop */ +4: + /* unrelated load as early as possible*/ + LDR Q10, [X1], #0x10 + + MOV V4.16B, V13.16B + PMULL2 V13.1Q, V13.2D, V6.2D + PMULL V4.1Q, V4.1D, V6.1D + EOR V13.16B, V13.16B, V4.16B + + REV64 V10.16B, V10.16B + EXT V10.16B, V10.16B, V10.16B, #8 + + EOR V13.16B, V13.16B, V10.16B + + SUB X2, X2, #0x10 + CMP X2, #0x0 + B.GE 4b // _16B_reduction_loop, >=0 + + /* _final_reduction_for_128 */ +3: ADD X2, X2, #0x10 + CMP X2, #0x0 + B.EQ 5f // _128_done, ==0 + + /* _get_last_two_xmms */ +6: MOV V12.16B, V13.16B + SUB X1, X1, #0x10 + ADD X1, X1, X2 + LDR Q11, [X1], #0x10 + REV64 V11.16B, V11.16B + EXT V11.16B, V11.16B, V11.16B, #8 + + CMP X2, #8 + B.EQ 50f + B.LT 51f + B.GT 52f + +50: + /* dont use X register as temp one */ + FMOV D14, D12 + MOVI D12, #0 + MOV V12.D[1],V14.D[0] + B 53f +51: + MOV X9, #64 + LSL X13, X2, #3 // <<3 equal x8 + SUB X9, X9, X13 + MOV X5, V12.D[0] // low 64-bit + MOV X6, V12.D[1] // high 64-bit + LSR X10, X5, X9 // high bit of low 64-bit + LSL X7, X5, X13 + LSL X8, X6, X13 + ORR X8, X8, X10 // combination of high 64-bit + MOV V12.D[1], X8 + MOV V12.D[0], X7 + + B 53f +52: + LSL X13, X2, #3 // <<3 equal x8 + SUB X13, X13, #64 + + DUP V18.2D, X13 + FMOV D16, D12 + USHL D16, D16, D18 + EXT V12.16B, V16.16B, V16.16B, #8 + +53: + MOVI D14, #0 //add one zero constant + + CMP X2, #0 + B.EQ 30f + CMP X2, #1 + B.EQ 31f + CMP X2, #2 + B.EQ 32f + CMP X2, #3 + B.EQ 33f + CMP X2, #4 + B.EQ 34f + CMP X2, #5 + B.EQ 35f + CMP X2, #6 + B.EQ 36f + CMP X2, #7 + B.EQ 37f + CMP X2, #8 + B.EQ 38f + CMP X2, #9 + B.EQ 39f + CMP X2, #10 + B.EQ 40f + CMP X2, #11 + B.EQ 41f + CMP X2, #12 + B.EQ 42f + CMP X2, #13 + B.EQ 43f + CMP X2, #14 + B.EQ 44f + CMP X2, #15 + B.EQ 45f + + // >> 128bit +30: + EOR V13.16B, V13.16B, V13.16B + EOR V8.16B, V8.16B, V8.16B + LDR Q9,=0xffffffffffffffffffffffffffffffff + B 46f + + // >> 120bit +31: + USHR V13.2D, V13.2D, #56 + EXT V13.16B, V13.16B, V14.16B, #8 + LDR Q8,=0xff + LDR Q9,=0xffffffffffffffffffffffffffffff00 + B 46f + + // >> 112bit +32: + USHR V13.2D, V13.2D, #48 + EXT V13.16B, V13.16B, V14.16B, #8 + LDR Q8,=0xffff + LDR Q9,=0xffffffffffffffffffffffffffff0000 + B 46f + + // >> 104bit +33: + USHR V13.2D, V13.2D, #40 + EXT V13.16B, V13.16B, V14.16B, #8 + LDR Q8,=0xffffff + LDR Q9,=0xffffffffffffffffffffffffff000000 + B 46f + + // >> 96bit +34: + USHR V13.2D, V13.2D, #32 + EXT V13.16B, V13.16B, V14.16B, #8 + LDR Q8,=0xffffffff + LDR Q9,=0xffffffffffffffffffffffff00000000 + B 46f + + // >> 88bit +35: + USHR V13.2D, V13.2D, #24 + EXT V13.16B, V13.16B, V14.16B, #8 + LDR Q8,=0xffffffffff + LDR Q9,=0xffffffffffffffffffffff0000000000 + B 46f + + // >> 80bit +36: + USHR V13.2D, V13.2D, #16 + EXT V13.16B, V13.16B, V14.16B, #8 + LDR Q8,=0xffffffffffff + LDR Q9,=0xffffffffffffffffffff000000000000 + B 46f + + // >> 72bit +37: + USHR V13.2D, V13.2D, #8 + EXT V13.16B, V13.16B, V14.16B, #8 + LDR Q8,=0xffffffffffffff + LDR Q9,=0xffffffffffffffffff00000000000000 + B 46f + + // >> 64bit +38: + EXT V13.16B, V13.16B, V14.16B, #8 + LDR Q8,=0xffffffffffffffff + LDR Q9,=0xffffffffffffffff0000000000000000 + B 46f + + // >> 56bit +39: + EXT V13.16B, V13.16B, V13.16B, #7 + MOV V13.S[3], V14.S[0] + MOV V13.H[5], V14.H[0] + MOV V13.B[9], V14.B[0] + + LDR Q8,=0xffffffffffffffffff + LDR Q9,=0xffffffffffffff000000000000000000 + B 46f + + // >> 48bit +40: + EXT V13.16B, V13.16B, V13.16B, #6 + MOV V13.S[3], V14.S[0] + MOV V13.H[5], V14.H[0] + + LDR Q8,=0xffffffffffffffffffff + LDR Q9,=0xffffffffffff00000000000000000000 + B 46f + + // >> 40bit +41: + EXT V13.16B, V13.16B, V13.16B, #5 + MOV V13.S[3], V14.S[0] + MOV V13.B[11], V14.B[0] + + LDR Q8,=0xffffffffffffffffffffff + LDR Q9,=0xffffffffff0000000000000000000000 + B 46f + + // >> 32bit +42: + EXT V13.16B, V13.16B, V13.16B, #4 + MOV V13.S[3], V14.S[0] + + LDR Q8,=0xffffffffffffffffffffffff + LDR Q9,=0xffffffff000000000000000000000000 + B 46f + + // >> 24bit +43: + EXT V13.16B, V13.16B, V13.16B, #3 + MOV V13.H[7], V14.H[0] + MOV V13.B[13], V14.B[0] + + LDR Q8,=0xffffffffffffffffffffffffff + LDR Q9,=0xffffff00000000000000000000000000 + B 46f + + // >> 16bit +44: + EXT V13.16B, V13.16B, V13.16B, #2 + MOV V13.H[7], V14.H[0] + + LDR Q8,=0xffffffffffffffffffffffffffff + LDR Q9,=0xffff0000000000000000000000000000 + B 46f + + // >> 8bit +45: + EXT V13.16B, V13.16B, V13.16B, #1 + MOV V13.B[15], V14.B[0] + + LDR Q8,=0xffffffffffffffffffffffffffffff + LDR Q9,=0xff000000000000000000000000000000 + + // backup V12 first + // pblendvb xmm1, xmm2 +46: + AND V12.16B, V12.16B, V9.16B + AND V11.16B, V11.16B, V8.16B + ORR V11.16B, V11.16B, V12.16B + + MOV V12.16B, V11.16B + MOV V4.16B, V13.16B + PMULL2 V13.1Q, V13.2D, V6.2D + PMULL V4.1Q, V4.1D, V6.1D + EOR V13.16B, V13.16B, V4.16B + EOR V13.16B, V13.16B, V12.16B + + /* _128_done. we change the Q6 D[0] and D[1] */ +5: LDR Q6, =0x2d560000000000001368000000000000 + MOVI D14, #0 + MOV V10.16B, V13.16B + PMULL2 V13.1Q, V13.2D, V6.2D + + MOV V10.D[1], V10.D[0] + MOV V10.D[0], V14.D[0] //set zero + + EOR V13.16B, V13.16B, V10.16B + + MOV V10.16B, V13.16B + LDR Q7, =0x00000000FFFFFFFFFFFFFFFFFFFFFFFF + AND V10.16B, V10.16B, V7.16B + + MOV S13, V13.S[3] + + PMULL V13.1Q, V13.1D, V6.1D + EOR V13.16B, V13.16B, V10.16B + + /* _barrett */ +7: LDR Q6, =0x00000001f65a57f8000000018bb70000 + MOVI D14, #0 + MOV V10.16B, V13.16B + PMULL2 V13.1Q, V13.2D, V6.2D + + EXT V13.16B, V13.16B, V13.16B, #12 + MOV V13.S[0], V14.S[0] + + EXT V6.16B, V6.16B, V6.16B, #8 + PMULL2 V13.1Q, V13.2D, V6.2D + + EXT V13.16B, V13.16B, V13.16B, #12 + MOV V13.S[0], V14.S[0] + + EOR V13.16B, V13.16B, V10.16B + MOV X0, V13.D[0] + + /* _cleanup */ +8: MOV X14, #48 + LSR X0, X0, X14 +99: + ADD sp, sp, #0x20 + + LDP Q18, Q19, [sp], #0x20 + LDP Q16, Q17, [sp], #0x20 + LDP Q14, Q15, [sp], #0x20 + + LDP Q8, Q9, [sp], #0x20 + LDP Q6, Q7, [sp], #0x20 + LDP Q4, Q5, [sp], #0x20 + LDP Q12, Q13, [sp], #0x20 + LDP Q10, Q11, [sp], #0x20 + LDP X13, X14, [sp], #0x10 + LDP X11, X12, [sp], #0x10 + LDP X9, X10, [sp], #0x10 + LDP X7, X8, [sp], #0x10 + LDP X5, X6, [sp], #0x10 + + RET + + /* _less_than_128 */ +2: CMP X2, #32 + B.LT 9f // _less_than_32 + LDR Q6, =0x06df0000000000002d56000000000000 + + LSL X0, X0, #48 + LDR Q10, =0x0 + MOV V10.D[1], X0 + LDR Q13, [X1], #0x10 + REV64 V13.16B, V13.16B + EXT V13.16B, V13.16B, V13.16B, #8 + + EOR V13.16B, V13.16B, V10.16B + + SUB X2, X2, #32 + B 4b + + /* _less_than_32 */ +9: CMP X2, #0 + B.EQ 99b // _cleanup + LSL X0, X0, #48 + LDR Q10,=0x0 + MOV V10.D[1], X0 + + CMP X2, #16 + B.EQ 10f // _exact_16_left + B.LE 11f // _less_than_16_left + LDR Q13, [X1], #0x10 + + REV64 V13.16B, V13.16B + EXT V13.16B, V13.16B, V13.16B, #8 + + EOR V13.16B, V13.16B, V10.16B + SUB X2, X2, #16 + LDR Q6, =0x06df0000000000002d56000000000000 + B 6b // _get_last_two_xmms + + /* _less_than_16_left */ +11: CMP X2, #4 + B.LT 13f // _only_less_than_4 + + /* backup the length of data, we used in _less_than_2_left */ + MOV X8, X2 + CMP X2, #8 + B.LT 14f // _less_than_8_left + + LDR X14, [X1], #8 + /* push the data to stack, we backup the data to V10 */ + STR X14, [sp, #0] + SUB X2, X2, #8 + ADD X11, X11, #8 + + /* _less_than_8_left */ +14: CMP X2, #4 + B.LT 15f // _less_than_4_left + + /* get 32bit data */ + LDR W5, [X1], #4 + + /* push the data to stack */ + STR W5, [sp, X11] + SUB X2, X2, #4 + ADD X11, X11, #4 + + /* _less_than_4_left */ +15: CMP X2, #2 + B.LT 16f // _less_than_2_left + + /* get 16bits data */ + LDRH W6, [X1], #2 + + /* push the data to stack */ + STRH W6, [sp, X11] + SUB X2, X2, #2 + ADD X11, X11, #2 + + /* _less_than_2_left */ +16: + /* get 8bits data */ + LDRB W7, [X1], #1 + STRB W7, [sp, X11] + ADD X11, X11, #1 + + /* POP data from stack, store to V13 */ + LDR Q13, [sp] + MOVI D14, #0 + REV64 V13.16B, V13.16B + MOV V8.16B, V13.16B + MOV V13.D[1], V8.D[0] + MOV V13.D[0], V8.D[1] + + EOR V13.16B, V13.16B, V10.16B + CMP X8, #15 + B.EQ 80f + CMP X8, #14 + B.EQ 81f + CMP X8, #13 + B.EQ 82f + CMP X8, #12 + B.EQ 83f + CMP X8, #11 + B.EQ 84f + CMP X8, #10 + B.EQ 85f + CMP X8, #9 + B.EQ 86f + CMP X8, #8 + B.EQ 87f + CMP X8, #7 + B.EQ 88f + CMP X8, #6 + B.EQ 89f + CMP X8, #5 + B.EQ 90f + CMP X8, #4 + B.EQ 91f + CMP X8, #3 + B.EQ 92f + CMP X8, #2 + B.EQ 93f + CMP X8, #1 + B.EQ 94f + CMP X8, #0 + B.EQ 95f + +80: + EXT V13.16B, V13.16B, V13.16B, #1 + MOV V13.B[15], V14.B[0] + B 5b + +81: + EXT V13.16B, V13.16B, V13.16B, #2 + MOV V13.H[7], V14.H[0] + B 5b + +82: + EXT V13.16B, V13.16B, V13.16B, #3 + MOV V13.H[7], V14.H[0] + MOV V13.B[13], V14.B[0] + B 5b +83: + + EXT V13.16B, V13.16B, V13.16B, #4 + MOV V13.S[3], V14.S[0] + B 5b + +84: + EXT V13.16B, V13.16B, V13.16B, #5 + MOV V13.S[3], V14.S[0] + MOV V13.B[11], V14.B[0] + B 5b + +85: + EXT V13.16B, V13.16B, V13.16B, #6 + MOV V13.S[3], V14.S[0] + MOV V13.H[5], V14.H[0] + B 5b + +86: + EXT V13.16B, V13.16B, V13.16B, #7 + MOV V13.S[3], V14.S[0] + MOV V13.H[5], V14.H[0] + MOV V13.B[9], V14.B[0] + B 5b + +87: + MOV V13.D[0], V13.D[1] + MOV V13.D[1], V14.D[0] + B 5b + +88: + EXT V13.16B, V13.16B, V13.16B, #9 + MOV V13.D[1], V14.D[0] + MOV V13.B[7], V14.B[0] + B 5b + +89: + EXT V13.16B, V13.16B, V13.16B, #10 + MOV V13.D[1], V14.D[0] + MOV V13.H[3], V14.H[0] + B 5b + +90: + EXT V13.16B, V13.16B, V13.16B, #11 + MOV V13.D[1], V14.D[0] + MOV V13.H[3], V14.H[0] + MOV V13.B[5], V14.B[0] + B 5b + +91: + MOV V13.S[0], V13.S[3] + MOV V13.D[1], V14.D[0] + MOV V13.S[1], V14.S[0] + B 5b + +92: + EXT V13.16B, V13.16B, V13.16B, #13 + MOV V13.D[1], V14.D[0] + MOV V13.S[1], V14.S[0] + MOV V13.B[3], V14.B[0] + B 5b + +93: + MOV V15.H[0], V13.H[7] + MOV V13.16B, V14.16B + MOV V13.H[0], V15.H[0] + B 5b + +94: + MOV V15.B[0], V13.B[15] + MOV V13.16B, V14.16B + MOV V13.B[0], V15.B[0] + B 5b + +95: + LDR Q13,=0x0 + B 5b // _128_done + + /* _exact_16_left */ +10: + LD1 { V13.2D }, [X1], #0x10 + + REV64 V13.16B, V13.16B + EXT V13.16B, V13.16B, V13.16B, #8 + EOR V13.16B, V13.16B, V10.16B + B 5b // _128_done + + /* _only_less_than_4 */ +13: CMP X2, #3 + MOVI D14, #0 + B.LT 17f //_only_less_than_3 + + LDR S13, [X1], #4 + MOV V13.B[15], V13.B[0] + MOV V13.B[14], V13.B[1] + MOV V13.B[13], V13.B[2] + MOV V13.S[0], V13.S[1] + + EOR V13.16B, V13.16B, V10.16B + + EXT V13.16B, V13.16B, V13.16B, #5 + + MOV V13.S[3], V14.S[0] + MOV V13.B[11], V14.B[0] + + B 7b // _barrett + /* _only_less_than_3 */ +17: + CMP X2, #2 + B.LT 18f // _only_less_than_2 + + LDR H13, [X1], #2 + MOV V13.B[15], V13.B[0] + MOV V13.B[14], V13.B[1] + MOV V13.H[0], V13.H[1] + + EOR V13.16B, V13.16B, V10.16B + + EXT V13.16B, V13.16B, V13.16B, #6 + MOV V13.S[3], V14.S[0] + MOV V13.H[5], V14.H[0] + + B 7b // _barrett + + /* _only_less_than_2 */ +18: + LDRB W7, [X1], #1 + LDR Q13, = 0x0 + MOV V13.B[15], W7 + + EOR V13.16B, V13.16B, V10.16B + + EXT V13.16B, V13.16B, V13.16B, #7 + MOV V13.S[3], V14.S[0] + MOV V13.H[5], V14.H[0] + MOV V13.B[9], V14.B[0] + + B 7b // _barrett diff --git a/arch/arm64/crypto/crct10dif-neon_glue.c b/arch/arm64/crypto/crct10dif-neon_glue.c new file mode 100644 index 000000000000..e0c4a9acee27 --- /dev/null +++ b/arch/arm64/crypto/crct10dif-neon_glue.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2016-2017 Hisilicon Limited. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + + +#include <linux/types.h> +#include <linux/module.h> +#include <linux/crc-t10dif.h> +#include <crypto/internal/hash.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/kernel.h> + +asmlinkage __u16 crc_t10dif_neon(__u16 crc, const unsigned char *buf, + size_t len); + +struct chksum_desc_ctx { + __u16 crc; +}; + +/* + * Steps through buffer one byte at at time, calculates reflected + * crc using table. + */ + +static int chksum_init(struct shash_desc *desc) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + ctx->crc = 0; + + return 0; +} + +static int chksum_update(struct shash_desc *desc, const u8 *data, + unsigned int length) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + ctx->crc = crc_t10dif_neon(ctx->crc, data, length); + return 0; +} + +static int chksum_final(struct shash_desc *desc, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + *(__u16 *)out = ctx->crc; + return 0; +} + +static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len, + u8 *out) +{ + *(__u16 *)out = crc_t10dif_neon(*crcp, data, len); + return 0; +} + +static int chksum_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + return __chksum_finup(&ctx->crc, data, len, out); +} + +static int chksum_digest(struct shash_desc *desc, const u8 *data, + unsigned int length, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + return __chksum_finup(&ctx->crc, data, length, out); +} + +static struct shash_alg alg = { + .digestsize = CRC_T10DIF_DIGEST_SIZE, + .init = chksum_init, + .update = chksum_update, + .final = chksum_final, + .finup = chksum_finup, + .digest = chksum_digest, + .descsize = sizeof(struct chksum_desc_ctx), + .base = { + .cra_name = "crct10dif", + .cra_driver_name = "crct10dif-neon", + .cra_priority = 200, + .cra_blocksize = CRC_T10DIF_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static int __init crct10dif_arm64_mod_init(void) +{ + return crypto_register_shash(&alg); +} + +static void __exit crct10dif_arm64_mod_fini(void) +{ + crypto_unregister_shash(&alg); +} + +module_init(crct10dif_arm64_mod_init); +module_exit(crct10dif_arm64_mod_fini); + +MODULE_AUTHOR("YueHaibing yuehaibing@huawei.com"); +MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with ARM64 NEON instruction."); +MODULE_LICENSE("GPL"); + +MODULE_ALIAS_CRYPTO("crct10dif"); +MODULE_ALIAS_CRYPTO("crct10dif-neon");
Offering: HULK hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8K36D CVE: NA
--------------------------------
The original patch that introduces NEON implementation of CRCT10DIF algorithm reused the config CONFIG_CRYPTO_CRCT10DIF_ARM64_CE. This hides the CE implementation of the same algorithm from mainline which is not a very good pratice. This patch fixes this by introducing a new config option CONFIG_CRYPTO_CRCT10DIF_ARM64_NEON.
Signed-off-by: GUO Zihua guozihua@huawei.com --- arch/arm64/crypto/Kconfig | 10 ++++++++++ arch/arm64/crypto/Makefile | 5 ++++- 2 files changed, 14 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 6d06b448a66e..cc743c2cc7b6 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -323,5 +323,15 @@ config CRYPTO_CRCT10DIF_ARM64_CE Architecture: arm64 using - PMULL (Polynomial Multiply Long) instructions
+config CRYPTO_CRCT10DIF_ARM64_NEON + tristate "CRCT10DIF (NEON)" + depends on KERNEL_MODE_NEON && CRC_T10DIF + select CRYPTO_HASH + help + CRC16 CRC algorithm used for the T10 (SCSI) Data Integrity Field (DIF) + + Architecture: arm64 using + - NEON instructions + endmenu
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index c9d88436411b..44f49f9725d2 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -45,7 +45,10 @@ obj-$(CONFIG_CRYPTO_POLYVAL_ARM64_CE) += polyval-ce.o polyval-ce-y := polyval-ce-glue.o polyval-ce-core.o
obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o -crct10dif-ce-y := crct10dif-neon-asm_64.o crct10dif-neon_glue.o +crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o + +obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_NEON) += crct10dif-neon.o +crct10dif-neon-y := crct10dif-neon-asm_64.o crct10dif-neon_glue.o AFLAGS_crct10dif-neon-asm_64.o := -march=armv8-a+crypto
obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8K36D CVE: NA
Reference: https://lore.kernel.org/lkml/20190609164147.971147667@linuxfoundation.org/
--------------------------------
The digest() hook relies on a crc value from the shash_desc context. However, this context is not initialized while digest() hook is called, and an arbitrary value is read causing the algorithm generating wrong result.
This patch fixes this issue by passing a 0 as the initial crc value in the digest() hook.
Signed-off-by: GUO Zihua guozihua@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Signed-off-by: GUO Zihua guozihua@huawei.com --- arch/arm64/crypto/crct10dif-neon_glue.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/arch/arm64/crypto/crct10dif-neon_glue.c b/arch/arm64/crypto/crct10dif-neon_glue.c index e0c4a9acee27..47638a9f318a 100644 --- a/arch/arm64/crypto/crct10dif-neon_glue.c +++ b/arch/arm64/crypto/crct10dif-neon_glue.c @@ -55,10 +55,10 @@ static int chksum_final(struct shash_desc *desc, u8 *out) return 0; }
-static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len, +static int __chksum_finup(__u16 crc, const u8 *data, unsigned int len, u8 *out) { - *(__u16 *)out = crc_t10dif_neon(*crcp, data, len); + *(__u16 *)out = crc_t10dif_neon(crc, data, len); return 0; }
@@ -67,15 +67,13 @@ static int chksum_finup(struct shash_desc *desc, const u8 *data, { struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
- return __chksum_finup(&ctx->crc, data, len, out); + return __chksum_finup(ctx->crc, data, len, out); }
static int chksum_digest(struct shash_desc *desc, const u8 *data, unsigned int length, u8 *out) { - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - return __chksum_finup(&ctx->crc, data, length, out); + return __chksum_finup(0, data, length, out); }
static struct shash_alg alg = {
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/3151 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/H...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/3151 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/H...