hulk inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I9PXLD?from=project-issue CVE: NA
--------------------------------
In the copy_from/to_user scenario, due to the need to accurately calculate the number of copied bytes for fixup, it is necessary to increment the values of src and dst after each copy of the data. Therefore, this introduce RAW dependency which result in some bubble in pipline and degrade the IPC.
Consider updating the values of src and dst only when the copy is completed in the current stage to reduce data dependence. But this will break the function of fixup, so in order to maintain the function of fixup, a new macro USER_OFF is introduced to determine the number of bytes that load/store has completed copying at this stage, the final number of bytes that have not been copied can be determined.
The following are the test results of UnixBench in HiSilicon KunPeng920: without this patch with this patch System Benchmarks Index Values INDEX INDEX Dhrystone 2 using register variables 3714.0 3770.5 Double-Precision Whetstone 797.1 797.1 Execl Throughput 665.8 663.9 File Copy 1024 bufsize 2000 maxblocks 2465.5 2435.5 File Copy 256 bufsize 500 maxblocks 1748.2 1724.4 File Copy 4096 bufsize 8000 maxblocks 3734.4 3634.8 Pipe Throughput 1019.9 1022.0 Pipe-based Context Switching 346.9 394.0 Process Creation 500.1 512.0 Shell Scripts (1 concurrent) 1495.8 1521.4 Shell Scripts (8 concurrent) 5132.4 5202.2 System Call Overhead 681.6 697.2 ======== ======== System Benchmarks Index Score 1325.3 1343.7
Signed-off-by: Li Zetao lizetao1@huawei.com --- arch/arm64/include/asm/asm-uaccess.h | 29 ---- arch/arm64/lib/copy_from_user.S | 241 +++++++++++++++++++++----- arch/arm64/lib/copy_to_user.S | 242 ++++++++++++++++++++++----- 3 files changed, 406 insertions(+), 106 deletions(-)
diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h index 7bbebfa5b7103..7a872c77c03ac 100644 --- a/arch/arm64/include/asm/asm-uaccess.h +++ b/arch/arm64/include/asm/asm-uaccess.h @@ -65,33 +65,4 @@ alternative_else_nop_endif 9999: x; \ _asm_extable_uaccess 9999b, l
-/* - * Generate the assembly for LDTR/STTR with exception table entries. - * This is complicated as there is no post-increment or pair versions of the - * unprivileged instructions, and USER() only works for single instructions. - */ - .macro user_ldp l, reg1, reg2, addr, post_inc -8888: ldtr \reg1, [\addr]; -8889: ldtr \reg2, [\addr, #8]; - add \addr, \addr, \post_inc; - - _asm_extable_uaccess 8888b, \l; - _asm_extable_uaccess 8889b, \l; - .endm - - .macro user_stp l, reg1, reg2, addr, post_inc -8888: sttr \reg1, [\addr]; -8889: sttr \reg2, [\addr, #8]; - add \addr, \addr, \post_inc; - - _asm_extable_uaccess 8888b,\l; - _asm_extable_uaccess 8889b,\l; - .endm - - .macro user_ldst l, inst, reg, addr, post_inc -8888: \inst \reg, [\addr]; - add \addr, \addr, \post_inc; - - _asm_extable_uaccess 8888b, \l; - .endm #endif diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 34e3179075244..a80b8679c4b58 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -20,54 +20,219 @@ * x0 - bytes not copied */
- .macro ldrb1 reg, ptr, val - user_ldst 9998f, ldtrb, \reg, \ptr, \val - .endm +dstin .req x0 +end .req x5 +src .req x1 +srcin .req x15 +count .req x2 +tmp1 .req x3 +tmp1w .req w3 +tmp2 .req x4 +tmp2w .req w4 +dst .req x6
- .macro strb1 reg, ptr, val - strb \reg, [\ptr], \val - .endm +A_l .req x7 +A_h .req x8 +B_l .req x9 +B_h .req x10 +C_l .req x11 +C_h .req x12 +D_l .req x13 +D_h .req x14
- .macro ldrh1 reg, ptr, val - user_ldst 9997f, ldtrh, \reg, \ptr, \val - .endm +#define USER_OFF(off, x...) USER(fixup_offset_##off, x) +#define FIXUP_OFFSET(n) \ +fixup_offset_##n: \ + sub x0, end, dst; \ + sub x0, x0, n; \ + ret
- .macro strh1 reg, ptr, val - strh \reg, [\ptr], \val - .endm +FIXUP_OFFSET(0) +FIXUP_OFFSET(8) +FIXUP_OFFSET(16) +FIXUP_OFFSET(24) +FIXUP_OFFSET(32) +FIXUP_OFFSET(40) +FIXUP_OFFSET(48) +FIXUP_OFFSET(56)
- .macro ldr1 reg, ptr, val - user_ldst 9997f, ldtr, \reg, \ptr, \val - .endm +SYM_FUNC_START(__arch_copy_from_user) + add end, x0, x2 + mov srcin, x1
- .macro str1 reg, ptr, val - str \reg, [\ptr], \val - .endm + mov dst, dstin + cmp count, #16 + /*When memory length is less than 16, the accessed are not aligned.*/ + b.lo .Ltiny15
- .macro ldp1 reg1, reg2, ptr, val - user_ldp 9997f, \reg1, \reg2, \ptr, \val - .endm + neg tmp2, src + ands tmp2, tmp2, #15/* Bytes to reach alignment. */ + b.eq .LSrcAligned + sub count, count, tmp2 + /* + * Copy the leading memory data from src to dst in an increasing + * address order.By this way,the risk of overwriting the source + * memory data is eliminated when the distance between src and + * dst is less than 16. The memory accesses here are alignment. + */ + tbz tmp2, #0, 1f +USER_OFF(0, ldtrb tmp1w, [src, #0]) + strb tmp1w, [dst], #1 + add src, src, #1 +1: + tbz tmp2, #1, 2f +USER_OFF(0, ldtrh tmp1w, [src, #0]) + strh tmp1w, [dst], #2 + add src, src, #2 +2: + tbz tmp2, #2, 3f +USER_OFF(0, ldtr tmp1w, [src, #0]) + str tmp1w, [dst], #4 + add src, src, #4 +3: + tbz tmp2, #3, .LSrcAligned +USER_OFF(0, ldtr tmp1, [src, #0]) + str tmp1, [dst], #8 + add src, src, #8
- .macro stp1 reg1, reg2, ptr, val - stp \reg1, \reg2, [\ptr], \val - .endm +.LSrcAligned: + cmp count, #64 + b.ge .Lcpy_over64 + /* + * Deal with small copies quickly by dropping straight into the + * exit block. + */ +.Ltail63: + /* + * Copy up to 48 bytes of data. At this point we only need the + * bottom 6 bits of count to be accurate. + */ + ands tmp1, count, #0x30 + b.eq .Ltiny15 + USER_OFF(0, ldtr A_l, [src, #0]) + USER_OFF(8, ldtr A_h, [src, #8]) + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f + stp A_l, A_h, [dst], #16 + add src, src, #16 + USER_OFF(0, ldtr A_l, [src, #0]) + USER_OFF(8, ldtr A_h, [src, #8]) +1: + stp A_l, A_h, [dst], #16 + add src, src, #16 + USER_OFF(0, ldtr A_l, [src, #0]) + USER_OFF(8, ldtr A_h, [src, #8]) +2: + stp A_l, A_h, [dst], #16 + add src, src, #16 +.Ltiny15: + /* + * Prefer to break one ldp/stp into several load/store to access + * memory in an increasing address order,rather than to load/store 16 + * bytes from (src-16) to (dst-16) and to backward the src to aligned + * address,which way is used in original cortex memcpy. If keeping + * the original memcpy process here, memmove need to satisfy the + * precondition that src address is at least 16 bytes bigger than dst + * address,otherwise some source data will be overwritten when memove + * call memcpy directly. To make memmove simpler and decouple the + * memcpy's dependency on memmove, withdrew the original process. + */ + tbz count, #3, 1f +USER_OFF(0, ldtr tmp1, [src, #0]) + str tmp1, [dst], #8 + add src, src, #8 +1: + tbz count, #2, 2f +USER_OFF(0, ldtr tmp1w, [src, #0]) + str tmp1w, [dst], #4 + add src, src, #4 +2: + tbz count, #1, 3f +USER_OFF(0, ldtrh tmp1w, [src, #0]) + strh tmp1w, [dst], #2 + add src, src, #2 +3: + tbz count, #0, .Lexitfunc +USER_OFF(0, ldtrb tmp1w, [src, #0]) + strb tmp1w, [dst], #1 + add src, src, #1
-end .req x5 -srcin .req x15 -SYM_FUNC_START(__arch_copy_from_user) - add end, x0, x2 - mov srcin, x1 -#include "copy_template.S" + b .Lexitfunc + +.Lcpy_over64: + .p2align L1_CACHE_SHIFT + USER_OFF(0, ldtr A_l, [src, #0]) + USER_OFF(8, ldtr A_h, [src, #8]) + subs count, count, #128 + b.ge .Lcpy_body_large + /* + * Less than 128 bytes to copy, so handle 64 here and then jump + * to the tail. + */ + stp A_l, A_h, [dst, #0] +USER_OFF(16, ldtr B_l, [src, #16]) +USER_OFF(24, ldtr B_h, [src, #24]) +USER_OFF(32, ldtr C_l, [src, #32]) +USER_OFF(40, ldtr C_h, [src, #40]) + stp B_l, B_h, [dst, #16] + stp C_l, C_h, [dst, #32] +USER_OFF(48, ldtr D_l, [src, #48]) +USER_OFF(56, ldtr D_h, [src, #56]) + add src, src, #64 + stp D_l, D_h, [dst, #48] + add dst, dst, #64 + + tst count, #0x3f + b.ne .Ltail63 + b .Lexitfunc + + /* + * Critical loop. Start at a new cache line boundary. Assuming + * 64 bytes per line this ensures the entire loop is in one line. + */ +.Lcpy_body_large: + /* pre-get 64 bytes data. */ +USER_OFF(16, ldtr B_l, [src, #16]) +USER_OFF(24, ldtr B_h, [src, #24]) +USER_OFF(32, ldtr C_l, [src, #32]) +USER_OFF(40, ldtr C_h, [src, #40]) +USER_OFF(48, ldtr D_l, [src, #48]) +USER_OFF(56, ldtr D_h, [src, #56]) + add src, src, #64 + +1: + /* + * interlace the load of next 64 bytes data block with store of the last + * loaded 64 bytes data. + */ + stp A_l, A_h, [dst, #0] +USER_OFF(0, ldtr A_l, [src, #0]) +USER_OFF(8, ldtr A_h, [src, #8]) + stp B_l, B_h, [dst, #16] +USER_OFF(16, ldtr B_l, [src, #16]) +USER_OFF(24, ldtr B_h, [src, #24]) + stp C_l, C_h, [dst, #32] +USER_OFF(32, ldtr C_l, [src, #32]) +USER_OFF(40, ldtr C_h, [src, #40]) + stp D_l, D_h, [dst, #48] +USER_OFF(48, ldtr D_l, [src, #48]) + add dst, dst, #64 +USER_OFF(56, ldtr D_h, [src, #56]) + add src, src, #64 + subs count, count, #64 + b.ge 1b + stp A_l, A_h, [dst, #0] + stp B_l, B_h, [dst, #16] + stp C_l, C_h, [dst, #32] + stp D_l, D_h, [dst, #48] + add dst, dst, #64 + + tst count, #0x3f + b.ne .Ltail63 +.Lexitfunc: mov x0, #0 // Nothing to copy ret
- // Exception fixups -9997: cmp dst, dstin - b.ne 9998f - // Before being absolutely sure we couldn't copy anything, try harder -USER(9998f, ldtrb tmp1w, [srcin]) - strb tmp1w, [dst], #1 -9998: sub x0, end, dst // bytes not copied - ret SYM_FUNC_END(__arch_copy_from_user) EXPORT_SYMBOL(__arch_copy_from_user) diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 2ac716c0d6d8c..7b69dece56f6d 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -19,55 +19,219 @@ * Returns: * x0 - bytes not copied */ - .macro ldrb1 reg, ptr, val - KERNEL_ME_SAFE(9998f, ldrb \reg, [\ptr], \val) - .endm
- .macro strb1 reg, ptr, val - user_ldst 9998f, sttrb, \reg, \ptr, \val - .endm - - .macro ldrh1 reg, ptr, val - KERNEL_ME_SAFE(9998f, ldrh \reg, [\ptr], \val) - .endm - - .macro strh1 reg, ptr, val - user_ldst 9997f, sttrh, \reg, \ptr, \val - .endm - - .macro ldr1 reg, ptr, val - KERNEL_ME_SAFE(9998f, ldr \reg, [\ptr], \val) - .endm +dstin .req x0 +src .req x1 +end .req x5 +srcin .req x15 +count .req x2 +tmp1 .req x3 +tmp1w .req w3 +tmp2 .req x4 +tmp2w .req w4 +dst .req x6
- .macro str1 reg, ptr, val - user_ldst 9997f, sttr, \reg, \ptr, \val - .endm +A_l .req x7 +A_h .req x8 +B_l .req x9 +B_h .req x10 +C_l .req x11 +C_h .req x12 +D_l .req x13 +D_h .req x14
- .macro ldp1 reg1, reg2, ptr, val - KERNEL_ME_SAFE(9998f, ldp \reg1, \reg2, [\ptr], \val) - .endm +#define USER_OFF(off, x...) USER(fixup_offset_##off, x) +#define FIXUP_OFFSET(n) \ +fixup_offset_##n: \ + sub x0, end, dst; \ + sub x0, x0, n; \ + ret
- .macro stp1 reg1, reg2, ptr, val - user_stp 9997f, \reg1, \reg2, \ptr, \val - .endm +FIXUP_OFFSET(0) +FIXUP_OFFSET(8) +FIXUP_OFFSET(16) +FIXUP_OFFSET(24) +FIXUP_OFFSET(32) +FIXUP_OFFSET(40) +FIXUP_OFFSET(48) +FIXUP_OFFSET(56)
-end .req x5 -srcin .req x15 SYM_FUNC_START(__arch_copy_to_user) add end, x0, x2 mov srcin, x1 -#include "copy_template.S" + mov dst, dstin + cmp count, #16 + /*When memory length is less than 16, the accessed are not aligned.*/ + b.lo .Ltiny15 + + neg tmp2, src + ands tmp2, tmp2, #15/* Bytes to reach alignment. */ + b.eq .LSrcAligned + sub count, count, tmp2 + /* + * Copy the leading memory data from src to dst in an increasing + * address order.By this way,the risk of overwriting the source + * memory data is eliminated when the distance between src and + * dst is less than 16. The memory accesses here are alignment. + */ + tbz tmp2, #0, 1f + ldrb tmp1w, [src], #1 +USER_OFF(0, sttrb tmp1w, [dst, #0]) + add dst, dst, #1 +1: + tbz tmp2, #1, 2f + ldrh tmp1w, [src], #2 +USER_OFF(0, sttrh tmp1w, [dst, #0]) + add dst, dst, #2 +2: + tbz tmp2, #2, 3f + ldr tmp1w, [src], #4 +USER_OFF(0, sttr tmp1w, [dst, #0]) + add dst, dst, #4 +3: + tbz tmp2, #3, .LSrcAligned + ldr tmp1, [src], #8 +USER_OFF(0, sttr tmp1, [dst, #0]) + add dst, dst, #8 + +.LSrcAligned: + cmp count, #64 + b.ge .Lcpy_over64 + /* + * Deal with small copies quickly by dropping straight into the + * exit block. + */ +.Ltail63: + /* + * Copy up to 48 bytes of data. At this point we only need the + * bottom 6 bits of count to be accurate. + */ + ands tmp1, count, #0x30 + b.eq .Ltiny15 + ldp A_l, A_h, [src], #16 + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f +USER_OFF(0, sttr A_l, [dst, #0]) +USER_OFF(8, sttr A_h, [dst, #8]) + ldp A_l, A_h, [src], #16 + add dst, dst, #16 +1: +USER_OFF(0, sttr A_l, [dst, #0]) +USER_OFF(8, sttr A_h, [dst, #8]) + ldp A_l, A_h, [src], #16 + add dst, dst, #16 +2: +USER_OFF(0, sttr A_l, [dst, #0]) +USER_OFF(8, sttr A_h, [dst, #8]) + add dst, dst, #16 +.Ltiny15: + /* + * Prefer to break one ldp/stp into several load/store to access + * memory in an increasing address order,rather than to load/store 16 + * bytes from (src-16) to (dst-16) and to backward the src to aligned + * address,which way is used in original cortex memcpy. If keeping + * the original memcpy process here, memmove need to satisfy the + * precondition that src address is at least 16 bytes bigger than dst + * address,otherwise some source data will be overwritten when memove + * call memcpy directly. To make memmove simpler and decouple the + * memcpy's dependency on memmove, withdrew the original process. + */ + tbz count, #3, 1f + ldr tmp1, [src], #8 +USER_OFF(0, sttr tmp1, [dst, #0]) + add dst, dst, #8 +1: + tbz count, #2, 2f + ldr tmp1w, [src], #4 +USER_OFF(0, sttr tmp1w, [dst, #0]) + add dst, dst, #4 +2: + tbz count, #1, 3f + ldrh tmp1w, [src], #2 +USER_OFF(0, sttrh tmp1w, [dst, #0]) + add dst, dst, #2 +3: + tbz count, #0, .Lexitfunc + ldrb tmp1w, [src], #1 +USER_OFF(0, sttrb tmp1w, [dst, #0]) + add dst, dst, #1 + + b .Lexitfunc + +.Lcpy_over64: + .p2align L1_CACHE_SHIFT + ldp A_l, A_h, [src, #0] + subs count, count, #128 + b.ge .Lcpy_body_large + /* + * Less than 128 bytes to copy, so handle 64 here and then jump + * to the tail. + */ +USER_OFF(0, sttr A_l, [dst, #0]) +USER_OFF(8, sttr A_h, [dst, #8]) + ldp B_l, B_h, [src, #16] + ldp C_l, C_h, [src, #32] +USER_OFF(16, sttr B_l, [dst, #16]) +USER_OFF(24, sttr B_h, [dst, #24]) +USER_OFF(32, sttr C_l, [dst, #32]) +USER_OFF(40, sttr C_h, [dst, #40]) + ldp D_l, D_h, [src, #48] + add src, src, #64 +USER_OFF(48, sttr D_l, [dst, #48]) +USER_OFF(56, sttr D_h, [dst, #56]) + add dst, dst, #64 + + tst count, #0x3f + b.ne .Ltail63 + b .Lexitfunc + + /* + * Critical loop. Start at a new cache line boundary. Assuming + * 64 bytes per line this ensures the entire loop is in one line. + */ +.Lcpy_body_large: + /* pre-get 64 bytes data. */ + ldp B_l, B_h, [src, #16] + ldp C_l, C_h, [src, #32] + ldp D_l, D_h, [src, #48] + add src, src, #64 +1: + /* + * interlace the load of next 64 bytes data block with store of the last + * loaded 64 bytes data. + */ +USER_OFF(0, sttr A_l, [dst, #0]) +USER_OFF(8, sttr A_h, [dst, #8]) + ldp A_l, A_h, [src, #0] +USER_OFF(16, sttr B_l, [dst, #16]) +USER_OFF(24, sttr B_h, [dst, #24]) + ldp B_l, B_h, [src, #16] +USER_OFF(32, sttr C_l, [dst, #32]) +USER_OFF(40, sttr C_h, [dst, #40]) + ldp C_l, C_h, [src, #32] +USER_OFF(48, sttr D_l, [dst, #48]) +USER_OFF(56, sttr D_h, [dst, #56]) + add dst, dst, #64 + ldp D_l, D_h, [src, #48] + add src, src, #64 + subs count, count, #64 + b.ge 1b +USER_OFF(0, sttr A_l, [dst, #0]) +USER_OFF(8, sttr A_h, [dst, #8]) +USER_OFF(16, sttr B_l, [dst, #16]) +USER_OFF(24, sttr B_h, [dst, #24]) +USER_OFF(32, sttr C_l, [dst, #32]) +USER_OFF(40, sttr C_h, [dst, #40]) +USER_OFF(48, sttr D_l, [dst, #48]) +USER_OFF(56, sttr D_h, [dst, #56]) + add dst, dst, #64 + + tst count, #0x3f + b.ne .Ltail63 +.Lexitfunc: mov x0, #0 ret
- // Exception fixups -9997: cmp dst, dstin - b.ne 9998f - // Before being absolutely sure we couldn't copy anything, try harder -KERNEL_ME_SAFE(9998f, ldrb tmp1w, [srcin]) -USER(9998f, sttrb tmp1w, [dst]) - add dst, dst, #1 -9998: sub x0, end, dst // bytes not copied - ret SYM_FUNC_END(__arch_copy_to_user) EXPORT_SYMBOL(__arch_copy_to_user)
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/7473 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/P...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/7473 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/P...