From: Zhihao Cheng chengzhihao1@huawei.com
hulk inclusion category: bugfix bugzilla: 187685, https://gitee.com/openeuler/kernel/issues/I5VVZX CVE: NA
--------------------------------
Fix bad space budget when symlink file is encrypted. Bad space budget may let make_reservation() return with -ENOSPC, which could turn ubifs to read-only mode in do_writepage() process.
Fetch a reproducer in [Link].
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216490 Fixes: ca7f85be8d6cf9 ("ubifs: Add support for encrypted symlinks") Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/ubifs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index acd7e83a35e4..647a47ade5bd 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1148,7 +1148,6 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry, int err, sz_change, len = strlen(symname); struct fscrypt_str disk_link; struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, - .new_ino_d = ALIGN(len, 8), .dirtied_ino = 1 }; struct fscrypt_name nm;
@@ -1164,6 +1163,7 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry, * Budget request settings: new inode, new direntry and changing parent * directory inode. */ + req.new_ino_d = ALIGN(disk_link.len - 1, 8); err = ubifs_budget_space(c, &req); if (err) return err;
From: Zhihao Cheng chengzhihao1@huawei.com
hulk inclusion category: bugfix bugzilla: 187685, https://gitee.com/openeuler/kernel/issues/I5VVZX CVE: NA
--------------------------------
There is no space budget for ubifs_xrename(). It may let make_reservation() return with -ENOSPC, which could turn ubifs to read-only mode in do_writepage() process. Fix it by adding space budget for ubifs_xrename().
Fixes: 9ec64962afb170 ("ubifs: Implement RENAME_EXCHANGE") Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/ubifs/dir.c | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 647a47ade5bd..38e3f50ac54d 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1573,6 +1573,10 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, return err; }
+ err = ubifs_budget_space(c, &req); + if (err) + goto out; + lock_4_inodes(old_dir, new_dir, NULL, NULL);
time = current_time(old_dir); @@ -1598,6 +1602,7 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, unlock_4_inodes(old_dir, new_dir, NULL, NULL); ubifs_release_budget(c, &req);
+out: fscrypt_free_filename(&fst_nm); fscrypt_free_filename(&snd_nm); return err;
From: Zhihao Cheng chengzhihao1@huawei.com
hulk inclusion category: bugfix bugzilla: 187685, https://gitee.com/openeuler/kernel/issues/I5VVZX CVE: NA
-------------------------------
Each dirty inode should reserve 'c->bi.inode_budget' bytes in space budget calculation. Currently, space budget for dirty inode reports more space than what UBIFS actually needs to write.
Fixes: 1e51764a3c2ac0 ("UBIFS: add new flash file system") Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/ubifs/budget.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index c0b84e960b20..bdb79be6dc0e 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -403,7 +403,7 @@ static int calc_dd_growth(const struct ubifs_info *c, dd_growth = req->dirtied_page ? c->bi.page_budget : 0;
if (req->dirtied_ino) - dd_growth += c->bi.inode_budget << (req->dirtied_ino - 1); + dd_growth += c->bi.inode_budget * req->dirtied_ino; if (req->mod_dent) dd_growth += c->bi.dent_budget; dd_growth += req->dirtied_ino_d;
From: Zhihao Cheng chengzhihao1@huawei.com
hulk inclusion category: bugfix bugzilla: 187685, https://gitee.com/openeuler/kernel/issues/I5VVZX CVE: NA
--------------------------------
If target inode is a special file (eg. block/char device) with nlink count greater than 1, the inode with ui->data will be re-written on disk. However, UBIFS losts target inode's data_len while doing space budget. Bad space budget may let make_reservation() return with -ENOSPC, which could turn ubifs to read-only mode in do_writepage() process.
Fetch a reproducer in [Link].
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216494 Fixes: 1e51764a3c2ac0 ("UBIFS: add new flash file system") Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/ubifs/dir.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 38e3f50ac54d..aedd545886cb 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1321,6 +1321,8 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlink) { ubifs_assert(c, inode_is_locked(new_inode));
+ /* Budget for old inode's data when its nlink > 1. */ + req.dirtied_ino_d = ALIGN(ubifs_inode(new_inode)->data_len, 8); err = ubifs_purge_xattrs(new_inode); if (err) return err;
From: Zhihao Cheng chengzhihao1@huawei.com
hulk inclusion category: bugfix bugzilla: 187685, https://gitee.com/openeuler/kernel/issues/I5VVZX CVE: NA
--------------------------------
UBIFS calculates available space by c->main_bytes - c->lst.total_used (which means non-index lebs' free and dirty space is accounted into total available), then index lebs and four lebs (one for gc_lnum, one for deletions, two for journal heads) are deducted. In following situation, ubifs may get -ENOSPC from make_reservation(): LEB 84: DATAHD free 122880 used 1920 dirty 2176 dark 6144 LEB 110:DELETION free 126976 used 0 dirty 0 dark 6144 (empty) LEB 201:gc_lnum free 126976 used 0 dirty 0 dark 6144 LEB 272:GCHD free 77824 used 47672 dirty 1480 dark 6144 LEB 356:BASEHD free 0 used 39776 dirty 87200 dark 6144 OTHERS: index lebs, zero-available non-index lebs
UBIFS calculates the available bytes is 6888 (How to calculate it: 126976 * 5[remain main bytes] - 1920[used] - 47672[used] - 39776[used] - 126976 * 1[deletions] - 126976 * 1[gc_lnum] - 126976 * 2[journal heads] - 6144 * 5[dark] = 6888) after doing budget, however UBIFS cannot use BASEHD's dirty space(87200), because UBIFS cannot find next BASEHD to reclaim current BASEHD. (c->bi.min_idx_lebs equals to c->lst.idx_lebs, the empty leb won't be found by ubifs_find_free_space(), and dirty index lebs won't be picked as gced lebs. All non-index lebs has dirty space less then c->dead_wm, non-index lebs won't be picked as gced lebs either. So new free lebs won't be produced.). See more details in Link.
To fix it, reserve one leb for each journal head while doing budget.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216562 Fixes: 1e51764a3c2ac0 ("UBIFS: add new flash file system") Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/ubifs/budget.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index bdb79be6dc0e..9cb05ef9b9dd 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -212,11 +212,10 @@ long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs) subtract_lebs += 1;
/* - * The GC journal head LEB is not really accessible. And since - * different write types go to different heads, we may count only on - * one head's space. + * Since different write types go to different heads, we should + * reserve one leb for each head. */ - subtract_lebs += c->jhead_cnt - 1; + subtract_lebs += c->jhead_cnt;
/* We also reserve one LEB for deletions, which bypass budgeting */ subtract_lebs += 1;
From: Eric Dumazet edumazet@google.com
mainline inclusion from mainline-v6.1-rc1 commit ec7eede369fe5b0d085ac51fdbb95184f87bfc6c category: bugfix bugzilla: 187823, https://gitee.com/src-openeuler/kernel/issues/I5VZ0N CVE: CVE-2022-3521
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
syzbot found that kcm_tx_work() could crash [1] in:
/* Primarily for SOCK_SEQPACKET sockets */ if (likely(sk->sk_socket) && test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { <<*>> clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_space(sk); }
I think the reason is that another thread might concurrently run in kcm_release() and call sock_orphan(sk) while sk is not locked. kcm_tx_work() find sk->sk_socket being NULL.
[1] BUG: KASAN: null-ptr-deref in instrument_atomic_write include/linux/instrumented.h:86 [inline] BUG: KASAN: null-ptr-deref in clear_bit include/asm-generic/bitops/instrumented-atomic.h:41 [inline] BUG: KASAN: null-ptr-deref in kcm_tx_work+0xff/0x160 net/kcm/kcmsock.c:742 Write of size 8 at addr 0000000000000008 by task kworker/u4:3/53
CPU: 0 PID: 53 Comm: kworker/u4:3 Not tainted 5.19.0-rc3-next-20220621-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: kkcmd kcm_tx_work Call Trace: <TASK> __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 kasan_report+0xbe/0x1f0 mm/kasan/report.c:495 check_region_inline mm/kasan/generic.c:183 [inline] kasan_check_range+0x13d/0x180 mm/kasan/generic.c:189 instrument_atomic_write include/linux/instrumented.h:86 [inline] clear_bit include/asm-generic/bitops/instrumented-atomic.h:41 [inline] kcm_tx_work+0xff/0x160 net/kcm/kcmsock.c:742 process_one_work+0x996/0x1610 kernel/workqueue.c:2289 worker_thread+0x665/0x1080 kernel/workqueue.c:2436 kthread+0x2e9/0x3a0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:302 </TASK>
Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module") Reported-by: syzbot syzkaller@googlegroups.com Signed-off-by: Eric Dumazet edumazet@google.com Cc: Tom Herbert tom@herbertland.com Link: https://lore.kernel.org/r/20221012133412.519394-1-edumazet@google.com Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Baisong Zhong zhongbaisong@huawei.com Reviewed-by: Liu Jian liujian56@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/kcm/kcmsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 56dad9565bc9..977f2de89dce 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1838,10 +1838,10 @@ static int kcm_release(struct socket *sock) kcm = kcm_sk(sk); mux = kcm->mux;
+ lock_sock(sk); sock_orphan(sk); kfree_skb(kcm->seq_skb);
- lock_sock(sk); /* Purge queue under lock to avoid race condition with tx_work trying * to act when queue is nonempty. If tx_work runs after this point * it will just return.
From: Kuniyuki Iwashima kuniyu@amazon.com
mainline inclusion from mainline-v6.1-rc1 commit 3c52c6bb831f6335c176a0fc7214e26f43adbd11 category: bugfix bugzilla: 187825, https://gitee.com/src-openeuler/kernel/issues/I5VZ0J CVE: CVE-2022-3524
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
syzbot reported a memory leak [0] related to IPV6_ADDRFORM.
The scenario is that while one thread is converting an IPv6 socket into IPv4 with IPV6_ADDRFORM, another thread calls do_ipv6_setsockopt() and allocates memory to inet6_sk(sk)->XXX after conversion.
Then, the converted sk with (tcp|udp)_prot never frees the IPv6 resources, which inet6_destroy_sock() should have cleaned up.
setsockopt(IPV6_ADDRFORM) setsockopt(IPV6_DSTOPTS) +-----------------------+ +----------------------+ - do_ipv6_setsockopt(sk, ...) - sockopt_lock_sock(sk) - do_ipv6_setsockopt(sk, ...) - lock_sock(sk) ^._ called via tcpv6_prot - WRITE_ONCE(sk->sk_prot, &tcp_prot) before WRITE_ONCE() - xchg(&np->opt, NULL) - txopt_put(opt) - sockopt_release_sock(sk) - release_sock(sk) - sockopt_lock_sock(sk) - lock_sock(sk) - ipv6_set_opt_hdr(sk, ...) - ipv6_update_options(sk, opt) - xchg(&inet6_sk(sk)->opt, opt) ^._ opt is never freed.
- sockopt_release_sock(sk) - release_sock(sk)
Since IPV6_DSTOPTS allocates options under lock_sock(), we can avoid this memory leak by testing whether sk_family is changed by IPV6_ADDRFORM after acquiring the lock.
This issue exists from the initial commit between IPV6_ADDRFORM and IPV6_PKTOPTIONS.
[0]: BUG: memory leak unreferenced object 0xffff888009ab9f80 (size 96): comm "syz-executor583", pid 328, jiffies 4294916198 (age 13.034s) hex dump (first 32 bytes): 01 00 00 00 48 00 00 00 08 00 00 00 00 00 00 00 ....H........... 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<000000002ee98ae1>] kmalloc include/linux/slab.h:605 [inline] [<000000002ee98ae1>] sock_kmalloc+0xb3/0x100 net/core/sock.c:2566 [<0000000065d7b698>] ipv6_renew_options+0x21e/0x10b0 net/ipv6/exthdrs.c:1318 [<00000000a8c756d7>] ipv6_set_opt_hdr net/ipv6/ipv6_sockglue.c:354 [inline] [<00000000a8c756d7>] do_ipv6_setsockopt.constprop.0+0x28b7/0x4350 net/ipv6/ipv6_sockglue.c:668 [<000000002854d204>] ipv6_setsockopt+0xdf/0x190 net/ipv6/ipv6_sockglue.c:1021 [<00000000e69fdcf8>] tcp_setsockopt+0x13b/0x2620 net/ipv4/tcp.c:3789 [<0000000090da4b9b>] __sys_setsockopt+0x239/0x620 net/socket.c:2252 [<00000000b10d192f>] __do_sys_setsockopt net/socket.c:2263 [inline] [<00000000b10d192f>] __se_sys_setsockopt net/socket.c:2260 [inline] [<00000000b10d192f>] __x64_sys_setsockopt+0xbe/0x160 net/socket.c:2260 [<000000000a80d7aa>] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [<000000000a80d7aa>] do_syscall_64+0x38/0x90 arch/x86/entry/common.c:80 [<000000004562b5c6>] entry_SYSCALL_64_after_hwframe+0x63/0xcd
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot syzkaller@googlegroups.com Signed-off-by: Kuniyuki Iwashima kuniyu@amazon.com Signed-off-by: Jakub Kicinski kuba@kernel.org
Conflicts: net/ipv6/ipv6_sockglue.c
Signed-off-by: Xu Jia xujia39@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Reviewed-by: Liu Jian liujian56@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/ipv6/ipv6_sockglue.c | 7 +++++++ 1 file changed, 7 insertions(+)
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 1ae33a882b9a..16af0c700d05 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -417,6 +417,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, rtnl_lock(); lock_sock(sk);
+ /* Another thread has converted the socket into IPv4 with + * IPV6_ADDRFORM concurrently. + */ + if (unlikely(sk->sk_family != AF_INET6)) + goto unlock; + switch (optname) {
case IPV6_ADDRFORM: @@ -978,6 +984,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, break; }
+unlock: release_sock(sk); if (needs_rtnl) rtnl_unlock();
From: Thadeu Lima de Souza Cascardo cascardo@canonical.com
stable inclusion from stable-v5.10.144 commit 33015556a943d6cbb18c555925a54b8c0e46f521 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5WL0J CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=l...
--------------------------------
This reverts commit 00b136bb6254e0abf6aaafe62c4da5f6c4fea4cb.
This temporarily reverts the backport of upstream commit 1f001e9da6bbf482311e45e48f53c2bd2179e59c. It was not correct to copy the ftrace stub as it would contain a relative jump to the return thunk which would not apply to the context where it was being copied to, leading to ftrace support to be broken.
Signed-off-by: Thadeu Lima de Souza Cascardo cascardo@canonical.com Signed-off-by: Ovidiu Panait ovidiu.panait@windriver.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Lin Yujun linyujun809@huawei.com Reviewed-by: Zhang Jianhua chris.zjh@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kernel/ftrace.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index dca5cf82144c..fbcd144260ac 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -308,7 +308,7 @@ union ftrace_op_code_union { } __attribute__((packed)); };
-#define RET_SIZE (IS_ENABLED(CONFIG_RETPOLINE) ? 5 : 1 + IS_ENABLED(CONFIG_SLS)) +#define RET_SIZE 1 + IS_ENABLED(CONFIG_SLS)
static unsigned long create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) @@ -367,10 +367,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
/* The trampoline ends with ret(q) */ retq = (unsigned long)ftrace_stub; - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) - memcpy(ip, text_gen_insn(JMP32_INSN_OPCODE, ip, &__x86_return_thunk), JMP32_INSN_SIZE); - else - ret = copy_from_kernel_nofault(ip, (void *)retq, RET_SIZE); + ret = copy_from_kernel_nofault(ip, (void *)retq, RET_SIZE); if (WARN_ON(ret < 0)) goto fail;
From: Peter Zijlstra peterz@infradead.org
stable inclusion from stable-v5.10.144 commit 4586df06a02049f4315c25b947c6dde2627c0d18 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5WL0J CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=l...
--------------------------------
commit e52fc2cf3f662828cc0d51c4b73bed73ad275fce upstream.
Return trampoline must not use indirect branch to return; while this preserves the RSB, it is fundamentally incompatible with IBT. Instead use a retpoline like ROP gadget that defeats IBT while not unbalancing the RSB.
And since ftrace_stub is no longer a plain RET, don't use it to copy from. Since RET is a trivial instruction, poke it directly.
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Acked-by: Josh Poimboeuf jpoimboe@redhat.com Link: https://lore.kernel.org/r/20220308154318.347296408@infradead.org [cascardo: remove ENDBR] Signed-off-by: Thadeu Lima de Souza Cascardo cascardo@canonical.com [OP: adjusted context for 5.10-stable] Signed-off-by: Ovidiu Panait ovidiu.panait@windriver.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Lin Yujun linyujun809@huawei.com Reviewed-by: Zhang Jianhua chris.zjh@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kernel/ftrace.c | 9 ++------- arch/x86/kernel/ftrace_64.S | 19 +++++++++++++++---- 2 files changed, 17 insertions(+), 11 deletions(-)
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index fbcd144260ac..5c7583449df7 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -321,12 +321,12 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) unsigned long offset; unsigned long npages; unsigned long size; - unsigned long retq; unsigned long *ptr; void *trampoline; void *ip; /* 48 8b 15 <offset> is movq <offset>(%rip), %rdx */ unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 }; + unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE }; union ftrace_op_code_union op_ptr; int ret;
@@ -364,12 +364,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) goto fail;
ip = trampoline + size; - - /* The trampoline ends with ret(q) */ - retq = (unsigned long)ftrace_stub; - ret = copy_from_kernel_nofault(ip, (void *)retq, RET_SIZE); - if (WARN_ON(ret < 0)) - goto fail; + memcpy(ip, retq, RET_SIZE);
/* No need to test direct calls on created trampolines */ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index e3a375185a1b..5b2dabedcf66 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -170,7 +170,6 @@ SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL)
/* * This is weak to keep gas from relaxing the jumps. - * It is also used to copy the RET for trampolines. */ SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) UNWIND_HINT_FUNC @@ -325,7 +324,7 @@ SYM_FUNC_END(ftrace_graph_caller)
SYM_CODE_START(return_to_handler) UNWIND_HINT_EMPTY - subq $24, %rsp + subq $16, %rsp
/* Save the return values */ movq %rax, (%rsp) @@ -337,7 +336,19 @@ SYM_CODE_START(return_to_handler) movq %rax, %rdi movq 8(%rsp), %rdx movq (%rsp), %rax - addq $24, %rsp - JMP_NOSPEC rdi + + addq $16, %rsp + /* + * Jump back to the old return address. This cannot be JMP_NOSPEC rdi + * since IBT would demand that contain ENDBR, which simply isn't so for + * return addresses. Use a retpoline here to keep the RSB balanced. + */ + ANNOTATE_INTRA_FUNCTION_CALL + call .Ldo_rop + int3 +.Ldo_rop: + mov %rdi, (%rsp) + UNWIND_HINT_FUNC + RET SYM_CODE_END(return_to_handler) #endif
From: Peter Zijlstra peterz@infradead.org
stable inclusion from stable-v5.10.144 commit 35371fd68807f41a4072c01c166de5425a2a47e5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5WL0J CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=l...
--------------------------------
commit 1f001e9da6bbf482311e45e48f53c2bd2179e59c upstream.
Use the return thunk in ftrace trampolines, if needed.
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Signed-off-by: Borislav Petkov bp@suse.de Reviewed-by: Josh Poimboeuf jpoimboe@kernel.org Signed-off-by: Borislav Petkov bp@suse.de [cascardo: use memcpy(text_gen_insn) as there is no __text_gen_insn] Signed-off-by: Thadeu Lima de Souza Cascardo cascardo@canonical.com Signed-off-by: Ovidiu Panait ovidiu.panait@windriver.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Lin Yujun linyujun809@huawei.com Reviewed-by: Zhang Jianhua chris.zjh@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kernel/ftrace.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 5c7583449df7..e760f6b5f0b4 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -308,7 +308,7 @@ union ftrace_op_code_union { } __attribute__((packed)); };
-#define RET_SIZE 1 + IS_ENABLED(CONFIG_SLS) +#define RET_SIZE (IS_ENABLED(CONFIG_RETPOLINE) ? 5 : 1 + IS_ENABLED(CONFIG_SLS))
static unsigned long create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) @@ -364,7 +364,12 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) goto fail;
ip = trampoline + size; - memcpy(ip, retq, RET_SIZE); + + /* The trampoline ends with ret(q) */ + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + memcpy(ip, text_gen_insn(JMP32_INSN_OPCODE, ip, &__x86_return_thunk), JMP32_INSN_SIZE); + else + memcpy(ip, retq, sizeof(retq));
/* No need to test direct calls on created trampolines */ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
From: Miklos Szeredi mszeredi@redhat.com
mainline inclusion from mainline-v5.11-rc1 commit b6650dab404c701d7fe08a108b746542a934da84 category: bugfix bugzilla: 187815, https://gitee.com/openeuler/kernel/issues/I5WLBD CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
In case the file cannot be opened with O_NOATIME because of lack of capabilities, then clear O_NOATIME instead of failing.
Remove WARN_ON(), since it would now trigger if O_NOATIME was cleared. Noticed by Amir Goldstein.
Signed-off-by: Miklos Szeredi mszeredi@redhat.com Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/overlayfs/file.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-)
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index b019f27c1360..71019814c0b5 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -54,9 +54,10 @@ static struct file *ovl_open_realfile(const struct file *file, err = inode_permission(realinode, MAY_OPEN | acc_mode); if (err) { realfile = ERR_PTR(err); - } else if (!inode_owner_or_capable(realinode)) { - realfile = ERR_PTR(-EPERM); } else { + if (!inode_owner_or_capable(realinode)) + flags &= ~O_NOATIME; + realfile = open_with_fake_path(&file->f_path, flags, realinode, current_cred()); } @@ -76,12 +77,6 @@ static int ovl_change_flags(struct file *file, unsigned int flags) struct inode *inode = file_inode(file); int err;
- flags |= OVL_OPEN_FLAGS; - - /* If some flag changed that cannot be changed then something's amiss */ - if (WARN_ON((file->f_flags ^ flags) & ~OVL_SETFL_MASK)) - return -EIO; - flags &= OVL_SETFL_MASK;
if (((flags ^ file->f_flags) & O_APPEND) && IS_APPEND(inode))
From: Johannes Berg johannes.berg@intel.com
stable inclusion from stable-v5.10.148 commit 58c0306d0bcd5f541714bea8765d23111c9af68a category: bugfix bugzilla: 187817, https://gitee.com/src-openeuler/kernel/issues/I5VMMW CVE: CVE-2022-42722
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit b2d03cabe2b2e150ff5a381731ea0355459be09f upstream.
If beacon protection is active but the beacon cannot be decrypted or is otherwise malformed, we call the cfg80211 API to report this to userspace, but that uses a netdev pointer, which isn't present for P2P-Device. Fix this to call it only conditionally to ensure cfg80211 won't crash in the case of P2P-Device.
This fixes CVE-2022-42722.
Reported-by: Sönke Huster shuster@seemoo.tu-darmstadt.de Fixes: 9eaf183af741 ("mac80211: Report beacon protection failures to user space") Signed-off-by: Johannes Berg johannes.berg@intel.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.orgi Signed-off-by: Dong Chenchen dongchenchen2@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Reviewed-by: Liu Jian liujian56@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/mac80211/rx.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-)
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index e991abb45f68..97a63b940482 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1975,10 +1975,11 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
if (mmie_keyidx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS || mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS + - NUM_DEFAULT_BEACON_KEYS) { - cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev, - skb->data, - skb->len); + NUM_DEFAULT_BEACON_KEYS) { + if (rx->sdata->dev) + cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev, + skb->data, + skb->len); return RX_DROP_MONITOR; /* unexpected BIP keyidx */ }
@@ -2126,7 +2127,8 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) /* either the frame has been decrypted or will be dropped */ status->flag |= RX_FLAG_DECRYPTED;
- if (unlikely(ieee80211_is_beacon(fc) && result == RX_DROP_UNUSABLE)) + if (unlikely(ieee80211_is_beacon(fc) && result == RX_DROP_UNUSABLE && + rx->sdata->dev)) cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev, skb->data, skb->len);
From: Kuniyuki Iwashima kuniyu@amazon.com
mainline inclusion from mainline-v6.1-rc1 commit f49cd2f4d6170d27a2c61f1fecb03d8a70c91f57 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5W7ZF?from=project-issue CVE: CVE-2022-3566
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=...
--------------------------------
setsockopt(IPV6_ADDRFORM) and tcp_v6_connect() change icsk->icsk_af_ops under lock_sock(), but tcp_(get|set)sockopt() read it locklessly. To avoid load/store tearing, we need to add READ_ONCE() and WRITE_ONCE() for the reads and writes.
Thanks to Eric Dumazet for providing the syzbot report:
BUG: KCSAN: data-race in tcp_setsockopt / tcp_v6_connect
write to 0xffff88813c624518 of 8 bytes by task 23936 on cpu 0: tcp_v6_connect+0x5b3/0xce0 net/ipv6/tcp_ipv6.c:240 __inet_stream_connect+0x159/0x6d0 net/ipv4/af_inet.c:660 inet_stream_connect+0x44/0x70 net/ipv4/af_inet.c:724 __sys_connect_file net/socket.c:1976 [inline] __sys_connect+0x197/0x1b0 net/socket.c:1993 __do_sys_connect net/socket.c:2003 [inline] __se_sys_connect net/socket.c:2000 [inline] __x64_sys_connect+0x3d/0x50 net/socket.c:2000 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd
read to 0xffff88813c624518 of 8 bytes by task 23937 on cpu 1: tcp_setsockopt+0x147/0x1c80 net/ipv4/tcp.c:3789 sock_common_setsockopt+0x5d/0x70 net/core/sock.c:3585 __sys_setsockopt+0x212/0x2b0 net/socket.c:2252 __do_sys_setsockopt net/socket.c:2263 [inline] __se_sys_setsockopt net/socket.c:2260 [inline] __x64_sys_setsockopt+0x62/0x70 net/socket.c:2260 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd
value changed: 0xffffffff8539af68 -> 0xffffffff8539aff8
Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 23937 Comm: syz-executor.5 Not tainted 6.0.0-rc4-syzkaller-00331-g4ed9c1e971b1-dirty #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/26/2022
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot syzkaller@googlegroups.com Reported-by: Eric Dumazet edumazet@google.com Signed-off-by: Kuniyuki Iwashima kuniyu@amazon.com Signed-off-by: Jakub Kicinski kuba@kernel.org Conflicts: net/ipv4/tcp.c net/ipv6/ipv6_sockglue.c Signed-off-by: Ziyang Xuan william.xuanziyang@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/ipv4/tcp.c | 10 ++++++---- net/ipv6/ipv6_sockglue.c | 3 ++- net/ipv6/tcp_ipv6.c | 6 ++++-- 3 files changed, 12 insertions(+), 7 deletions(-)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4d5280780a8e..e7183cc8e998 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3448,8 +3448,9 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, const struct inet_connection_sock *icsk = inet_csk(sk);
if (level != SOL_TCP) - return icsk->icsk_af_ops->setsockopt(sk, level, optname, - optval, optlen); + /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */ + return READ_ONCE(icsk->icsk_af_ops)->setsockopt(sk, level, optname, + optval, optlen); return do_tcp_setsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(tcp_setsockopt); @@ -3993,8 +3994,9 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, struct inet_connection_sock *icsk = inet_csk(sk);
if (level != SOL_TCP) - return icsk->icsk_af_ops->getsockopt(sk, level, optname, - optval, optlen); + /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */ + return READ_ONCE(icsk->icsk_af_ops)->getsockopt(sk, level, optname, + optval, optlen); return do_tcp_getsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(tcp_getsockopt); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 16af0c700d05..6cfb4042994e 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -481,7 +481,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, local_bh_enable(); /* Paired with READ_ONCE(sk->sk_prot) in net/ipv6/af_inet6.c */ WRITE_ONCE(sk->sk_prot, &tcp_prot); - icsk->icsk_af_ops = &ipv4_specific; + /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ + WRITE_ONCE(icsk->icsk_af_ops, &ipv4_specific); sk->sk_socket->ops = &inet_stream_ops; sk->sk_family = PF_INET; tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ef36c34e435e..0862fe07cf06 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -237,7 +237,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sin.sin_port = usin->sin6_port; sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
- icsk->icsk_af_ops = &ipv6_mapped; + /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ + WRITE_ONCE(icsk->icsk_af_ops, &ipv6_mapped); if (sk_is_mptcp(sk)) mptcpv6_handle_mapped(sk, true); sk->sk_backlog_rcv = tcp_v4_do_rcv; @@ -249,7 +250,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
if (err) { icsk->icsk_ext_hdr_len = exthdrlen; - icsk->icsk_af_ops = &ipv6_specific; + /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ + WRITE_ONCE(icsk->icsk_af_ops, &ipv6_specific); if (sk_is_mptcp(sk)) mptcpv6_handle_mapped(sk, false); sk->sk_backlog_rcv = tcp_v6_do_rcv;
From: Lee Jones lee@kernel.org
stable inclusion from stable-v5.10.134 commit 2ee0cab11f6626071f8a64c7792406dabdd94c8d category: bugfix bugzilla: 187845, https://gitee.com/src-openeuler/kernel/issues/I5UDNW CVE: CVE-2022-20409
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
This issue is conceptually identical to the one fixed in 29f077d07051 ("io_uring: always use original task when preparing req identity"), so rather than reinvent the wheel, I'm shamelessly quoting the commit message from that patch - thanks Jens:
"If the ring is setup with IORING_SETUP_IOPOLL and we have more than one task doing submissions on a ring, we can up in a situation where we assign the context from the current task rather than the request originator.
Always use req->task rather than assume it's the same as current.
No upstream patch exists for this issue, as only older kernels with the non-native workers have this problem."
Cc: Jens Axboe axboe@kernel.dk Cc: Pavel Begunkov asml.silence@gmail.com Cc: Alexander Viro viro@zeniv.linux.org.uk Cc: io-uring@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Fixes: 5c3462cfd123b ("io_uring: store io_identity in io_uring_task") Signed-off-by: Lee Jones lee@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Li Lingfeng lilingfeng3@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index 257e4af176e6..71cabd60a01d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1325,7 +1325,7 @@ static void io_req_clean_work(struct io_kiocb *req) */ static bool io_identity_cow(struct io_kiocb *req) { - struct io_uring_task *tctx = current->io_uring; + struct io_uring_task *tctx = req->task->io_uring; const struct cred *creds = NULL; struct io_identity *id;
From: Kuniyuki Iwashima kuniyu@amazon.com
mainline inclusion from mainline-6.1-rc1 commit 364f997b5cfe1db0d63a390fe7c801fa2b3115f6 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5W7YP CVE: CVE-2022-3567
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
---------------------------
Commit 086d49058cd8 ("ipv6: annotate some data-races around sk->sk_prot") fixed some data-races around sk->sk_prot but it was not enough.
Some functions in inet6_(stream|dgram)_ops still access sk->sk_prot without lock_sock() or rtnl_lock(), so they need READ_ONCE() to avoid load tearing.
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima kuniyu@amazon.com Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Liu Jian liujian56@huawei.com
Conflicts: net/ipv6/ipv6_sockglue.c Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/core/sock.c | 6 ++++-- net/ipv4/af_inet.c | 23 ++++++++++++++++------- net/ipv6/ipv6_sockglue.c | 4 ++-- 3 files changed, 22 insertions(+), 11 deletions(-)
diff --git a/net/core/sock.c b/net/core/sock.c index 2fa8863caee0..3f49f1117753 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3225,7 +3225,8 @@ int sock_common_getsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk;
- return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_common_getsockopt);
@@ -3252,7 +3253,8 @@ int sock_common_setsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk;
- return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_common_setsockopt);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 911ad595dbb9..d95233b903a9 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -559,22 +559,27 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; + const struct proto *prot; int err;
if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; + + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + prot = READ_ONCE(sk->sk_prot); + if (uaddr->sa_family == AF_UNSPEC) - return sk->sk_prot->disconnect(sk, flags); + return prot->disconnect(sk, flags);
if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { - err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); + err = prot->pre_connect(sk, uaddr, addr_len); if (err) return err; }
if (data_race(!inet_sk(sk)->inet_num) && inet_autobind(sk)) return -EAGAIN; - return sk->sk_prot->connect(sk, uaddr, addr_len); + return prot->connect(sk, uaddr, addr_len); } EXPORT_SYMBOL(inet_dgram_connect);
@@ -735,10 +740,11 @@ EXPORT_SYMBOL(inet_stream_connect); int inet_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { - struct sock *sk1 = sock->sk; + struct sock *sk1 = sock->sk, *sk2; int err = -EINVAL; - struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err, kern);
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + sk2 = READ_ONCE(sk1->sk_prot)->accept(sk1, flags, &err, kern); if (!sk2) goto do_err;
@@ -824,12 +830,15 @@ ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) { struct sock *sk = sock->sk; + const struct proto *prot;
if (unlikely(inet_send_prepare(sk))) return -EAGAIN;
- if (sk->sk_prot->sendpage) - return sk->sk_prot->sendpage(sk, page, offset, size, flags); + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + prot = READ_ONCE(sk->sk_prot); + if (prot->sendpage) + return prot->sendpage(sk, page, offset, size, flags); return sock_no_sendpage(sock, page, offset, size, flags); } EXPORT_SYMBOL(inet_sendpage); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 6cfb4042994e..65f83ce728fa 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -479,7 +479,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, &tcp_prot, 1); local_bh_enable(); - /* Paired with READ_ONCE(sk->sk_prot) in net/ipv6/af_inet6.c */ + /* Paired with READ_ONCE(sk->sk_prot) in inet6_stream_ops */ WRITE_ONCE(sk->sk_prot, &tcp_prot); /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ WRITE_ONCE(icsk->icsk_af_ops, &ipv4_specific); @@ -495,7 +495,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, prot, 1); local_bh_enable(); - /* Paired with READ_ONCE(sk->sk_prot) in net/ipv6/af_inet6.c */ + /* Paired with READ_ONCE(sk->sk_prot) in inet6_dgram_ops */ WRITE_ONCE(sk->sk_prot, prot); sk->sk_socket->ops = &inet_dgram_ops; sk->sk_family = PF_INET;
From: Zhihao Cheng chengzhihao1@huawei.com
mainline inclusion from mainline-v6.1-rc1 commit 669d204469c46e91d99da24914130f78277a71d3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5WQRM CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
[1] suggests that fastmap is suitable for large flash devices. Module parameter 'fm_autoconvert' is a coarse grained switch to enable all ubi devices to generate fastmap, which may turn on fastmap even for small flash devices.
This patch imports a new field 'disable_fm' in struct 'ubi_attach_req' to support following situations by ioctl 'UBI_IOCATT'. [old functions] A. Disable 'fm_autoconvert': Disbable fastmap for all ubi devices B. Enable 'fm_autoconvert': Enable fastmap for all ubi devices [new function] C. Enable 'fm_autoconvert', set 'disable_fm' for given device: Don't create new fastmap and do full scan (existed fastmap will be destroyed) for the given ubi device.
A simple test case in [2].
[1] http://www.linux-mtd.infradead.org/doc/ubi.html#L_fastmap [2] https://bugzilla.kernel.org/show_bug.cgi?id=216278
Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Signed-off-by: Richard Weinberger richard@nod.at Reviewed-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/mtd/ubi/build.c | 14 ++++++++++---- drivers/mtd/ubi/cdev.c | 2 +- drivers/mtd/ubi/ubi.h | 3 ++- include/uapi/mtd/ubi-user.h | 8 +++++++- 4 files changed, 20 insertions(+), 7 deletions(-)
diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index 4153e0d15c5f..ac97f6f9bcc2 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -808,6 +808,7 @@ static int autoresize(struct ubi_device *ubi, int vol_id) * @ubi_num: number to assign to the new UBI device * @vid_hdr_offset: VID header offset * @max_beb_per1024: maximum expected number of bad PEB per 1024 PEBs + * @disable_fm: whether disable fastmap * * This function attaches MTD device @mtd_dev to UBI and assign @ubi_num number * to the newly created UBI device, unless @ubi_num is %UBI_DEV_NUM_AUTO, in @@ -815,11 +816,15 @@ static int autoresize(struct ubi_device *ubi, int vol_id) * automatically. Returns the new UBI device number in case of success and a * negative error code in case of failure. * + * If @disable_fm is true, ubi doesn't create new fastmap even the module param + * 'fm_autoconvert' is set, and existed old fastmap will be destroyed after + * doing full scanning. + * * Note, the invocations of this function has to be serialized by the * @ubi_devices_mutex. */ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, - int vid_hdr_offset, int max_beb_per1024) + int vid_hdr_offset, int max_beb_per1024, bool disable_fm) { struct ubi_device *ubi; int i, err; @@ -922,7 +927,7 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, UBI_FM_MIN_POOL_SIZE);
ubi->fm_wl_pool.max_size = ubi->fm_pool.max_size / 2; - ubi->fm_disabled = !fm_autoconvert; + ubi->fm_disabled = (!fm_autoconvert || disable_fm) ? 1 : 0; if (fm_debug) ubi_enable_dbg_chk_fastmap(ubi);
@@ -963,7 +968,7 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, if (!ubi->fm_buf) goto out_free; #endif - err = ubi_attach(ubi, 0); + err = ubi_attach(ubi, disable_fm ? 1 : 0); if (err) { ubi_err(ubi, "failed to attach mtd%d, error %d", mtd->index, err); @@ -1243,7 +1248,8 @@ static int __init ubi_init(void)
mutex_lock(&ubi_devices_mutex); err = ubi_attach_mtd_dev(mtd, p->ubi_num, - p->vid_hdr_offs, p->max_beb_per1024); + p->vid_hdr_offs, p->max_beb_per1024, + false); mutex_unlock(&ubi_devices_mutex); if (err < 0) { pr_err("UBI error: cannot attach mtd%d\n", diff --git a/drivers/mtd/ubi/cdev.c b/drivers/mtd/ubi/cdev.c index cc9a28cf9d82..0680ba82255b 100644 --- a/drivers/mtd/ubi/cdev.c +++ b/drivers/mtd/ubi/cdev.c @@ -1041,7 +1041,7 @@ static long ctrl_cdev_ioctl(struct file *file, unsigned int cmd, */ mutex_lock(&ubi_devices_mutex); err = ubi_attach_mtd_dev(mtd, req.ubi_num, req.vid_hdr_offset, - req.max_beb_per1024); + req.max_beb_per1024, !!req.disable_fm); mutex_unlock(&ubi_devices_mutex); if (err < 0) put_mtd_device(mtd); diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h index da0bee13fe7f..4f84607dafb4 100644 --- a/drivers/mtd/ubi/ubi.h +++ b/drivers/mtd/ubi/ubi.h @@ -939,7 +939,8 @@ int ubi_io_write_vid_hdr(struct ubi_device *ubi, int pnum,
/* build.c */ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, - int vid_hdr_offset, int max_beb_per1024); + int vid_hdr_offset, int max_beb_per1024, + bool disable_fm); int ubi_detach_mtd_dev(int ubi_num, int anyway); struct ubi_device *ubi_get_device(int ubi_num); void ubi_put_device(struct ubi_device *ubi); diff --git a/include/uapi/mtd/ubi-user.h b/include/uapi/mtd/ubi-user.h index b69e9ba6742b..dcb179de4358 100644 --- a/include/uapi/mtd/ubi-user.h +++ b/include/uapi/mtd/ubi-user.h @@ -247,6 +247,7 @@ enum { * @vid_hdr_offset: VID header offset (use defaults if %0) * @max_beb_per1024: maximum expected number of bad PEB per 1024 PEBs * @padding: reserved for future, not used, has to be zeroed + * @disable_fm: whether disable fastmap * * This data structure is used to specify MTD device UBI has to attach and the * parameters it has to use. The number which should be assigned to the new UBI @@ -281,13 +282,18 @@ enum { * eraseblocks for new bad eraseblocks, but attempts to use available * eraseblocks (if any). The accepted range is 0-768. If 0 is given, the * default kernel value of %CONFIG_MTD_UBI_BEB_LIMIT will be used. + * + * If @disable_fm is not zero, ubi doesn't create new fastmap even the module + * param 'fm_autoconvert' is set, and existed old fastmap will be destroyed + * after doing full scanning. */ struct ubi_attach_req { __s32 ubi_num; __s32 mtd_num; __s32 vid_hdr_offset; __s16 max_beb_per1024; - __s8 padding[10]; + __s8 disable_fm; + __s8 padding[9]; };
/*
From: ZhaoLong Wang wangzhaolong1@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5WQRM CVE: NA
--------------------------------
This patch add 'enable_fm' as 5th module init parameters to control fastmap enable or not. When the value is 0, fastmap will not create and existed fastmap will destroyed for the given ubi device. Default value is 0.
To enable or disable fastmap during module loading, fm_autoconvert must be set to non-zero.
+-----------------+---------------+---------------------------+ | \ | enable_fm=0 | enable_fm=1 | +-----------------+---------------+---------------------------+ |fm_autoconvert=Y | disable fm | enable fm | +---------------------------------+---------------------------+ |fm_autoconvert=N | disable fm | Enable fastmap. If fastmap| | | | exists in the old image, | +------------------------------------------------------------ +
Example: # - Attach mtd1 to ubi1, disable fastmap, mtd2 to ubi2, enable fastmap. # modprobe ubi mtd=1,0,0,1,0 mtd=2,0,0,2,1 fm_autoconvert=1
# - If 5th parameter is not specified, the value is 0, fastmap is disable # modprobe ubi mtd=1 fm_autoconvert=1
Signed-off-by: ZhaoLong Wang wangzhaolong1@huawei.com Reviewed-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/mtd/ubi/build.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-)
diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index ac97f6f9bcc2..62cdbf5bf562 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -35,7 +35,7 @@ #define MTD_PARAM_LEN_MAX 64
/* Maximum number of comma-separated items in the 'mtd=' parameter */ -#define MTD_PARAM_MAX_COUNT 4 +#define MTD_PARAM_MAX_COUNT 5
/* Maximum value for the number of bad PEBs per 1024 PEBs */ #define MAX_MTD_UBI_BEB_LIMIT 768 @@ -52,12 +52,14 @@ * string * @vid_hdr_offs: VID header offset * @max_beb_per1024: maximum expected number of bad PEBs per 1024 PEBs + * @enable_fm: enable fastmap when value is non-zero */ struct mtd_dev_param { char name[MTD_PARAM_LEN_MAX]; int ubi_num; int vid_hdr_offs; int max_beb_per1024; + int enable_fm; };
/* Numbers of elements set in the @mtd_dev_param array */ @@ -1249,7 +1251,7 @@ static int __init ubi_init(void) mutex_lock(&ubi_devices_mutex); err = ubi_attach_mtd_dev(mtd, p->ubi_num, p->vid_hdr_offs, p->max_beb_per1024, - false); + p->enable_fm == 0 ? true : false); mutex_unlock(&ubi_devices_mutex); if (err < 0) { pr_err("UBI error: cannot attach mtd%d\n", @@ -1429,7 +1431,7 @@ static int ubi_mtd_param_parse(const char *val, const struct kernel_param *kp) int err = kstrtoint(token, 10, &p->max_beb_per1024);
if (err) { - pr_err("UBI error: bad value for max_beb_per1024 parameter: %s", + pr_err("UBI error: bad value for max_beb_per1024 parameter: %s\n", token); return -EINVAL; } @@ -1440,13 +1442,25 @@ static int ubi_mtd_param_parse(const char *val, const struct kernel_param *kp) int err = kstrtoint(token, 10, &p->ubi_num);
if (err) { - pr_err("UBI error: bad value for ubi_num parameter: %s", + pr_err("UBI error: bad value for ubi_num parameter: %s\n", token); return -EINVAL; } } else p->ubi_num = UBI_DEV_NUM_AUTO;
+ token = tokens[4]; + if (token) { + int err = kstrtoint(token, 10, &p->enable_fm); + + if (err) { + pr_err("UBI error: bad value for enable_fm parameter: %s\n", + token); + return -EINVAL; + } + } else + p->enable_fm = 0; + mtd_devs += 1; return 0; } @@ -1459,11 +1473,13 @@ MODULE_PARM_DESC(mtd, "MTD devices to attach. Parameter format: mtd=<name|num|pa "Optional "max_beb_per1024" parameter specifies the maximum expected bad eraseblock per 1024 eraseblocks. (default value (" __stringify(CONFIG_MTD_UBI_BEB_LIMIT) ") if 0)\n" "Optional "ubi_num" parameter specifies UBI device number which have to be assigned to the newly created UBI device (assigned automatically by default)\n" + "Optional "enable_fm" parameter determines whether to enable fastmap during attach. If the value is non-zero, fastmap is enabled. Default value is 0.\n" "\n" "Example 1: mtd=/dev/mtd0 - attach MTD device /dev/mtd0.\n" "Example 2: mtd=content,1984 mtd=4 - attach MTD device with name "content" using VID header offset 1984, and MTD device number 4 with default VID header offset.\n" "Example 3: mtd=/dev/mtd1,0,25 - attach MTD device /dev/mtd1 using default VID header offset and reserve 25*nand_size_in_blocks/1024 erase blocks for bad block handling.\n" "Example 4: mtd=/dev/mtd1,0,0,5 - attach MTD device /dev/mtd1 to UBI 5 and using default values for the other fields.\n" + "example 5: mtd=1,0,0,5 mtd=2,0,0,6,1 - attach MTD device /dev/mtd1 to UBI 5 and disable fastmap; attach MTD device /dev/mtd2 to UBI 6 and enable fastmap(If supported).\n" "\t(e.g. if the NAND *chipset* has 4096 PEB, 100 will be reserved for this UBI device)."); #ifdef CONFIG_MTD_UBI_FASTMAP module_param(fm_autoconvert, bool, 0644);
From: Duoming Zhou duoming@zju.edu.cn
stable inclusion from stable-v6.1-rc1 commit 2568a7e0832ee30b0a351016d03062ab4e0e0a3f category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5W7YH CVE: CVE-2022-3565
Reference: https://nvd.nist.gov/vuln/detail/CVE-2022-3565 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The l1oip_cleanup() traverses the l1oip_ilist and calls release_card() to cleanup module and stack. However, release_card() calls del_timer() to delete the timers such as keep_tl and timeout_tl. If the timer handler is running, the del_timer() will not stop it and result in UAF bugs. One of the processes is shown below:
(cleanup routine) | (timer handler) release_card() | l1oip_timeout() ... | del_timer() | ... ... | kfree(hc) //FREE | | hc->timeout_on = 0 //USE
Fix by calling del_timer_sync() in release_card(), which makes sure the timer handlers have finished before the resources, such as l1oip and so on, have been deallocated.
What's more, the hc->workq and hc->socket_thread can kick those timers right back in. We add a bool flag to show if card is released. Then, check this flag in hc->workq and hc->socket_thread.
Fixes: 3712b42d4b1b ("Add layer1 over IP support") Signed-off-by: Duoming Zhou duoming@zju.edu.cn Reviewed-by: Leon Romanovsky leonro@nvidia.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: GUO Zihua guozihua@huawei.com Reviewed-by: GONG Ruiqi gongruiqi1@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/isdn/mISDN/l1oip.h | 1 + drivers/isdn/mISDN/l1oip_core.c | 13 +++++++------ 2 files changed, 8 insertions(+), 6 deletions(-)
diff --git a/drivers/isdn/mISDN/l1oip.h b/drivers/isdn/mISDN/l1oip.h index 7ea10db20e3a..48133d022812 100644 --- a/drivers/isdn/mISDN/l1oip.h +++ b/drivers/isdn/mISDN/l1oip.h @@ -59,6 +59,7 @@ struct l1oip { int bundle; /* bundle channels in one frm */ int codec; /* codec to use for transmis. */ int limit; /* limit number of bchannels */ + bool shutdown; /* if card is released */
/* timer */ struct timer_list keep_tl; diff --git a/drivers/isdn/mISDN/l1oip_core.c b/drivers/isdn/mISDN/l1oip_core.c index b57dcb834594..aec4f2a69c3b 100644 --- a/drivers/isdn/mISDN/l1oip_core.c +++ b/drivers/isdn/mISDN/l1oip_core.c @@ -275,7 +275,7 @@ l1oip_socket_send(struct l1oip *hc, u8 localcodec, u8 channel, u32 chanmask, p = frame;
/* restart timer */ - if (time_before(hc->keep_tl.expires, jiffies + 5 * HZ)) + if (time_before(hc->keep_tl.expires, jiffies + 5 * HZ) && !hc->shutdown) mod_timer(&hc->keep_tl, jiffies + L1OIP_KEEPALIVE * HZ); else hc->keep_tl.expires = jiffies + L1OIP_KEEPALIVE * HZ; @@ -601,7 +601,9 @@ l1oip_socket_parse(struct l1oip *hc, struct sockaddr_in *sin, u8 *buf, int len) goto multiframe;
/* restart timer */ - if (time_before(hc->timeout_tl.expires, jiffies + 5 * HZ) || !hc->timeout_on) { + if ((time_before(hc->timeout_tl.expires, jiffies + 5 * HZ) || + !hc->timeout_on) && + !hc->shutdown) { hc->timeout_on = 1; mod_timer(&hc->timeout_tl, jiffies + L1OIP_TIMEOUT * HZ); } else /* only adjust timer */ @@ -1232,11 +1234,10 @@ release_card(struct l1oip *hc) { int ch;
- if (timer_pending(&hc->keep_tl)) - del_timer(&hc->keep_tl); + hc->shutdown = true;
- if (timer_pending(&hc->timeout_tl)) - del_timer(&hc->timeout_tl); + del_timer_sync(&hc->keep_tl); + del_timer_sync(&hc->timeout_tl);
cancel_work_sync(&hc->workq);
From: Pavel Begunkov asml.silence@gmail.com
mainline inclusion from mainline-v6.1-rc1 commit 0091bfc81741b8d3aeb3b7ab8636f911b2de6e80 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5WFKI CVE: CVE-2022-2602
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit?h=...
--------------------------------
Instead of putting io_uring's registered files in unix_gc() we want it to be done by io_uring itself. The trick here is to consider io_uring registered files for cycle detection but not actually putting them down. Because io_uring can't register other ring instances, this will remove all refs to the ring file triggering the ->release path and clean up with io_ring_ctx_free().
Cc: stable@vger.kernel.org Fixes: 6b06314c47e1 ("io_uring: add file set registration") Reported-and-tested-by: David Bouman dbouman03@gmail.com Signed-off-by: Pavel Begunkov asml.silence@gmail.com Signed-off-by: Thadeu Lima de Souza Cascardo cascardo@canonical.com [axboe: add kerneldoc comment to skb, fold in skb leak fix] Signed-off-by: Jens Axboe axboe@kernel.dk Conflicts: fs/io_uring.c include/linux/skbuff.h Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: zhongbaisong zhongbaisong@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/io_uring.c | 1 + include/linux/skbuff.h | 2 ++ net/unix/garbage.c | 20 ++++++++++++++++++++ 3 files changed, 23 insertions(+)
diff --git a/fs/io_uring.c b/fs/io_uring.c index 71cabd60a01d..0da8aa6651a6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7338,6 +7338,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) }
skb->sk = sk; + skb->scm_io_uring = 1;
nr_files = 0; fpl->user = get_uid(ctx->user); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 72cfb047fd2c..6600a269d675 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -706,6 +706,7 @@ typedef unsigned char *sk_buff_data_t; * @transport_header: Transport layer header * @network_header: Network layer header * @mac_header: Link layer header + * @scm_io_uring: SKB holds io_uring registered files * @tail: Tail pointer * @end: End pointer * @head: Head of buffer @@ -866,6 +867,7 @@ struct sk_buff { #ifdef CONFIG_TLS_DEVICE __u8 decrypted:1; #endif + __u8 scm_io_uring:1;
#ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ diff --git a/net/unix/garbage.c b/net/unix/garbage.c index d45d5366115a..dc2763540393 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -204,6 +204,7 @@ void wait_for_unix_gc(void) /* The external entry point: unix_gc() */ void unix_gc(void) { + struct sk_buff *next_skb, *skb; struct unix_sock *u; struct unix_sock *next; struct sk_buff_head hitlist; @@ -297,11 +298,30 @@ void unix_gc(void)
spin_unlock(&unix_gc_lock);
+ /* We need io_uring to clean its registered files, ignore all io_uring + * originated skbs. It's fine as io_uring doesn't keep references to + * other io_uring instances and so killing all other files in the cycle + * will put all io_uring references forcing it to go through normal + * release.path eventually putting registered files. + */ + skb_queue_walk_safe(&hitlist, skb, next_skb) { + if (skb->scm_io_uring) { + __skb_unlink(skb, &hitlist); + skb_queue_tail(&skb->sk->sk_receive_queue, skb); + } + } + /* Here we are. Hitlist is filled. Die. */ __skb_queue_purge(&hitlist);
spin_lock(&unix_gc_lock);
+ /* There could be io_uring registered files, just push them back to + * the inflight list + */ + list_for_each_entry_safe(u, next, &gc_candidates, link) + list_move_tail(&u->link, &gc_inflight_list); + /* All candidates should have been detached by now. */ BUG_ON(!list_empty(&gc_candidates));
From: Zhihao Cheng chengzhihao1@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5WFKI CVE: CVE-2022-2602
--------------------------------
In order to fix CVE-2022-2602, member 'scm_io_uring' is imported into sk_buff, fix broken kabi in sk_buff.
Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/skbuff.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6600a269d675..4739ce5f0913 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -867,7 +867,6 @@ struct sk_buff { #ifdef CONFIG_TLS_DEVICE __u8 decrypted:1; #endif - __u8 scm_io_uring:1;
#ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ @@ -918,7 +917,7 @@ struct sk_buff { __u32 headers_end[0]; /* public: */
- KABI_RESERVE(1) + KABI_USE(1, __u8 scm_io_uring:1) KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4)
From: Linus Torvalds torvalds@linux-foundation.org
stable inclusion from stable-v5.10.134 commit 0adf21eec59040b31af113e626efd85eb153c728 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5X57Q CVE: CVE-2022-1882
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit 353f7988dd8413c47718f7ca79c030b6fb62cfe5 upstream.
When the pipe is closed, we mark the associated watchqueue defunct by calling watch_queue_clear(). However, while that is protected by the watchqueue lock, new watchqueue entries aren't actually added under that lock at all: they use the pipe->rd_wait.lock instead, and looking up that pipe happens without any locking.
The watchqueue code uses the RCU read-side section to make sure that the wqueue entry itself hasn't disappeared, but that does not protect the pipe_info in any way.
So make sure to actually hold the wqueue lock when posting watch events, properly serializing against the pipe being torn down.
Reported-by: Noam Rathaus noamr@ssd-disclosure.com Cc: Greg KH gregkh@linuxfoundation.org Cc: David Howells dhowells@redhat.com Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Wang Hai wanghai38@huawei.com Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/watch_queue.c | 53 +++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 16 deletions(-)
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 249ed3259144..dcf1e676797d 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -34,6 +34,27 @@ MODULE_LICENSE("GPL"); #define WATCH_QUEUE_NOTE_SIZE 128 #define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE)
+/* + * This must be called under the RCU read-lock, which makes + * sure that the wqueue still exists. It can then take the lock, + * and check that the wqueue hasn't been destroyed, which in + * turn makes sure that the notification pipe still exists. + */ +static inline bool lock_wqueue(struct watch_queue *wqueue) +{ + spin_lock_bh(&wqueue->lock); + if (unlikely(wqueue->defunct)) { + spin_unlock_bh(&wqueue->lock); + return false; + } + return true; +} + +static inline void unlock_wqueue(struct watch_queue *wqueue) +{ + spin_unlock_bh(&wqueue->lock); +} + static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { @@ -69,6 +90,10 @@ static const struct pipe_buf_operations watch_queue_pipe_buf_ops = {
/* * Post a notification to a watch queue. + * + * Must be called with the RCU lock for reading, and the + * watch_queue lock held, which guarantees that the pipe + * hasn't been released. */ static bool post_one_notification(struct watch_queue *wqueue, struct watch_notification *n) @@ -85,9 +110,6 @@ static bool post_one_notification(struct watch_queue *wqueue,
spin_lock_irq(&pipe->rd_wait.lock);
- if (wqueue->defunct) - goto out; - mask = pipe->ring_size - 1; head = pipe->head; tail = pipe->tail; @@ -203,7 +225,10 @@ void __post_watch_notification(struct watch_list *wlist, if (security_post_notification(watch->cred, cred, n) < 0) continue;
- post_one_notification(wqueue, n); + if (lock_wqueue(wqueue)) { + post_one_notification(wqueue, n); + unlock_wqueue(wqueue);; + } }
rcu_read_unlock(); @@ -465,11 +490,12 @@ int add_watch_to_object(struct watch *watch, struct watch_list *wlist) return -EAGAIN; }
- spin_lock_bh(&wqueue->lock); - kref_get(&wqueue->usage); - kref_get(&watch->usage); - hlist_add_head(&watch->queue_node, &wqueue->watches); - spin_unlock_bh(&wqueue->lock); + if (lock_wqueue(wqueue)) { + kref_get(&wqueue->usage); + kref_get(&watch->usage); + hlist_add_head(&watch->queue_node, &wqueue->watches); + unlock_wqueue(wqueue); + }
hlist_add_head(&watch->list_node, &wlist->watchers); return 0; @@ -523,20 +549,15 @@ int remove_watch_from_object(struct watch_list *wlist, struct watch_queue *wq,
wqueue = rcu_dereference(watch->queue);
- /* We don't need the watch list lock for the next bit as RCU is - * protecting *wqueue from deallocation. - */ - if (wqueue) { + if (lock_wqueue(wqueue)) { post_one_notification(wqueue, &n.watch);
- spin_lock_bh(&wqueue->lock); - if (!hlist_unhashed(&watch->queue_node)) { hlist_del_init_rcu(&watch->queue_node); put_watch(watch); }
- spin_unlock_bh(&wqueue->lock); + unlock_wqueue(wqueue); }
if (wlist->release_watch) {
From: Linus Torvalds torvalds@linux-foundation.org
stable inclusion from stable-v5.10.134 commit bb1990a3005e3655e84f9a57b3f30ce96d597dee category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5X57Q CVE: CVE-2022-1882
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit 44e29e64cf1ac0cffb152e0532227ea6d002aa28 upstream.
Sedat Dilek noticed that I had an extraneous semicolon at the end of a line in the previous patch.
It's harmless, but unintentional, and while compilers just treat it as an extra empty statement, for all I know some other tooling might warn about it. So clean it up before other people notice too ;)
Fixes: 353f7988dd84 ("watchqueue: make sure to serialize 'wqueue->defunct' properly") Reported-by: Sedat Dilek sedat.dilek@gmail.com Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Reported-by: Sedat Dilek sedat.dilek@gmail.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/watch_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index dcf1e676797d..e5d22af43fa0 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -227,7 +227,7 @@ void __post_watch_notification(struct watch_list *wlist,
if (lock_wqueue(wqueue)) { post_one_notification(wqueue, n); - unlock_wqueue(wqueue);; + unlock_wqueue(wqueue); } }
From: Toke Høiland-Jørgensen toke@toke.dk
stable inclusion from stable-v5.10.143 commit 2ee85ac1b29dbd2ebd2d8e5ac1dd5793235d516b category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5WF14 CVE: CVE-2022-3586
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/patch/?id=2...
--------------------------------
[ Upstream commit 9efd23297cca530bb35e1848665805d3fcdd7889 ]
The sch_sfb enqueue() routine assumes the skb is still alive after it has been enqueued into a child qdisc, using the data in the skb cb field in the increment_qlen() routine after enqueue. However, the skb may in fact have been freed, causing a use-after-free in this case. In particular, this happens if sch_cake is used as a child of sfb, and the GSO splitting mode of CAKE is enabled (in which case the skb will be split into segments and the original skb freed).
Fix this by copying the sfb cb data to the stack before enqueueing the skb, and using this stack copy in increment_qlen() instead of the skb pointer itself.
Reported-by: zdi-disclosures@trendmicro.com # ZDI-CAN-18231 Fixes: e13e02a3c68d ("net_sched: SFB flow scheduler") Signed-off-by: Toke Høiland-Jørgensen toke@toke.dk Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Guo Mengqi guomengqi3@huawei.com Reviewed-by: chenweilong chenweilong@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/sched/sch_sfb.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index da047a37a3bf..f180cf95cfc9 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -135,15 +135,15 @@ static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q) } }
-static void increment_qlen(const struct sk_buff *skb, struct sfb_sched_data *q) +static void increment_qlen(const struct sfb_skb_cb *cb, struct sfb_sched_data *q) { u32 sfbhash;
- sfbhash = sfb_hash(skb, 0); + sfbhash = cb->hashes[0]; if (sfbhash) increment_one_qlen(sfbhash, 0, q);
- sfbhash = sfb_hash(skb, 1); + sfbhash = cb->hashes[1]; if (sfbhash) increment_one_qlen(sfbhash, 1, q); } @@ -283,6 +283,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sfb_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; struct tcf_proto *fl; + struct sfb_skb_cb cb; int i; u32 p_min = ~0; u32 minqlen = ~0; @@ -399,11 +400,12 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, }
enqueue: + memcpy(&cb, sfb_skb_cb(skb), sizeof(cb)); ret = qdisc_enqueue(skb, child, to_free); if (likely(ret == NET_XMIT_SUCCESS)) { qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; - increment_qlen(skb, q); + increment_qlen(&cb, q); } else if (net_xmit_drop_count(ret)) { q->stats.childdrop++; qdisc_qstats_drop(sch);
From: Toke Høiland-Jørgensen toke@toke.dk
stable inclusion from stable-v5.10.143 commit 2ead78fbe6b523e6232ad286e3c13d2a410de22a category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5WF14 CVE: CVE-2022-3586
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/patch/?id=2...
--------------------------------
[ Upstream commit 2f09707d0c972120bf794cfe0f0c67e2c2ddb252 ]
Cong Wang noticed that the previous fix for sch_sfb accessing the queued skb after enqueueing it to a child qdisc was incomplete: the SFB enqueue function was also calling qdisc_qstats_backlog_inc() after enqueue, which reads the pkt len from the skb cb field. Fix this by also storing the skb len, and using the stored value to increment the backlog after enqueueing.
Fixes: 9efd23297cca ("sch_sfb: Don't assume the skb is still around after enqueueing to child") Signed-off-by: Toke Høiland-Jørgensen toke@toke.dk Acked-by: Cong Wang cong.wang@bytedance.com Link: https://lore.kernel.org/r/20220905192137.965549-1-toke@toke.dk Signed-off-by: Paolo Abeni pabeni@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Guo Mengqi guomengqi3@huawei.com Reviewed-by: chenweilong chenweilong@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/sched/sch_sfb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index f180cf95cfc9..b2724057629f 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -281,6 +281,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, {
struct sfb_sched_data *q = qdisc_priv(sch); + unsigned int len = qdisc_pkt_len(skb); struct Qdisc *child = q->qdisc; struct tcf_proto *fl; struct sfb_skb_cb cb; @@ -403,7 +404,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, memcpy(&cb, sfb_skb_cb(skb), sizeof(cb)); ret = qdisc_enqueue(skb, child, to_free); if (likely(ret == NET_XMIT_SUCCESS)) { - qdisc_qstats_backlog_inc(sch, skb); + sch->qstats.backlog += len; sch->q.qlen++; increment_qlen(&cb, q); } else if (net_xmit_drop_count(ret)) {
From: Xu Kuohai xukuohai@huawei.com
maillist inclusion category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5W7AT CVE: CVE-2022-3534
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=...
--------------------------------
ASAN reports an use-after-free in btf_dump_name_dups:
ERROR: AddressSanitizer: heap-use-after-free on address 0xffff927006db at pc 0xaaaab5dfb618 bp 0xffffdd89b890 sp 0xffffdd89b928 READ of size 2 at 0xffff927006db thread T0 #0 0xaaaab5dfb614 in __interceptor_strcmp.part.0 (test_progs+0x21b614) #1 0xaaaab635f144 in str_equal_fn tools/lib/bpf/btf_dump.c:127 #2 0xaaaab635e3e0 in hashmap_find_entry tools/lib/bpf/hashmap.c:143 #3 0xaaaab635e72c in hashmap__find tools/lib/bpf/hashmap.c:212 #4 0xaaaab6362258 in btf_dump_name_dups tools/lib/bpf/btf_dump.c:1525 #5 0xaaaab636240c in btf_dump_resolve_name tools/lib/bpf/btf_dump.c:1552 #6 0xaaaab6362598 in btf_dump_type_name tools/lib/bpf/btf_dump.c:1567 #7 0xaaaab6360b48 in btf_dump_emit_struct_def tools/lib/bpf/btf_dump.c:912 #8 0xaaaab6360630 in btf_dump_emit_type tools/lib/bpf/btf_dump.c:798 #9 0xaaaab635f720 in btf_dump__dump_type tools/lib/bpf/btf_dump.c:282 #10 0xaaaab608523c in test_btf_dump_incremental tools/testing/selftests/bpf/prog_tests/btf_dump.c:236 #11 0xaaaab6097530 in test_btf_dump tools/testing/selftests/bpf/prog_tests/btf_dump.c:875 #12 0xaaaab6314ed0 in run_one_test tools/testing/selftests/bpf/test_progs.c:1062 #13 0xaaaab631a0a8 in main tools/testing/selftests/bpf/test_progs.c:1697 #14 0xffff9676d214 in __libc_start_main ../csu/libc-start.c:308 #15 0xaaaab5d65990 (test_progs+0x185990)
0xffff927006db is located 11 bytes inside of 16-byte region [0xffff927006d0,0xffff927006e0) freed by thread T0 here: #0 0xaaaab5e2c7c4 in realloc (test_progs+0x24c7c4) #1 0xaaaab634f4a0 in libbpf_reallocarray tools/lib/bpf/libbpf_internal.h:191 #2 0xaaaab634f840 in libbpf_add_mem tools/lib/bpf/btf.c:163 #3 0xaaaab636643c in strset_add_str_mem tools/lib/bpf/strset.c:106 #4 0xaaaab6366560 in strset__add_str tools/lib/bpf/strset.c:157 #5 0xaaaab6352d70 in btf__add_str tools/lib/bpf/btf.c:1519 #6 0xaaaab6353e10 in btf__add_field tools/lib/bpf/btf.c:2032 #7 0xaaaab6084fcc in test_btf_dump_incremental tools/testing/selftests/bpf/prog_tests/btf_dump.c:232 #8 0xaaaab6097530 in test_btf_dump tools/testing/selftests/bpf/prog_tests/btf_dump.c:875 #9 0xaaaab6314ed0 in run_one_test tools/testing/selftests/bpf/test_progs.c:1062 #10 0xaaaab631a0a8 in main tools/testing/selftests/bpf/test_progs.c:1697 #11 0xffff9676d214 in __libc_start_main ../csu/libc-start.c:308 #12 0xaaaab5d65990 (test_progs+0x185990)
previously allocated by thread T0 here: #0 0xaaaab5e2c7c4 in realloc (test_progs+0x24c7c4) #1 0xaaaab634f4a0 in libbpf_reallocarray tools/lib/bpf/libbpf_internal.h:191 #2 0xaaaab634f840 in libbpf_add_mem tools/lib/bpf/btf.c:163 #3 0xaaaab636643c in strset_add_str_mem tools/lib/bpf/strset.c:106 #4 0xaaaab6366560 in strset__add_str tools/lib/bpf/strset.c:157 #5 0xaaaab6352d70 in btf__add_str tools/lib/bpf/btf.c:1519 #6 0xaaaab6353ff0 in btf_add_enum_common tools/lib/bpf/btf.c:2070 #7 0xaaaab6354080 in btf__add_enum tools/lib/bpf/btf.c:2102 #8 0xaaaab6082f50 in test_btf_dump_incremental tools/testing/selftests/bpf/prog_tests/btf_dump.c:162 #9 0xaaaab6097530 in test_btf_dump tools/testing/selftests/bpf/prog_tests/btf_dump.c:875 #10 0xaaaab6314ed0 in run_one_test tools/testing/selftests/bpf/test_progs.c:1062 #11 0xaaaab631a0a8 in main tools/testing/selftests/bpf/test_progs.c:1697 #12 0xffff9676d214 in __libc_start_main ../csu/libc-start.c:308 #13 0xaaaab5d65990 (test_progs+0x185990)
The reason is that the key stored in hash table name_map is a string address, and the string memory is allocated by realloc() function, when the memory is resized by realloc() later, the old memory may be freed, so the address stored in name_map references to a freed memory, causing use-after-free.
Fix it by storing duplicated string address in name_map.
Fixes: 919d2b1dbb07 ("libbpf: Allow modification of BTF and add btf__add_str API") Signed-off-by: Xu Kuohai xukuohai@huawei.com Signed-off-by: Andrii Nakryiko andrii@kernel.org Acked-by: Martin KaFai Lau martin.lau@kernel.org Link: https://lore.kernel.org/bpf/20221011120108.782373-2-xukuohai@huaweicloud.com Reviewed-by: Yang Jihong yangjihong1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- tools/lib/bpf/btf_dump.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-)
diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index bd22853be4a6..9ab0964475c1 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -188,6 +188,17 @@ static int btf_dump_resize(struct btf_dump *d) return 0; }
+static void btf_dump_free_names(struct hashmap *map) +{ + size_t bkt; + struct hashmap_entry *cur; + + hashmap__for_each_entry(map, cur, bkt) + free((void *)cur->key); + + hashmap__free(map); +} + void btf_dump__free(struct btf_dump *d) { int i; @@ -206,8 +217,8 @@ void btf_dump__free(struct btf_dump *d) free(d->cached_names); free(d->emit_queue); free(d->decl_stack); - hashmap__free(d->type_names); - hashmap__free(d->ident_names); + btf_dump_free_names(d->type_names); + btf_dump_free_names(d->ident_names);
free(d); } @@ -1392,11 +1403,24 @@ static void btf_dump_emit_type_chain(struct btf_dump *d, static size_t btf_dump_name_dups(struct btf_dump *d, struct hashmap *name_map, const char *orig_name) { + int err; + char *old_name; + char *new_name; size_t dup_cnt = 0;
+ new_name = strdup(orig_name); + if (!new_name) + return 1; + hashmap__find(name_map, orig_name, (void **)&dup_cnt); dup_cnt++; - hashmap__set(name_map, orig_name, (void *)dup_cnt, NULL, NULL); + + err = hashmap__set(name_map, new_name, (void *)dup_cnt, + (const void **)&old_name, NULL); + if (err) + free(new_name); + + free(old_name);
return dup_cnt; }
From: Al Viro viro@zeniv.linux.org.uk
mainline inclusion from mainline-v6.1-rc1 commit 9d64d2405f7d30d49818f6682acd0392348f0fdb category: bugfix bugzilla: 187857, https://gitee.com/src-openeuler/kernel/issues/I5X9DT CVE: CVE-2022-3577
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
So far we relied on .put_super = binderfs_put_super() to destroy info we stashed in sb->s_fs_info. This gave us the required ordering between ->evict_inode() and sb->s_fs_info destruction.
But the current implementation of binderfs_fill_super() has a memory leak in the rare circumstance that d_make_root() fails because ->put_super() is only called when sb->s_root is initialized. Fix this by removing ->put_super() and simply do all that work in binderfs_kill_super().
Reported-by: Dongliang Mu mudongliangabcd@gmail.com Signed-off-by: Al Viro viro@zeniv.linux.org.uk Signed-off-by: Christian Brauner (Microsoft) brauner@kernel.org Link: https://lore.kernel.org/r/20220823095339.853371-1-brauner@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Li Huafei lihuafei1@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/android/binderfs.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-)
diff --git a/drivers/android/binderfs.c b/drivers/android/binderfs.c index 7b4f154f07e6..a3beb685748c 100644 --- a/drivers/android/binderfs.c +++ b/drivers/android/binderfs.c @@ -330,22 +330,10 @@ static int binderfs_show_options(struct seq_file *seq, struct dentry *root) return 0; }
-static void binderfs_put_super(struct super_block *sb) -{ - struct binderfs_info *info = sb->s_fs_info; - - if (info && info->ipc_ns) - put_ipc_ns(info->ipc_ns); - - kfree(info); - sb->s_fs_info = NULL; -} - static const struct super_operations binderfs_super_ops = { .evict_inode = binderfs_evict_inode, .show_options = binderfs_show_options, .statfs = simple_statfs, - .put_super = binderfs_put_super, };
static inline bool is_binderfs_control_device(const struct dentry *dentry) @@ -763,11 +751,27 @@ static int binderfs_init_fs_context(struct fs_context *fc) return 0; }
+static void binderfs_kill_super(struct super_block *sb) +{ + struct binderfs_info *info = sb->s_fs_info; + + /* + * During inode eviction struct binderfs_info is needed. + * So first wipe the super_block then free struct binderfs_info. + */ + kill_litter_super(sb); + + if (info && info->ipc_ns) + put_ipc_ns(info->ipc_ns); + + kfree(info); +} + static struct file_system_type binder_fs_type = { .name = "binder", .init_fs_context = binderfs_init_fs_context, .parameters = binderfs_fs_parameters, - .kill_sb = kill_litter_super, + .kill_sb = binderfs_kill_super, .fs_flags = FS_USERNS_MOUNT, };
From: Dongliang Mu mudongliangabcd@gmail.com
mainline inclusion from mainline-v6.1-rc1 commit 945a9a8e448b65bec055d37eba58f711b39f66f0 category: bugfix bugzilla: 187857, https://gitee.com/src-openeuler/kernel/issues/I5X9DT CVE: CVE-2022-3577
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The error handling code in pvr2_hdw_create forgets to unregister the v4l2 device. When pvr2_hdw_create returns back to pvr2_context_create, it calls pvr2_context_destroy to destroy context, but mp->hdw is NULL, which leads to that pvr2_hdw_destroy directly returns.
Fix this by adding v4l2_device_unregister to decrease the refcount of usb interface.
Reported-by: syzbot+77b432d57c4791183ed4@syzkaller.appspotmail.com Signed-off-by: Dongliang Mu mudongliangabcd@gmail.com Signed-off-by: Hans Verkuil hverkuil-cisco@xs4all.nl Signed-off-by: Mauro Carvalho Chehab mchehab@kernel.org Signed-off-by: Li Huafei lihuafei1@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/media/usb/pvrusb2/pvrusb2-hdw.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/media/usb/pvrusb2/pvrusb2-hdw.c b/drivers/media/usb/pvrusb2/pvrusb2-hdw.c index fccd1798445d..d22ce328a279 100644 --- a/drivers/media/usb/pvrusb2/pvrusb2-hdw.c +++ b/drivers/media/usb/pvrusb2/pvrusb2-hdw.c @@ -2610,6 +2610,7 @@ struct pvr2_hdw *pvr2_hdw_create(struct usb_interface *intf, del_timer_sync(&hdw->encoder_run_timer); del_timer_sync(&hdw->encoder_wait_timer); flush_work(&hdw->workpoll); + v4l2_device_unregister(&hdw->v4l2_dev); usb_free_urb(hdw->ctl_read_urb); usb_free_urb(hdw->ctl_write_urb); kfree(hdw->ctl_read_buffer);
From: Maxim Mikityanskiy maxtram95@gmail.com
maillist inclusion category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5W7XW CVE: CVE-2022-3564
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next.git...
--------------------------------
Fix the race condition between the following two flows that run in parallel:
1. l2cap_reassemble_sdu -> chan->ops->recv (l2cap_sock_recv_cb) -> __sock_queue_rcv_skb.
2. bt_sock_recvmsg -> skb_recv_datagram, skb_free_datagram.
An SKB can be queued by the first flow and immediately dequeued and freed by the second flow, therefore the callers of l2cap_reassemble_sdu can't use the SKB after that function returns. However, some places continue accessing struct l2cap_ctrl that resides in the SKB's CB for a short time after l2cap_reassemble_sdu returns, leading to a use-after-free condition (the stack trace is below, line numbers for kernel 5.19.8).
Fix it by keeping a local copy of struct l2cap_ctrl.
BUG: KASAN: use-after-free in l2cap_rx_state_recv (net/bluetooth/l2cap_core.c:6906) bluetooth Read of size 1 at addr ffff88812025f2f0 by task kworker/u17:3/43169
Workqueue: hci0 hci_rx_work [bluetooth] Call Trace: <TASK> dump_stack_lvl (lib/dump_stack.c:107 (discriminator 4)) print_report.cold (mm/kasan/report.c:314 mm/kasan/report.c:429) ? l2cap_rx_state_recv (net/bluetooth/l2cap_core.c:6906) bluetooth kasan_report (mm/kasan/report.c:162 mm/kasan/report.c:493) ? l2cap_rx_state_recv (net/bluetooth/l2cap_core.c:6906) bluetooth l2cap_rx_state_recv (net/bluetooth/l2cap_core.c:6906) bluetooth l2cap_rx (net/bluetooth/l2cap_core.c:7236 net/bluetooth/l2cap_core.c:7271) bluetooth ret_from_fork (arch/x86/entry/entry_64.S:306) </TASK>
Allocated by task 43169: kasan_save_stack (mm/kasan/common.c:39) __kasan_slab_alloc (mm/kasan/common.c:45 mm/kasan/common.c:436 mm/kasan/common.c:469) kmem_cache_alloc_node (mm/slab.h:750 mm/slub.c:3243 mm/slub.c:3293) __alloc_skb (net/core/skbuff.c:414) l2cap_recv_frag (./include/net/bluetooth/bluetooth.h:425 net/bluetooth/l2cap_core.c:8329) bluetooth l2cap_recv_acldata (net/bluetooth/l2cap_core.c:8442) bluetooth hci_rx_work (net/bluetooth/hci_core.c:3642 net/bluetooth/hci_core.c:3832) bluetooth process_one_work (kernel/workqueue.c:2289) worker_thread (./include/linux/list.h:292 kernel/workqueue.c:2437) kthread (kernel/kthread.c:376) ret_from_fork (arch/x86/entry/entry_64.S:306)
Freed by task 27920: kasan_save_stack (mm/kasan/common.c:39) kasan_set_track (mm/kasan/common.c:45) kasan_set_free_info (mm/kasan/generic.c:372) ____kasan_slab_free (mm/kasan/common.c:368 mm/kasan/common.c:328) slab_free_freelist_hook (mm/slub.c:1780) kmem_cache_free (mm/slub.c:3536 mm/slub.c:3553) skb_free_datagram (./include/net/sock.h:1578 ./include/net/sock.h:1639 net/core/datagram.c:323) bt_sock_recvmsg (net/bluetooth/af_bluetooth.c:295) bluetooth l2cap_sock_recvmsg (net/bluetooth/l2cap_sock.c:1212) bluetooth sock_read_iter (net/socket.c:1087) new_sync_read (./include/linux/fs.h:2052 fs/read_write.c:401) vfs_read (fs/read_write.c:482) ksys_read (fs/read_write.c:620) do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120)
Link: https://lore.kernel.org/linux-bluetooth/CAKErNvoqga1WcmoR3-0875esY6TVWFQDand... Fixes: d2a7ac5d5d3a ("Bluetooth: Add the ERTM receive state machine") Fixes: 4b51dae96731 ("Bluetooth: Add streaming mode receive and incoming packet classifier") Signed-off-by: Maxim Mikityanskiy maxtram95@gmail.com Signed-off-by: Luiz Augusto von Dentz luiz.von.dentz@intel.com
conflicts: net/bluetooth/l2cap_core.c
Signed-off-by: Xia Longlong xialonglong1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/bluetooth/l2cap_core.c | 48 ++++++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 7 deletions(-)
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 2557cd917f5e..e4464a35b419 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -6832,6 +6832,7 @@ static int l2cap_rx_state_recv(struct l2cap_chan *chan, struct l2cap_ctrl *control, struct sk_buff *skb, u8 event) { + struct l2cap_ctrl local_control; int err = 0; bool skb_in_use = false;
@@ -6856,15 +6857,32 @@ static int l2cap_rx_state_recv(struct l2cap_chan *chan, chan->buffer_seq = chan->expected_tx_seq; skb_in_use = true;
+ /* l2cap_reassemble_sdu may free skb, hence invalidate + * control, so make a copy in advance to use it after + * l2cap_reassemble_sdu returns and to avoid the race + * condition, for example: + * + * The current thread calls: + * l2cap_reassemble_sdu + * chan->ops->recv == l2cap_sock_recv_cb + * __sock_queue_rcv_skb + * Another thread calls: + * bt_sock_recvmsg + * skb_recv_datagram + * skb_free_datagram + * Then the current thread tries to access control, but + * it was freed by skb_free_datagram. + */ + local_control = *control; err = l2cap_reassemble_sdu(chan, skb, control); if (err) break;
- if (control->final) { + if (local_control.final) { if (!test_and_clear_bit(CONN_REJ_ACT, &chan->conn_state)) { - control->final = 0; - l2cap_retransmit_all(chan, control); + local_control.final = 0; + l2cap_retransmit_all(chan, &local_control); l2cap_ertm_send(chan); } } @@ -7244,11 +7262,27 @@ static int l2cap_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control, static int l2cap_stream_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control, struct sk_buff *skb) { + /* l2cap_reassemble_sdu may free skb, hence invalidate control, so store + * the txseq field in advance to use it after l2cap_reassemble_sdu + * returns and to avoid the race condition, for example: + * + * The current thread calls: + * l2cap_reassemble_sdu + * chan->ops->recv == l2cap_sock_recv_cb + * __sock_queue_rcv_skb + * Another thread calls: + * bt_sock_recvmsg + * skb_recv_datagram + * skb_free_datagram + * Then the current thread tries to access control, but it was freed by + * skb_free_datagram. + */ + u16 txseq = control->txseq; + BT_DBG("chan %p, control %p, skb %p, state %d", chan, control, skb, chan->rx_state);
- if (l2cap_classify_txseq(chan, control->txseq) == - L2CAP_TXSEQ_EXPECTED) { + if (l2cap_classify_txseq(chan, txseq) == L2CAP_TXSEQ_EXPECTED) { l2cap_pass_to_tx(chan, control);
BT_DBG("buffer_seq %d->%d", chan->buffer_seq, @@ -7271,8 +7305,8 @@ static int l2cap_stream_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control, } }
- chan->last_acked_seq = control->txseq; - chan->expected_tx_seq = __next_seq(chan, control->txseq); + chan->last_acked_seq = txseq; + chan->expected_tx_seq = __next_seq(chan, txseq);
return 0; }
From: Jialiang Wang wangjialiang0806@163.com
mainline inclusion from mainline-v6.0-rc1 commit 02e1a114fdb71e59ee6770294166c30d437bf86a category: bugfix bugzilla: 187867, https://gitee.com/src-openeuler/kernel/issues/I5W7B5 CVE: CVE-2022-3545
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next.git/comm...
--------------------------------
area_cache_get() is used to distribute cache->area and set cache->id, and if cache->id is not 0 and cache->area->kref refcount is 0, it will release the cache->area by nfp_cpp_area_release(). area_cache_get() set cache->id before cpp->op->area_init() and nfp_cpp_area_acquire().
But if area_init() or nfp_cpp_area_acquire() fails, the cache->id is is already set but the refcount is not increased as expected. At this time, calling the nfp_cpp_area_release() will cause use-after-free.
To avoid the use-after-free, set cache->id after area_init() and nfp_cpp_area_acquire() complete successfully.
Note: This vulnerability is triggerable by providing emulated device equipped with specified configuration.
BUG: KASAN: use-after-free in nfp6000_area_init (drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c:760) Write of size 4 at addr ffff888005b7f4a0 by task swapper/0/1
Call Trace: <TASK> nfp6000_area_init (drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c:760) area_cache_get.constprop.8 (drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c:884)
Allocated by task 1: nfp_cpp_area_alloc_with_name (drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c:303) nfp_cpp_area_cache_add (drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c:802) nfp6000_init (drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c:1230) nfp_cpp_from_operations (drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c:1215) nfp_pci_probe (drivers/net/ethernet/netronome/nfp/nfp_main.c:744)
Freed by task 1: kfree (mm/slub.c:4562) area_cache_get.constprop.8 (drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c:873) nfp_cpp_read (drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c:924 drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c:973) nfp_cpp_readl (drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpplib.c:48)
Signed-off-by: Jialiang Wang wangjialiang0806@163.com Reviewed-by: Yinjun Zhang yinjun.zhang@corigine.com Acked-by: Simon Horman simon.horman@corigine.com Link: https://lore.kernel.org/r/20220810073057.4032-1-wangjialiang0806@163.com Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Yuyao Lin linyuyao1@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c index 6ef48eb3a77d..b163489489e9 100644 --- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c +++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c @@ -874,7 +874,6 @@ area_cache_get(struct nfp_cpp *cpp, u32 id, }
/* Adjust the start address to be cache size aligned */ - cache->id = id; cache->addr = addr & ~(u64)(cache->size - 1);
/* Re-init to the new ID and address */ @@ -894,6 +893,8 @@ area_cache_get(struct nfp_cpp *cpp, u32 id, return NULL; }
+ cache->id = id; + exit: /* Adjust offset */ *offset = addr - cache->addr;
From: Andrew Gaul gaul@gaul.org
mainline inclusion from mainline-v6.1-rc1 commit 93e2be344a7db169b7119de21ac1bf253b8c6907 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5WFKR CVE: CVE-2022-3594
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=...
--------------------------------
My system shows almost 10 million of these messages over a 24-hour period which pollutes my logs.
Signed-off-by: Andrew Gaul gaul@google.com Link: https://lore.kernel.org/r/20221002034128.2026653-1-gaul@google.com Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/usb/r8152.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 842f16815395..eaff8f83b5ac 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -1689,7 +1689,9 @@ static void intr_callback(struct urb *urb) "Stop submitting intr, status %d\n", status); return; case -EOVERFLOW: - netif_info(tp, intr, tp->netdev, "intr status -EOVERFLOW\n"); + if (net_ratelimit()) + netif_info(tp, intr, tp->netdev, + "intr status -EOVERFLOW\n"); goto resubmit; /* -EPIPE: should clear the halt */ default:
From: Fedor Pchelkin pchelkin@ispras.ru
stable inclusion from stable-v5.10.138 commit a220ff343396bae8d3b6abee72ab51f1f34b3027 category: bugfix bugzilla: 187869, https://gitee.com/src-openeuler/kernel/issues/I5X3MU CVE: CVE-2022-3633
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit 8c21c54a53ab21842f5050fa090f26b03c0313d6 upstream.
We need to drop skb references taken in j1939_session_skb_queue() when destroying a session in j1939_session_destroy(). Otherwise those skbs would be lost.
Link to Syzkaller info and repro: https://forge.ispras.ru/issues/11743.
Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
V1: https://lore.kernel.org/all/20220708175949.539064-1-pchelkin@ispras.ru
Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Suggested-by: Oleksij Rempel o.rempel@pengutronix.de Signed-off-by: Fedor Pchelkin pchelkin@ispras.ru Signed-off-by: Alexey Khoroshilov khoroshilov@ispras.ru Acked-by: Oleksij Rempel o.rempel@pengutronix.de Link: https://lore.kernel.org/all/20220805150216.66313-1-pchelkin@ispras.ru Signed-off-by: Marc Kleine-Budde mkl@pengutronix.de Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Dong Chenchen dongchenchen2@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/can/j1939/transport.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 9c39b0f5d6e0..2830a12a4dd1 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -260,6 +260,8 @@ static void __j1939_session_drop(struct j1939_session *session)
static void j1939_session_destroy(struct j1939_session *session) { + struct sk_buff *skb; + if (session->err) j1939_sk_errqueue(session, J1939_ERRQUEUE_ABORT); else @@ -270,7 +272,11 @@ static void j1939_session_destroy(struct j1939_session *session) WARN_ON_ONCE(!list_empty(&session->sk_session_queue_entry)); WARN_ON_ONCE(!list_empty(&session->active_session_list_entry));
- skb_queue_purge(&session->skb_queue); + while ((skb = skb_dequeue(&session->skb_queue)) != NULL) { + /* drop ref taken in j1939_session_skb_queue() */ + skb_unref(skb); + kfree_skb(skb); + } __j1939_session_drop(session); j1939_priv_put(session->priv); kfree(session);
From: Ryusuke Konishi konishi.ryusuke@gmail.com
stable inclusion from stable-v5.10.148 commit 21ee3cffed8fbabb669435facfd576ba18ac8652 category: bugfix bugzilla:187877,https://gitee.com/src-openeuler/kernel/issues/I5X3MX CVE: CVE-2022-3649
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit d325dc6eb763c10f591c239550b8c7e5466a5d09 upstream.
If the beginning of the inode bitmap area is corrupted on disk, an inode with the same inode number as the root inode can be allocated and fail soon after. In this case, the subsequent call to nilfs_clear_inode() on that bogus root inode will wrongly decrement the reference counter of struct nilfs_root, and this will erroneously free struct nilfs_root, causing kernel oopses.
This fixes the problem by changing nilfs_new_inode() to skip reserved inode numbers while repairing the inode bitmap.
Link: https://lkml.kernel.org/r/20221003150519.39789-1-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi konishi.ryusuke@gmail.com Reported-by: syzbot+b8c672b0e22615c80fe0@syzkaller.appspotmail.com Reported-by: Khalid Masum khalid.masum.92@gmail.com Tested-by: Ryusuke Konishi konishi.ryusuke@gmail.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Baokun Li libaokun1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/nilfs2/inode.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-)
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 95684fa3c985..3e4874b0c55c 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -332,6 +332,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) struct inode *inode; struct nilfs_inode_info *ii; struct nilfs_root *root; + struct buffer_head *bh; int err = -ENOMEM; ino_t ino;
@@ -347,11 +348,25 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) ii->i_state = BIT(NILFS_I_NEW); ii->i_root = root;
- err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh); + err = nilfs_ifile_create_inode(root->ifile, &ino, &bh); if (unlikely(err)) goto failed_ifile_create_inode; /* reference count of i_bh inherits from nilfs_mdt_read_block() */
+ if (unlikely(ino < NILFS_USER_INO)) { + nilfs_warn(sb, + "inode bitmap is inconsistent for reserved inodes"); + do { + brelse(bh); + err = nilfs_ifile_create_inode(root->ifile, &ino, &bh); + if (unlikely(err)) + goto failed_ifile_create_inode; + } while (ino < NILFS_USER_INO); + + nilfs_info(sb, "repaired inode bitmap for reserved inodes"); + } + ii->i_bh = bh; + atomic64_inc(&root->inodes_count); inode_init_owner(inode, dir, mode); inode->i_ino = ino;
From: Peng Wu wupeng58@huawei.com
mainline inclusion from mainline-v6.0-rc1 commit 18178e03b124b0c6be17abbbca914157642f5d7a category: bugfix bugzilla: 187764, https://gitee.com/openeuler/kernel/issues/I5XJD7 CVE: N/A
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h...
--------------------------------
Driver should call pci_disable_device() if it returns from cafe_nand_probe() with error.
Meanwhile, the driver calls pci_enable_device() in cafe_nand_probe(), but never calls pci_disable_device() during removal.
Signed-off-by: Peng Wu wupeng58@huawei.com Signed-off-by: Miquel Raynal miquel.raynal@bootlin.com Link: https://lore.kernel.org/linux-mtd/20220520084425.116686-1-wupeng58@huawei.co... Signed-off-by: GONG, Ruiqi gongruiqi1@huawei.com Reviewed-by: guozihua guozihua@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/mtd/nand/raw/cafe_nand.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/drivers/mtd/nand/raw/cafe_nand.c b/drivers/mtd/nand/raw/cafe_nand.c index 04502d22efc9..a0fd1b22921f 100644 --- a/drivers/mtd/nand/raw/cafe_nand.c +++ b/drivers/mtd/nand/raw/cafe_nand.c @@ -679,8 +679,10 @@ static int cafe_nand_probe(struct pci_dev *pdev, pci_set_master(pdev);
cafe = kzalloc(sizeof(*cafe), GFP_KERNEL); - if (!cafe) - return -ENOMEM; + if (!cafe) { + err = -ENOMEM; + goto out_disable_device; + }
mtd = nand_to_mtd(&cafe->nand); mtd->dev.parent = &pdev->dev; @@ -801,6 +803,8 @@ static int cafe_nand_probe(struct pci_dev *pdev, pci_iounmap(pdev, cafe->mmio); out_free_mtd: kfree(cafe); + out_disable_device: + pci_disable_device(pdev); out: return err; } @@ -822,6 +826,7 @@ static void cafe_nand_remove(struct pci_dev *pdev) pci_iounmap(pdev, cafe->mmio); dma_free_coherent(&cafe->pdev->dev, 2112, cafe->dmabuf, cafe->dmaaddr); kfree(cafe); + pci_disable_device(pdev); }
static const struct pci_device_id cafe_nand_tbl[] = {
From: Zheng Yejian zhengyejian1@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5WF0G CVE: NA
--------------------------------
File '/proc/livepatch/state' should be removed before removing '/proc/livepatch', otherwise following log will appear: remove_proc_entry: removing non-empty directory '/proc/livepatch', leaking at least 'state'
Fixes: c33e42836a74 ("livepatch/core: Allow implementation without ftrace") Signed-off-by: Zheng Yejian zhengyejian1@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/livepatch/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 780a825cee8b..f6981faa18a8 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -2331,13 +2331,15 @@ static int __init klp_init(void)
klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj); if (!klp_root_kobj) - goto error_remove; + goto error_remove_state;
#ifdef CONFIG_LIVEPATCH_STOP_MACHINE_CONSISTENCY arch_klp_init(); #endif return 0;
+error_remove_state: + remove_proc_entry("livepatch/state", NULL); error_remove: remove_proc_entry("livepatch", NULL); error_out:
From: Jie Zhan zhanjie9@hisilicon.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5GNIE
------------------------------------------------------------------------
Since commit 4615fbc3788d ("genirq/irqdomain: Don't try to free an interrupt that has no mapping"), we have found failures when re-inserting some specific drivers:
[root@localhost ~]# rmmod hisi_sas_v3_hw [root@localhost ~]# modprobe hisi_sas_v3_hw [ 1295.622525] hisi_sas_v3_hw: probe of 0000:30:04.0 failed with error -2
A relevant discussion can be found at: https://lore.kernel.org/lkml/3d3d0155e66429968cb4f6b4feeae4b3@kernel.org/
This is because IRQs from a low-level domain are not freed together, leaving some leaked. Thus, the next driver insertion fails to allocate the same number of IRQs.
Free a contiguous group of IRQs in one go to fix this issue.
Fixes: 4615fbc3788d ("genirq/irqdomain: Don't try to free an interrupt that has no mapping") Signed-off-by: Jie Zhan zhanjie9@hisilicon.com Reviewed-by: Liao Chang liaochang1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/irq/irqdomain.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-)
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index c6b419db68ef..ed2fb45e8cae 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1374,13 +1374,24 @@ static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain, unsigned int nr_irqs) { unsigned int i; + int n;
if (!domain->ops->free) return;
for (i = 0; i < nr_irqs; i++) { - if (irq_domain_get_irq_data(domain, irq_base + i)) - domain->ops->free(domain, irq_base + i, 1); + /* Find the largest possible span of IRQs to free in one go */ + for (n = 0; + ((i + n) < nr_irqs) && + (irq_domain_get_irq_data(domain, irq_base + i + n)); + n++) + ; + + if (!n) + continue; + + domain->ops->free(domain, irq_base + i, n); + i += n; } }
From: ZhaoLong Wang wangzhaolong1@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5MFXJ CVE: NA
-----------------------------------------
The readahead feature of the openEuler-22.03-LTS ebpf enhancement involves interface changes and needs to be compatible with the ebpf tool of openEuler-1.0-LTS. This patch changes the _ctl_mode to _mode of fs_file_read_ctx structure.
Signed-off-by: ZhaoLong Wang wangzhaolong1@huawei.com Reviewed-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/read_write.c | 8 ++++---- include/linux/fs.h | 10 +++++----- .../selftests/bpf/progs/file_read_pattern_prog.c | 16 ++++++++-------- 3 files changed, 17 insertions(+), 17 deletions(-)
diff --git a/fs/read_write.c b/fs/read_write.c index d175b5e8d3d3..c8df4ce598e1 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1688,7 +1688,7 @@ static void fs_file_read_ctx_init(struct fs_file_read_ctx *ctx, { memset(ctx, 0, sizeof(*ctx)); ctx->name = file_dentry(filp)->d_name.name; - ctx->f_ctl_mode = filp->f_ctl_mode; + ctx->f_mode = filp->f_mode; ctx->key = (unsigned long)filp; ctx->i_size = file_inode(filp)->i_size; ctx->prev_index = filp->f_ra.prev_pos >> PAGE_SHIFT; @@ -1706,11 +1706,11 @@ void fs_file_read_update_args_by_trace(struct kiocb *iocb) fs_file_read_ctx_init(&ctx, filp, iocb->ki_pos); trace_fs_file_read(&ctx, FS_FILE_READ_VERSION);
- if (!ctx.set_f_ctl_mode && !ctx.clr_f_ctl_mode) + if (!ctx.set_f_mode && !ctx.clr_f_mode) return;
- filp->f_ctl_mode |= ctx.set_f_ctl_mode & FS_FILE_READ_MODE_MASK; - filp->f_ctl_mode &= ~(ctx.clr_f_ctl_mode & FS_FILE_READ_MODE_MASK); + filp->f_ctl_mode |= ctx.set_f_mode & FS_FILE_READ_MODE_MASK; + filp->f_ctl_mode &= ~(ctx.clr_f_mode & FS_FILE_READ_MODE_MASK); } EXPORT_SYMBOL_GPL(fs_file_read_update_args_by_trace); #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index fa9d89379da1..45ea1243118c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -185,10 +185,10 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define FMODE_BUF_RASYNC ((__force fmode_t)0x40000000)
/* File mode control flag, expect random access pattern */ -#define FMODE_CTL_RANDOM ((__force fmode_t)0x1) +#define FMODE_CTL_RANDOM ((__force fmode_t)0x1000)
/* File mode control flag, will try to read head of the file into pagecache */ -#define FMODE_CTL_WILLNEED ((__force fmode_t)0x2) +#define FMODE_CTL_WILLNEED ((__force fmode_t)0x400000)
/* * Attribute flags. These should be or-ed together to figure out what @@ -3565,12 +3565,12 @@ static inline int inode_drain_writes(struct inode *inode)
struct fs_file_read_ctx { const unsigned char *name; - unsigned int f_ctl_mode; + unsigned int f_mode; unsigned int rsvd; /* clear from f_ctl_mode */ - unsigned int clr_f_ctl_mode; + unsigned int clr_f_mode; /* set into f_ctl_mode */ - unsigned int set_f_ctl_mode; + unsigned int set_f_mode; unsigned long key; /* file size */ long long i_size; diff --git a/tools/testing/selftests/bpf/progs/file_read_pattern_prog.c b/tools/testing/selftests/bpf/progs/file_read_pattern_prog.c index cd2dcd7c64bb..1d9f199d8370 100644 --- a/tools/testing/selftests/bpf/progs/file_read_pattern_prog.c +++ b/tools/testing/selftests/bpf/progs/file_read_pattern_prog.c @@ -11,17 +11,17 @@ #endif
/* Need to keep consistent with definitions in include/linux/fs.h */ -#define FMODE_CTL_RANDOM 0x1 -#define FMODE_CTL_WILLNEED 0x2 +#define FMODE_CTL_RANDOM 0x1000 +#define FMODE_CTL_WILLNEED 0x400000
struct fs_file_read_ctx { const unsigned char *name; - unsigned int f_ctl_mode; + unsigned int f_mode; unsigned int rsvd; /* clear from f_ctl_mode */ - unsigned int clr_f_ctl_mode; + unsigned int clr_f_mode; /* set into f_ctl_mode */ - unsigned int set_f_ctl_mode; + unsigned int set_f_mode; unsigned long key; /* file size */ long long i_size; @@ -80,7 +80,7 @@ int fs_file_read(struct fs_file_read_args *args) return 0;
if (rd_ctx->i_size <= (4 << 20)) { - rd_ctx->set_f_ctl_mode = FMODE_CTL_WILLNEED; + rd_ctx->set_f_mode = FMODE_CTL_WILLNEED; return 0; }
@@ -112,9 +112,9 @@ int fs_file_read(struct fs_file_read_args *args) if (now - hist->last_nsec >= 500000000ULL || hist->tot_nr >= 10) { if (hist->tot_nr >= 10) { if (hist->seq_nr <= hist->tot_nr * 3 / 10) - rd_ctx->set_f_ctl_mode = FMODE_CTL_RANDOM; + rd_ctx->set_f_mode = FMODE_CTL_RANDOM; else if (hist->seq_nr >= hist->tot_nr * 7 / 10) - rd_ctx->clr_f_ctl_mode = FMODE_CTL_RANDOM; + rd_ctx->clr_f_mode = FMODE_CTL_RANDOM; }
hist->last_nsec = now;
From: Muchun Song songmuchun@bytedance.com
mainline inclusion from mainline-v5.18-rc1 commit cf5a501d985ba1b6ace9b18c64346441819bffea category: bugfix bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5Q9PS CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
When a contiguous HugeTLB page is mapped, set_pte_at() will be called CONT_PTES/CONT_PMDS times. Therefore, __sync_icache_dcache() will flush cache multiple times if the page is executable (to ensure the I-D cache coherency). However, the first flushing cache already covers subsequent cache flush operations. So only flusing cache for the head page if it is a HugeTLB page to avoid redundant cache flushing. In the next patch, it is also depends on this change since the tail vmemmap pages of HugeTLB is mapped with read-only meanning only head page struct can be modified.
Signed-off-by: Muchun Song songmuchun@bytedance.com Reviewed-by: Catalin Marinas catalin.marinas@arm.com Link: https://lore.kernel.org/r/20220302084624.33340-1-songmuchun@bytedance.com Signed-off-by: Will Deacon will@kernel.org Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/mm/flush.c | 7 +++++++ 1 file changed, 7 insertions(+)
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index c7678e7df53a..6904a70450d9 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -55,6 +55,13 @@ void __sync_icache_dcache(pte_t pte) { struct page *page = pte_page(pte);
+ /* + * HugeTLB pages are always fully mapped, so only setting head page's + * PG_dcache_clean flag is enough. + */ + if (PageHuge(page)) + page = compound_head(page); + if (!test_bit(PG_dcache_clean, &page->flags)) { sync_icache_aliases(page_address(page), page_size(page)); set_bit(PG_dcache_clean, &page->flags);
From: Qi Liu liuqi115@huawei.com
mainline inclusion from mainline-v6.1-rc1 commit 45a3975f8e4c category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5WGEF CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
Add find_pmu_for_event() and use to simplify logic in auxtrace_record_init(). find_pmu_for_event() will be reused in subsequent patches.
Reviewed-by: John Garry john.garry@huawei.com Reviewed-by: Jonathan Cameron Jonathan.Cameron@huawei.com Reviewed-by: Leo Yan leo.yan@linaro.org Signed-off-by: Qi Liu liuqi115@huawei.com Signed-off-by: Yicong Yang yangyicong@hisilicon.com Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Bjorn Helgaas helgaas@kernel.org Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Ingo Molnar mingo@redhat.com Cc: James Clark james.clark@arm.com Cc: Lorenzo Pieralisi lorenzo.pieralisi@arm.com Cc: Mark Rutland mark.rutland@arm.com Cc: Mathieu Poirier mathieu.poirier@linaro.org Cc: Mike Leach mike.leach@linaro.org Cc: Peter Zijlstra peterz@infradead.org Cc: Qi Liu liuqi6124@gmail.com Cc: Shameerali Kolothum Thodi shameerali.kolothum.thodi@huawei.com Cc: Shaokun Zhang zhangshaokun@hisilicon.com Cc: Suzuki Poulouse suzuki.poulose@arm.com Cc: Will Deacon will@kernel.org Cc: Zeng Prime prime.zeng@huawei.com Cc: linux-arm-kernel@lists.infradead.org Cc: linux-pci@vger.kernel.org Cc: linuxarm@huawei.com Link: https://lore.kernel.org/r/20220927081400.14364-2-yangyicong@huawei.com Signed-off-by: Arnaldo Carvalho de Melo acme@redhat.com Signed-off-by: Wangming Shao shaowangming@h-partners.com Reviewed-by: Yang Jihong yangjihong1@huawei.com Reviewed-by: Jay Fang f.fangjian@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- tools/perf/arch/arm/util/auxtrace.c | 53 ++++++++++++++++++----------- 1 file changed, 34 insertions(+), 19 deletions(-)
diff --git a/tools/perf/arch/arm/util/auxtrace.c b/tools/perf/arch/arm/util/auxtrace.c index b187bddbd01a..22a1ac8f648f 100644 --- a/tools/perf/arch/arm/util/auxtrace.c +++ b/tools/perf/arch/arm/util/auxtrace.c @@ -50,16 +50,32 @@ static struct perf_pmu **find_all_arm_spe_pmus(int *nr_spes, int *err) return arm_spe_pmus; }
+static struct perf_pmu *find_pmu_for_event(struct perf_pmu **pmus, + int pmu_nr, struct evsel *evsel) +{ + int i; + + if (!pmus) + return NULL; + + for (i = 0; i < pmu_nr; i++) { + if (evsel->core.attr.type == pmus[i]->type) + return pmus[i]; + } + + return NULL; +} + struct auxtrace_record *auxtrace_record__init(struct evlist *evlist, int *err) { - struct perf_pmu *cs_etm_pmu; + struct perf_pmu *cs_etm_pmu = NULL; + struct perf_pmu **arm_spe_pmus = NULL; struct evsel *evsel; - bool found_etm = false; + struct perf_pmu *found_etm = NULL; struct perf_pmu *found_spe = NULL; - struct perf_pmu **arm_spe_pmus = NULL; + int auxtrace_event_cnt = 0; int nr_spes = 0; - int i = 0;
if (!evlist) return NULL; @@ -68,24 +84,23 @@ struct auxtrace_record arm_spe_pmus = find_all_arm_spe_pmus(&nr_spes, err);
evlist__for_each_entry(evlist, evsel) { - if (cs_etm_pmu && - evsel->core.attr.type == cs_etm_pmu->type) - found_etm = true; - - if (!nr_spes || found_spe) - continue; - - for (i = 0; i < nr_spes; i++) { - if (evsel->core.attr.type == arm_spe_pmus[i]->type) { - found_spe = arm_spe_pmus[i]; - break; - } - } + if (cs_etm_pmu && !found_etm) + found_etm = find_pmu_for_event(&cs_etm_pmu, 1, evsel); + + if (arm_spe_pmus && !found_spe) + found_spe = find_pmu_for_event(arm_spe_pmus, nr_spes, evsel); } + free(arm_spe_pmus);
- if (found_etm && found_spe) { - pr_err("Concurrent ARM Coresight ETM and SPE operation not currently supported\n"); + if (found_etm) + auxtrace_event_cnt++; + + if (found_spe) + auxtrace_event_cnt++; + + if (auxtrace_event_cnt > 1) { + pr_err("Concurrent AUX trace operation not currently supported\n"); *err = -EOPNOTSUPP; return NULL; }
From: Qi Liu liuqi115@huawei.com
mainline inclusion from mainline-v6.1-rc1 commit 057381a7ece1 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5WGEF CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
HiSilicon PCIe tune and trace device (PTT) could dynamically tune the PCIe link's events, and trace the TLP headers).
This patch add support for PTT device in perf tool, so users could use 'perf record' to get TLP headers trace data.
Reviewed-by: Leo Yan leo.yan@linaro.org Signed-off-by: Qi Liu liuqi115@huawei.com Signed-off-by: Yicong Yang yangyicong@hisilicon.com Acked-by: John Garry john.garry@huawei.com Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Bjorn Helgaas helgaas@kernel.org Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Ingo Molnar mingo@redhat.com Cc: James Clark james.clark@arm.com Cc: Jonathan Cameron jonathan.cameron@huawei.com Cc: Lorenzo Pieralisi lorenzo.pieralisi@arm.com Cc: Mark Rutland mark.rutland@arm.com Cc: Mathieu Poirier mathieu.poirier@linaro.org Cc: Mike Leach mike.leach@linaro.org Cc: Peter Zijlstra peterz@infradead.org Cc: Qi Liu liuqi6124@gmail.com Cc: Shameerali Kolothum Thodi shameerali.kolothum.thodi@huawei.com Cc: Shaokun Zhang zhangshaokun@hisilicon.com Cc: Suzuki Poulouse suzuki.poulose@arm.com Cc: Will Deacon will@kernel.org Cc: Zeng Prime prime.zeng@huawei.com Cc: linux-arm-kernel@lists.infradead.org Cc: linux-pci@vger.kernel.org Cc: linuxarm@huawei.com Link: https://lore.kernel.org/r/20220927081400.14364-3-yangyicong@huawei.com Signed-off-by: Arnaldo Carvalho de Melo acme@redhat.com Signed-off-by: Wangming Shao shaowangming@h-partners.com Reviewed-by: Yang Jihong yangjihong1@huawei.com Reviewed-by: Jay Fang f.fangjian@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- tools/perf/arch/arm/util/auxtrace.c | 63 +++++++++ tools/perf/arch/arm/util/pmu.c | 3 + tools/perf/arch/arm64/util/Build | 2 +- tools/perf/arch/arm64/util/hisi-ptt.c | 188 ++++++++++++++++++++++++++ tools/perf/util/auxtrace.c | 1 + tools/perf/util/auxtrace.h | 1 + tools/perf/util/hisi-ptt.h | 16 +++ 7 files changed, 273 insertions(+), 1 deletion(-) create mode 100644 tools/perf/arch/arm64/util/hisi-ptt.c create mode 100644 tools/perf/util/hisi-ptt.h
diff --git a/tools/perf/arch/arm/util/auxtrace.c b/tools/perf/arch/arm/util/auxtrace.c index 22a1ac8f648f..e282fcea5e05 100644 --- a/tools/perf/arch/arm/util/auxtrace.c +++ b/tools/perf/arch/arm/util/auxtrace.c @@ -4,9 +4,11 @@ * Author: Mathieu Poirier mathieu.poirier@linaro.org */
+#include <dirent.h> #include <stdbool.h> #include <linux/coresight-pmu.h> #include <linux/zalloc.h> +#include <api/fs/fs.h>
#include "../../util/auxtrace.h" #include "../../util/debug.h" @@ -14,6 +16,7 @@ #include "../../util/pmu.h" #include "cs-etm.h" #include "arm-spe.h" +#include "hisi-ptt.h"
static struct perf_pmu **find_all_arm_spe_pmus(int *nr_spes, int *err) { @@ -50,6 +53,52 @@ static struct perf_pmu **find_all_arm_spe_pmus(int *nr_spes, int *err) return arm_spe_pmus; }
+static struct perf_pmu **find_all_hisi_ptt_pmus(int *nr_ptts, int *err) +{ + const char *sysfs = sysfs__mountpoint(); + struct perf_pmu **hisi_ptt_pmus = NULL; + struct dirent *dent; + char path[PATH_MAX]; + DIR *dir = NULL; + int idx = 0; + + snprintf(path, PATH_MAX, "%s" EVENT_SOURCE_DEVICE_PATH, sysfs); + dir = opendir(path); + if (!dir) { + pr_err("can't read directory '%s'\n", EVENT_SOURCE_DEVICE_PATH); + *err = -EINVAL; + return NULL; + } + + while ((dent = readdir(dir))) { + if (strstr(dent->d_name, HISI_PTT_PMU_NAME)) + (*nr_ptts)++; + } + + if (!(*nr_ptts)) + goto out; + + hisi_ptt_pmus = zalloc(sizeof(struct perf_pmu *) * (*nr_ptts)); + if (!hisi_ptt_pmus) { + pr_err("hisi_ptt alloc failed\n"); + *err = -ENOMEM; + goto out; + } + + rewinddir(dir); + while ((dent = readdir(dir))) { + if (strstr(dent->d_name, HISI_PTT_PMU_NAME) && idx < *nr_ptts) { + hisi_ptt_pmus[idx] = perf_pmu__find(dent->d_name); + if (hisi_ptt_pmus[idx]) + idx++; + } + } + +out: + closedir(dir); + return hisi_ptt_pmus; +} + static struct perf_pmu *find_pmu_for_event(struct perf_pmu **pmus, int pmu_nr, struct evsel *evsel) { @@ -71,17 +120,21 @@ struct auxtrace_record { struct perf_pmu *cs_etm_pmu = NULL; struct perf_pmu **arm_spe_pmus = NULL; + struct perf_pmu **hisi_ptt_pmus = NULL; struct evsel *evsel; struct perf_pmu *found_etm = NULL; struct perf_pmu *found_spe = NULL; + struct perf_pmu *found_ptt = NULL; int auxtrace_event_cnt = 0; int nr_spes = 0; + int nr_ptts = 0;
if (!evlist) return NULL;
cs_etm_pmu = perf_pmu__find(CORESIGHT_ETM_PMU_NAME); arm_spe_pmus = find_all_arm_spe_pmus(&nr_spes, err); + hisi_ptt_pmus = find_all_hisi_ptt_pmus(&nr_ptts, err);
evlist__for_each_entry(evlist, evsel) { if (cs_etm_pmu && !found_etm) @@ -89,9 +142,13 @@ struct auxtrace_record
if (arm_spe_pmus && !found_spe) found_spe = find_pmu_for_event(arm_spe_pmus, nr_spes, evsel); + + if (hisi_ptt_pmus && !found_ptt) + found_ptt = find_pmu_for_event(hisi_ptt_pmus, nr_ptts, evsel); }
free(arm_spe_pmus); + free(hisi_ptt_pmus);
if (found_etm) auxtrace_event_cnt++; @@ -99,6 +156,9 @@ struct auxtrace_record if (found_spe) auxtrace_event_cnt++;
+ if (found_ptt) + auxtrace_event_cnt++; + if (auxtrace_event_cnt > 1) { pr_err("Concurrent AUX trace operation not currently supported\n"); *err = -EOPNOTSUPP; @@ -111,6 +171,9 @@ struct auxtrace_record #if defined(__aarch64__) if (found_spe) return arm_spe_recording_init(err, found_spe); + + if (found_ptt) + return hisi_ptt_recording_init(err, found_ptt); #endif
/* diff --git a/tools/perf/arch/arm/util/pmu.c b/tools/perf/arch/arm/util/pmu.c index bbc297a7e2e3..2223083c6825 100644 --- a/tools/perf/arch/arm/util/pmu.c +++ b/tools/perf/arch/arm/util/pmu.c @@ -10,6 +10,7 @@ #include <linux/string.h>
#include "arm-spe.h" +#include "hisi-ptt.h" #include "../../util/pmu.h"
struct perf_event_attr @@ -22,6 +23,8 @@ struct perf_event_attr #if defined(__aarch64__) } else if (strstarts(pmu->name, ARM_SPE_PMU_NAME)) { return arm_spe_pmu_default_config(pmu); + } else if (strstarts(pmu->name, HISI_PTT_PMU_NAME)) { + pmu->selectable = true; #endif }
diff --git a/tools/perf/arch/arm64/util/Build b/tools/perf/arch/arm64/util/Build index 4f153da4c00c..b0f0a5257034 100644 --- a/tools/perf/arch/arm64/util/Build +++ b/tools/perf/arch/arm64/util/Build @@ -9,4 +9,4 @@ perf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o perf-$(CONFIG_AUXTRACE) += ../../arm/util/pmu.o \ ../../arm/util/auxtrace.o \ ../../arm/util/cs-etm.o \ - arm-spe.o mem-events.o + arm-spe.o mem-events.o hisi-ptt.o diff --git a/tools/perf/arch/arm64/util/hisi-ptt.c b/tools/perf/arch/arm64/util/hisi-ptt.c new file mode 100644 index 000000000000..ba97c8a562a0 --- /dev/null +++ b/tools/perf/arch/arm64/util/hisi-ptt.c @@ -0,0 +1,188 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * HiSilicon PCIe Trace and Tuning (PTT) support + * Copyright (c) 2022 HiSilicon Technologies Co., Ltd. + */ + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/bitops.h> +#include <linux/log2.h> +#include <linux/zalloc.h> +#include <time.h> + +#include <internal/lib.h> // page_size +#include "../../../util/auxtrace.h" +#include "../../../util/cpumap.h" +#include "../../../util/debug.h" +#include "../../../util/event.h" +#include "../../../util/evlist.h" +#include "../../../util/evsel.h" +#include "../../../util/hisi-ptt.h" +#include "../../../util/pmu.h" +#include "../../../util/record.h" +#include "../../../util/session.h" +#include "../../../util/tsc.h" + +#define KiB(x) ((x) * 1024) +#define MiB(x) ((x) * 1024 * 1024) + +struct hisi_ptt_recording { + struct auxtrace_record itr; + struct perf_pmu *hisi_ptt_pmu; + struct evlist *evlist; +}; + +static size_t +hisi_ptt_info_priv_size(struct auxtrace_record *itr __maybe_unused, + struct evlist *evlist __maybe_unused) +{ + return HISI_PTT_AUXTRACE_PRIV_SIZE; +} + +static int hisi_ptt_info_fill(struct auxtrace_record *itr, + struct perf_session *session, + struct perf_record_auxtrace_info *auxtrace_info, + size_t priv_size) +{ + struct hisi_ptt_recording *pttr = + container_of(itr, struct hisi_ptt_recording, itr); + struct perf_pmu *hisi_ptt_pmu = pttr->hisi_ptt_pmu; + + if (priv_size != HISI_PTT_AUXTRACE_PRIV_SIZE) + return -EINVAL; + + if (!session->evlist->core.nr_mmaps) + return -EINVAL; + + auxtrace_info->type = PERF_AUXTRACE_HISI_PTT; + auxtrace_info->priv[0] = hisi_ptt_pmu->type; + + return 0; +} + +static int hisi_ptt_set_auxtrace_mmap_page(struct record_opts *opts) +{ + bool privileged = perf_event_paranoid_check(-1); + + if (!opts->full_auxtrace) + return 0; + + if (opts->full_auxtrace && !opts->auxtrace_mmap_pages) { + if (privileged) { + opts->auxtrace_mmap_pages = MiB(16) / page_size; + } else { + opts->auxtrace_mmap_pages = KiB(128) / page_size; + if (opts->mmap_pages == UINT_MAX) + opts->mmap_pages = KiB(256) / page_size; + } + } + + /* Validate auxtrace_mmap_pages */ + if (opts->auxtrace_mmap_pages) { + size_t sz = opts->auxtrace_mmap_pages * (size_t)page_size; + size_t min_sz = KiB(8); + + if (sz < min_sz || !is_power_of_2(sz)) { + pr_err("Invalid mmap size for HISI PTT: must be at least %zuKiB and a power of 2\n", + min_sz / 1024); + return -EINVAL; + } + } + + return 0; +} + +static int hisi_ptt_recording_options(struct auxtrace_record *itr, + struct evlist *evlist, + struct record_opts *opts) +{ + struct hisi_ptt_recording *pttr = + container_of(itr, struct hisi_ptt_recording, itr); + struct perf_pmu *hisi_ptt_pmu = pttr->hisi_ptt_pmu; + struct evsel *evsel, *hisi_ptt_evsel = NULL; + struct evsel *tracking_evsel; + int err; + + pttr->evlist = evlist; + evlist__for_each_entry(evlist, evsel) { + if (evsel->core.attr.type == hisi_ptt_pmu->type) { + if (hisi_ptt_evsel) { + pr_err("There may be only one " HISI_PTT_PMU_NAME "x event\n"); + return -EINVAL; + } + evsel->core.attr.freq = 0; + evsel->core.attr.sample_period = 1; + evsel->needs_auxtrace_mmap = true; + hisi_ptt_evsel = evsel; + opts->full_auxtrace = true; + } + } + + err = hisi_ptt_set_auxtrace_mmap_page(opts); + if (err) + return err; + /* + * To obtain the auxtrace buffer file descriptor, the auxtrace event + * must come first. + */ + evlist__to_front(evlist, hisi_ptt_evsel); + evsel__set_sample_bit(hisi_ptt_evsel, TIME); + + /* Add dummy event to keep tracking */ + err = parse_event(evlist, "dummy:u"); + if (err) + return err; + + tracking_evsel = evlist__last(evlist); + evlist__set_tracking_event(evlist, tracking_evsel); + + tracking_evsel->core.attr.freq = 0; + tracking_evsel->core.attr.sample_period = 1; + evsel__set_sample_bit(tracking_evsel, TIME); + + return 0; +} + +static u64 hisi_ptt_reference(struct auxtrace_record *itr __maybe_unused) +{ + return rdtsc(); +} + +static void hisi_ptt_recording_free(struct auxtrace_record *itr) +{ + struct hisi_ptt_recording *pttr = + container_of(itr, struct hisi_ptt_recording, itr); + + free(pttr); +} + +struct auxtrace_record *hisi_ptt_recording_init(int *err, + struct perf_pmu *hisi_ptt_pmu) +{ + struct hisi_ptt_recording *pttr; + + if (!hisi_ptt_pmu) { + *err = -ENODEV; + return NULL; + } + + pttr = zalloc(sizeof(*pttr)); + if (!pttr) { + *err = -ENOMEM; + return NULL; + } + + pttr->hisi_ptt_pmu = hisi_ptt_pmu; + pttr->itr.pmu = hisi_ptt_pmu; + pttr->itr.recording_options = hisi_ptt_recording_options; + pttr->itr.info_priv_size = hisi_ptt_info_priv_size; + pttr->itr.info_fill = hisi_ptt_info_fill; + pttr->itr.free = hisi_ptt_recording_free; + pttr->itr.reference = hisi_ptt_reference; + pttr->itr.read_finish = auxtrace_record__read_finish; + pttr->itr.alignment = 0; + + *err = 0; + return &pttr->itr; +} diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 1ee9207f3d96..26365a3d197d 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -1273,6 +1273,7 @@ int perf_event__process_auxtrace_info(struct perf_session *session, case PERF_AUXTRACE_S390_CPUMSF: err = s390_cpumsf_process_auxtrace_info(event, session); break; + case PERF_AUXTRACE_HISI_PTT: case PERF_AUXTRACE_UNKNOWN: default: return -EINVAL; diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h index 7e5c9e1552bd..ac94d3974e16 100644 --- a/tools/perf/util/auxtrace.h +++ b/tools/perf/util/auxtrace.h @@ -47,6 +47,7 @@ enum auxtrace_type { PERF_AUXTRACE_CS_ETM, PERF_AUXTRACE_ARM_SPE, PERF_AUXTRACE_S390_CPUMSF, + PERF_AUXTRACE_HISI_PTT, };
enum itrace_period_type { diff --git a/tools/perf/util/hisi-ptt.h b/tools/perf/util/hisi-ptt.h new file mode 100644 index 000000000000..82283c81b4c1 --- /dev/null +++ b/tools/perf/util/hisi-ptt.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * HiSilicon PCIe Trace and Tuning (PTT) support + * Copyright (c) 2022 HiSilicon Technologies Co., Ltd. + */ + +#ifndef INCLUDE__PERF_HISI_PTT_H__ +#define INCLUDE__PERF_HISI_PTT_H__ + +#define HISI_PTT_PMU_NAME "hisi_ptt" +#define HISI_PTT_AUXTRACE_PRIV_SIZE sizeof(u64) + +struct auxtrace_record *hisi_ptt_recording_init(int *err, + struct perf_pmu *hisi_ptt_pmu); + +#endif
From: Qi Liu liuqi115@huawei.com
mainline inclusion from mainline-v6.1-rc1 commit 5e91e57e6809 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5WGEF CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
Add support for using 'perf report --dump-raw-trace' to parse PTT packet.
Example usage:
Output will contain raw PTT data and its textual representation, such as (8DW format):
0 0 0x5810 [0x30]: PERF_RECORD_AUXTRACE size: 0x400000 offset: 0 ref: 0xa5d50c725 idx: 0 tid: -1 cpu: 0 . . ... HISI PTT data: size 4194304 bytes . 00000000: 00 00 00 00 Prefix . 00000004: 08 20 00 60 Header DW0 . 00000008: ff 02 00 01 Header DW1 . 0000000c: 20 08 00 00 Header DW2 . 00000010: 10 e7 44 ab Header DW3 . 00000014: 2a a8 1e 01 Time . 00000020: 00 00 00 00 Prefix . 00000024: 01 00 00 60 Header DW0 . 00000028: 0f 1e 00 01 Header DW1 . 0000002c: 04 00 00 00 Header DW2 . 00000030: 40 00 81 02 Header DW3 . 00000034: ee 02 00 00 Time ....
This patch only add basic parsing support according to the definition of the PTT packet described in Documentation/trace/hisi-ptt.rst. And the fields of each packet can be further decoded following the PCIe Spec's definition of TLP packet.
Signed-off-by: Qi Liu liuqi115@huawei.com Signed-off-by: Yicong Yang yangyicong@hisilicon.com Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Bjorn Helgaas helgaas@kernel.org Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Ingo Molnar mingo@redhat.com Cc: James Clark james.clark@arm.com Cc: John Garry john.garry@huawei.com Cc: Jonathan Cameron jonathan.cameron@huawei.com Cc: Leo Yan leo.yan@linaro.org Cc: Lorenzo Pieralisi lorenzo.pieralisi@arm.com Cc: Mark Rutland mark.rutland@arm.com Cc: Mathieu Poirier mathieu.poirier@linaro.org Cc: Mike Leach mike.leach@linaro.org Cc: Peter Zijlstra peterz@infradead.org Cc: Qi Liu liuqi6124@gmail.com Cc: Shameerali Kolothum Thodi shameerali.kolothum.thodi@huawei.com Cc: Shaokun Zhang zhangshaokun@hisilicon.com Cc: Suzuki Poulouse suzuki.poulose@arm.com Cc: Will Deacon will@kernel.org Cc: Zeng Prime prime.zeng@huawei.com Cc: linux-arm-kernel@lists.infradead.org Cc: linux-pci@vger.kernel.org Cc: linuxarm@huawei.com Link: https://lore.kernel.org/r/20220927081400.14364-4-yangyicong@huawei.com Signed-off-by: Arnaldo Carvalho de Melo acme@redhat.com Signed-off-by: Wangming Shao shaowangming@h-partners.com Reviewed-by: Yang Jihong yangjihong1@huawei.com Reviewed-by: Jay Fang f.fangjian@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- tools/perf/util/Build | 2 + tools/perf/util/auxtrace.c | 3 + tools/perf/util/hisi-ptt-decoder/Build | 1 + .../hisi-ptt-decoder/hisi-ptt-pkt-decoder.c | 164 +++++++++++++++ .../hisi-ptt-decoder/hisi-ptt-pkt-decoder.h | 31 +++ tools/perf/util/hisi-ptt.c | 192 ++++++++++++++++++ tools/perf/util/hisi-ptt.h | 3 + 7 files changed, 396 insertions(+) create mode 100644 tools/perf/util/hisi-ptt-decoder/Build create mode 100644 tools/perf/util/hisi-ptt-decoder/hisi-ptt-pkt-decoder.c create mode 100644 tools/perf/util/hisi-ptt-decoder/hisi-ptt-pkt-decoder.h create mode 100644 tools/perf/util/hisi-ptt.c
diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 661d995f5016..6bf0547bf134 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -109,6 +109,8 @@ perf-$(CONFIG_AUXTRACE) += intel-pt.o perf-$(CONFIG_AUXTRACE) += intel-bts.o perf-$(CONFIG_AUXTRACE) += arm-spe.o perf-$(CONFIG_AUXTRACE) += arm-spe-decoder/ +perf-$(CONFIG_AUXTRACE) += hisi-ptt.o +perf-$(CONFIG_AUXTRACE) += hisi-ptt-decoder/ perf-$(CONFIG_AUXTRACE) += s390-cpumsf.o
ifdef CONFIG_LIBOPENCSD diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 26365a3d197d..ead536d02160 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -51,6 +51,7 @@ #include "intel-pt.h" #include "intel-bts.h" #include "arm-spe.h" +#include "hisi-ptt.h" #include "s390-cpumsf.h" #include "util/mmap.h"
@@ -1274,6 +1275,8 @@ int perf_event__process_auxtrace_info(struct perf_session *session, err = s390_cpumsf_process_auxtrace_info(event, session); break; case PERF_AUXTRACE_HISI_PTT: + err = hisi_ptt_process_auxtrace_info(event, session); + break; case PERF_AUXTRACE_UNKNOWN: default: return -EINVAL; diff --git a/tools/perf/util/hisi-ptt-decoder/Build b/tools/perf/util/hisi-ptt-decoder/Build new file mode 100644 index 000000000000..db3db8b75033 --- /dev/null +++ b/tools/perf/util/hisi-ptt-decoder/Build @@ -0,0 +1 @@ +perf-$(CONFIG_AUXTRACE) += hisi-ptt-pkt-decoder.o diff --git a/tools/perf/util/hisi-ptt-decoder/hisi-ptt-pkt-decoder.c b/tools/perf/util/hisi-ptt-decoder/hisi-ptt-pkt-decoder.c new file mode 100644 index 000000000000..a17c423a526d --- /dev/null +++ b/tools/perf/util/hisi-ptt-decoder/hisi-ptt-pkt-decoder.c @@ -0,0 +1,164 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * HiSilicon PCIe Trace and Tuning (PTT) support + * Copyright (c) 2022 HiSilicon Technologies Co., Ltd. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <endian.h> +#include <byteswap.h> +#include <linux/bitops.h> +#include <stdarg.h> + +#include "../color.h" +#include "hisi-ptt-pkt-decoder.h" + +/* + * For 8DW format, the bit[31:11] of DW0 is always 0x1fffff, which can be + * used to distinguish the data format. + * 8DW format is like: + * bits [ 31:11 ][ 10:0 ] + * |---------------------------------------|-------------------| + * DW0 [ 0x1fffff ][ Reserved (0x7ff) ] + * DW1 [ Prefix ] + * DW2 [ Header DW0 ] + * DW3 [ Header DW1 ] + * DW4 [ Header DW2 ] + * DW5 [ Header DW3 ] + * DW6 [ Reserved (0x0) ] + * DW7 [ Time ] + * + * 4DW format is like: + * bits [31:30] [ 29:25 ][24][23][22][21][ 20:11 ][ 10:0 ] + * |-----|---------|---|---|---|---|-------------|-------------| + * DW0 [ Fmt ][ Type ][T9][T8][TH][SO][ Length ][ Time ] + * DW1 [ Header DW1 ] + * DW2 [ Header DW2 ] + * DW3 [ Header DW3 ] + */ + +enum hisi_ptt_8dw_pkt_field_type { + HISI_PTT_8DW_CHK_AND_RSV0, + HISI_PTT_8DW_PREFIX, + HISI_PTT_8DW_HEAD0, + HISI_PTT_8DW_HEAD1, + HISI_PTT_8DW_HEAD2, + HISI_PTT_8DW_HEAD3, + HISI_PTT_8DW_RSV1, + HISI_PTT_8DW_TIME, + HISI_PTT_8DW_TYPE_MAX +}; + +enum hisi_ptt_4dw_pkt_field_type { + HISI_PTT_4DW_HEAD1, + HISI_PTT_4DW_HEAD2, + HISI_PTT_4DW_HEAD3, + HISI_PTT_4DW_TYPE_MAX +}; + +static const char * const hisi_ptt_8dw_pkt_field_name[] = { + [HISI_PTT_8DW_PREFIX] = "Prefix", + [HISI_PTT_8DW_HEAD0] = "Header DW0", + [HISI_PTT_8DW_HEAD1] = "Header DW1", + [HISI_PTT_8DW_HEAD2] = "Header DW2", + [HISI_PTT_8DW_HEAD3] = "Header DW3", + [HISI_PTT_8DW_TIME] = "Time" +}; + +static const char * const hisi_ptt_4dw_pkt_field_name[] = { + [HISI_PTT_4DW_HEAD1] = "Header DW1", + [HISI_PTT_4DW_HEAD2] = "Header DW2", + [HISI_PTT_4DW_HEAD3] = "Header DW3", +}; + +union hisi_ptt_4dw { + struct { + uint32_t format : 2; + uint32_t type : 5; + uint32_t t9 : 1; + uint32_t t8 : 1; + uint32_t th : 1; + uint32_t so : 1; + uint32_t len : 10; + uint32_t time : 11; + }; + uint32_t value; +}; + +static void hisi_ptt_print_pkt(const unsigned char *buf, int pos, const char *desc) +{ + const char *color = PERF_COLOR_BLUE; + int i; + + printf("."); + color_fprintf(stdout, color, " %08x: ", pos); + for (i = 0; i < HISI_PTT_FIELD_LENTH; i++) + color_fprintf(stdout, color, "%02x ", buf[pos + i]); + for (i = 0; i < HISI_PTT_MAX_SPACE_LEN; i++) + color_fprintf(stdout, color, " "); + color_fprintf(stdout, color, " %s\n", desc); +} + +static int hisi_ptt_8dw_kpt_desc(const unsigned char *buf, int pos) +{ + int i; + + for (i = 0; i < HISI_PTT_8DW_TYPE_MAX; i++) { + /* Do not show 8DW check field and reserved fields */ + if (i == HISI_PTT_8DW_CHK_AND_RSV0 || i == HISI_PTT_8DW_RSV1) { + pos += HISI_PTT_FIELD_LENTH; + continue; + } + + hisi_ptt_print_pkt(buf, pos, hisi_ptt_8dw_pkt_field_name[i]); + pos += HISI_PTT_FIELD_LENTH; + } + + return hisi_ptt_pkt_size[HISI_PTT_8DW_PKT]; +} + +static void hisi_ptt_4dw_print_dw0(const unsigned char *buf, int pos) +{ + const char *color = PERF_COLOR_BLUE; + union hisi_ptt_4dw dw0; + int i; + + dw0.value = *(uint32_t *)(buf + pos); + printf("."); + color_fprintf(stdout, color, " %08x: ", pos); + for (i = 0; i < HISI_PTT_FIELD_LENTH; i++) + color_fprintf(stdout, color, "%02x ", buf[pos + i]); + for (i = 0; i < HISI_PTT_MAX_SPACE_LEN; i++) + color_fprintf(stdout, color, " "); + + color_fprintf(stdout, color, + " %s %x %s %x %s %x %s %x %s %x %s %x %s %x %s %x\n", + "Format", dw0.format, "Type", dw0.type, "T9", dw0.t9, + "T8", dw0.t8, "TH", dw0.th, "SO", dw0.so, "Length", + dw0.len, "Time", dw0.time); +} + +static int hisi_ptt_4dw_kpt_desc(const unsigned char *buf, int pos) +{ + int i; + + hisi_ptt_4dw_print_dw0(buf, pos); + pos += HISI_PTT_FIELD_LENTH; + + for (i = 0; i < HISI_PTT_4DW_TYPE_MAX; i++) { + hisi_ptt_print_pkt(buf, pos, hisi_ptt_4dw_pkt_field_name[i]); + pos += HISI_PTT_FIELD_LENTH; + } + + return hisi_ptt_pkt_size[HISI_PTT_4DW_PKT]; +} + +int hisi_ptt_pkt_desc(const unsigned char *buf, int pos, enum hisi_ptt_pkt_type type) +{ + if (type == HISI_PTT_8DW_PKT) + return hisi_ptt_8dw_kpt_desc(buf, pos); + + return hisi_ptt_4dw_kpt_desc(buf, pos); +} diff --git a/tools/perf/util/hisi-ptt-decoder/hisi-ptt-pkt-decoder.h b/tools/perf/util/hisi-ptt-decoder/hisi-ptt-pkt-decoder.h new file mode 100644 index 000000000000..e78f1b5bc836 --- /dev/null +++ b/tools/perf/util/hisi-ptt-decoder/hisi-ptt-pkt-decoder.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * HiSilicon PCIe Trace and Tuning (PTT) support + * Copyright (c) 2022 HiSilicon Technologies Co., Ltd. + */ + +#ifndef INCLUDE__HISI_PTT_PKT_DECODER_H__ +#define INCLUDE__HISI_PTT_PKT_DECODER_H__ + +#include <stddef.h> +#include <stdint.h> + +#define HISI_PTT_8DW_CHECK_MASK GENMASK(31, 11) +#define HISI_PTT_IS_8DW_PKT GENMASK(31, 11) +#define HISI_PTT_MAX_SPACE_LEN 10 +#define HISI_PTT_FIELD_LENTH 4 + +enum hisi_ptt_pkt_type { + HISI_PTT_4DW_PKT, + HISI_PTT_8DW_PKT, + HISI_PTT_PKT_MAX +}; + +static int hisi_ptt_pkt_size[] = { + [HISI_PTT_4DW_PKT] = 16, + [HISI_PTT_8DW_PKT] = 32, +}; + +int hisi_ptt_pkt_desc(const unsigned char *buf, int pos, enum hisi_ptt_pkt_type type); + +#endif diff --git a/tools/perf/util/hisi-ptt.c b/tools/perf/util/hisi-ptt.c new file mode 100644 index 000000000000..45b614bb73bf --- /dev/null +++ b/tools/perf/util/hisi-ptt.c @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * HiSilicon PCIe Trace and Tuning (PTT) support + * Copyright (c) 2022 HiSilicon Technologies Co., Ltd. + */ + +#include <byteswap.h> +#include <endian.h> +#include <errno.h> +#include <inttypes.h> +#include <linux/bitops.h> +#include <linux/kernel.h> +#include <linux/log2.h> +#include <linux/types.h> +#include <linux/zalloc.h> +#include <stdlib.h> +#include <unistd.h> + +#include "auxtrace.h" +#include "color.h" +#include "debug.h" +#include "evsel.h" +#include "hisi-ptt.h" +#include "hisi-ptt-decoder/hisi-ptt-pkt-decoder.h" +#include "machine.h" +#include "session.h" +#include "tool.h" +#include <internal/lib.h> + +struct hisi_ptt { + struct auxtrace auxtrace; + u32 auxtrace_type; + struct perf_session *session; + struct machine *machine; + u32 pmu_type; +}; + +struct hisi_ptt_queue { + struct hisi_ptt *ptt; + struct auxtrace_buffer *buffer; +}; + +static enum hisi_ptt_pkt_type hisi_ptt_check_packet_type(unsigned char *buf) +{ + uint32_t head = *(uint32_t *)buf; + + if ((HISI_PTT_8DW_CHECK_MASK & head) == HISI_PTT_IS_8DW_PKT) + return HISI_PTT_8DW_PKT; + + return HISI_PTT_4DW_PKT; +} + +static void hisi_ptt_dump(struct hisi_ptt *ptt __maybe_unused, + unsigned char *buf, size_t len) +{ + const char *color = PERF_COLOR_BLUE; + enum hisi_ptt_pkt_type type; + size_t pos = 0; + int pkt_len; + + type = hisi_ptt_check_packet_type(buf); + len = round_down(len, hisi_ptt_pkt_size[type]); + color_fprintf(stdout, color, ". ... HISI PTT data: size %zu bytes\n", + len); + + while (len > 0) { + pkt_len = hisi_ptt_pkt_desc(buf, pos, type); + if (!pkt_len) + color_fprintf(stdout, color, " Bad packet!\n"); + + pos += pkt_len; + len -= pkt_len; + } +} + +static void hisi_ptt_dump_event(struct hisi_ptt *ptt, unsigned char *buf, + size_t len) +{ + printf(".\n"); + + hisi_ptt_dump(ptt, buf, len); +} + +static int hisi_ptt_process_event(struct perf_session *session __maybe_unused, + union perf_event *event __maybe_unused, + struct perf_sample *sample __maybe_unused, + struct perf_tool *tool __maybe_unused) +{ + return 0; +} + +static int hisi_ptt_process_auxtrace_event(struct perf_session *session, + union perf_event *event, + struct perf_tool *tool __maybe_unused) +{ + struct hisi_ptt *ptt = container_of(session->auxtrace, struct hisi_ptt, + auxtrace); + int fd = perf_data__fd(session->data); + int size = event->auxtrace.size; + void *data = malloc(size); + off_t data_offset; + int err; + + if (!data) + return -errno; + + if (perf_data__is_pipe(session->data)) { + data_offset = 0; + } else { + data_offset = lseek(fd, 0, SEEK_CUR); + if (data_offset == -1) + return -errno; + } + + err = readn(fd, data, size); + if (err != (ssize_t)size) { + free(data); + return -errno; + } + + if (dump_trace) + hisi_ptt_dump_event(ptt, data, size); + + return 0; +} + +static int hisi_ptt_flush(struct perf_session *session __maybe_unused, + struct perf_tool *tool __maybe_unused) +{ + return 0; +} + +static void hisi_ptt_free_events(struct perf_session *session __maybe_unused) +{ +} + +static void hisi_ptt_free(struct perf_session *session) +{ + struct hisi_ptt *ptt = container_of(session->auxtrace, struct hisi_ptt, + auxtrace); + + session->auxtrace = NULL; + free(ptt); +} + +static bool hisi_ptt_evsel_is_auxtrace(struct perf_session *session, + struct evsel *evsel) +{ + struct hisi_ptt *ptt = container_of(session->auxtrace, struct hisi_ptt, auxtrace); + + return evsel->core.attr.type == ptt->pmu_type; +} + +static void hisi_ptt_print_info(__u64 type) +{ + if (!dump_trace) + return; + + fprintf(stdout, " PMU Type %" PRId64 "\n", (s64) type); +} + +int hisi_ptt_process_auxtrace_info(union perf_event *event, + struct perf_session *session) +{ + struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info; + struct hisi_ptt *ptt; + + if (auxtrace_info->header.size < HISI_PTT_AUXTRACE_PRIV_SIZE + + sizeof(struct perf_record_auxtrace_info)) + return -EINVAL; + + ptt = zalloc(sizeof(*ptt)); + if (!ptt) + return -ENOMEM; + + ptt->session = session; + ptt->machine = &session->machines.host; /* No kvm support */ + ptt->auxtrace_type = auxtrace_info->type; + ptt->pmu_type = auxtrace_info->priv[0]; + + ptt->auxtrace.process_event = hisi_ptt_process_event; + ptt->auxtrace.process_auxtrace_event = hisi_ptt_process_auxtrace_event; + ptt->auxtrace.flush_events = hisi_ptt_flush; + ptt->auxtrace.free_events = hisi_ptt_free_events; + ptt->auxtrace.free = hisi_ptt_free; + ptt->auxtrace.evsel_is_auxtrace = hisi_ptt_evsel_is_auxtrace; + session->auxtrace = &ptt->auxtrace; + + hisi_ptt_print_info(auxtrace_info->priv[0]); + + return 0; +} diff --git a/tools/perf/util/hisi-ptt.h b/tools/perf/util/hisi-ptt.h index 82283c81b4c1..2db9b4056214 100644 --- a/tools/perf/util/hisi-ptt.h +++ b/tools/perf/util/hisi-ptt.h @@ -13,4 +13,7 @@ struct auxtrace_record *hisi_ptt_recording_init(int *err, struct perf_pmu *hisi_ptt_pmu);
+int hisi_ptt_process_auxtrace_info(union perf_event *event, + struct perf_session *session); + #endif
From: zhengfeng luo luozhengfeng@h-partners.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5XJ01
----------------------------------------------------------------------
Add default algorithm processing.
Fixes: f91696f2f053 ("RDMA/hns: Support congestion control type selection according to the FW") Signed-off-by: zhengfeng luo luozhengfeng@h-partners.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: Haoyue Xu xuhaoyue1@hisilicon.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 851c8c6fc9fa..820a0c496c54 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -4889,10 +4889,11 @@ static int check_cong_type(struct ib_qp *ibqp, cong_alg->wnd_mode_sel = WND_LIMIT; break; default: - ibdev_err(&hr_dev->ib_dev, - "error type(%u) for congestion selection.\n", - hr_dev->caps.cong_type); - return -EINVAL; + cong_alg->alg_sel = CONG_DCQCN; + cong_alg->alg_sub_sel = UNSUPPORT_CONG_LEVEL; + cong_alg->dip_vld = DIP_INVALID; + cong_alg->wnd_mode_sel = WND_LIMIT; + break; }
return 0;
From: Luoyouming luoyouming@huawei.com
driver inclusion category: cleanup bugzilla: https://gitee.com/openeuler/kernel/issues/I5USIG
-----------------------------------------------------------
This reverts commit 5a18385d182db24e95c8f4ce9fb0e34ed6139803.
Fixes: 5a18385d182d ("RDMA/hns: Support cqe inline in user space")
Signed-off-by: Luoyouming luoyouming@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: Haoyue Xu xuhaoyue1@hisilicon.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_device.h | 1 - drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 12 ------------ drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 3 +-- drivers/infiniband/hw/hns/hns_roce_main.c | 4 ---- include/uapi/rdma/hns-abi.h | 1 - 5 files changed, 1 insertion(+), 20 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 218d66166b05..ab052b54e206 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -145,7 +145,6 @@ enum { HNS_ROCE_CAP_FLAG_DIRECT_WQE = BIT(12), HNS_ROCE_CAP_FLAG_SDI_MODE = BIT(14), HNS_ROCE_CAP_FLAG_STASH = BIT(17), - HNS_ROCE_CAP_FLAG_CQE_INLINE = BIT(19), HNS_ROCE_CAP_FLAG_RQ_INLINE = BIT(20), };
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 820a0c496c54..c9309b8addd8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -4726,18 +4726,6 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, hr_reg_clear(qpc_mask, QPC_RQIE); }
- if (udata && - (ibqp->qp_type == IB_QPT_RC || ibqp->qp_type == IB_QPT_XRC_TGT) && - (uctx->config & HNS_ROCE_ALLOC_UCTX_CQE_INLINE_FLAGS)) { - hr_reg_write_bool(context, QPC_CQEIE, - hr_dev->caps.flags & - HNS_ROCE_CAP_FLAG_CQE_INLINE); - hr_reg_clear(qpc_mask, QPC_CQEIE); - - hr_reg_write(context, QPC_CQEIS, 0); - hr_reg_clear(qpc_mask, QPC_CQEIS); - } - return 0; }
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index dbb19fe680c9..16437df320ea 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -529,8 +529,7 @@ struct hns_roce_v2_qp_context { #define QPC_RQ_RTY_TX_ERR QPC_FIELD_LOC(607, 607) #define QPC_RX_CQN QPC_FIELD_LOC(631, 608) #define QPC_XRC_QP_TYPE QPC_FIELD_LOC(632, 632) -#define QPC_CQEIE QPC_FIELD_LOC(633, 633) -#define QPC_CQEIS QPC_FIELD_LOC(634, 634) +#define QPC_RSV3 QPC_FIELD_LOC(634, 633) #define QPC_MIN_RNR_TIME QPC_FIELD_LOC(639, 635) #define QPC_RQ_PRODUCER_IDX QPC_FIELD_LOC(655, 640) #define QPC_RQ_CONSUMER_IDX QPC_FIELD_LOC(671, 656) diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 2d59e0bebf98..0b04eb196fa2 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -383,10 +383,6 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, context->config |= ucmd.config & HNS_ROCE_ALLOC_UCTX_RQ_INLINE_FLAGS; if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) resp.config |= HNS_ROCE_ALLOC_UCTX_RQ_INLINE_FLAGS; - - context->config |= ucmd.config & HNS_ROCE_ALLOC_UCTX_CQE_INLINE_FLAGS; - if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQE_INLINE) - resp.config |= HNS_ROCE_ALLOC_UCTX_CQE_INLINE_FLAGS; }
ret = hns_roce_uar_alloc(hr_dev, &context->uar); diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h index 7d09da8d99ce..90d1c3629b7b 100644 --- a/include/uapi/rdma/hns-abi.h +++ b/include/uapi/rdma/hns-abi.h @@ -88,7 +88,6 @@ struct hns_roce_ib_create_qp_resp { enum { HNS_ROCE_UCONTEXT_EXSGE_CALC_MODE = 1 << 0, HNS_ROCE_ALLOC_UCTX_RQ_INLINE_FLAGS = 1 << 1, - HNS_ROCE_ALLOC_UCTX_CQE_INLINE_FLAGS = 1 << 2, };
struct hns_roce_ib_alloc_ucontext_resp {
From: Luoyouming luoyouming@huawei.com
driver inclusion category: cleanup bugzilla: https://gitee.com/openeuler/kernel/issues/I5USIG
-----------------------------------------------------------
This reverts commit d0b40fc54df4b63fc279b048fb855a25ee006ed5.
Fixes: d0b40fc54df4 ("RDMA/hns: Remove enable rq inline in kernel and add compatibility handling")
Signed-off-by: Luoyouming luoyouming@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: Haoyue Xu xuhaoyue1@hisilicon.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_device.h | 8 ++--- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 34 ++++++++------------- drivers/infiniband/hw/hns/hns_roce_main.c | 4 --- drivers/infiniband/hw/hns/hns_roce_qp.c | 2 +- include/uapi/rdma/hns-abi.h | 1 - 5 files changed, 17 insertions(+), 32 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index ab052b54e206..47f44012f44a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -132,8 +132,7 @@ enum hns_roce_event { enum { HNS_ROCE_CAP_FLAG_REREG_MR = BIT(0), HNS_ROCE_CAP_FLAG_ROCE_V1_V2 = BIT(1), - /* discard this bit, reserved for compatibility */ - HNS_ROCE_CAP_FLAG_DISCARD = BIT(2), + HNS_ROCE_CAP_FLAG_RQ_INLINE = BIT(2), HNS_ROCE_CAP_FLAG_CQ_RECORD_DB = BIT(3), HNS_ROCE_CAP_FLAG_QP_RECORD_DB = BIT(4), HNS_ROCE_CAP_FLAG_SRQ = BIT(5), @@ -145,7 +144,6 @@ enum { HNS_ROCE_CAP_FLAG_DIRECT_WQE = BIT(12), HNS_ROCE_CAP_FLAG_SDI_MODE = BIT(14), HNS_ROCE_CAP_FLAG_STASH = BIT(17), - HNS_ROCE_CAP_FLAG_RQ_INLINE = BIT(20), };
#define HNS_ROCE_DB_TYPE_COUNT 2 @@ -204,7 +202,7 @@ struct hns_roce_ucontext { struct list_head page_list; struct mutex page_mutex; struct hns_user_mmap_entry *db_mmap_entry; - u32 config; + u32 config; };
struct hns_roce_pd { @@ -891,7 +889,7 @@ struct hns_roce_hw { u32 step_idx); int (*modify_qp)(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, - enum ib_qp_state new_state, struct ib_udata *udata); + enum ib_qp_state new_state); int (*qp_flow_control_init)(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp); void (*dereg_mr)(struct hns_roce_dev *hr_dev); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index c9309b8addd8..ae36f84940e1 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2801,7 +2801,7 @@ static int free_mr_modify_rsv_qp(struct hns_roce_dev *hr_dev, attr->port_num = 1; attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE; ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, attr, mask, IB_QPS_INIT, - IB_QPS_INIT, NULL); + IB_QPS_INIT); if (ret) { ibdev_err(ibdev, "failed to modify qp to init, ret = %d.\n", ret); @@ -2823,7 +2823,7 @@ static int free_mr_modify_rsv_qp(struct hns_roce_dev *hr_dev, rdma_ah_set_sl(&attr->ah_attr, (u8)sl_num);
ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, attr, mask, IB_QPS_INIT, - IB_QPS_RTR, NULL); + IB_QPS_RTR); hr_dev->loop_idc = loopback; if (ret) { ibdev_err(ibdev, "failed to modify qp to rtr, ret = %d.\n", @@ -2838,7 +2838,7 @@ static int free_mr_modify_rsv_qp(struct hns_roce_dev *hr_dev, attr->retry_cnt = HNS_ROCE_FREE_MR_USED_QP_RETRY_CNT; attr->timeout = HNS_ROCE_FREE_MR_USED_QP_TIMEOUT; ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, attr, mask, IB_QPS_RTR, - IB_QPS_RTS, NULL); + IB_QPS_RTS); if (ret) ibdev_err(ibdev, "failed to modify qp to rts, ret = %d.\n", ret); @@ -4404,6 +4404,10 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, hr_reg_write(context, QPC_RQ_DB_RECORD_ADDR_H, upper_32_bits(hr_qp->rdb.dma));
+ if (ibqp->qp_type != IB_QPT_UD && ibqp->qp_type != IB_QPT_GSI) + hr_reg_write_bool(context, QPC_RQIE, + hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE); + hr_reg_write(context, QPC_RX_CQN, get_cqn(ibqp->recv_cq));
if (ibqp->srq) { @@ -4594,11 +4598,8 @@ static inline enum ib_mtu get_mtu(struct ib_qp *ibqp, static int modify_qp_init_to_rtr(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, struct hns_roce_v2_qp_context *context, - struct hns_roce_v2_qp_context *qpc_mask, - struct ib_udata *udata) + struct hns_roce_v2_qp_context *qpc_mask) { - struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context(udata, - struct hns_roce_ucontext, ibucontext); struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); struct ib_device *ibdev = &hr_dev->ib_dev; @@ -4718,14 +4719,6 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, hr_reg_write(context, QPC_LP_SGEN_INI, 3); hr_reg_clear(qpc_mask, QPC_LP_SGEN_INI);
- if (udata && (ibqp->qp_type == IB_QPT_RC) && - (uctx->config & HNS_ROCE_ALLOC_UCTX_RQ_INLINE_FLAGS)) { - hr_reg_write_bool(context, QPC_RQIE, - hr_dev->caps.flags & - HNS_ROCE_CAP_FLAG_RQ_INLINE); - hr_reg_clear(qpc_mask, QPC_RQIE); - } - return 0; }
@@ -5074,8 +5067,7 @@ static int hns_roce_v2_set_abs_fields(struct ib_qp *ibqp, enum ib_qp_state cur_state, enum ib_qp_state new_state, struct hns_roce_v2_qp_context *context, - struct hns_roce_v2_qp_context *qpc_mask, - struct ib_udata *udata) + struct hns_roce_v2_qp_context *qpc_mask) { struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); int ret = 0; @@ -5094,7 +5086,7 @@ static int hns_roce_v2_set_abs_fields(struct ib_qp *ibqp, qpc_mask); } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { ret = modify_qp_init_to_rtr(ibqp, attr, attr_mask, context, - qpc_mask, udata); + qpc_mask); } else if (cur_state == IB_QPS_RTR && new_state == IB_QPS_RTS) { ret = modify_qp_rtr_to_rts(ibqp, attr, attr_mask, context, qpc_mask); @@ -5274,7 +5266,7 @@ static void v2_set_flushed_fields(struct ib_qp *ibqp, static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, - enum ib_qp_state new_state, struct ib_udata *udata) + enum ib_qp_state new_state) { struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); @@ -5294,7 +5286,7 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, memset(qpc_mask, 0xff, hr_dev->caps.qpc_sz);
ret = hns_roce_v2_set_abs_fields(ibqp, attr, attr_mask, cur_state, - new_state, context, qpc_mask, udata); + new_state, context, qpc_mask); if (ret) goto out;
@@ -5495,7 +5487,7 @@ int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev, if (modify_qp_is_ok(hr_qp)) { /* Modify qp to reset before destroying qp */ ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, NULL, 0, - hr_qp->state, IB_QPS_RESET, udata); + hr_qp->state, IB_QPS_RESET); if (ret) ibdev_err(ibdev, "failed to modify QP to RST, ret = %d.\n", diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 0b04eb196fa2..ff6f0e6f2ac4 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -379,10 +379,6 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, resp.config = HNS_ROCE_UCONTEXT_EXSGE_CALC_MODE; resp.max_inline_data = hr_dev->caps.max_sq_inline; } - - context->config |= ucmd.config & HNS_ROCE_ALLOC_UCTX_RQ_INLINE_FLAGS; - if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) - resp.config |= HNS_ROCE_ALLOC_UCTX_RQ_INLINE_FLAGS; }
ret = hns_roce_uar_alloc(hr_dev, &context->uar); diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index ef66c3b29d80..99119c97fd96 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -1446,7 +1446,7 @@ int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, goto out;
ret = hr_dev->hw->modify_qp(ibqp, attr, attr_mask, cur_state, - new_state, udata); + new_state);
out: mutex_unlock(&hr_qp->mutex); diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h index 90d1c3629b7b..499ccc84aa39 100644 --- a/include/uapi/rdma/hns-abi.h +++ b/include/uapi/rdma/hns-abi.h @@ -87,7 +87,6 @@ struct hns_roce_ib_create_qp_resp {
enum { HNS_ROCE_UCONTEXT_EXSGE_CALC_MODE = 1 << 0, - HNS_ROCE_ALLOC_UCTX_RQ_INLINE_FLAGS = 1 << 1, };
struct hns_roce_ib_alloc_ucontext_resp {
From: Luoyouming luoyouming@huawei.com
driver inclusion category: cleanup bugzilla: https://gitee.com/openeuler/kernel/issues/I5USIG
-----------------------------------------------------------
This reverts commit 345423493ae7dc1964c45db54953a5108644be0c.
Fixes: 345423493ae7 ("RDMA/hns: Fix the problem of sge nums")
Signed-off-by: Luoyouming luoyouming@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: Haoyue Xu xuhaoyue1@hisilicon.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_device.h | 2 - drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 12 +- drivers/infiniband/hw/hns/hns_roce_main.c | 16 --- drivers/infiniband/hw/hns/hns_roce_qp.c | 134 ++++---------------- include/uapi/rdma/hns-abi.h | 11 -- 5 files changed, 35 insertions(+), 140 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 47f44012f44a..1386a7720d2d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -202,7 +202,6 @@ struct hns_roce_ucontext { struct list_head page_list; struct mutex page_mutex; struct hns_user_mmap_entry *db_mmap_entry; - u32 config; };
struct hns_roce_pd { @@ -330,7 +329,6 @@ struct hns_roce_wq { spinlock_t lock; u32 wqe_cnt; /* WQE num */ u32 max_gs; - u32 ext_sge_cnt; u32 rsv_sge; u32 offset; u32 wqe_shift; /* WQE size */ diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index ae36f84940e1..a4b8fcdf9066 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -188,6 +188,14 @@ static void set_atomic_seg(const struct ib_send_wr *wr, hr_reg_write(rc_sq_wqe, RC_SEND_WQE_SGE_NUM, valid_num_sge); }
+static unsigned int get_std_sge_num(struct hns_roce_qp *qp) +{ + if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_UD) + return 0; + + return HNS_ROCE_SGE_IN_WQE; +} + static int fill_ext_sge_inl_data(struct hns_roce_qp *qp, const struct ib_send_wr *wr, unsigned int *sge_idx, u32 msg_len) @@ -195,12 +203,14 @@ static int fill_ext_sge_inl_data(struct hns_roce_qp *qp, struct ib_device *ibdev = &(to_hr_dev(qp->ibqp.device))->ib_dev; unsigned int left_len_in_pg; unsigned int idx = *sge_idx; + unsigned int std_sge_num; unsigned int i = 0; unsigned int len; void *addr; void *dseg;
- if (msg_len > qp->sq.ext_sge_cnt * HNS_ROCE_SGE_SIZE) { + std_sge_num = get_std_sge_num(qp); + if (msg_len > (qp->sq.max_gs - std_sge_num) * HNS_ROCE_SGE_SIZE) { ibdev_err(ibdev, "no enough extended sge space for inline data.\n"); return -EINVAL; diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index ff6f0e6f2ac4..b4ce05d12f8c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -358,7 +358,6 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, struct hns_roce_ucontext *context = to_hr_ucontext(uctx); struct hns_roce_ib_alloc_ucontext_resp resp = {}; struct hns_roce_dev *hr_dev = to_hr_dev(uctx->device); - struct hns_roce_ib_alloc_ucontext ucmd = {};
if (!hr_dev->active) return -EAGAIN; @@ -366,21 +365,6 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, resp.qp_tab_size = hr_dev->caps.num_qps; resp.srq_tab_size = hr_dev->caps.num_srqs;
- if (udata->inlen == sizeof(struct hns_roce_ib_alloc_ucontext)) { - ret = ib_copy_from_udata(&ucmd, udata, - min(udata->inlen, sizeof(ucmd))); - if (ret) - return ret; - - if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) - context->config = ucmd.config & HNS_ROCE_UCONTEXT_EXSGE_CALC_MODE; - - if (context->config & HNS_ROCE_UCONTEXT_EXSGE_CALC_MODE) { - resp.config = HNS_ROCE_UCONTEXT_EXSGE_CALC_MODE; - resp.max_inline_data = hr_dev->caps.max_sq_inline; - } - } - ret = hns_roce_uar_alloc(hr_dev, &context->uar); if (ret) goto error_fail_uar_alloc; diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 99119c97fd96..43530a7c8304 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -482,118 +482,38 @@ static int set_rq_size(struct hns_roce_dev *hr_dev, struct ib_qp_cap *cap, return 0; }
-static u32 get_max_inline_data(struct hns_roce_dev *hr_dev, - struct ib_qp_cap *cap) +static u32 get_wqe_ext_sge_cnt(struct hns_roce_qp *qp) { - if (cap->max_inline_data) { - cap->max_inline_data = roundup_pow_of_two( - cap->max_inline_data); - return min(cap->max_inline_data, - hr_dev->caps.max_sq_inline); - } - - return cap->max_inline_data; -} - -static void update_inline_data(struct hns_roce_qp *hr_qp, - struct ib_qp_cap *cap, u32 config) -{ - bool is_ud_or_gsi_type = (hr_qp->ibqp.qp_type == IB_QPT_GSI || - hr_qp->ibqp.qp_type == IB_QPT_UD); - u32 sge_num = hr_qp->sq.ext_sge_cnt; - - if (config & HNS_ROCE_UCONTEXT_EXSGE_CALC_MODE) { - if (!is_ud_or_gsi_type) - sge_num = max((u32)HNS_ROCE_SGE_IN_WQE, sge_num); - - cap->max_inline_data = max(cap->max_inline_data, - sge_num * HNS_ROCE_SGE_SIZE); - } + /* GSI/UD QP only has extended sge */ + if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_UD) + return qp->sq.max_gs;
- hr_qp->max_inline_data = cap->max_inline_data; -} - -/** - * Calculated sge num according to attr's max_send_sge - */ -static u32 get_sge_num_from_max_send_sge(bool is_ud_or_gsi_type, - u32 max_send_sge) -{ - unsigned int std_sge_num; - unsigned int min_sge; + if (qp->sq.max_gs > HNS_ROCE_SGE_IN_WQE) + return qp->sq.max_gs - HNS_ROCE_SGE_IN_WQE;
- std_sge_num = is_ud_or_gsi_type ? 0 : HNS_ROCE_SGE_IN_WQE; - min_sge = is_ud_or_gsi_type ? 1 : 0; - return max_send_sge > std_sge_num ? max(min_sge, - (max_send_sge - std_sge_num)) : min_sge; -} - -/** - * Calculated sge num according to attr's max_inline_data - */ -static unsigned int get_sge_num_from_max_inl_data(bool is_ud_or_gsi_type, - u32 max_inline_data) -{ - unsigned int inline_sge = 0; - - inline_sge = roundup_pow_of_two(max_inline_data) / HNS_ROCE_SGE_SIZE; - - /* - * if max_inline_data less than - * HNS_ROCE_SGE_IN_WQE * HNS_ROCE_SGE_SIZE, - * In addition to ud's mode, no need to extend sge. - */ - if ((!is_ud_or_gsi_type) && (inline_sge <= HNS_ROCE_SGE_IN_WQE)) - inline_sge = 0; - - return inline_sge; + return 0; }
static void set_ext_sge_param(struct hns_roce_dev *hr_dev, u32 sq_wqe_cnt, - struct hns_roce_qp *hr_qp, struct ib_qp_cap *cap, - u32 config) + struct hns_roce_qp *hr_qp, struct ib_qp_cap *cap) { - bool is_ud_or_gsi_type = (hr_qp->ibqp.qp_type == IB_QPT_GSI || - hr_qp->ibqp.qp_type == IB_QPT_UD); - unsigned int std_sge_num; - u32 inline_ext_sge = 0; - u32 ext_wqe_sge_cnt; u32 total_sge_cnt; - - cap->max_inline_data = get_max_inline_data(hr_dev, cap); + u32 wqe_sge_cnt;
hr_qp->sge.sge_shift = HNS_ROCE_SGE_SHIFT; - std_sge_num = is_ud_or_gsi_type ? 0 : HNS_ROCE_SGE_IN_WQE; - ext_wqe_sge_cnt = get_sge_num_from_max_send_sge(is_ud_or_gsi_type, - cap->max_send_sge);
- if (config & HNS_ROCE_UCONTEXT_EXSGE_CALC_MODE) { - inline_ext_sge = max(ext_wqe_sge_cnt, - get_sge_num_from_max_inl_data( - is_ud_or_gsi_type, cap->max_inline_data)); - hr_qp->sq.ext_sge_cnt = !!(inline_ext_sge) ? - roundup_pow_of_two(inline_ext_sge) : 0; + hr_qp->sq.max_gs = max(1U, cap->max_send_sge);
- hr_qp->sq.max_gs = max(1U, (hr_qp->sq.ext_sge_cnt + std_sge_num)); - hr_qp->sq.max_gs = min(hr_qp->sq.max_gs, hr_dev->caps.max_sq_sg); - - ext_wqe_sge_cnt = hr_qp->sq.ext_sge_cnt; - } else { - hr_qp->sq.max_gs = max(1U, cap->max_send_sge); - hr_qp->sq.max_gs = min(hr_qp->sq.max_gs, hr_dev->caps.max_sq_sg); - hr_qp->sq.ext_sge_cnt = hr_qp->sq.max_gs; - } + wqe_sge_cnt = get_wqe_ext_sge_cnt(hr_qp);
/* If the number of extended sge is not zero, they MUST use the * space of HNS_HW_PAGE_SIZE at least. */ - if (ext_wqe_sge_cnt) { - total_sge_cnt = roundup_pow_of_two(sq_wqe_cnt * ext_wqe_sge_cnt); + if (wqe_sge_cnt) { + total_sge_cnt = roundup_pow_of_two(sq_wqe_cnt * wqe_sge_cnt); hr_qp->sge.sge_cnt = max(total_sge_cnt, (u32)HNS_HW_PAGE_SIZE / HNS_ROCE_SGE_SIZE); } - - update_inline_data(hr_qp, cap, config); }
static int check_sq_size_with_integrity(struct hns_roce_dev *hr_dev, @@ -621,7 +541,7 @@ static int check_sq_size_with_integrity(struct hns_roce_dev *hr_dev,
static int set_user_sq_size(struct hns_roce_dev *hr_dev, struct ib_qp_cap *cap, struct hns_roce_qp *hr_qp, - struct hns_roce_ib_create_qp *ucmd, u32 config) + struct hns_roce_ib_create_qp *ucmd) { struct ib_device *ibdev = &hr_dev->ib_dev; u32 cnt = 0; @@ -638,11 +558,10 @@ static int set_user_sq_size(struct hns_roce_dev *hr_dev, return ret; }
- set_ext_sge_param(hr_dev, cnt, hr_qp, cap, config); + set_ext_sge_param(hr_dev, cnt, hr_qp, cap);
hr_qp->sq.wqe_shift = ucmd->log_sq_stride; hr_qp->sq.wqe_cnt = cnt; - cap->max_send_sge = hr_qp->sq.max_gs;
return 0; } @@ -699,8 +618,7 @@ static int set_wqe_buf_attr(struct hns_roce_dev *hr_dev, }
static int set_kernel_sq_size(struct hns_roce_dev *hr_dev, - struct ib_qp_cap *cap, struct hns_roce_qp *hr_qp, - u32 config) + struct ib_qp_cap *cap, struct hns_roce_qp *hr_qp) { struct ib_device *ibdev = &hr_dev->ib_dev; u32 cnt; @@ -721,7 +639,7 @@ static int set_kernel_sq_size(struct hns_roce_dev *hr_dev, hr_qp->sq.wqe_shift = ilog2(hr_dev->caps.max_sq_desc_sz); hr_qp->sq.wqe_cnt = cnt;
- set_ext_sge_param(hr_dev, cnt, hr_qp, cap, config); + set_ext_sge_param(hr_dev, cnt, hr_qp, cap);
/* sync the parameters of kernel QP to user's configuration */ cap->max_send_wr = cnt; @@ -1073,12 +991,15 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct hns_roce_ib_create_qp *ucmd) { struct ib_device *ibdev = &hr_dev->ib_dev; - struct hns_roce_ucontext *uctx; - u32 config = 0; int ret;
hr_qp->ibqp.qp_type = init_attr->qp_type;
+ if (init_attr->cap.max_inline_data > hr_dev->caps.max_sq_inline) + init_attr->cap.max_inline_data = hr_dev->caps.max_sq_inline; + + hr_qp->max_inline_data = init_attr->cap.max_inline_data; + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) hr_qp->sq_signal_bits = IB_SIGNAL_ALL_WR; else @@ -1101,12 +1022,7 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, return ret; }
- uctx = rdma_udata_to_drv_context(udata, struct hns_roce_ucontext, - ibucontext); - config = uctx->config; - ret = set_user_sq_size(hr_dev, &init_attr->cap, hr_qp, ucmd, - config); - + ret = set_user_sq_size(hr_dev, &init_attr->cap, hr_qp, ucmd); if (ret) ibdev_err(ibdev, "Failed to set user SQ size, ret = %d\n", ret); @@ -1122,9 +1038,7 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, return -EINVAL; }
- if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) - config = HNS_ROCE_UCONTEXT_EXSGE_CALC_MODE; - ret = set_kernel_sq_size(hr_dev, &init_attr->cap, hr_qp, config); + ret = set_kernel_sq_size(hr_dev, &init_attr->cap, hr_qp); if (ret) ibdev_err(ibdev, "Failed to set kernel SQ size, ret = %d\n", ret); diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h index 499ccc84aa39..abfd36e27f5e 100644 --- a/include/uapi/rdma/hns-abi.h +++ b/include/uapi/rdma/hns-abi.h @@ -85,24 +85,13 @@ struct hns_roce_ib_create_qp_resp { __aligned_u64 dwqe_mmap_key; };
-enum { - HNS_ROCE_UCONTEXT_EXSGE_CALC_MODE = 1 << 0, -}; - struct hns_roce_ib_alloc_ucontext_resp { __u32 qp_tab_size; __u32 cqe_size; __u32 srq_tab_size; - __u32 max_inline_data; - __u32 config; __u32 reserved; };
-struct hns_roce_ib_alloc_ucontext { - __u32 config; - __u32 reserved; -}; - struct hns_roce_ib_alloc_pd_resp { __u32 pdn; };
From: Luoyouming luoyouming@huawei.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5USIG
----------------------------------------------------------
Currently, the driver only uses max_send_sge to initialize sge num when creating_qp. So, in the sq inline scenario, the driver may not has enough sge to send data. For example, if max_send_sge is 16 and max_inline_data is 1024, the driver needs 1024/16=64 sge to send data. Therefore, the calculation method of sge num is modified to take the maximum value of max_send_sge and max_inline_data/16 to solve this problem.
Fixes:05201e01be93("RDMA/hns: Refactor process of setting extended sge") Fixes:30b707886aeb("RDMA/hns: Support inline data in extented sge space for RC")
Signed-off-by: Luoyouming luoyouming@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: Haoyue Xu xuhaoyue1@hisilicon.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_device.h | 3 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 12 +- drivers/infiniband/hw/hns/hns_roce_main.c | 18 ++- drivers/infiniband/hw/hns/hns_roce_qp.c | 115 +++++++++++++++++--- include/uapi/rdma/hns-abi.h | 15 +++ 5 files changed, 133 insertions(+), 30 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 1386a7720d2d..2caea7c68e03 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -202,6 +202,7 @@ struct hns_roce_ucontext { struct list_head page_list; struct mutex page_mutex; struct hns_user_mmap_entry *db_mmap_entry; + u32 config; };
struct hns_roce_pd { @@ -335,6 +336,7 @@ struct hns_roce_wq { u32 head; u32 tail; void __iomem *db_reg; + u32 ext_sge_cnt; };
struct hns_roce_sge { @@ -637,6 +639,7 @@ struct hns_roce_qp { struct list_head rq_node; /* all recv qps are on a list */ struct list_head sq_node; /* all send qps are on a list */ struct hns_user_mmap_entry *dwqe_mmap_entry; + u32 config; };
struct hns_roce_ib_iboe { diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index a4b8fcdf9066..ae36f84940e1 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -188,14 +188,6 @@ static void set_atomic_seg(const struct ib_send_wr *wr, hr_reg_write(rc_sq_wqe, RC_SEND_WQE_SGE_NUM, valid_num_sge); }
-static unsigned int get_std_sge_num(struct hns_roce_qp *qp) -{ - if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_UD) - return 0; - - return HNS_ROCE_SGE_IN_WQE; -} - static int fill_ext_sge_inl_data(struct hns_roce_qp *qp, const struct ib_send_wr *wr, unsigned int *sge_idx, u32 msg_len) @@ -203,14 +195,12 @@ static int fill_ext_sge_inl_data(struct hns_roce_qp *qp, struct ib_device *ibdev = &(to_hr_dev(qp->ibqp.device))->ib_dev; unsigned int left_len_in_pg; unsigned int idx = *sge_idx; - unsigned int std_sge_num; unsigned int i = 0; unsigned int len; void *addr; void *dseg;
- std_sge_num = get_std_sge_num(qp); - if (msg_len > (qp->sq.max_gs - std_sge_num) * HNS_ROCE_SGE_SIZE) { + if (msg_len > qp->sq.ext_sge_cnt * HNS_ROCE_SGE_SIZE) { ibdev_err(ibdev, "no enough extended sge space for inline data.\n"); return -EINVAL; diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index b4ce05d12f8c..d04168676df6 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -354,10 +354,11 @@ static int hns_roce_alloc_uar_entry(struct ib_ucontext *uctx) static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) { - int ret; struct hns_roce_ucontext *context = to_hr_ucontext(uctx); - struct hns_roce_ib_alloc_ucontext_resp resp = {}; struct hns_roce_dev *hr_dev = to_hr_dev(uctx->device); + struct hns_roce_ib_alloc_ucontext_resp resp = {}; + struct hns_roce_ib_alloc_ucontext ucmd = {}; + int ret;
if (!hr_dev->active) return -EAGAIN; @@ -365,6 +366,19 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, resp.qp_tab_size = hr_dev->caps.num_qps; resp.srq_tab_size = hr_dev->caps.num_srqs;
+ ret = ib_copy_from_udata(&ucmd, udata, + min(udata->inlen, sizeof(ucmd))); + if (ret) + return ret; + + if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) + context->config = ucmd.config & HNS_ROCE_EXSGE_FLAGS; + + if (context->config & HNS_ROCE_EXSGE_FLAGS) { + resp.config |= HNS_ROCE_RSP_EXSGE_FLAGS; + resp.max_inline_data = hr_dev->caps.max_sq_inline; + } + ret = hns_roce_uar_alloc(hr_dev, &context->uar); if (ret) goto error_fail_uar_alloc; diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 43530a7c8304..f39cc5b6e58c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -482,38 +482,116 @@ static int set_rq_size(struct hns_roce_dev *hr_dev, struct ib_qp_cap *cap, return 0; }
-static u32 get_wqe_ext_sge_cnt(struct hns_roce_qp *qp) +static u32 get_max_inline_data(struct hns_roce_dev *hr_dev, + struct ib_qp_cap *cap) { - /* GSI/UD QP only has extended sge */ - if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_UD) - return qp->sq.max_gs; - - if (qp->sq.max_gs > HNS_ROCE_SGE_IN_WQE) - return qp->sq.max_gs - HNS_ROCE_SGE_IN_WQE; + if (cap->max_inline_data) { + cap->max_inline_data = roundup_pow_of_two( + cap->max_inline_data); + return min(cap->max_inline_data, + hr_dev->caps.max_sq_inline); + }
return 0; }
+static void update_inline_data(struct hns_roce_qp *hr_qp, + struct ib_qp_cap *cap) +{ + u32 sge_num = hr_qp->sq.ext_sge_cnt; + + if (hr_qp->config & HNS_ROCE_EXSGE_FLAGS) { + if (!(hr_qp->ibqp.qp_type == IB_QPT_GSI || + hr_qp->ibqp.qp_type == IB_QPT_UD)) + sge_num = max((u32)HNS_ROCE_SGE_IN_WQE, sge_num); + + cap->max_inline_data = max(cap->max_inline_data, + sge_num * HNS_ROCE_SGE_SIZE); + } + + hr_qp->max_inline_data = cap->max_inline_data; +} + +/** + * Calculated sge num according to attr's max_send_sge + */ +static u32 get_sge_num_from_max_send_sge(bool is_ud_or_gsi, + u32 max_send_sge) +{ + unsigned int std_sge_num; + unsigned int min_sge; + + std_sge_num = is_ud_or_gsi ? 0 : HNS_ROCE_SGE_IN_WQE; + min_sge = is_ud_or_gsi ? 1 : 0; + return max_send_sge > std_sge_num ? (max_send_sge - std_sge_num) : + min_sge; +} + +/** + * Calculated sge num according to attr's max_inline_data + */ +static unsigned int get_sge_num_from_max_inl_data(bool is_ud_or_gsi, + u32 max_inline_data) +{ + unsigned int inline_sge; + + inline_sge = roundup_pow_of_two(max_inline_data) / HNS_ROCE_SGE_SIZE; + + /* + * if max_inline_data less than + * HNS_ROCE_SGE_IN_WQE * HNS_ROCE_SGE_SIZE, + * In addition to ud's mode, no need to extend sge. + */ + if ((!is_ud_or_gsi) && (inline_sge <= HNS_ROCE_SGE_IN_WQE)) + inline_sge = 0; + + return inline_sge; +} + static void set_ext_sge_param(struct hns_roce_dev *hr_dev, u32 sq_wqe_cnt, struct hns_roce_qp *hr_qp, struct ib_qp_cap *cap) { + bool is_ud_or_gsi = (hr_qp->ibqp.qp_type == IB_QPT_GSI || + hr_qp->ibqp.qp_type == IB_QPT_UD); + unsigned int std_sge_num; + u32 inline_ext_sge = 0; + u32 ext_wqe_sge_cnt; u32 total_sge_cnt; - u32 wqe_sge_cnt; + + cap->max_inline_data = get_max_inline_data(hr_dev, cap);
hr_qp->sge.sge_shift = HNS_ROCE_SGE_SHIFT; + std_sge_num = is_ud_or_gsi ? 0 : HNS_ROCE_SGE_IN_WQE; + ext_wqe_sge_cnt = get_sge_num_from_max_send_sge(is_ud_or_gsi, + cap->max_send_sge);
- hr_qp->sq.max_gs = max(1U, cap->max_send_sge); + if (hr_qp->config & HNS_ROCE_EXSGE_FLAGS) { + inline_ext_sge = max(ext_wqe_sge_cnt, + get_sge_num_from_max_inl_data( + is_ud_or_gsi, cap->max_inline_data)); + hr_qp->sq.ext_sge_cnt = !!(inline_ext_sge) ? + roundup_pow_of_two(inline_ext_sge) : 0;
- wqe_sge_cnt = get_wqe_ext_sge_cnt(hr_qp); + hr_qp->sq.max_gs = max(1U, (hr_qp->sq.ext_sge_cnt + std_sge_num)); + hr_qp->sq.max_gs = min(hr_qp->sq.max_gs, hr_dev->caps.max_sq_sg); + + ext_wqe_sge_cnt = hr_qp->sq.ext_sge_cnt; + } else { + hr_qp->sq.max_gs = max(1U, cap->max_send_sge); + hr_qp->sq.max_gs = min(hr_qp->sq.max_gs, hr_dev->caps.max_sq_sg); + hr_qp->sq.ext_sge_cnt = hr_qp->sq.max_gs; + }
/* If the number of extended sge is not zero, they MUST use the * space of HNS_HW_PAGE_SIZE at least. */ - if (wqe_sge_cnt) { - total_sge_cnt = roundup_pow_of_two(sq_wqe_cnt * wqe_sge_cnt); + if (ext_wqe_sge_cnt) { + total_sge_cnt = roundup_pow_of_two(sq_wqe_cnt * ext_wqe_sge_cnt); hr_qp->sge.sge_cnt = max(total_sge_cnt, (u32)HNS_HW_PAGE_SIZE / HNS_ROCE_SGE_SIZE); } + + update_inline_data(hr_qp, cap); }
static int check_sq_size_with_integrity(struct hns_roce_dev *hr_dev, @@ -562,6 +640,7 @@ static int set_user_sq_size(struct hns_roce_dev *hr_dev,
hr_qp->sq.wqe_shift = ucmd->log_sq_stride; hr_qp->sq.wqe_cnt = cnt; + cap->max_send_sge = hr_qp->sq.max_gs;
return 0; } @@ -991,15 +1070,11 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct hns_roce_ib_create_qp *ucmd) { struct ib_device *ibdev = &hr_dev->ib_dev; + struct hns_roce_ucontext *uctx; int ret;
hr_qp->ibqp.qp_type = init_attr->qp_type;
- if (init_attr->cap.max_inline_data > hr_dev->caps.max_sq_inline) - init_attr->cap.max_inline_data = hr_dev->caps.max_sq_inline; - - hr_qp->max_inline_data = init_attr->cap.max_inline_data; - if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) hr_qp->sq_signal_bits = IB_SIGNAL_ALL_WR; else @@ -1022,7 +1097,11 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, return ret; }
+ uctx = rdma_udata_to_drv_context(udata, struct hns_roce_ucontext, + ibucontext); + hr_qp->config = uctx->config; ret = set_user_sq_size(hr_dev, &init_attr->cap, hr_qp, ucmd); + if (ret) ibdev_err(ibdev, "Failed to set user SQ size, ret = %d\n", ret); @@ -1038,6 +1117,8 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, return -EINVAL; }
+ if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) + hr_qp->config = HNS_ROCE_EXSGE_FLAGS; ret = set_kernel_sq_size(hr_dev, &init_attr->cap, hr_qp); if (ret) ibdev_err(ibdev, "Failed to set kernel SQ size, ret = %d\n", diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h index abfd36e27f5e..964b76d4f99d 100644 --- a/include/uapi/rdma/hns-abi.h +++ b/include/uapi/rdma/hns-abi.h @@ -85,11 +85,26 @@ struct hns_roce_ib_create_qp_resp { __aligned_u64 dwqe_mmap_key; };
+enum { + HNS_ROCE_EXSGE_FLAGS = 1 << 0, +}; + +enum { + HNS_ROCE_RSP_EXSGE_FLAGS = 1 << 0, +}; + struct hns_roce_ib_alloc_ucontext_resp { __u32 qp_tab_size; __u32 cqe_size; __u32 srq_tab_size; __u32 reserved; + __u32 config; + __u32 max_inline_data; +}; + +struct hns_roce_ib_alloc_ucontext { + __u32 config; + __u32 reserved; };
struct hns_roce_ib_alloc_pd_resp {
From: Luoyouming luoyouming@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5USIG
----------------------------------------------------------
The rq inline makes some changes as follows, Firstly, it is only used in user space. Secondly, it should notify hardware in QP RTR status. Thirdly, Add compatibility processing between different user space and kernel space. Change the HNS_ROCE_CAP_FLAG_RQ_INLINE to a new bit to prevent old kernel spaces / spaced from enabling rq inline.
Signed-off-by: Luoyouming luoyouming@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: Haoyue Xu xuhaoyue1@hisilicon.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_device.h | 6 ++-- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 34 +++++++++++++-------- drivers/infiniband/hw/hns/hns_roce_main.c | 5 +++ drivers/infiniband/hw/hns/hns_roce_qp.c | 2 +- include/uapi/rdma/hns-abi.h | 2 ++ 5 files changed, 33 insertions(+), 16 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 2caea7c68e03..7ed23f8ed78a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -132,7 +132,8 @@ enum hns_roce_event { enum { HNS_ROCE_CAP_FLAG_REREG_MR = BIT(0), HNS_ROCE_CAP_FLAG_ROCE_V1_V2 = BIT(1), - HNS_ROCE_CAP_FLAG_RQ_INLINE = BIT(2), + /* discard this bit, reserved for compatibility */ + HNS_ROCE_CAP_FLAG_DISCARD = BIT(2), HNS_ROCE_CAP_FLAG_CQ_RECORD_DB = BIT(3), HNS_ROCE_CAP_FLAG_QP_RECORD_DB = BIT(4), HNS_ROCE_CAP_FLAG_SRQ = BIT(5), @@ -144,6 +145,7 @@ enum { HNS_ROCE_CAP_FLAG_DIRECT_WQE = BIT(12), HNS_ROCE_CAP_FLAG_SDI_MODE = BIT(14), HNS_ROCE_CAP_FLAG_STASH = BIT(17), + HNS_ROCE_CAP_FLAG_RQ_INLINE = BIT(20), };
#define HNS_ROCE_DB_TYPE_COUNT 2 @@ -890,7 +892,7 @@ struct hns_roce_hw { u32 step_idx); int (*modify_qp)(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, - enum ib_qp_state new_state); + enum ib_qp_state new_state, struct ib_udata *udata); int (*qp_flow_control_init)(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp); void (*dereg_mr)(struct hns_roce_dev *hr_dev); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index ae36f84940e1..6891afb39d73 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2801,7 +2801,7 @@ static int free_mr_modify_rsv_qp(struct hns_roce_dev *hr_dev, attr->port_num = 1; attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE; ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, attr, mask, IB_QPS_INIT, - IB_QPS_INIT); + IB_QPS_INIT, NULL); if (ret) { ibdev_err(ibdev, "failed to modify qp to init, ret = %d.\n", ret); @@ -2823,7 +2823,7 @@ static int free_mr_modify_rsv_qp(struct hns_roce_dev *hr_dev, rdma_ah_set_sl(&attr->ah_attr, (u8)sl_num);
ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, attr, mask, IB_QPS_INIT, - IB_QPS_RTR); + IB_QPS_RTR, NULL); hr_dev->loop_idc = loopback; if (ret) { ibdev_err(ibdev, "failed to modify qp to rtr, ret = %d.\n", @@ -2838,7 +2838,7 @@ static int free_mr_modify_rsv_qp(struct hns_roce_dev *hr_dev, attr->retry_cnt = HNS_ROCE_FREE_MR_USED_QP_RETRY_CNT; attr->timeout = HNS_ROCE_FREE_MR_USED_QP_TIMEOUT; ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, attr, mask, IB_QPS_RTR, - IB_QPS_RTS); + IB_QPS_RTS, NULL); if (ret) ibdev_err(ibdev, "failed to modify qp to rts, ret = %d.\n", ret); @@ -4404,10 +4404,6 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, hr_reg_write(context, QPC_RQ_DB_RECORD_ADDR_H, upper_32_bits(hr_qp->rdb.dma));
- if (ibqp->qp_type != IB_QPT_UD && ibqp->qp_type != IB_QPT_GSI) - hr_reg_write_bool(context, QPC_RQIE, - hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE); - hr_reg_write(context, QPC_RX_CQN, get_cqn(ibqp->recv_cq));
if (ibqp->srq) { @@ -4598,8 +4594,11 @@ static inline enum ib_mtu get_mtu(struct ib_qp *ibqp, static int modify_qp_init_to_rtr(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, struct hns_roce_v2_qp_context *context, - struct hns_roce_v2_qp_context *qpc_mask) + struct hns_roce_v2_qp_context *qpc_mask, + struct ib_udata *udata) { + struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context(udata, + struct hns_roce_ucontext, ibucontext); struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); struct ib_device *ibdev = &hr_dev->ib_dev; @@ -4719,6 +4718,14 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, hr_reg_write(context, QPC_LP_SGEN_INI, 3); hr_reg_clear(qpc_mask, QPC_LP_SGEN_INI);
+ if (udata && (ibqp->qp_type == IB_QPT_RC) && + (uctx->config & HNS_ROCE_RQ_INLINE_FLAGS)) { + hr_reg_write_bool(context, QPC_RQIE, + hr_dev->caps.flags & + HNS_ROCE_CAP_FLAG_RQ_INLINE); + hr_reg_clear(qpc_mask, QPC_RQIE); + } + return 0; }
@@ -5067,7 +5074,8 @@ static int hns_roce_v2_set_abs_fields(struct ib_qp *ibqp, enum ib_qp_state cur_state, enum ib_qp_state new_state, struct hns_roce_v2_qp_context *context, - struct hns_roce_v2_qp_context *qpc_mask) + struct hns_roce_v2_qp_context *qpc_mask, + struct ib_udata *udata) { struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); int ret = 0; @@ -5086,7 +5094,7 @@ static int hns_roce_v2_set_abs_fields(struct ib_qp *ibqp, qpc_mask); } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { ret = modify_qp_init_to_rtr(ibqp, attr, attr_mask, context, - qpc_mask); + qpc_mask, udata); } else if (cur_state == IB_QPS_RTR && new_state == IB_QPS_RTS) { ret = modify_qp_rtr_to_rts(ibqp, attr, attr_mask, context, qpc_mask); @@ -5266,7 +5274,7 @@ static void v2_set_flushed_fields(struct ib_qp *ibqp, static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, - enum ib_qp_state new_state) + enum ib_qp_state new_state, struct ib_udata *udata) { struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); @@ -5286,7 +5294,7 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, memset(qpc_mask, 0xff, hr_dev->caps.qpc_sz);
ret = hns_roce_v2_set_abs_fields(ibqp, attr, attr_mask, cur_state, - new_state, context, qpc_mask); + new_state, context, qpc_mask, udata); if (ret) goto out;
@@ -5487,7 +5495,7 @@ int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev, if (modify_qp_is_ok(hr_qp)) { /* Modify qp to reset before destroying qp */ ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, NULL, 0, - hr_qp->state, IB_QPS_RESET); + hr_qp->state, IB_QPS_RESET, udata); if (ret) ibdev_err(ibdev, "failed to modify QP to RST, ret = %d.\n", diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index d04168676df6..046d3eed1b3f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -379,6 +379,11 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, resp.max_inline_data = hr_dev->caps.max_sq_inline; }
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) { + context->config |= ucmd.config & HNS_ROCE_RQ_INLINE_FLAGS; + resp.config |= HNS_ROCE_RSP_RQ_INLINE_FLAGS; + } + ret = hns_roce_uar_alloc(hr_dev, &context->uar); if (ret) goto error_fail_uar_alloc; diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index f39cc5b6e58c..44310c0c06b6 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -1441,7 +1441,7 @@ int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, goto out;
ret = hr_dev->hw->modify_qp(ibqp, attr, attr_mask, cur_state, - new_state); + new_state, udata);
out: mutex_unlock(&hr_qp->mutex); diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h index 964b76d4f99d..32054c96ffb5 100644 --- a/include/uapi/rdma/hns-abi.h +++ b/include/uapi/rdma/hns-abi.h @@ -87,10 +87,12 @@ struct hns_roce_ib_create_qp_resp {
enum { HNS_ROCE_EXSGE_FLAGS = 1 << 0, + HNS_ROCE_RQ_INLINE_FLAGS = 1 << 1, };
enum { HNS_ROCE_RSP_EXSGE_FLAGS = 1 << 0, + HNS_ROCE_RSP_RQ_INLINE_FLAGS = 1 << 1, };
struct hns_roce_ib_alloc_ucontext_resp {
From: Luoyouming luoyouming@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5USIG
----------------------------------------------------------
Enable the CQEIE field and configure the CQEIS field of QPC. And add compatibility handling.
Signed-off-by: Luoyouming luoyouming@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: Haoyue Xu xuhaoyue1@hisilicon.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_device.h | 1 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 12 ++++++++++++ drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 3 ++- drivers/infiniband/hw/hns/hns_roce_main.c | 5 +++++ include/uapi/rdma/hns-abi.h | 2 ++ 5 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 7ed23f8ed78a..a077dd50d48a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -145,6 +145,7 @@ enum { HNS_ROCE_CAP_FLAG_DIRECT_WQE = BIT(12), HNS_ROCE_CAP_FLAG_SDI_MODE = BIT(14), HNS_ROCE_CAP_FLAG_STASH = BIT(17), + HNS_ROCE_CAP_FLAG_CQE_INLINE = BIT(19), HNS_ROCE_CAP_FLAG_RQ_INLINE = BIT(20), };
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 6891afb39d73..e391a93c61bb 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -4726,6 +4726,18 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, hr_reg_clear(qpc_mask, QPC_RQIE); }
+ if (udata && + (ibqp->qp_type == IB_QPT_RC || ibqp->qp_type == IB_QPT_XRC_TGT) && + (uctx->config & HNS_ROCE_CQE_INLINE_FLAGS)) { + hr_reg_write_bool(context, QPC_CQEIE, + hr_dev->caps.flags & + HNS_ROCE_CAP_FLAG_CQE_INLINE); + hr_reg_clear(qpc_mask, QPC_CQEIE); + + hr_reg_write(context, QPC_CQEIS, 0); + hr_reg_clear(qpc_mask, QPC_CQEIS); + } + return 0; }
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 16437df320ea..dbb19fe680c9 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -529,7 +529,8 @@ struct hns_roce_v2_qp_context { #define QPC_RQ_RTY_TX_ERR QPC_FIELD_LOC(607, 607) #define QPC_RX_CQN QPC_FIELD_LOC(631, 608) #define QPC_XRC_QP_TYPE QPC_FIELD_LOC(632, 632) -#define QPC_RSV3 QPC_FIELD_LOC(634, 633) +#define QPC_CQEIE QPC_FIELD_LOC(633, 633) +#define QPC_CQEIS QPC_FIELD_LOC(634, 634) #define QPC_MIN_RNR_TIME QPC_FIELD_LOC(639, 635) #define QPC_RQ_PRODUCER_IDX QPC_FIELD_LOC(655, 640) #define QPC_RQ_CONSUMER_IDX QPC_FIELD_LOC(671, 656) diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 046d3eed1b3f..4fb8c4488195 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -384,6 +384,11 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, resp.config |= HNS_ROCE_RSP_RQ_INLINE_FLAGS; }
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQE_INLINE) { + context->config |= ucmd.config & HNS_ROCE_CQE_INLINE_FLAGS; + resp.config |= HNS_ROCE_RSP_CQE_INLINE_FLAGS; + } + ret = hns_roce_uar_alloc(hr_dev, &context->uar); if (ret) goto error_fail_uar_alloc; diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h index 32054c96ffb5..cd334ee5804c 100644 --- a/include/uapi/rdma/hns-abi.h +++ b/include/uapi/rdma/hns-abi.h @@ -88,11 +88,13 @@ struct hns_roce_ib_create_qp_resp { enum { HNS_ROCE_EXSGE_FLAGS = 1 << 0, HNS_ROCE_RQ_INLINE_FLAGS = 1 << 1, + HNS_ROCE_CQE_INLINE_FLAGS = 1 << 2, };
enum { HNS_ROCE_RSP_EXSGE_FLAGS = 1 << 0, HNS_ROCE_RSP_RQ_INLINE_FLAGS = 1 << 1, + HNS_ROCE_RSP_CQE_INLINE_FLAGS = 1 << 2, };
struct hns_roce_ib_alloc_ucontext_resp {
From: Luoyouming luoyouming@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5USIG
----------------------------------------------------------
The kernel space does not support the rq inline feature anymore, so remove the code associated with rq inline.
Signed-off-by: Luoyouming luoyouming@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: Haoyue Xu xuhaoyue1@hisilicon.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_device.h | 16 ------ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 53 ----------------- drivers/infiniband/hw/hns/hns_roce_qp.c | 64 --------------------- 3 files changed, 133 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index a077dd50d48a..de5cca595ca3 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -571,21 +571,6 @@ struct hns_roce_mbox_msg {
struct hns_roce_dev;
-struct hns_roce_rinl_sge { - void *addr; - u32 len; -}; - -struct hns_roce_rinl_wqe { - struct hns_roce_rinl_sge *sg_list; - u32 sge_cnt; -}; - -struct hns_roce_rinl_buf { - struct hns_roce_rinl_wqe *wqe_list; - u32 wqe_cnt; -}; - enum { HNS_ROCE_FLUSH_FLAG = 0, }; @@ -637,7 +622,6 @@ struct hns_roce_qp { /* 0: flush needed, 1: unneeded */ unsigned long flush_flag; struct hns_roce_work flush_work; - struct hns_roce_rinl_buf rq_inl_buf; struct list_head node; /* all qps are on a list */ struct list_head rq_node; /* all recv qps are on a list */ struct list_head sq_node; /* all send qps are on a list */ diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index e391a93c61bb..26baacd4db0b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -825,22 +825,10 @@ static void fill_recv_sge_to_wqe(const struct ib_recv_wr *wr, void *wqe, static void fill_rq_wqe(struct hns_roce_qp *hr_qp, const struct ib_recv_wr *wr, u32 wqe_idx, u32 max_sge) { - struct hns_roce_rinl_sge *sge_list; void *wqe = NULL; - u32 i;
wqe = hns_roce_get_recv_wqe(hr_qp, wqe_idx); fill_recv_sge_to_wqe(wr, wqe, max_sge, hr_qp->rq.rsv_sge); - - /* rq support inline data */ - if (hr_qp->rq_inl_buf.wqe_cnt) { - sge_list = hr_qp->rq_inl_buf.wqe_list[wqe_idx].sg_list; - hr_qp->rq_inl_buf.wqe_list[wqe_idx].sge_cnt = (u32)wr->num_sge; - for (i = 0; i < wr->num_sge; i++) { - sge_list[i].addr = (void *)(u64)wr->sg_list[i].addr; - sge_list[i].len = wr->sg_list[i].length; - } - } }
static int hns_roce_v2_post_recv(struct ib_qp *ibqp, @@ -3684,39 +3672,6 @@ static int hns_roce_v2_req_notify_cq(struct ib_cq *ibcq, return 0; }
-static int hns_roce_handle_recv_inl_wqe(struct hns_roce_v2_cqe *cqe, - struct hns_roce_qp *qp, - struct ib_wc *wc) -{ - struct hns_roce_rinl_sge *sge_list; - u32 wr_num, wr_cnt, sge_num; - u32 sge_cnt, data_len, size; - void *wqe_buf; - - wr_num = hr_reg_read(cqe, CQE_WQE_IDX); - wr_cnt = wr_num & (qp->rq.wqe_cnt - 1); - - sge_list = qp->rq_inl_buf.wqe_list[wr_cnt].sg_list; - sge_num = qp->rq_inl_buf.wqe_list[wr_cnt].sge_cnt; - wqe_buf = hns_roce_get_recv_wqe(qp, wr_cnt); - data_len = wc->byte_len; - - for (sge_cnt = 0; (sge_cnt < sge_num) && (data_len); sge_cnt++) { - size = min(sge_list[sge_cnt].len, data_len); - memcpy((void *)sge_list[sge_cnt].addr, wqe_buf, size); - - data_len -= size; - wqe_buf += size; - } - - if (unlikely(data_len)) { - wc->status = IB_WC_LOC_LEN_ERR; - return -EAGAIN; - } - - return 0; -} - static int sw_comp(struct hns_roce_qp *hr_qp, struct hns_roce_wq *wq, int num_entries, struct ib_wc *wc) { @@ -3944,10 +3899,8 @@ static inline bool is_rq_inl_enabled(struct ib_wc *wc, u32 hr_opcode,
static int fill_recv_wc(struct ib_wc *wc, struct hns_roce_v2_cqe *cqe) { - struct hns_roce_qp *qp = to_hr_qp(wc->qp); u32 hr_opcode; int ib_opcode; - int ret;
wc->byte_len = le32_to_cpu(cqe->byte_cnt);
@@ -3972,12 +3925,6 @@ static int fill_recv_wc(struct ib_wc *wc, struct hns_roce_v2_cqe *cqe) else wc->opcode = ib_opcode;
- if (is_rq_inl_enabled(wc, hr_opcode, cqe)) { - ret = hns_roce_handle_recv_inl_wqe(cqe, qp, wc); - if (unlikely(ret)) - return ret; - } - wc->sl = hr_reg_read(cqe, CQE_SL); wc->src_qp = hr_reg_read(cqe, CQE_RMT_QPN); wc->slid = 0; diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 44310c0c06b6..50177c47b247 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -436,7 +436,6 @@ static int set_rq_size(struct hns_roce_dev *hr_dev, struct ib_qp_cap *cap, if (!has_rq) { hr_qp->rq.wqe_cnt = 0; hr_qp->rq.max_gs = 0; - hr_qp->rq_inl_buf.wqe_cnt = 0; cap->max_recv_wr = 0; cap->max_recv_sge = 0;
@@ -469,12 +468,6 @@ static int set_rq_size(struct hns_roce_dev *hr_dev, struct ib_qp_cap *cap, hr_qp->rq.max_gs);
hr_qp->rq.wqe_cnt = cnt; - if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE && - hr_qp->ibqp.qp_type != IB_QPT_UD && - hr_qp->ibqp.qp_type != IB_QPT_GSI) - hr_qp->rq_inl_buf.wqe_cnt = cnt; - else - hr_qp->rq_inl_buf.wqe_cnt = 0;
cap->max_recv_wr = cnt; cap->max_recv_sge = hr_qp->rq.max_gs - hr_qp->rq.rsv_sge; @@ -745,49 +738,6 @@ static int hns_roce_qp_has_rq(struct ib_qp_init_attr *attr) return 1; }
-static int alloc_rq_inline_buf(struct hns_roce_qp *hr_qp, - struct ib_qp_init_attr *init_attr) -{ - u32 max_recv_sge = init_attr->cap.max_recv_sge; - u32 wqe_cnt = hr_qp->rq_inl_buf.wqe_cnt; - struct hns_roce_rinl_wqe *wqe_list; - int i; - - /* allocate recv inline buf */ - wqe_list = kcalloc(wqe_cnt, sizeof(struct hns_roce_rinl_wqe), - GFP_KERNEL); - if (!wqe_list) - goto err; - - /* Allocate a continuous buffer for all inline sge we need */ - wqe_list[0].sg_list = kcalloc(wqe_cnt, (max_recv_sge * - sizeof(struct hns_roce_rinl_sge)), - GFP_KERNEL); - if (!wqe_list[0].sg_list) - goto err_wqe_list; - - /* Assign buffers of sg_list to each inline wqe */ - for (i = 1; i < wqe_cnt; i++) - wqe_list[i].sg_list = &wqe_list[0].sg_list[i * max_recv_sge]; - - hr_qp->rq_inl_buf.wqe_list = wqe_list; - - return 0; - -err_wqe_list: - kfree(wqe_list); - -err: - return -ENOMEM; -} - -static void free_rq_inline_buf(struct hns_roce_qp *hr_qp) -{ - if (hr_qp->rq_inl_buf.wqe_list) - kfree(hr_qp->rq_inl_buf.wqe_list[0].sg_list); - kfree(hr_qp->rq_inl_buf.wqe_list); -} - static int alloc_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct ib_qp_init_attr *init_attr, struct ib_udata *udata, unsigned long addr) @@ -796,18 +746,6 @@ static int alloc_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct hns_roce_buf_attr buf_attr = {}; int ret;
- if (!udata && hr_qp->rq_inl_buf.wqe_cnt) { - ret = alloc_rq_inline_buf(hr_qp, init_attr); - if (ret) { - ibdev_err(ibdev, - "failed to alloc inline buf, ret = %d.\n", - ret); - return ret; - } - } else { - hr_qp->rq_inl_buf.wqe_list = NULL; - } - ret = set_wqe_buf_attr(hr_dev, hr_qp, &buf_attr); if (ret) { ibdev_err(ibdev, "failed to split WQE buf, ret = %d.\n", ret); @@ -826,7 +764,6 @@ static int alloc_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
return 0; err_inline: - free_rq_inline_buf(hr_qp);
return ret; } @@ -834,7 +771,6 @@ static int alloc_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, static void free_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) { hns_roce_mtr_destroy(hr_dev, &hr_qp->mtr); - free_rq_inline_buf(hr_qp); }
static inline bool user_qp_has_sdb(struct hns_roce_dev *hr_dev,
From: Chengchang Tang tangchengchang@huawei.com
driver inclusion category: cleanup bugzilla: https://gitee.com/openeuler/kernel/issues/I5X0C3
-----------------------------------------------------------
This reverts commit f9d4c8481cd49c2c3b08124e796471755f3fb1d0.
This patch is synchronized from the linux mainline, but some important code is lost during synchronization. This results in a large code difference between openEuler and the linux mainline. It is a better option to revert the bad patch and resync a full patch from the linux mainline. The resynchronized patch is the second patch in this patchlist.
Signed-off-by: Chengchang Tang tangchengchang@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: Haoyue Xu xuhaoyue1@hisilicon.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_cq.c | 2 +- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 4 ++-- drivers/infiniband/hw/hns/hns_roce_mr.c | 2 +- drivers/infiniband/hw/hns/hns_roce_qp.c | 2 +- drivers/infiniband/hw/hns/hns_roce_srq.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 5320f4a4c312..74ad294e7ffd 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -209,7 +209,7 @@ static int alloc_cq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, buf_attr.region_count = 1;
ret = hns_roce_mtr_create(hr_dev, &hr_cq->mtr, &buf_attr, - hr_dev->caps.cqe_ba_pg_sz + PAGE_SHIFT, + hr_dev->caps.cqe_ba_pg_sz + HNS_HW_PAGE_SHIFT, udata, addr); if (ret) ibdev_err(ibdev, "Failed to alloc CQ mtr, ret = %d\n", ret); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 26baacd4db0b..a703fbd98bf9 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -6365,14 +6365,14 @@ static int alloc_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) else eq->hop_num = hr_dev->caps.eqe_hop_num;
- buf_attr.page_shift = hr_dev->caps.eqe_buf_pg_sz + PAGE_SHIFT; + buf_attr.page_shift = hr_dev->caps.eqe_buf_pg_sz + HNS_HW_PAGE_SHIFT; buf_attr.region[0].size = eq->entries * eq->eqe_size; buf_attr.region[0].hopnum = eq->hop_num; buf_attr.region_count = 1;
err = hns_roce_mtr_create(hr_dev, &eq->mtr, &buf_attr, hr_dev->caps.eqe_ba_pg_sz + - PAGE_SHIFT, NULL, 0); + HNS_HW_PAGE_SHIFT, NULL, 0); if (err) dev_err(hr_dev->dev, "Failed to alloc EQE mtr, err %d\n", err);
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index d1a9200a312d..469ce0526ffd 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -104,7 +104,7 @@ static int alloc_mr_pbl(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr, buf_attr.mtt_only = is_fast;
err = hns_roce_mtr_create(hr_dev, &mr->pbl_mtr, &buf_attr, - hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT, + hr_dev->caps.pbl_ba_pg_sz + HNS_HW_PAGE_SHIFT, udata, start); if (err) ibdev_err(ibdev, "failed to alloc pbl mtr, ret = %d.\n", err); diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 50177c47b247..d377e0e91479 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -752,7 +752,7 @@ static int alloc_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, goto err_inline; } ret = hns_roce_mtr_create(hr_dev, &hr_qp->mtr, &buf_attr, - PAGE_SHIFT + hr_dev->caps.mtt_ba_pg_sz, + HNS_HW_PAGE_SHIFT + hr_dev->caps.mtt_ba_pg_sz, udata, addr); if (ret) { ibdev_err(ibdev, "failed to create WQE mtr, ret = %d.\n", ret); diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index f3e19c66283f..71f291907976 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -179,7 +179,7 @@ static int alloc_srq_idx(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq, buf_attr.region_count = 1;
ret = hns_roce_mtr_create(hr_dev, &idx_que->mtr, &buf_attr, - hr_dev->caps.idx_ba_pg_sz + PAGE_SHIFT, + hr_dev->caps.idx_ba_pg_sz + HNS_HW_PAGE_SHIFT, udata, addr); if (ret) { ibdev_err(ibdev, "Failed to alloc SRQ idx mtr, ret = %d.\n", ret);
From: Xi Wang wangxi11@huawei.com
mainline inclusion from mainline-v5.14-rc1 commit 7b0006db6800 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5X0C3 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=7b0...
---------------------------------------------------------------------
The base address table is allocated by dma allocator, and the size is always aligned to PAGE_SIZE. If a fixed size is used to allocate the table, the number of base address entries stored in the table will be smaller than that can actually stored.
Link: https://lore.kernel.org/r/1621589395-2435-2-git-send-email-liweihang@huawei.... Signed-off-by: Xi Wang wangxi11@huawei.com Signed-off-by: Weihang Li liweihang@huawei.com Signed-off-by: Jason Gunthorpe jgg@nvidia.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: Haoyue Xu xuhaoyue1@hisilicon.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_alloc.c | 51 +++++++++------------ drivers/infiniband/hw/hns/hns_roce_cq.c | 4 +- drivers/infiniband/hw/hns/hns_roce_device.h | 15 ++++-- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 26 +++++++++-- drivers/infiniband/hw/hns/hns_roce_mr.c | 14 +++--- drivers/infiniband/hw/hns/hns_roce_qp.c | 2 +- drivers/infiniband/hw/hns/hns_roce_srq.c | 10 ++-- 7 files changed, 68 insertions(+), 54 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_alloc.c b/drivers/infiniband/hw/hns/hns_roce_alloc.c index 1a9da5c84183..11a78ceae568 100644 --- a/drivers/infiniband/hw/hns/hns_roce_alloc.c +++ b/drivers/infiniband/hw/hns/hns_roce_alloc.c @@ -86,10 +86,10 @@ struct hns_roce_buf *hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size,
/* Calc the trunk size and num by required size and page_shift */ if (flags & HNS_ROCE_BUF_DIRECT) { - buf->trunk_shift = ilog2(ALIGN(size, PAGE_SIZE)); + buf->trunk_shift = order_base_2(ALIGN(size, PAGE_SIZE)); ntrunk = 1; } else { - buf->trunk_shift = ilog2(ALIGN(page_size, PAGE_SIZE)); + buf->trunk_shift = order_base_2(ALIGN(page_size, PAGE_SIZE)); ntrunk = DIV_ROUND_UP(size, 1 << buf->trunk_shift); }
@@ -130,50 +130,41 @@ struct hns_roce_buf *hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, }
int hns_roce_get_kmem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, - int buf_cnt, int start, struct hns_roce_buf *buf) + int buf_cnt, struct hns_roce_buf *buf, + unsigned int page_shift) { - int i, end; - int total; - - end = start + buf_cnt; - if (end > buf->npages) { - dev_err(hr_dev->dev, - "failed to check kmem bufs, end %d + %d total %u!\n", - start, buf_cnt, buf->npages); + unsigned int offset, max_size; + int total = 0; + int i; + + if (page_shift > buf->trunk_shift) { + dev_err(hr_dev->dev, "failed to check kmem buf shift %u > %u\n", + page_shift, buf->trunk_shift); return -EINVAL; }
- total = 0; - for (i = start; i < end; i++) - bufs[total++] = hns_roce_buf_page(buf, i); + offset = 0; + max_size = buf->ntrunks << buf->trunk_shift; + for (i = 0; i < buf_cnt && offset < max_size; i++) { + bufs[total++] = hns_roce_buf_dma_addr(buf, offset); + offset += (1 << page_shift); + }
return total; }
int hns_roce_get_umem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, - int buf_cnt, int start, struct ib_umem *umem, + int buf_cnt, struct ib_umem *umem, unsigned int page_shift) { struct ib_block_iter biter; int total = 0; - int idx = 0; - u64 addr; - - if (page_shift < HNS_HW_PAGE_SHIFT) { - dev_err(hr_dev->dev, "failed to check umem page shift %u!\n", - page_shift); - return -EINVAL; - }
/* convert system page cnt to hw page cnt */ rdma_umem_for_each_dma_block(umem, &biter, 1 << page_shift) { - addr = rdma_block_iter_dma_address(&biter); - if (idx >= start) { - bufs[total++] = addr; - if (total >= buf_cnt) - goto done; - } - idx++; + bufs[total++] = rdma_block_iter_dma_address(&biter); + if (total >= buf_cnt) + goto done; }
done: diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 74ad294e7ffd..98dcbc40f964 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -203,13 +203,13 @@ static int alloc_cq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, struct hns_roce_buf_attr buf_attr = {}; int ret;
- buf_attr.page_shift = hr_dev->caps.cqe_buf_pg_sz + HNS_HW_PAGE_SHIFT; + buf_attr.page_shift = hr_dev->caps.cqe_buf_pg_sz + PAGE_SHIFT; buf_attr.region[0].size = hr_cq->cq_depth * hr_cq->cqe_size; buf_attr.region[0].hopnum = hr_dev->caps.cqe_hop_num; buf_attr.region_count = 1;
ret = hns_roce_mtr_create(hr_dev, &hr_cq->mtr, &buf_attr, - hr_dev->caps.cqe_ba_pg_sz + HNS_HW_PAGE_SHIFT, + hr_dev->caps.cqe_ba_pg_sz + PAGE_SHIFT, udata, addr); if (ret) ibdev_err(ibdev, "Failed to alloc CQ mtr, ret = %d\n", ret); diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index de5cca595ca3..6804c12ce146 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -1029,14 +1029,18 @@ static inline void *hns_roce_buf_offset(struct hns_roce_buf *buf, (offset & ((1 << buf->trunk_shift) - 1)); }
-static inline dma_addr_t hns_roce_buf_page(struct hns_roce_buf *buf, u32 idx) +static inline dma_addr_t hns_roce_buf_dma_addr(struct hns_roce_buf *buf, + unsigned int offset) { - unsigned int offset = idx << buf->page_shift; - return buf->trunk_list[offset >> buf->trunk_shift].map + (offset & ((1 << buf->trunk_shift) - 1)); }
+static inline dma_addr_t hns_roce_buf_page(struct hns_roce_buf *buf, u32 idx) +{ + return hns_roce_buf_dma_addr(buf, idx << buf->page_shift); +} + #define hr_hw_page_align(x) ALIGN(x, 1 << HNS_HW_PAGE_SHIFT)
static inline u64 to_hr_hw_page_addr(u64 addr) @@ -1152,9 +1156,10 @@ struct hns_roce_buf *hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 page_shift, u32 flags);
int hns_roce_get_kmem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, - int buf_cnt, int start, struct hns_roce_buf *buf); + int buf_cnt, struct hns_roce_buf *buf, + unsigned int page_shift); int hns_roce_get_umem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, - int buf_cnt, int start, struct ib_umem *umem, + int buf_cnt, struct ib_umem *umem, unsigned int page_shift);
int hns_roce_create_srq(struct ib_srq *srq, diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index a703fbd98bf9..afd78b5d5548 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2083,6 +2083,8 @@ static void set_hem_page_size(struct hns_roce_dev *hr_dev) caps->llm_buf_pg_sz = 0;
/* MR */ + caps->mpt_ba_pg_sz = 0; + caps->mpt_buf_pg_sz = 0; caps->pbl_ba_pg_sz = HNS_ROCE_BA_PG_SZ_SUPPORTED_16K; caps->pbl_buf_pg_sz = 0; calc_pg_sz(caps->num_mtpts, caps->mtpt_entry_sz, caps->mpt_hop_num, @@ -2090,8 +2092,12 @@ static void set_hem_page_size(struct hns_roce_dev *hr_dev) HEM_TYPE_MTPT);
/* QP */ - caps->qpc_timer_ba_pg_sz = 0; + caps->qpc_ba_pg_sz = 0; + caps->qpc_buf_pg_sz = 0; + caps->qpc_timer_ba_pg_sz = 0; caps->qpc_timer_buf_pg_sz = 0; + caps->sccc_ba_pg_sz = 0; + caps->sccc_buf_pg_sz = 0; caps->mtt_ba_pg_sz = 0; caps->mtt_buf_pg_sz = 0; calc_pg_sz(caps->num_qps, caps->qpc_sz, caps->qpc_hop_num, @@ -2104,6 +2110,12 @@ static void set_hem_page_size(struct hns_roce_dev *hr_dev) &caps->sccc_ba_pg_sz, HEM_TYPE_SCCC);
/* CQ */ + caps->cqc_ba_pg_sz = 0; + caps->cqc_buf_pg_sz = 0; + caps->cqc_timer_ba_pg_sz = 0; + caps->cqc_timer_buf_pg_sz = 0; + caps->cqe_ba_pg_sz = HNS_ROCE_BA_PG_SZ_SUPPORTED_256K; + caps->cqe_buf_pg_sz = 0; calc_pg_sz(caps->num_cqs, caps->cqc_entry_sz, caps->cqc_hop_num, caps->cqc_bt_num, &caps->cqc_buf_pg_sz, &caps->cqc_ba_pg_sz, HEM_TYPE_CQC); @@ -2112,6 +2124,12 @@ static void set_hem_page_size(struct hns_roce_dev *hr_dev)
/* SRQ */ if (caps->flags & HNS_ROCE_CAP_FLAG_SRQ) { + caps->srqc_ba_pg_sz = 0; + caps->srqc_buf_pg_sz = 0; + caps->srqwqe_ba_pg_sz = 0; + caps->srqwqe_buf_pg_sz = 0; + caps->idx_ba_pg_sz = 0; + caps->idx_buf_pg_sz = 0; calc_pg_sz(caps->num_srqs, caps->srqc_entry_sz, caps->srqc_hop_num, caps->srqc_bt_num, &caps->srqc_buf_pg_sz, &caps->srqc_ba_pg_sz, @@ -6365,14 +6383,14 @@ static int alloc_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) else eq->hop_num = hr_dev->caps.eqe_hop_num;
- buf_attr.page_shift = hr_dev->caps.eqe_buf_pg_sz + HNS_HW_PAGE_SHIFT; + buf_attr.page_shift = hr_dev->caps.eqe_buf_pg_sz + PAGE_SHIFT; buf_attr.region[0].size = eq->entries * eq->eqe_size; buf_attr.region[0].hopnum = eq->hop_num; buf_attr.region_count = 1;
err = hns_roce_mtr_create(hr_dev, &eq->mtr, &buf_attr, - hr_dev->caps.eqe_ba_pg_sz + - HNS_HW_PAGE_SHIFT, NULL, 0); + hr_dev->caps.eqe_ba_pg_sz + PAGE_SHIFT, NULL, + 0); if (err) dev_err(hr_dev->dev, "Failed to alloc EQE mtr, err %d\n", err);
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 469ce0526ffd..ea9af06b3530 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -104,7 +104,7 @@ static int alloc_mr_pbl(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr, buf_attr.mtt_only = is_fast;
err = hns_roce_mtr_create(hr_dev, &mr->pbl_mtr, &buf_attr, - hr_dev->caps.pbl_ba_pg_sz + HNS_HW_PAGE_SHIFT, + hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT, udata, start); if (err) ibdev_err(ibdev, "failed to alloc pbl mtr, ret = %d.\n", err); @@ -705,11 +705,11 @@ static int mtr_map_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, return -ENOMEM;
if (mtr->umem) - npage = hns_roce_get_umem_bufs(hr_dev, pages, page_count, 0, + npage = hns_roce_get_umem_bufs(hr_dev, pages, page_count, mtr->umem, page_shift); else - npage = hns_roce_get_kmem_bufs(hr_dev, pages, page_count, 0, - mtr->kmem); + npage = hns_roce_get_kmem_bufs(hr_dev, pages, page_count, + mtr->kmem, page_shift);
if (npage != page_count) { ibdev_err(ibdev, "failed to get mtr page %d != %d.\n", npage, @@ -721,8 +721,8 @@ static int mtr_map_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, if (mtr->hem_cfg.is_direct && npage > 1) { ret = mtr_check_direct_pages(pages, npage, page_shift); if (ret) { - ibdev_err(ibdev, "failed to check %s mtr, idx = %d.\n", - mtr->umem ? "user" : "kernel", ret); + ibdev_err(ibdev, "failed to check %s page: %d / %d.\n", + mtr->umem ? "umtr" : "kmtr", ret, npage); ret = -ENOBUFS; goto err_alloc_list; } @@ -767,7 +767,7 @@ int hns_roce_mtr_map(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, if (r->offset + r->count > page_cnt) { ret = -EINVAL; ibdev_err(ibdev, - "failed to check mtr%u end %u + %u, max %u.\n", + "failed to check mtr%u count %u + %u > %u.\n", i, r->offset, r->count, page_cnt); return ret; } diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index d377e0e91479..50177c47b247 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -752,7 +752,7 @@ static int alloc_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, goto err_inline; } ret = hns_roce_mtr_create(hr_dev, &hr_qp->mtr, &buf_attr, - HNS_HW_PAGE_SHIFT + hr_dev->caps.mtt_ba_pg_sz, + PAGE_SHIFT + hr_dev->caps.mtt_ba_pg_sz, udata, addr); if (ret) { ibdev_err(ibdev, "failed to create WQE mtr, ret = %d.\n", ret); diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index 71f291907976..727c7a134102 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -172,14 +172,14 @@ static int alloc_srq_idx(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq,
srq->idx_que.entry_shift = ilog2(HNS_ROCE_IDX_QUE_ENTRY_SZ);
- buf_attr.page_shift = hr_dev->caps.idx_buf_pg_sz + HNS_HW_PAGE_SHIFT; + buf_attr.page_shift = hr_dev->caps.idx_buf_pg_sz + PAGE_SHIFT; buf_attr.region[0].size = to_hr_hem_entries_size(srq->wqe_cnt, srq->idx_que.entry_shift); buf_attr.region[0].hopnum = hr_dev->caps.idx_hop_num; buf_attr.region_count = 1;
ret = hns_roce_mtr_create(hr_dev, &idx_que->mtr, &buf_attr, - hr_dev->caps.idx_ba_pg_sz + HNS_HW_PAGE_SHIFT, + hr_dev->caps.idx_ba_pg_sz + PAGE_SHIFT, udata, addr); if (ret) { ibdev_err(ibdev, "Failed to alloc SRQ idx mtr, ret = %d.\n", ret); @@ -226,15 +226,15 @@ static int alloc_srq_wqe_buf(struct hns_roce_dev *hr_dev, HNS_ROCE_SGE_SIZE * srq->max_gs)));
- buf_attr.page_shift = hr_dev->caps.srqwqe_buf_pg_sz + HNS_HW_PAGE_SHIFT; + buf_attr.page_shift = hr_dev->caps.srqwqe_buf_pg_sz + PAGE_SHIFT; buf_attr.region[0].size = to_hr_hem_entries_size(srq->wqe_cnt, srq->wqe_shift); buf_attr.region[0].hopnum = hr_dev->caps.srqwqe_hop_num; buf_attr.region_count = 1;
ret = hns_roce_mtr_create(hr_dev, &srq->buf_mtr, &buf_attr, - hr_dev->caps.srqwqe_ba_pg_sz + - HNS_HW_PAGE_SHIFT, udata, addr); + hr_dev->caps.srqwqe_ba_pg_sz + PAGE_SHIFT, + udata, addr); if (ret) ibdev_err(ibdev, "failed to alloc SRQ buf mtr, ret = %d.\n", ret);
From: Yangyang Li liyangyang20@huawei.com
mainline inclusion from mainline-v5.15-rc1 commit 4ad8181426df category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5X0C3 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=4ad...
---------------------------------------------------------------------
Due to the discrete nature of the HIP08 timer unit, a requester might finish the timeout period sooner, in elapsed real time, than its responder does, even when both sides share the identical RNR timeout length included in the RNR Nak packet and the responder indeed starts the timing prior to the requester. Furthermore, if a 'providential' resend packet arrived before the responder's timeout period expired, the responder is certainly entitled to drop the packet silently in the light of IB protocol.
To address this problem, our team made good use of certain hardware facts:
1) The timing resolution regards the transmission arrangements is 1 microsecond, e.g. if cq_period field is set to 3, it would be interpreted as 3 microsecond by hardware
2) A QPC field shall inform the hardware how many timing unit (ticks) constitutes a full microsecond, which, by default, is 1000
3) It takes 14ns for the processor to handle a packet in the buffer, so the RNR timeout length of 10ns would ensure our processing mechanism is disabled during the entire timeout period and the packet won't be dropped silently
To achieve (3), we permanently set the QPC field mentioned in (2) to zero which nominally indicates every time tick is equivalent to a microsecond in wall-clock time; now, a RNR timeout period at face value of 10 would only last 10 ticks, which is 10ns in wall-clock time.
It's worth noting that we adapt the driver by magnifying certain configuration parameters(cq_period, eq_period and ack_timeout)by 1000 given the user assumes the configuring timing unit to be microseconds.
Also, this particular improvisation is only deployed on HIP08 since other hardware has already solved this issue.
Fixes: cfc85f3e4b7f ("RDMA/hns: Add profile support for hip08 driver") Link: https://lore.kernel.org/r/20211209140655.49493-1-liangwenpeng@huawei.com Signed-off-by: Yangyang Li liyangyang20@huawei.com Signed-off-by: Wenpeng Liang liangwenpeng@huawei.com Signed-off-by: Jason Gunthorpe jgg@nvidia.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: Haoyue Xu xuhaoyue1@hisilicon.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 64 +++++++++++++++++++--- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 7 +++ 2 files changed, 64 insertions(+), 7 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index afd78b5d5548..87872c6e1977 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1614,11 +1614,17 @@ static int hns_roce_config_global_param(struct hns_roce_dev *hr_dev) { struct hns_roce_cmq_desc desc; struct hns_roce_cmq_req *req = (struct hns_roce_cmq_req *)desc.data; + u32 clock_cycles_of_1us;
hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_GLOBAL_PARAM, false);
- hr_reg_write(req, CFG_GLOBAL_PARAM_1US_CYCLES, 0x3e8); + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) + clock_cycles_of_1us = HNS_ROCE_1NS_CFG; + else + clock_cycles_of_1us = HNS_ROCE_1US_CFG; + + hr_reg_write(req, CFG_GLOBAL_PARAM_1US_CYCLES, clock_cycles_of_1us); hr_reg_write(req, CFG_GLOBAL_PARAM_UDP_PORT, ROCE_V2_UDP_DPORT);
return hns_roce_cmq_send(hr_dev, &desc, 1); @@ -5080,6 +5086,30 @@ static int hns_roce_v2_set_abs_fields(struct ib_qp *ibqp, return ret; }
+static bool check_qp_timeout_cfg_range(struct hns_roce_dev *hr_dev, u8 *timeout) +{ +#define QP_ACK_TIMEOUT_MAX_HIP08 20 +#define QP_ACK_TIMEOUT_OFFSET 10 +#define QP_ACK_TIMEOUT_MAX 31 + + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) { + if (*timeout > QP_ACK_TIMEOUT_MAX_HIP08) { + ibdev_warn(&hr_dev->ib_dev, + "Local ACK timeout shall be 0 to 20.\n"); + return false; + } + *timeout += QP_ACK_TIMEOUT_OFFSET; + } else if (hr_dev->pci_dev->revision > PCI_REVISION_ID_HIP08) { + if (*timeout > QP_ACK_TIMEOUT_MAX) { + ibdev_warn(&hr_dev->ib_dev, + "Local ACK timeout shall be 0 to 31.\n"); + return false; + } + } + + return true; +} + static int hns_roce_v2_set_opt_fields(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, @@ -5089,6 +5119,7 @@ static int hns_roce_v2_set_opt_fields(struct ib_qp *ibqp, struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); int ret = 0; + u8 timeout;
if (attr_mask & IB_QP_AV) { ret = hns_roce_v2_set_path(ibqp, attr, attr_mask, context, @@ -5098,12 +5129,10 @@ static int hns_roce_v2_set_opt_fields(struct ib_qp *ibqp, }
if (attr_mask & IB_QP_TIMEOUT) { - if (attr->timeout < 31) { - hr_reg_write(context, QPC_AT, attr->timeout); + timeout = attr->timeout; + if (check_qp_timeout_cfg_range(hr_dev, &timeout)) { + hr_reg_write(context, QPC_AT, timeout); hr_reg_clear(qpc_mask, QPC_AT); - } else { - ibdev_warn(&hr_dev->ib_dev, - "Local ACK timeout shall be 0 to 30.\n"); } }
@@ -5160,7 +5189,9 @@ static int hns_roce_v2_set_opt_fields(struct ib_qp *ibqp, set_access_flags(hr_qp, context, qpc_mask, attr, attr_mask);
if (attr_mask & IB_QP_MIN_RNR_TIMER) { - hr_reg_write(context, QPC_MIN_RNR_TIME, attr->min_rnr_timer); + hr_reg_write(context, QPC_MIN_RNR_TIME, + hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08 ? + HNS_ROCE_RNR_TIMER_10NS : attr->min_rnr_timer); hr_reg_clear(qpc_mask, QPC_MIN_RNR_TIME); }
@@ -5770,6 +5801,16 @@ static int hns_roce_v2_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
hr_reg_write(cq_context, CQC_CQ_MAX_CNT, cq_count); hr_reg_clear(cqc_mask, CQC_CQ_MAX_CNT); + + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) { + if (cq_period * HNS_ROCE_CLOCK_ADJUST > USHRT_MAX) { + dev_info(hr_dev->dev, + "cq_period(%u) reached the upper limit, adjusted to 65.\n", + cq_period); + cq_period = HNS_ROCE_MAX_CQ_PERIOD; + } + cq_period *= HNS_ROCE_CLOCK_ADJUST; + } hr_reg_write(cq_context, CQC_CQ_PERIOD, cq_period); hr_reg_clear(cqc_mask, CQC_CQ_PERIOD);
@@ -6356,6 +6397,15 @@ static int config_eqc(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq, hr_reg_write(eqc, EQC_EQ_PROD_INDX, HNS_ROCE_EQ_INIT_PROD_IDX); hr_reg_write(eqc, EQC_EQ_MAX_CNT, eq->eq_max_cnt);
+ if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) { + if (eq->eq_period * HNS_ROCE_CLOCK_ADJUST > USHRT_MAX) { + dev_info(hr_dev->dev, "eq_period(%u) reached the upper limit, adjusted to 65.\n", + eq->eq_period); + eq->eq_period = HNS_ROCE_MAX_EQ_PERIOD; + } + eq->eq_period *= HNS_ROCE_CLOCK_ADJUST; + } + hr_reg_write(eqc, EQC_EQ_PERIOD, eq->eq_period); hr_reg_write(eqc, EQC_EQE_REPORT_TIMER, HNS_ROCE_EQ_INIT_REPORT_TIMER); hr_reg_write(eqc, EQC_EQE_BA_L, bt_ba >> 3); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index dbb19fe680c9..77e8cd067642 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -1357,6 +1357,13 @@ struct fmea_ram_ecc { u32 index; };
+/* only for RNR timeout issue of HIP08 */ +#define HNS_ROCE_CLOCK_ADJUST 1000 +#define HNS_ROCE_MAX_CQ_PERIOD 65 +#define HNS_ROCE_MAX_EQ_PERIOD 65 +#define HNS_ROCE_RNR_TIMER_10NS 1 +#define HNS_ROCE_1US_CFG 999 +#define HNS_ROCE_1NS_CFG 0
#define HNS_ROCE_AEQ_DEFAULT_BURST_NUM 0x0 #define HNS_ROCE_AEQ_DEFAULT_INTERVAL 0x0
From: Chengchang Tang tangchengchang@huawei.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5X0C3
---------------------------------------------------------------------
The return value of set_hem has been fixed to ENODEV, which will lead a diagnostic information missing.
Fixes: 9a4435375cd1 ("IB/hns: Add driver files for hns RoCE driver")
Signed-off-by: Chengchang Tang tangchengchang@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: Haoyue Xu xuhaoyue1@hisilicon.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_hem.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index a5f7b8775756..e158b41328f3 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -596,11 +596,12 @@ int hns_roce_table_get(struct hns_roce_dev *hr_dev, }
/* Set HEM base address(128K/page, pa) to Hardware */ - if (hr_dev->hw->set_hem(hr_dev, table, obj, HEM_HOP_STEP_DIRECT)) { + ret = hr_dev->hw->set_hem(hr_dev, table, obj, HEM_HOP_STEP_DIRECT); + if (ret) { hns_roce_free_hem(hr_dev, table->hem[i]); table->hem[i] = NULL; - ret = -ENODEV; - dev_err(dev, "set HEM base address to HW failed.\n"); + dev_err(dev, "set HEM base address to HW failed, ret = %d.\n", + ret); goto out; }
From: Chengchang Tang tangchengchang@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5X0C3
---------------------------------------------------------------------
Log return value of clear_hem() to help diagnose.
Signed-off-by: Chengchang Tang tangchengchang@huawei.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: Haoyue Xu xuhaoyue1@hisilicon.com Reviewed-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_hem.c | 44 ++++++++++++++++-------- 1 file changed, 30 insertions(+), 14 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index e158b41328f3..f21358515bf9 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -620,6 +620,7 @@ static void clear_mhop_hem(struct hns_roce_dev *hr_dev, u32 hop_num = mhop->hop_num; u32 chunk_ba_num; u32 step_idx; + int ret;
index->inited = HEM_INDEX_BUF; chunk_ba_num = mhop->bt_chunk_size / BA_BYTE_LEN; @@ -643,16 +644,24 @@ static void clear_mhop_hem(struct hns_roce_dev *hr_dev, else step_idx = hop_num;
- if (hr_dev->hw->clear_hem(hr_dev, table, obj, step_idx)) - ibdev_warn(ibdev, "failed to clear hop%u HEM.\n", hop_num); - - if (index->inited & HEM_INDEX_L1) - if (hr_dev->hw->clear_hem(hr_dev, table, obj, 1)) - ibdev_warn(ibdev, "failed to clear HEM step 1.\n"); + ret = hr_dev->hw->clear_hem(hr_dev, table, obj, step_idx); + if (ret) + ibdev_warn(ibdev, "failed to clear hop%u HEM, ret = %d.\n", + hop_num, ret); + + if (index->inited & HEM_INDEX_L1) { + ret = hr_dev->hw->clear_hem(hr_dev, table, obj, 1); + if (ret) + ibdev_warn(ibdev, "failed to clear HEM step 1, ret = %d.\n", + ret); + }
- if (index->inited & HEM_INDEX_L0) - if (hr_dev->hw->clear_hem(hr_dev, table, obj, 0)) - ibdev_warn(ibdev, "failed to clear HEM step 0.\n"); + if (index->inited & HEM_INDEX_L0) { + ret = hr_dev->hw->clear_hem(hr_dev, table, obj, 0); + if (ret) + ibdev_warn(ibdev, "failed to clear HEM step 0, ret = %d.\n", + ret); + } } }
@@ -689,6 +698,7 @@ void hns_roce_table_put(struct hns_roce_dev *hr_dev, { struct device *dev = hr_dev->dev; unsigned long i; + int ret;
if (hns_roce_check_whether_mhop(hr_dev, table->type)) { hns_roce_table_mhop_put(hr_dev, table, obj, 1); @@ -701,8 +711,10 @@ void hns_roce_table_put(struct hns_roce_dev *hr_dev, &table->mutex)) return;
- if (hr_dev->hw->clear_hem(hr_dev, table, obj, HEM_HOP_STEP_DIRECT)) - dev_warn(dev, "failed to clear HEM base address.\n"); + ret = hr_dev->hw->clear_hem(hr_dev, table, obj, HEM_HOP_STEP_DIRECT); + if (ret) + dev_warn(dev, "failed to clear HEM base address, ret = %d.\n", + ret);
hns_roce_free_hem(hr_dev, table->hem[i]); table->hem[i] = NULL; @@ -923,6 +935,8 @@ void hns_roce_cleanup_hem_table(struct hns_roce_dev *hr_dev, { struct device *dev = hr_dev->dev; unsigned long i; + int obj; + int ret;
if (hns_roce_check_whether_mhop(hr_dev, table->type)) { hns_roce_cleanup_mhop_hem_table(hr_dev, table); @@ -931,9 +945,11 @@ void hns_roce_cleanup_hem_table(struct hns_roce_dev *hr_dev,
for (i = 0; i < table->num_hem; ++i) if (table->hem[i]) { - if (hr_dev->hw->clear_hem(hr_dev, table, - i * table->table_chunk_size / table->obj_size, 0)) - dev_err(dev, "Clear HEM base address failed.\n"); + obj = i * table->table_chunk_size / table->obj_size; + ret = hr_dev->hw->clear_hem(hr_dev, table, obj, 0); + if (ret) + dev_err(dev, "Clear HEM base address failed, ret = %d.\n", + ret);
hns_roce_free_hem(hr_dev, table->hem[i]); }