From: Miklos Szeredi mszeredi@redhat.com
stable inclusion from linux-4.19.219 commit 65f1f3eb09a3907c5f5edaff6c473b99e49e6020
--------------------------------
commit 712a951025c0667ff00b25afc360f74e639dfabe upstream.
It is possible to trigger a crash by splicing anon pipe bufs to the fuse device.
The reason for this is that anon_pipe_buf_release() will reuse buf->page if the refcount is 1, but that page might have already been stolen and its flags modified (e.g. PG_lru added).
This happens in the unlikely case of fuse_dev_splice_write() getting around to calling pipe_buf_release() after a page has been stolen, added to the page cache and removed from the page cache.
Fix by calling pipe_buf_release() right after the page was inserted into the page cache. In this case the page has an elevated refcount so any release function will know that the page isn't reusable.
Reported-by: Frank Dinoff fdinoff@google.com Link: https://lore.kernel.org/r/CAAmZXrsGg2xsP1CK+cbuEMumtrqdvD-NKnWzhNcvn71RV3c1y... Fixes: dd3bb14f44a6 ("fuse: support splice() writing to fuse device") Cc: stable@vger.kernel.org # v2.6.35 Signed-off-by: Miklos Szeredi mszeredi@redhat.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/fuse/dev.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 498a4fab40801..3fee28ee63745 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -905,6 +905,12 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) goto out_put_old; }
+ /* + * Release while we have extra ref on stolen page. Otherwise + * anon_pipe_buf_release() might think the page can be reused. + */ + pipe_buf_release(cs->pipe, buf); + get_page(newpage);
if (!(buf->flags & PIPE_BUF_FLAG_LRU)) @@ -2054,8 +2060,12 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
pipe_lock(pipe); out_free: - for (idx = 0; idx < nbuf; idx++) - pipe_buf_release(pipe, &bufs[idx]); + for (idx = 0; idx < nbuf; idx++) { + struct pipe_buffer *buf = &bufs[idx]; + + if (buf->ops) + pipe_buf_release(pipe, buf); + } pipe_unlock(pipe);
kvfree(bufs);
From: "Steven Rostedt (VMware)" rostedt@goodmis.org
stable inclusion from linux-4.19.219 commit bbaf4fe5ec34e7931af9a03a60b4071c2bc01ae0
--------------------------------
commit a55f224ff5f238013de8762c4287117e47b86e22 upstream.
If a event is filtered by pid and a trigger that requires processing of the event to happen is a attached to the event, the discard portion does not take the pid filtering into account, and the event will then be recorded when it should not have been.
Cc: stable@vger.kernel.org Fixes: 3fdaf80f4a836 ("tracing: Implement event pid filtering") Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/trace/trace.h | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0d27bbab52c95..d05230d21cf56 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1344,14 +1344,26 @@ __event_trigger_test_discard(struct trace_event_file *file, if (eflags & EVENT_FILE_FL_TRIGGER_COND) *tt = event_triggers_call(file, entry, event);
- if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) || - (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && - !filter_match_preds(file->filter, entry))) { - __trace_event_discard_commit(buffer, event); - return true; - } + if (likely(!(file->flags & (EVENT_FILE_FL_SOFT_DISABLED | + EVENT_FILE_FL_FILTERED | + EVENT_FILE_FL_PID_FILTER)))) + return false; + + if (file->flags & EVENT_FILE_FL_SOFT_DISABLED) + goto discard; + + if (file->flags & EVENT_FILE_FL_FILTERED && + !filter_match_preds(file->filter, entry)) + goto discard; + + if ((file->flags & EVENT_FILE_FL_PID_FILTER) && + trace_event_ignore_this_pid(file)) + goto discard;
return false; + discard: + __trace_event_discard_commit(buffer, event); + return true; }
/**
From: David Hildenbrand david@redhat.com
stable inclusion from linux-4.19.219 commit 9ef384ed300d1bcfb23d0ab0b487d544444d4b52
--------------------------------
commit c1e63117711977cc4295b2ce73de29dd17066c82 upstream.
To clear a user buffer we cannot simply use memset, we have to use clear_user(). With a virtio-mem device that registers a vmcore_cb and has some logically unplugged memory inside an added Linux memory block, I can easily trigger a BUG by copying the vmcore via "cp":
systemd[1]: Starting Kdump Vmcore Save Service... kdump[420]: Kdump is using the default log level(3). kdump[453]: saving to /sysroot/var/crash/127.0.0.1-2021-11-11-14:59:22/ kdump[458]: saving vmcore-dmesg.txt to /sysroot/var/crash/127.0.0.1-2021-11-11-14:59:22/ kdump[465]: saving vmcore-dmesg.txt complete kdump[467]: saving vmcore BUG: unable to handle page fault for address: 00007f2374e01000 #PF: supervisor write access in kernel mode #PF: error_code(0x0003) - permissions violation PGD 7a523067 P4D 7a523067 PUD 7a528067 PMD 7a525067 PTE 800000007048f867 Oops: 0003 [#1] PREEMPT SMP NOPTI CPU: 0 PID: 468 Comm: cp Not tainted 5.15.0+ #6 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.14.0-27-g64f37cc530f1-prebuilt.qemu.org 04/01/2014 RIP: 0010:read_from_oldmem.part.0.cold+0x1d/0x86 Code: ff ff ff e8 05 ff fe ff e9 b9 e9 7f ff 48 89 de 48 c7 c7 38 3b 60 82 e8 f1 fe fe ff 83 fd 08 72 3c 49 8d 7d 08 4c 89 e9 89 e8 <49> c7 45 00 00 00 00 00 49 c7 44 05 f8 00 00 00 00 48 83 e7 f81 RSP: 0018:ffffc9000073be08 EFLAGS: 00010212 RAX: 0000000000001000 RBX: 00000000002fd000 RCX: 00007f2374e01000 RDX: 0000000000000001 RSI: 00000000ffffdfff RDI: 00007f2374e01008 RBP: 0000000000001000 R08: 0000000000000000 R09: ffffc9000073bc50 R10: ffffc9000073bc48 R11: ffffffff829461a8 R12: 000000000000f000 R13: 00007f2374e01000 R14: 0000000000000000 R15: ffff88807bd421e8 FS: 00007f2374e12140(0000) GS:ffff88807f000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f2374e01000 CR3: 000000007a4aa000 CR4: 0000000000350eb0 Call Trace: read_vmcore+0x236/0x2c0 proc_reg_read+0x55/0xa0 vfs_read+0x95/0x190 ksys_read+0x4f/0xc0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae
Some x86-64 CPUs have a CPU feature called "Supervisor Mode Access Prevention (SMAP)", which is used to detect wrong access from the kernel to user buffers like this: SMAP triggers a permissions violation on wrong access. In the x86-64 variant of clear_user(), SMAP is properly handled via clac()+stac().
To fix, properly use clear_user() when we're dealing with a user buffer.
Link: https://lkml.kernel.org/r/20211112092750.6921-1-david@redhat.com Fixes: 997c136f518c ("fs/proc/vmcore.c: add hook to read_from_oldmem() to check for non-ram pages") Signed-off-by: David Hildenbrand david@redhat.com Acked-by: Baoquan He bhe@redhat.com Cc: Dave Young dyoung@redhat.com Cc: Baoquan He bhe@redhat.com Cc: Vivek Goyal vgoyal@redhat.com Cc: Philipp Rudo prudo@redhat.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: David Hildenbrand david@redhat.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/proc/vmcore.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index c4147e50af98a..f5dfedc015520 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -117,14 +117,19 @@ static ssize_t read_from_oldmem(char *buf, size_t count, nr_bytes = count;
/* If pfn is not ram, return zeros for sparse dump files */ - if (pfn_is_ram(pfn) == 0) - memset(buf, 0, nr_bytes); - else { + if (pfn_is_ram(pfn) == 0) { + tmp = 0; + if (!userbuf) + memset(buf, 0, nr_bytes); + else if (clear_user(buf, nr_bytes)) + tmp = -EFAULT; + } else { tmp = copy_oldmem_page(pfn, buf, nr_bytes, offset, userbuf); - if (tmp < 0) - return tmp; } + if (tmp < 0) + return tmp; + *ppos += nr_bytes; count -= nr_bytes; buf += nr_bytes;
From: Eric Dumazet edumazet@google.com
stable inclusion from linux-4.19.219 commit 3b7c37106bcf08f154404f1f212a3713ebf37cec
--------------------------------
[ Upstream commit 19d36c5f294879949c9d6f57cb61d39cc4c48553 ]
We deal with IPv6 packets, so we need to use IP6CB(skb)->flags and IP6SKB_REROUTED, instead of IPCB(skb)->flags and IPSKB_REROUTED
Found by code inspection, please double check that fixing this bug does not surface other bugs.
Fixes: 09ee9dba9611 ("ipv6: Reinject IPv6 packets if IPsec policy matches after SNAT") Signed-off-by: Eric Dumazet edumazet@google.com Cc: Tobias Brunner tobias@strongswan.org Cc: Steffen Klassert steffen.klassert@secunet.com Cc: David Ahern dsahern@kernel.org Reviewed-by: David Ahern dsahern@kernel.org Tested-by: Tobias Brunner tobias@strongswan.org Acked-by: Tobias Brunner tobias@strongswan.org Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv6/ip6_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index fc36f3b0dceb3..251ec12517e93 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -175,7 +175,7 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { - IPCB(skb)->flags |= IPSKB_REROUTED; + IP6CB(skb)->flags |= IP6SKB_REROUTED; return dst_output(net, sk, skb); } #endif
From: "Steven Rostedt (VMware)" rostedt@goodmis.org
stable inclusion from linux-4.19.219 commit 2692931d92d83afaa636703e95203d53629ca7c1
--------------------------------
commit 6cb206508b621a9a0a2c35b60540e399225c8243 upstream.
When pid filtering is activated in an instance, all of the events trace files for that instance has the PID_FILTER flag set. This determines whether or not pid filtering needs to be done on the event, otherwise the event is executed as normal.
If pid filtering is enabled when an event is created (via a dynamic event or modules), its flag is not updated to reflect the current state, and the events are not filtered properly.
Cc: stable@vger.kernel.org Fixes: 3fdaf80f4a836 ("tracing: Implement event pid filtering") Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/trace/trace_events.c | 7 +++++++ 1 file changed, 7 insertions(+)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index bb4c6ac51a6de..531e6e8f765b5 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2253,12 +2253,19 @@ static struct trace_event_file * trace_create_new_event(struct trace_event_call *call, struct trace_array *tr) { + struct trace_pid_list *pid_list; struct trace_event_file *file;
file = kmem_cache_alloc(file_cachep, GFP_TRACE); if (!file) return NULL;
+ pid_list = rcu_dereference_protected(tr->filtered_pids, + lockdep_is_held(&event_mutex)); + + if (pid_list) + file->flags |= EVENT_FILE_FL_PID_FILTER; + file->event_call = call; file->tr = tr; atomic_set(&file->sm_ref, 0);
From: Miklos Szeredi mszeredi@redhat.com
stable inclusion from linux-4.19.219 commit 22b814fdce55caf8db1cb550a32a64159e5b83a8
--------------------------------
commit 473441720c8616dfaf4451f9c7ea14f0eb5e5d65 upstream.
Checking buf->flags should be done before the pipe_buf_release() is called on the pipe buffer, since releasing the buffer might modify the flags.
This is exactly what page_cache_pipe_buf_release() does, and which results in the same VM_BUG_ON_PAGE(PageLRU(page)) that the original patch was trying to fix.
Reported-by: Justin Forbes jmforbes@linuxtx.org Fixes: 712a951025c0 ("fuse: fix page stealing") Cc: stable@vger.kernel.org # v2.6.35 Signed-off-by: Miklos Szeredi mszeredi@redhat.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/fuse/dev.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 3fee28ee63745..8a98ffd5e4b67 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -905,17 +905,17 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) goto out_put_old; }
+ get_page(newpage); + + if (!(buf->flags & PIPE_BUF_FLAG_LRU)) + lru_cache_add_file(newpage); + /* * Release while we have extra ref on stolen page. Otherwise * anon_pipe_buf_release() might think the page can be reused. */ pipe_buf_release(cs->pipe, buf);
- get_page(newpage); - - if (!(buf->flags & PIPE_BUF_FLAG_LRU)) - lru_cache_add_file(newpage); - err = 0; spin_lock(&cs->req->waitq.lock); if (test_bit(FR_ABORTED, &cs->req->flags))
From: Alexander Mikhalitsyn alexander.mikhalitsyn@virtuozzo.com
stable inclusion from linux-4.19.220 commit 4e91adc73764fd0cde00be381461e52fa641fadd
--------------------------------
commit 85b6d24646e4125c591639841169baa98a2da503 upstream.
Currently, the exit_shm() function not designed to work properly when task->sysvshm.shm_clist holds shm objects from different IPC namespaces.
This is a real pain when sysctl kernel.shm_rmid_forced = 1, because it leads to use-after-free (reproducer exists).
This is an attempt to fix the problem by extending exit_shm mechanism to handle shm's destroy from several IPC ns'es.
To achieve that we do several things:
1. add a namespace (non-refcounted) pointer to the struct shmid_kernel
2. during new shm object creation (newseg()/shmget syscall) we initialize this pointer by current task IPC ns
3. exit_shm() fully reworked such that it traverses over all shp's in task->sysvshm.shm_clist and gets IPC namespace not from current task as it was before but from shp's object itself, then call shm_destroy(shp, ns).
Note: We need to be really careful here, because as it was said before (1), our pointer to IPC ns non-refcnt'ed. To be on the safe side we using special helper get_ipc_ns_not_zero() which allows to get IPC ns refcounter only if IPC ns not in the "state of destruction".
Q/A
Q: Why can we access shp->ns memory using non-refcounted pointer? A: Because shp object lifetime is always shorther than IPC namespace lifetime, so, if we get shp object from the task->sysvshm.shm_clist while holding task_lock(task) nobody can steal our namespace.
Q: Does this patch change semantics of unshare/setns/clone syscalls? A: No. It's just fixes non-covered case when process may leave IPC namespace without getting task->sysvshm.shm_clist list cleaned up.
Link: https://lkml.kernel.org/r/67bb03e5-f79c-1815-e2bf-949c67047418@colorfullife.... Link: https://lkml.kernel.org/r/20211109151501.4921-1-manfred@colorfullife.com Fixes: ab602f79915 ("shm: make exit_shm work proportional to task activity") Co-developed-by: Manfred Spraul manfred@colorfullife.com Signed-off-by: Manfred Spraul manfred@colorfullife.com Signed-off-by: Alexander Mikhalitsyn alexander.mikhalitsyn@virtuozzo.com Cc: "Eric W. Biederman" ebiederm@xmission.com Cc: Davidlohr Bueso dave@stgolabs.net Cc: Greg KH gregkh@linuxfoundation.org Cc: Andrei Vagin avagin@gmail.com Cc: Pavel Tikhomirov ptikhomirov@virtuozzo.com Cc: Vasily Averin vvs@virtuozzo.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/ipc_namespace.h | 15 +++ include/linux/sched/task.h | 2 +- ipc/shm.c | 189 +++++++++++++++++++++++++--------- 3 files changed, 159 insertions(+), 47 deletions(-)
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 6ab8c1bada3fc..36ffbe330abea 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -129,6 +129,16 @@ static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) return ns; }
+static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) +{ + if (ns) { + if (refcount_inc_not_zero(&ns->count)) + return ns; + } + + return NULL; +} + extern void put_ipc_ns(struct ipc_namespace *ns); #else static inline struct ipc_namespace *copy_ipcs(unsigned long flags, @@ -145,6 +155,11 @@ static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) return ns; }
+static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) +{ + return ns; +} + static inline void put_ipc_ns(struct ipc_namespace *ns) { } diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 1bf1f51db372f..c8b52b3ec8654 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -143,7 +143,7 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring * subscriptions and synchronises with wait4(). Also used in procfs. Also * pins the final release of task.io_context. Also protects ->cpuset and - * ->cgroup.subsys[]. And ->vfork_done. + * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist. * * Nests both inside and outside of read_lock(&tasklist_lock). * It must not be nested with write_lock_irq(&tasklist_lock), diff --git a/ipc/shm.c b/ipc/shm.c index 904fe53256af0..0a5053f5726f6 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -62,9 +62,18 @@ struct shmid_kernel /* private to the kernel */ struct pid *shm_lprid; struct user_struct *mlock_user;
- /* The task created the shm object. NULL if the task is dead. */ + /* + * The task created the shm object, for + * task_lock(shp->shm_creator) + */ struct task_struct *shm_creator; - struct list_head shm_clist; /* list by creator */ + + /* + * List by creator. task_lock(->shm_creator) required for read/write. + * If list_empty(), then the creator is dead already. + */ + struct list_head shm_clist; + struct ipc_namespace *ns; } __randomize_layout;
/* shm_mode upper byte flags */ @@ -115,6 +124,7 @@ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) struct shmid_kernel *shp;
shp = container_of(ipcp, struct shmid_kernel, shm_perm); + WARN_ON(ns != shp->ns);
if (shp->shm_nattch) { shp->shm_perm.mode |= SHM_DEST; @@ -225,10 +235,43 @@ static void shm_rcu_free(struct rcu_head *head) kvfree(shp); }
-static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) +/* + * It has to be called with shp locked. + * It must be called before ipc_rmid() + */ +static inline void shm_clist_rm(struct shmid_kernel *shp) { - list_del(&s->shm_clist); - ipc_rmid(&shm_ids(ns), &s->shm_perm); + struct task_struct *creator; + + /* ensure that shm_creator does not disappear */ + rcu_read_lock(); + + /* + * A concurrent exit_shm may do a list_del_init() as well. + * Just do nothing if exit_shm already did the work + */ + if (!list_empty(&shp->shm_clist)) { + /* + * shp->shm_creator is guaranteed to be valid *only* + * if shp->shm_clist is not empty. + */ + creator = shp->shm_creator; + + task_lock(creator); + /* + * list_del_init() is a nop if the entry was already removed + * from the list. + */ + list_del_init(&shp->shm_clist); + task_unlock(creator); + } + rcu_read_unlock(); +} + +static inline void shm_rmid(struct shmid_kernel *s) +{ + shm_clist_rm(s); + ipc_rmid(&shm_ids(s->ns), &s->shm_perm); }
@@ -283,7 +326,7 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) shm_file = shp->shm_file; shp->shm_file = NULL; ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; - shm_rmid(ns, shp); + shm_rmid(shp); shm_unlock(shp); if (!is_file_hugepages(shm_file)) shmem_lock(shm_file, 0, shp->mlock_user); @@ -306,10 +349,10 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) * * 2) sysctl kernel.shm_rmid_forced is set to 1. */ -static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) +static bool shm_may_destroy(struct shmid_kernel *shp) { return (shp->shm_nattch == 0) && - (ns->shm_rmid_forced || + (shp->ns->shm_rmid_forced || (shp->shm_perm.mode & SHM_DEST)); }
@@ -340,7 +383,7 @@ static void shm_close(struct vm_area_struct *vma) ipc_update_pid(&shp->shm_lprid, task_tgid(current)); shp->shm_dtim = ktime_get_real_seconds(); shp->shm_nattch--; - if (shm_may_destroy(ns, shp)) + if (shm_may_destroy(shp)) shm_destroy(ns, shp); else shm_unlock(shp); @@ -361,10 +404,10 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data) * * As shp->* are changed under rwsem, it's safe to skip shp locking. */ - if (shp->shm_creator != NULL) + if (!list_empty(&shp->shm_clist)) return 0;
- if (shm_may_destroy(ns, shp)) { + if (shm_may_destroy(shp)) { shm_lock_by_ptr(shp); shm_destroy(ns, shp); } @@ -382,48 +425,97 @@ void shm_destroy_orphaned(struct ipc_namespace *ns) /* Locking assumes this will only be called with task == current */ void exit_shm(struct task_struct *task) { - struct ipc_namespace *ns = task->nsproxy->ipc_ns; - struct shmid_kernel *shp, *n; + for (;;) { + struct shmid_kernel *shp; + struct ipc_namespace *ns;
- if (list_empty(&task->sysvshm.shm_clist)) - return; + task_lock(task); + + if (list_empty(&task->sysvshm.shm_clist)) { + task_unlock(task); + break; + } + + shp = list_first_entry(&task->sysvshm.shm_clist, struct shmid_kernel, + shm_clist);
- /* - * If kernel.shm_rmid_forced is not set then only keep track of - * which shmids are orphaned, so that a later set of the sysctl - * can clean them up. - */ - if (!ns->shm_rmid_forced) { - down_read(&shm_ids(ns).rwsem); - list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist) - shp->shm_creator = NULL; /* - * Only under read lock but we are only called on current - * so no entry on the list will be shared. + * 1) Get pointer to the ipc namespace. It is worth to say + * that this pointer is guaranteed to be valid because + * shp lifetime is always shorter than namespace lifetime + * in which shp lives. + * We taken task_lock it means that shp won't be freed. */ - list_del(&task->sysvshm.shm_clist); - up_read(&shm_ids(ns).rwsem); - return; - } + ns = shp->ns;
- /* - * Destroy all already created segments, that were not yet mapped, - * and mark any mapped as orphan to cover the sysctl toggling. - * Destroy is skipped if shm_may_destroy() returns false. - */ - down_write(&shm_ids(ns).rwsem); - list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) { - shp->shm_creator = NULL; + /* + * 2) If kernel.shm_rmid_forced is not set then only keep track of + * which shmids are orphaned, so that a later set of the sysctl + * can clean them up. + */ + if (!ns->shm_rmid_forced) + goto unlink_continue;
- if (shm_may_destroy(ns, shp)) { - shm_lock_by_ptr(shp); - shm_destroy(ns, shp); + /* + * 3) get a reference to the namespace. + * The refcount could be already 0. If it is 0, then + * the shm objects will be free by free_ipc_work(). + */ + ns = get_ipc_ns_not_zero(ns); + if (!ns) { +unlink_continue: + list_del_init(&shp->shm_clist); + task_unlock(task); + continue; } - }
- /* Remove the list head from any segments still attached. */ - list_del(&task->sysvshm.shm_clist); - up_write(&shm_ids(ns).rwsem); + /* + * 4) get a reference to shp. + * This cannot fail: shm_clist_rm() is called before + * ipc_rmid(), thus the refcount cannot be 0. + */ + WARN_ON(!ipc_rcu_getref(&shp->shm_perm)); + + /* + * 5) unlink the shm segment from the list of segments + * created by current. + * This must be done last. After unlinking, + * only the refcounts obtained above prevent IPC_RMID + * from destroying the segment or the namespace. + */ + list_del_init(&shp->shm_clist); + + task_unlock(task); + + /* + * 6) we have all references + * Thus lock & if needed destroy shp. + */ + down_write(&shm_ids(ns).rwsem); + shm_lock_by_ptr(shp); + /* + * rcu_read_lock was implicitly taken in shm_lock_by_ptr, it's + * safe to call ipc_rcu_putref here + */ + ipc_rcu_putref(&shp->shm_perm, shm_rcu_free); + + if (ipc_valid_object(&shp->shm_perm)) { + if (shm_may_destroy(shp)) + shm_destroy(ns, shp); + else + shm_unlock(shp); + } else { + /* + * Someone else deleted the shp from namespace + * idr/kht while we have waited. + * Just unlock and continue. + */ + shm_unlock(shp); + } + + up_write(&shm_ids(ns).rwsem); + put_ipc_ns(ns); /* paired with get_ipc_ns_not_zero */ + } }
static vm_fault_t shm_fault(struct vm_fault *vmf) @@ -680,7 +772,11 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) if (error < 0) goto no_id;
+ shp->ns = ns; + + task_lock(current); list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist); + task_unlock(current);
/* * shmid gets reported as "inode#" in /proc/pid/maps. @@ -1549,7 +1645,8 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, down_write(&shm_ids(ns).rwsem); shp = shm_lock(ns, shmid); shp->shm_nattch--; - if (shm_may_destroy(ns, shp)) + + if (shm_may_destroy(shp)) shm_destroy(ns, shp); else shm_unlock(shp);
From: Mike Christie michael.christie@oracle.com
stable inclusion from linux-4.19.220 commit 8368b993f3155c07a0c4a9d24953961b71e84264
--------------------------------
[ Upstream commit a0c2f8b6709a9a4af175497ca65f93804f57b248 ]
We can race where iscsi_session_recovery_timedout() has woken up the error handler thread and it's now setting the devices to offline, and session_recovery_timedout()'s call to scsi_target_unblock() is also trying to set the device's state to transport-offline. We can then get a mix of states.
For the case where we can't relogin we want the devices to be in transport-offline so when we have repaired the connection __iscsi_unblock_session() can set the state back to running.
Set the device state then call into libiscsi to wake up the error handler.
Link: https://lore.kernel.org/r/20211105221048.6541-2-michael.christie@oracle.com Reviewed-by: Lee Duncan lduncan@suse.com Signed-off-by: Mike Christie michael.christie@oracle.com Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/scsi/scsi_transport_iscsi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 3d489193c8907..bd699906828f5 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -1894,12 +1894,12 @@ static void session_recovery_timedout(struct work_struct *work) } spin_unlock_irqrestore(&session->lock, flags);
- if (session->transport->session_recovery_timedout) - session->transport->session_recovery_timedout(session); - ISCSI_DBG_TRANS_SESSION(session, "Unblocking SCSI target\n"); scsi_target_unblock(&session->dev, SDEV_TRANSPORT_OFFLINE); ISCSI_DBG_TRANS_SESSION(session, "Completed unblocking SCSI target\n"); + + if (session->transport->session_recovery_timedout) + session->transport->session_recovery_timedout(session); }
static void __iscsi_unblock_session(struct work_struct *work)
From: Stephen Suryaputra ssuryaextr@gmail.com
stable inclusion from linux-4.19.220 commit 3d50e0b57cf4fffd6216629f45a2bf4946c65f56
--------------------------------
commit ee201011c1e1563c114a55c86eb164b236f18e84 upstream.
IPCB/IP6CB need to be initialized when processing outbound v4 or v6 pkts in the codepath of vrf device xmit function so that leftover garbage doesn't cause futher code that uses the CB to incorrectly process the pkt.
One occasion of the issue might occur when MPLS route uses the vrf device as the outgoing device such as when the route is added using "ip -f mpls route add <label> dev <vrf>" command.
The problems seems to exist since day one. Hence I put the day one commits on the Fixes tags.
Fixes: 193125dbd8eb ("net: Introduce VRF device driver") Fixes: 35402e313663 ("net: Add IPv6 support to VRF device") Cc: stable@vger.kernel.org Signed-off-by: Stephen Suryaputra ssuryaextr@gmail.com Reviewed-by: David Ahern dsahern@kernel.org Link: https://lore.kernel.org/r/20211130162637.3249-1-ssuryaextr@gmail.com Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/net/vrf.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index e745e2c09578e..6501ab4990d38 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -210,6 +210,7 @@ static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, /* strip the ethernet header added for pass through VRF device */ __skb_pull(skb, skb_network_offset(skb));
+ memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); ret = vrf_ip6_local_out(net, skb->sk, skb); if (unlikely(net_xmit_eval(ret))) dev->stats.tx_errors++; @@ -291,6 +292,7 @@ static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, RT_SCOPE_LINK); }
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb))); ret = vrf_ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); if (unlikely(net_xmit_eval(ret))) vrf_dev->stats.tx_errors++;
From: Ioanna Alifieraki ioanna-maria.alifieraki@canonical.com
stable inclusion from linux-4.19.220 commit 36fbc46a792969650a44702ddd2143fe722e4d00
--------------------------------
commit 1d49eb91e86e8c1c1614c72e3e958b6b7e2472a9 upstream.
Currently when removing an ipmi_user the removal is deferred as a work on the system's workqueue. Although this guarantees the free operation will occur in non atomic context, it can race with the ipmi_msghandler module removal (see [1]) . In case a remove_user work is scheduled for removal and shortly after ipmi_msghandler module is removed we can end up in a situation where the module is removed fist and when the work is executed the system crashes with : BUG: unable to handle page fault for address: ffffffffc05c3450 PF: supervisor instruction fetch in kernel mode PF: error_code(0x0010) - not-present page because the pages of the module are gone. In cleanup_ipmi() there is no easy way to detect if there are any pending works to flush them before removing the module. This patch creates a separate workqueue and schedules the remove_work works on it. When removing the module the workqueue is drained when destroyed to avoid the race.
[1] https://bugs.launchpad.net/bugs/1950666
Cc: stable@vger.kernel.org # 5.1 Fixes: 3b9a907223d7 (ipmi: fix sleep-in-atomic in free_user at cleanup SRCU user->release_barrier) Signed-off-by: Ioanna Alifieraki ioanna-maria.alifieraki@canonical.com Message-Id: 20211115131645.25116-1-ioanna-maria.alifieraki@canonical.com Signed-off-by: Corey Minyard cminyard@mvista.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/char/ipmi/ipmi_msghandler.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index 0698331724115..d51b82d4027d7 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -219,6 +219,8 @@ struct ipmi_user { struct work_struct remove_work; };
+struct workqueue_struct *remove_work_wq; + static struct ipmi_user *acquire_ipmi_user(struct ipmi_user *user, int *index) __acquires(user->release_barrier) { @@ -1207,7 +1209,7 @@ static void free_user(struct kref *ref) struct ipmi_user *user = container_of(ref, struct ipmi_user, refcount);
/* SRCU cleanup must happen in task context. */ - schedule_work(&user->remove_work); + queue_work(remove_work_wq, &user->remove_work); }
static void _ipmi_destroy_user(struct ipmi_user *user) @@ -5088,6 +5090,13 @@ static int ipmi_init_msghandler(void)
atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+ remove_work_wq = create_singlethread_workqueue("ipmi-msghandler-remove-wq"); + if (!remove_work_wq) { + pr_err("unable to create ipmi-msghandler-remove-wq workqueue"); + rv = -ENOMEM; + goto out; + } + initialized = true;
out: @@ -5113,6 +5122,8 @@ static void __exit cleanup_ipmi(void) int count;
if (initialized) { + destroy_workqueue(remove_work_wq); + atomic_notifier_chain_unregister(&panic_notifier_list, &panic_block);
From: Eric Dumazet edumazet@google.com
stable inclusion from linux-4.19.220 commit 2fd949365fe6628fb22f9224782592cb1fff600c
--------------------------------
commit 7a10d8c810cfad3e79372d7d1c77899d86cd6662 upstream.
syzbot found that __dev_queue_xmit() is reading txq->xmit_lock_owner without annotations.
No serious issue there, let's document what is happening there.
BUG: KCSAN: data-race in __dev_queue_xmit / __dev_queue_xmit
write to 0xffff888139d09484 of 4 bytes by interrupt on cpu 0: __netif_tx_unlock include/linux/netdevice.h:4437 [inline] __dev_queue_xmit+0x948/0xf70 net/core/dev.c:4229 dev_queue_xmit_accel+0x19/0x20 net/core/dev.c:4265 macvlan_queue_xmit drivers/net/macvlan.c:543 [inline] macvlan_start_xmit+0x2b3/0x3d0 drivers/net/macvlan.c:567 __netdev_start_xmit include/linux/netdevice.h:4987 [inline] netdev_start_xmit include/linux/netdevice.h:5001 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3590 dev_hard_start_xmit+0x72/0x120 net/core/dev.c:3606 sch_direct_xmit+0x1b2/0x7c0 net/sched/sch_generic.c:342 __dev_xmit_skb+0x83d/0x1370 net/core/dev.c:3817 __dev_queue_xmit+0x590/0xf70 net/core/dev.c:4194 dev_queue_xmit+0x13/0x20 net/core/dev.c:4259 neigh_hh_output include/net/neighbour.h:511 [inline] neigh_output include/net/neighbour.h:525 [inline] ip6_finish_output2+0x995/0xbb0 net/ipv6/ip6_output.c:126 __ip6_finish_output net/ipv6/ip6_output.c:191 [inline] ip6_finish_output+0x444/0x4c0 net/ipv6/ip6_output.c:201 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip6_output+0x10e/0x210 net/ipv6/ip6_output.c:224 dst_output include/net/dst.h:450 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ndisc_send_skb+0x486/0x610 net/ipv6/ndisc.c:508 ndisc_send_rs+0x3b0/0x3e0 net/ipv6/ndisc.c:702 addrconf_rs_timer+0x370/0x540 net/ipv6/addrconf.c:3898 call_timer_fn+0x2e/0x240 kernel/time/timer.c:1421 expire_timers+0x116/0x240 kernel/time/timer.c:1466 __run_timers+0x368/0x410 kernel/time/timer.c:1734 run_timer_softirq+0x2e/0x60 kernel/time/timer.c:1747 __do_softirq+0x158/0x2de kernel/softirq.c:558 __irq_exit_rcu kernel/softirq.c:636 [inline] irq_exit_rcu+0x37/0x70 kernel/softirq.c:648 sysvec_apic_timer_interrupt+0x3e/0xb0 arch/x86/kernel/apic/apic.c:1097 asm_sysvec_apic_timer_interrupt+0x12/0x20
read to 0xffff888139d09484 of 4 bytes by interrupt on cpu 1: __dev_queue_xmit+0x5e3/0xf70 net/core/dev.c:4213 dev_queue_xmit_accel+0x19/0x20 net/core/dev.c:4265 macvlan_queue_xmit drivers/net/macvlan.c:543 [inline] macvlan_start_xmit+0x2b3/0x3d0 drivers/net/macvlan.c:567 __netdev_start_xmit include/linux/netdevice.h:4987 [inline] netdev_start_xmit include/linux/netdevice.h:5001 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3590 dev_hard_start_xmit+0x72/0x120 net/core/dev.c:3606 sch_direct_xmit+0x1b2/0x7c0 net/sched/sch_generic.c:342 __dev_xmit_skb+0x83d/0x1370 net/core/dev.c:3817 __dev_queue_xmit+0x590/0xf70 net/core/dev.c:4194 dev_queue_xmit+0x13/0x20 net/core/dev.c:4259 neigh_resolve_output+0x3db/0x410 net/core/neighbour.c:1523 neigh_output include/net/neighbour.h:527 [inline] ip6_finish_output2+0x9be/0xbb0 net/ipv6/ip6_output.c:126 __ip6_finish_output net/ipv6/ip6_output.c:191 [inline] ip6_finish_output+0x444/0x4c0 net/ipv6/ip6_output.c:201 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip6_output+0x10e/0x210 net/ipv6/ip6_output.c:224 dst_output include/net/dst.h:450 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ndisc_send_skb+0x486/0x610 net/ipv6/ndisc.c:508 ndisc_send_rs+0x3b0/0x3e0 net/ipv6/ndisc.c:702 addrconf_rs_timer+0x370/0x540 net/ipv6/addrconf.c:3898 call_timer_fn+0x2e/0x240 kernel/time/timer.c:1421 expire_timers+0x116/0x240 kernel/time/timer.c:1466 __run_timers+0x368/0x410 kernel/time/timer.c:1734 run_timer_softirq+0x2e/0x60 kernel/time/timer.c:1747 __do_softirq+0x158/0x2de kernel/softirq.c:558 __irq_exit_rcu kernel/softirq.c:636 [inline] irq_exit_rcu+0x37/0x70 kernel/softirq.c:648 sysvec_apic_timer_interrupt+0x8d/0xb0 arch/x86/kernel/apic/apic.c:1097 asm_sysvec_apic_timer_interrupt+0x12/0x20 kcsan_setup_watchpoint+0x94/0x420 kernel/kcsan/core.c:443 folio_test_anon include/linux/page-flags.h:581 [inline] PageAnon include/linux/page-flags.h:586 [inline] zap_pte_range+0x5ac/0x10e0 mm/memory.c:1347 zap_pmd_range mm/memory.c:1467 [inline] zap_pud_range mm/memory.c:1496 [inline] zap_p4d_range mm/memory.c:1517 [inline] unmap_page_range+0x2dc/0x3d0 mm/memory.c:1538 unmap_single_vma+0x157/0x210 mm/memory.c:1583 unmap_vmas+0xd0/0x180 mm/memory.c:1615 exit_mmap+0x23d/0x470 mm/mmap.c:3170 __mmput+0x27/0x1b0 kernel/fork.c:1113 mmput+0x3d/0x50 kernel/fork.c:1134 exit_mm+0xdb/0x170 kernel/exit.c:507 do_exit+0x608/0x17a0 kernel/exit.c:819 do_group_exit+0xce/0x180 kernel/exit.c:929 get_signal+0xfc3/0x1550 kernel/signal.c:2852 arch_do_signal_or_restart+0x8c/0x2e0 arch/x86/kernel/signal.c:868 handle_signal_work kernel/entry/common.c:148 [inline] exit_to_user_mode_loop kernel/entry/common.c:172 [inline] exit_to_user_mode_prepare+0x113/0x190 kernel/entry/common.c:207 __syscall_exit_to_user_mode_work kernel/entry/common.c:289 [inline] syscall_exit_to_user_mode+0x20/0x40 kernel/entry/common.c:300 do_syscall_64+0x50/0xd0 arch/x86/entry/common.c:86 entry_SYSCALL_64_after_hwframe+0x44/0xae
value changed: 0x00000000 -> 0xffffffff
Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 28712 Comm: syz-executor.0 Tainted: G W 5.16.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet edumazet@google.com Reported-by: syzbot syzkaller@googlegroups.com Link: https://lore.kernel.org/r/20211130170155.2331929-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/netdevice.h | 19 +++++++++++++------ net/core/dev.c | 5 ++++- 2 files changed, 17 insertions(+), 7 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 61ad607027f3e..9a7d7e6304f44 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3957,7 +3957,8 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits) static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu) { spin_lock(&txq->_xmit_lock); - txq->xmit_lock_owner = cpu; + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, cpu); }
static inline bool __netif_tx_acquire(struct netdev_queue *txq) @@ -3974,26 +3975,32 @@ static inline void __netif_tx_release(struct netdev_queue *txq) static inline void __netif_tx_lock_bh(struct netdev_queue *txq) { spin_lock_bh(&txq->_xmit_lock); - txq->xmit_lock_owner = smp_processor_id(); + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id()); }
static inline bool __netif_tx_trylock(struct netdev_queue *txq) { bool ok = spin_trylock(&txq->_xmit_lock); - if (likely(ok)) - txq->xmit_lock_owner = smp_processor_id(); + + if (likely(ok)) { + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id()); + } return ok; }
static inline void __netif_tx_unlock(struct netdev_queue *txq) { - txq->xmit_lock_owner = -1; + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, -1); spin_unlock(&txq->_xmit_lock); }
static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) { - txq->xmit_lock_owner = -1; + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, -1); spin_unlock_bh(&txq->_xmit_lock); }
diff --git a/net/core/dev.c b/net/core/dev.c index 6ed7810ed7bd4..5bc0e56216c9f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3829,7 +3829,10 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) if (dev->flags & IFF_UP) { int cpu = smp_processor_id(); /* ok because BHs are off */
- if (txq->xmit_lock_owner != cpu) { + /* Other cpus might concurrently change txq->xmit_lock_owner + * to -1 or to their cpu id, but not to our id. + */ + if (READ_ONCE(txq->xmit_lock_owner) != cpu) { if (dev_xmit_recursion()) goto recursion_alert;
From: Sven Eckelmann sven@narfation.org
stable inclusion from linux-4.19.220 commit 5b75e312d5b989b041a310a5cff78e282342c218
--------------------------------
commit 7492ffc90fa126afb67d4392d56cb4134780194a upstream.
The CONSOLE_POLLING mode is used for tools like k(g)db. In this kind of setup, it is often sharing a serial device with the normal system console. This is usually no problem because the polling helpers can consume input values directly (when in kgdb context) and the normal Linux handlers can only consume new input values after kgdb switched back.
This is not true anymore when RX DMA is enabled for UARTDM controllers. Single input values can no longer be received correctly. Instead following seems to happen:
* on 1. input, some old input is read (continuously) * on 2. input, two old inputs are read (continuously) * on 3. input, three old input values are read (continuously) * on 4. input, 4 previous inputs are received
This repeats then for each group of 4 input values.
This behavior changes slightly depending on what state the controller was when the first input was received. But this makes working with kgdb basically impossible because control messages are always corrupted when kgdboc tries to parse them.
RX DMA should therefore be off when CONSOLE_POLLING is enabled to avoid these kind of problems. No such problem was noticed for TX DMA.
Fixes: 99693945013a ("tty: serial: msm: Add RX DMA support") Cc: stable@vger.kernel.org Signed-off-by: Sven Eckelmann sven@narfation.org Link: https://lore.kernel.org/r/20211113121050.7266-1-sven@narfation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/tty/serial/msm_serial.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/drivers/tty/serial/msm_serial.c b/drivers/tty/serial/msm_serial.c index 4caeca67fd143..a40827956bd15 100644 --- a/drivers/tty/serial/msm_serial.c +++ b/drivers/tty/serial/msm_serial.c @@ -603,6 +603,9 @@ static void msm_start_rx_dma(struct msm_port *msm_port) u32 val; int ret;
+ if (IS_ENABLED(CONFIG_CONSOLE_POLL)) + return; + if (!dma->chan) return;
From: Pierre Gondois Pierre.Gondois@arm.com
stable inclusion from linux-4.19.220 commit d3dbf51a923755fdf3c097fddb35cd098e77ce7a
--------------------------------
commit ac442a077acf9a6bf1db4320ec0c3f303be092b3 upstream.
The document 'ACPI for Arm Components 1.0' defines the following _HID mappings: -'Prime cell UART (PL011)': ARMH0011 -'SBSA UART': ARMHB000
Use the sbsa-uart driver when a device is described with the 'ARMHB000' _HID.
Note: PL011 devices currently use the sbsa-uart driver instead of the uart-pl011 driver. Indeed, PL011 devices are not bound to a clock in ACPI. It is not possible to change their baudrate.
Cc: stable@vger.kernel.org Signed-off-by: Pierre Gondois Pierre.Gondois@arm.com Link: https://lore.kernel.org/r/20211109172248.19061-1-Pierre.Gondois@arm.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/tty/serial/amba-pl011.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c index 35b86a2d5ddba..1d27e006826d2 100644 --- a/drivers/tty/serial/amba-pl011.c +++ b/drivers/tty/serial/amba-pl011.c @@ -2839,6 +2839,7 @@ MODULE_DEVICE_TABLE(of, sbsa_uart_of_match);
static const struct acpi_device_id sbsa_uart_acpi_match[] = { { "ARMH0011", 0 }, + { "ARMHB000", 0 }, {}, }; MODULE_DEVICE_TABLE(acpi, sbsa_uart_acpi_match);
From: Johan Hovold johan@kernel.org
stable inclusion from linux-4.19.220 commit 1179b168fa3f3a6aae3bd140000455a0e58457db
--------------------------------
commit 00de977f9e0aa9760d9a79d1e41ff780f74e3424 upstream.
Commit 761ed4a94582 ("tty: serial_core: convert uart_close to use tty_port_close") converted serial core to use tty_port_close() but failed to notice that the transmit buffer still needs to be freed on final close.
Not freeing the transmit buffer means that the buffer is no longer cleared on next open so that any ioctl() waiting for the buffer to drain might wait indefinitely (e.g. on termios changes) or that stale data can end up being transmitted in case tx is restarted.
Furthermore, the buffer of any port that has been opened would leak on driver unbind.
Note that the port lock is held when clearing the buffer pointer due to the ldisc race worked around by commit a5ba1d95e46e ("uart: fix race between uart_put_char() and uart_shutdown()").
Also note that the tty-port shutdown() callback is not called for console ports so it is not strictly necessary to free the buffer page after releasing the lock (cf. d72402145ace ("tty/serial: do not free trasnmit buffer page under port lock")).
Link: https://lore.kernel.org/r/319321886d97c456203d5c6a576a5480d07c3478.163578168... Fixes: 761ed4a94582 ("tty: serial_core: convert uart_close to use tty_port_close") Cc: stable@vger.kernel.org # 4.9 Cc: Rob Herring robh@kernel.org Reported-by: Baruch Siach baruch@tkos.co.il Tested-by: Baruch Siach baruch@tkos.co.il Signed-off-by: Johan Hovold johan@kernel.org Link: https://lore.kernel.org/r/20211108085431.12637-1-johan@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/tty/serial/serial_core.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 80fa06b16d9d7..4e60045487a86 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -1520,6 +1520,7 @@ static void uart_tty_port_shutdown(struct tty_port *port) { struct uart_state *state = container_of(port, struct uart_state, port); struct uart_port *uport = uart_port_check(state); + char *buf;
/* * At this point, we stop accepting input. To do this, we @@ -1541,8 +1542,18 @@ static void uart_tty_port_shutdown(struct tty_port *port) */ tty_port_set_suspended(port, 0);
- uart_change_pm(state, UART_PM_STATE_OFF); + /* + * Free the transmit buffer. + */ + spin_lock_irq(&uport->lock); + buf = state->xmit.buf; + state->xmit.buf = NULL; + spin_unlock_irq(&uport->lock);
+ if (buf) + free_page((unsigned long)buf); + + uart_change_pm(state, UART_PM_STATE_OFF); }
static void uart_wait_until_sent(struct tty_struct *tty, int timeout)
From: Wei Yongjun weiyongjun1@huawei.com
stable inclusion from linux-4.19.220 commit dc4731f4422d94f60ce64fc9904c0ad6b9d4027f
--------------------------------
commit 5a3ba99b62d8486de0316334e72ac620d4b94fdd upstream.
The sparse tool complains as follows:
drivers/char/ipmi/ipmi_msghandler.c:194:25: warning: symbol 'remove_work_wq' was not declared. Should it be static?
This symbol is not used outside of ipmi_msghandler.c, so marks it static.
Fixes: 1d49eb91e86e ("ipmi: Move remove_work to dedicated workqueue") Reported-by: Hulk Robot hulkci@huawei.com Signed-off-by: Wei Yongjun weiyongjun1@huawei.com Message-Id: 20211123083618.2366808-1-weiyongjun1@huawei.com Signed-off-by: Corey Minyard cminyard@mvista.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/char/ipmi/ipmi_msghandler.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index d51b82d4027d7..272c34102875c 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -219,7 +219,7 @@ struct ipmi_user { struct work_struct remove_work; };
-struct workqueue_struct *remove_work_wq; +static struct workqueue_struct *remove_work_wq;
static struct ipmi_user *acquire_ipmi_user(struct ipmi_user *user, int *index) __acquires(user->release_barrier)
From: Maxim Mikityanskiy maximmi@nvidia.com
stable inclusion from linux-4.19.221 commit c315bd96252887c20bf799293763f273e05394b2
--------------------------------
commit 2fa7d94afc1afbb4d702760c058dc2d7ed30f226 upstream.
The first commit cited below attempts to fix the off-by-one error that appeared in some comparisons with an open range. Due to this error, arithmetically equivalent pieces of code could get different verdicts from the verifier, for example (pseudocode):
// 1. Passes the verifier: if (data + 8 > data_end) return early read *(u64 *)data, i.e. [data; data+7]
// 2. Rejected by the verifier (should still pass): if (data + 7 >= data_end) return early read *(u64 *)data, i.e. [data; data+7]
The attempted fix, however, shifts the range by one in a wrong direction, so the bug not only remains, but also such piece of code starts failing in the verifier:
// 3. Rejected by the verifier, but the check is stricter than in #1. if (data + 8 >= data_end) return early read *(u64 *)data, i.e. [data; data+7]
The change performed by that fix converted an off-by-one bug into off-by-two. The second commit cited below added the BPF selftests written to ensure than code chunks like #3 are rejected, however, they should be accepted.
This commit fixes the off-by-two error by adjusting new_range in the right direction and fixes the tests by changing the range into the one that should actually fail.
Fixes: fb2a311a31d3 ("bpf: fix off by one for range markings with L{T, E} patterns") Fixes: b37242c773b2 ("bpf: add test cases to bpf selftests to cover all access tests") Signed-off-by: Maxim Mikityanskiy maximmi@nvidia.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Link: https://lore.kernel.org/bpf/20211130181607.593149-1-maximmi@nvidia.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2c091df344735..446655caa2625 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3818,7 +3818,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
new_range = dst_reg->off; if (range_right_open) - new_range--; + new_range++;
/* Examples for register markings: *
From: Jianguo Wu wujianguo@chinatelecom.cn
stable inclusion from linux-4.19.221 commit c90d2408288d179ecd598a686bbd05d950e21b00
--------------------------------
commit 158390e45612ef0fde160af0826f1740c36daf21 upstream.
The max number of UDP gso segments is intended to cap to UDP_MAX_SEGMENTS, this is checked in udp_send_skb():
if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS) { kfree_skb(skb); return -EINVAL; }
skb->len contains network and transport header len here, we should use only data len instead.
Fixes: bec1f6f69736 ("udp: generate gso with UDP_SEGMENT") Signed-off-by: Jianguo Wu wujianguo@chinatelecom.cn Reviewed-by: Willem de Bruijn willemb@google.com Link: https://lore.kernel.org/r/900742e5-81fb-30dc-6e0b-375c6cdd7982@163.com Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv4/udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 6ef22a9966246..0b9f3109c4efe 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -795,7 +795,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, kfree_skb(skb); return -EINVAL; } - if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS) { + if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) { kfree_skb(skb); return -EINVAL; }
From: Manjong Lee mj0123.lee@samsung.com
stable inclusion from linux-4.19.221 commit 505039dd0fe6493565e27b55700e46f43f20580d
--------------------------------
commit 3c376dfafbf7a8ea0dea212d095ddd83e93280bb upstream.
Initialize min_ratio if it is set during bdi unregistration. This can prevent problems that may occur a when bdi is removed without resetting min_ratio.
For example. 1) insert external sdcard 2) set external sdcard's min_ratio 70 3) remove external sdcard without setting min_ratio 0 4) insert external sdcard 5) set external sdcard's min_ratio 70 << error occur(can't set)
Because when an sdcard is removed, the present bdi_min_ratio value will remain. Currently, the only way to reset bdi_min_ratio is to reboot.
[akpm@linux-foundation.org: tweak comment and coding style]
Link: https://lkml.kernel.org/r/20211021161942.5983-1-mj0123.lee@samsung.com Signed-off-by: Manjong Lee mj0123.lee@samsung.com Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Cc: Changheun Lee nanich.lee@samsung.com Cc: Jens Axboe axboe@kernel.dk Cc: Christoph Hellwig hch@infradead.org Cc: Matthew Wilcox willy@infradead.org Cc: seunghwan.hyun@samsung.com Cc: sookwan7.kim@samsung.com Cc: yt0928.kim@samsung.com Cc: junho89.kim@samsung.com Cc: jisoo2146.oh@samsung.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Conflicts: mm/backing-dev.c [yyl: adjust context] Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/backing-dev.c | 7 +++++++ 1 file changed, 7 insertions(+)
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 75a61176f392f..f80d5c2ff420c 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -978,6 +978,13 @@ void bdi_unregister(struct backing_dev_info *bdi) wb_shutdown(&bdi->wb); cgwb_bdi_unregister(bdi);
+ /* + * If this BDI's min ratio has been set, use bdi_set_min_ratio() to + * update the global bdi_min_ratio. + */ + if (bdi->min_ratio) + bdi_set_min_ratio(bdi, 0); + if (bdi->dev) { struct rcu_device *rcu_dev = bdi->rcu_dev; bdi_debug_unregister(bdi);
From: "Steven Rostedt (VMware)" rostedt@goodmis.org
stable inclusion from linux-4.19.221 commit 7d795454ccb2a523d1d2146bda45e6f3b45d7d9f
--------------------------------
commit ee7f3666995d8537dec17b1d35425f28877671a9 upstream.
If directories in tracefs have their ownership changed, then any new files and directories that are created under those directories should inherit the ownership of the director they are created in.
Link: https://lkml.kernel.org/r/20211208075720.4855d180@gandalf.local.home
Cc: Kees Cook keescook@chromium.org Cc: Ingo Molnar mingo@kernel.org Cc: Andrew Morton akpm@linux-foundation.org Cc: Linus Torvalds torvalds@linux-foundation.org Cc: Al Viro viro@zeniv.linux.org.uk Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Yabin Cui yabinc@google.com Cc: Christian Brauner christian.brauner@ubuntu.com Cc: stable@vger.kernel.org Fixes: 4282d60689d4f ("tracefs: Add new tracefs file system") Reported-by: Kalesh Singh kaleshsingh@google.com Reported: https://lore.kernel.org/all/CAC_TJve8MMAv+H_NdLSJXZUSoxOEq2zB_pVaJ9p=7H6Bu3X... Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Conflicts: fs/tracefs/inode.c [yyl: adjust context] Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/tracefs/inode.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 7cfb2c77a120a..f4ea7bf13386c 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -412,6 +412,8 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, inode->i_mode = mode; inode->i_fop = fops ? fops : &tracefs_file_operations; inode->i_private = data; + inode->i_uid = d_inode(dentry->d_parent)->i_uid; + inode->i_gid = d_inode(dentry->d_parent)->i_gid; d_instantiate(dentry, inode); fsnotify_create(dentry->d_parent->d_inode, dentry); return end_creating(dentry); @@ -433,6 +435,8 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent, inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; inode->i_op = ops; inode->i_fop = &simple_dir_operations; + inode->i_uid = d_inode(dentry->d_parent)->i_uid; + inode->i_gid = d_inode(dentry->d_parent)->i_gid;
/* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode);
From: Eric Biggers ebiggers@google.com
stable inclusion from linux-4.19.221 commit 8dd7c46a59756bdc29cb9783338b899cd3fb4b83
--------------------------------
commit 42288cb44c4b5fff7653bc392b583a2b8bd6a8c0 upstream.
Several ->poll() implementations are special in that they use a waitqueue whose lifetime is the current task, rather than the struct file as is normally the case. This is okay for blocking polls, since a blocking poll occurs within one task; however, non-blocking polls require another solution. This solution is for the queue to be cleared before it is freed, using 'wake_up_poll(wq, EPOLLHUP | POLLFREE);'.
However, that has a bug: wake_up_poll() calls __wake_up() with nr_exclusive=1. Therefore, if there are multiple "exclusive" waiters, and the wakeup function for the first one returns a positive value, only that one will be called. That's *not* what's needed for POLLFREE; POLLFREE is special in that it really needs to wake up everyone.
Considering the three non-blocking poll systems:
- io_uring poll doesn't handle POLLFREE at all, so it is broken anyway.
- aio poll is unaffected, since it doesn't support exclusive waits. However, that's fragile, as someone could add this feature later.
- epoll doesn't appear to be broken by this, since its wakeup function returns 0 when it sees POLLFREE. But this is fragile.
Although there is a workaround (see epoll), it's better to define a function which always sends POLLFREE to all waiters. Add such a function. Also make it verify that the queue really becomes empty after all waiters have been woken up.
Reported-by: Linus Torvalds torvalds@linux-foundation.org Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20211209010455.42744-2-ebiggers@kernel.org Signed-off-by: Eric Biggers ebiggers@google.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/wait.h | 26 ++++++++++++++++++++++++++ kernel/sched/wait.c | 7 +++++++ 2 files changed, 33 insertions(+)
diff --git a/include/linux/wait.h b/include/linux/wait.h index 1234e6cbacd2a..60a62d3adbe66 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -204,6 +204,7 @@ void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head, void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key); void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr); void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr); +void __wake_up_pollfree(struct wait_queue_head *wq_head);
#define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL) #define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL) @@ -230,6 +231,31 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr); #define wake_up_interruptible_sync_poll(x, m) \ __wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, poll_to_key(m))
+/** + * wake_up_pollfree - signal that a polled waitqueue is going away + * @wq_head: the wait queue head + * + * In the very rare cases where a ->poll() implementation uses a waitqueue whose + * lifetime is tied to a task rather than to the 'struct file' being polled, + * this function must be called before the waitqueue is freed so that + * non-blocking polls (e.g. epoll) are notified that the queue is going away. + * + * The caller must also RCU-delay the freeing of the wait_queue_head, e.g. via + * an explicit synchronize_rcu() or call_rcu(), or via SLAB_TYPESAFE_BY_RCU. + */ +static inline void wake_up_pollfree(struct wait_queue_head *wq_head) +{ + /* + * For performance reasons, we don't always take the queue lock here. + * Therefore, we might race with someone removing the last entry from + * the queue, and proceed while they still hold the queue lock. + * However, rcu_read_lock() is required to be held in such cases, so we + * can safely proceed with an RCU-delayed free. + */ + if (waitqueue_active(wq_head)) + __wake_up_pollfree(wq_head); +} + #define ___wait_cond_timeout(condition) \ ({ \ bool __cond = (condition); \ diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index c95b392735672..021da4812fca7 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -209,6 +209,13 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr_e } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
+void __wake_up_pollfree(struct wait_queue_head *wq_head) +{ + __wake_up(wq_head, TASK_NORMAL, 0, poll_to_key(EPOLLHUP | POLLFREE)); + /* POLLFREE must have cleared the queue. */ + WARN_ON_ONCE(waitqueue_active(wq_head)); +} + /* * Note: we use "set_current_state()" _after_ the wait-queue add, * because we need a memory barrier there on SMP, so that any
From: Eric Biggers ebiggers@google.com
stable inclusion from linux-4.19.221 commit f226fdd855b7d9c1f2a6e878d82eb3e1fbc880e9
--------------------------------
commit 9537bae0da1f8d1e2361ab6d0479e8af7824e160 upstream.
wake_up_poll() uses nr_exclusive=1, so it's not guaranteed to wake up all exclusive waiters. Yet, POLLFREE *must* wake up all waiters. epoll and aio poll are fortunately not affected by this, but it's very fragile. Thus, the new function wake_up_pollfree() has been introduced.
Convert signalfd to use wake_up_pollfree().
Reported-by: Linus Torvalds torvalds@linux-foundation.org Fixes: d80e731ecab4 ("epoll: introduce POLLFREE to flush ->signalfd_wqh before kfree()") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20211209010455.42744-4-ebiggers@kernel.org Signed-off-by: Eric Biggers ebiggers@google.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/signalfd.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-)
diff --git a/fs/signalfd.c b/fs/signalfd.c index 3c40a3bf772ce..94e0ae01db5c8 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -35,17 +35,7 @@
void signalfd_cleanup(struct sighand_struct *sighand) { - wait_queue_head_t *wqh = &sighand->signalfd_wqh; - /* - * The lockless check can race with remove_wait_queue() in progress, - * but in this case its caller should run under rcu_read_lock() and - * sighand_cachep is SLAB_TYPESAFE_BY_RCU, we can safely return. - */ - if (likely(!waitqueue_active(wqh))) - return; - - /* wait_queue_entry_t->func(POLLFREE) should do remove_wait_queue() */ - wake_up_poll(wqh, EPOLLHUP | POLLFREE); + wake_up_pollfree(&sighand->signalfd_wqh); }
struct signalfd_ctx {
From: Eric Biggers ebiggers@google.com
stable inclusion from linux-4.19.221 commit 580c7e023303ce3a187adcaa40868bfc740725d2
--------------------------------
commit 363bee27e25804d8981dd1c025b4ad49dc39c530 upstream.
Currently, aio_poll_wake() will always remove the poll request from the waitqueue. Then, if aio_poll_complete_work() sees that none of the polled events are ready and the request isn't cancelled, it re-adds the request to the waitqueue. (This can easily happen when polling a file that doesn't pass an event mask when waking up its waitqueue.)
This is fundamentally broken for two reasons:
1. If a wakeup occurs between vfs_poll() and the request being re-added to the waitqueue, it will be missed because the request wasn't on the waitqueue at the time. Therefore, IOCB_CMD_POLL might never complete even if the polled file is ready.
2. When the request isn't on the waitqueue, there is no way to be notified that the waitqueue is being freed (which happens when its lifetime is shorter than the struct file's). This is supposed to happen via the waitqueue entries being woken up with POLLFREE.
Therefore, leave the requests on the waitqueue until they are actually completed (or cancelled). To keep track of when aio_poll_complete_work needs to be scheduled, use new fields in struct poll_iocb. Remove the 'done' field which is now redundant.
Note that this is consistent with how sys_poll() and eventpoll work; their wakeup functions do *not* remove the waitqueue entries.
Fixes: 2c14fa838cbe ("aio: implement IOCB_CMD_POLL") Cc: stable@vger.kernel.org # v4.18+ Link: https://lore.kernel.org/r/20211209010455.42744-5-ebiggers@kernel.org Signed-off-by: Eric Biggers ebiggers@google.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Conflicts: fs/aio.c [yyl: adjust context] Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/aio.c | 84 ++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 63 insertions(+), 21 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c index 190f8a7d85c52..3a6d540f31d9e 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -176,8 +176,9 @@ struct poll_iocb { struct file *file; struct wait_queue_head *head; __poll_t events; - bool done; bool cancelled; + bool work_scheduled; + bool work_need_resched; struct wait_queue_entry wait; struct work_struct work; }; @@ -1636,14 +1637,26 @@ static void aio_poll_complete_work(struct work_struct *work) * avoid further branches in the fast path. */ spin_lock_irq(&ctx->ctx_lock); + spin_lock(&req->head->lock); if (!mask && !READ_ONCE(req->cancelled)) { - add_wait_queue(req->head, &req->wait); + /* + * The request isn't actually ready to be completed yet. + * Reschedule completion if another wakeup came in. + */ + if (req->work_need_resched) { + schedule_work(&req->work); + req->work_need_resched = false; + } else { + req->work_scheduled = false; + } + spin_unlock(&req->head->lock); spin_unlock_irq(&ctx->ctx_lock); return; } + list_del_init(&req->wait.entry); + spin_unlock(&req->head->lock); list_del_init(&iocb->ki_list); iocb->ki_res.res = mangle_poll(mask); - req->done = true; spin_unlock_irq(&ctx->ctx_lock);
iocb_put(iocb); @@ -1657,9 +1670,9 @@ static int aio_poll_cancel(struct kiocb *iocb)
spin_lock(&req->head->lock); WRITE_ONCE(req->cancelled, true); - if (!list_empty(&req->wait.entry)) { - list_del_init(&req->wait.entry); + if (!req->work_scheduled) { schedule_work(&aiocb->poll.work); + req->work_scheduled = true; } spin_unlock(&req->head->lock);
@@ -1678,20 +1691,26 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (mask && !(mask & req->events)) return 0;
- list_del_init(&req->wait.entry); - - if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { + /* + * Complete the request inline if possible. This requires that three + * conditions be met: + * 1. An event mask must have been passed. If a plain wakeup was done + * instead, then mask == 0 and we have to call vfs_poll() to get + * the events, so inline completion isn't possible. + * 2. The completion work must not have already been scheduled. + * 3. ctx_lock must not be busy. We have to use trylock because we + * already hold the waitqueue lock, so this inverts the normal + * locking order. Use irqsave/irqrestore because not all + * filesystems (e.g. fuse) call this function with IRQs disabled, + * yet IRQs have to be disabled before ctx_lock is obtained. + */ + if (mask && !req->work_scheduled && + spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { struct kioctx *ctx = iocb->ki_ctx;
- /* - * Try to complete the iocb inline if we can. Use - * irqsave/irqrestore because not all filesystems (e.g. fuse) - * call this function with IRQs disabled and because IRQs - * have to be disabled before ctx_lock is obtained. - */ + list_del_init(&req->wait.entry); list_del(&iocb->ki_list); iocb->ki_res.res = mangle_poll(mask); - req->done = true; if (iocb->ki_eventfd && eventfd_signal_count()) { iocb = NULL; INIT_WORK(&req->work, aio_poll_put_work); @@ -1701,9 +1720,21 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (iocb) iocb_put(iocb); } else { - schedule_work(&req->work); + /* + * Schedule the completion work if needed. If it was already + * scheduled, record that another wakeup came in. + * + * Don't remove the request from the waitqueue here, as it might + * not actually be complete yet (we won't know until vfs_poll() + * is called), and we must not miss any wakeups. + */ + if (req->work_scheduled) { + req->work_need_resched = true; + } else { + schedule_work(&req->work); + req->work_scheduled = true; + } } - return 1; }
@@ -1749,8 +1780,9 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
req->head = NULL; - req->done = false; req->cancelled = false; + req->work_scheduled = false; + req->work_need_resched = false;
apt.pt._qproc = aio_poll_queue_proc; apt.pt._key = req->events; @@ -1765,17 +1797,27 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) spin_lock_irq(&ctx->ctx_lock); if (likely(req->head)) { spin_lock(&req->head->lock); - if (unlikely(list_empty(&req->wait.entry))) { - if (apt.error) + if (list_empty(&req->wait.entry) || req->work_scheduled) { + /* + * aio_poll_wake() already either scheduled the async + * completion work, or completed the request inline. + */ + if (apt.error) /* unsupported case: multiple queues */ cancel = true; apt.error = 0; mask = 0; } if (mask || apt.error) { + /* Steal to complete synchronously. */ list_del_init(&req->wait.entry); } else if (cancel) { + /* Cancel if possible (may be too late though). */ WRITE_ONCE(req->cancelled, true); - } else if (!req->done) { /* actually waiting for an event */ + } else if (!list_empty(&req->wait.entry)) { + /* + * Actually waiting for an event, so add the request to + * active_reqs so that it can be cancelled if needed. + */ list_add_tail(&aiocb->ki_list, &ctx->active_reqs); aiocb->ki_cancel = aio_poll_cancel; }
From: Eric Biggers ebiggers@google.com
stable inclusion from linux-4.19.221 commit 321fba81ec034f88aea4898993c1bf15605c023f
--------------------------------
commit 50252e4b5e989ce64555c7aef7516bdefc2fea72 upstream.
signalfd_poll() and binder_poll() are special in that they use a waitqueue whose lifetime is the current task, rather than the struct file as is normally the case. This is okay for blocking polls, since a blocking poll occurs within one task; however, non-blocking polls require another solution. This solution is for the queue to be cleared before it is freed, by sending a POLLFREE notification to all waiters.
Unfortunately, only eventpoll handles POLLFREE. A second type of non-blocking poll, aio poll, was added in kernel v4.18, and it doesn't handle POLLFREE. This allows a use-after-free to occur if a signalfd or binder fd is polled with aio poll, and the waitqueue gets freed.
Fix this by making aio poll handle POLLFREE.
A patch by Ramji Jiyani ramjiyani@google.com (https://lore.kernel.org/r/20211027011834.2497484-1-ramjiyani@google.com) tried to do this by making aio_poll_wake() always complete the request inline if POLLFREE is seen. However, that solution had two bugs. First, it introduced a deadlock, as it unconditionally locked the aio context while holding the waitqueue lock, which inverts the normal locking order. Second, it didn't consider that POLLFREE notifications are missed while the request has been temporarily de-queued.
The second problem was solved by my previous patch. This patch then properly fixes the use-after-free by handling POLLFREE in a deadlock-free way. It does this by taking advantage of the fact that freeing of the waitqueue is RCU-delayed, similar to what eventpoll does.
Fixes: 2c14fa838cbe ("aio: implement IOCB_CMD_POLL") Cc: stable@vger.kernel.org # v4.18+ Link: https://lore.kernel.org/r/20211209010455.42744-6-ebiggers@kernel.org Signed-off-by: Eric Biggers ebiggers@google.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/aio.c | 137 ++++++++++++++++++++++++-------- include/uapi/asm-generic/poll.h | 2 +- 2 files changed, 107 insertions(+), 32 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c index 3a6d540f31d9e..d221260b8f30a 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1618,6 +1618,51 @@ static void aio_poll_put_work(struct work_struct *work) iocb_put(iocb); }
+/* + * Safely lock the waitqueue which the request is on, synchronizing with the + * case where the ->poll() provider decides to free its waitqueue early. + * + * Returns true on success, meaning that req->head->lock was locked, req->wait + * is on req->head, and an RCU read lock was taken. Returns false if the + * request was already removed from its waitqueue (which might no longer exist). + */ +static bool poll_iocb_lock_wq(struct poll_iocb *req) +{ + wait_queue_head_t *head; + + /* + * While we hold the waitqueue lock and the waitqueue is nonempty, + * wake_up_pollfree() will wait for us. However, taking the waitqueue + * lock in the first place can race with the waitqueue being freed. + * + * We solve this as eventpoll does: by taking advantage of the fact that + * all users of wake_up_pollfree() will RCU-delay the actual free. If + * we enter rcu_read_lock() and see that the pointer to the queue is + * non-NULL, we can then lock it without the memory being freed out from + * under us, then check whether the request is still on the queue. + * + * Keep holding rcu_read_lock() as long as we hold the queue lock, in + * case the caller deletes the entry from the queue, leaving it empty. + * In that case, only RCU prevents the queue memory from being freed. + */ + rcu_read_lock(); + head = smp_load_acquire(&req->head); + if (head) { + spin_lock(&head->lock); + if (!list_empty(&req->wait.entry)) + return true; + spin_unlock(&head->lock); + } + rcu_read_unlock(); + return false; +} + +static void poll_iocb_unlock_wq(struct poll_iocb *req) +{ + spin_unlock(&req->head->lock); + rcu_read_unlock(); +} + static void aio_poll_complete_work(struct work_struct *work) { struct poll_iocb *req = container_of(work, struct poll_iocb, work); @@ -1637,24 +1682,25 @@ static void aio_poll_complete_work(struct work_struct *work) * avoid further branches in the fast path. */ spin_lock_irq(&ctx->ctx_lock); - spin_lock(&req->head->lock); - if (!mask && !READ_ONCE(req->cancelled)) { - /* - * The request isn't actually ready to be completed yet. - * Reschedule completion if another wakeup came in. - */ - if (req->work_need_resched) { - schedule_work(&req->work); - req->work_need_resched = false; - } else { - req->work_scheduled = false; + if (poll_iocb_lock_wq(req)) { + if (!mask && !READ_ONCE(req->cancelled)) { + /* + * The request isn't actually ready to be completed yet. + * Reschedule completion if another wakeup came in. + */ + if (req->work_need_resched) { + schedule_work(&req->work); + req->work_need_resched = false; + } else { + req->work_scheduled = false; + } + poll_iocb_unlock_wq(req); + spin_unlock_irq(&ctx->ctx_lock); + return; } - spin_unlock(&req->head->lock); - spin_unlock_irq(&ctx->ctx_lock); - return; - } - list_del_init(&req->wait.entry); - spin_unlock(&req->head->lock); + list_del_init(&req->wait.entry); + poll_iocb_unlock_wq(req); + } /* else, POLLFREE has freed the waitqueue, so we must complete */ list_del_init(&iocb->ki_list); iocb->ki_res.res = mangle_poll(mask); spin_unlock_irq(&ctx->ctx_lock); @@ -1668,13 +1714,14 @@ static int aio_poll_cancel(struct kiocb *iocb) struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw); struct poll_iocb *req = &aiocb->poll;
- spin_lock(&req->head->lock); - WRITE_ONCE(req->cancelled, true); - if (!req->work_scheduled) { - schedule_work(&aiocb->poll.work); - req->work_scheduled = true; - } - spin_unlock(&req->head->lock); + if (poll_iocb_lock_wq(req)) { + WRITE_ONCE(req->cancelled, true); + if (!req->work_scheduled) { + schedule_work(&aiocb->poll.work); + req->work_scheduled = true; + } + poll_iocb_unlock_wq(req); + } /* else, the request was force-cancelled by POLLFREE already */
return 0; } @@ -1726,7 +1773,8 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, * * Don't remove the request from the waitqueue here, as it might * not actually be complete yet (we won't know until vfs_poll() - * is called), and we must not miss any wakeups. + * is called), and we must not miss any wakeups. POLLFREE is an + * exception to this; see below. */ if (req->work_scheduled) { req->work_need_resched = true; @@ -1734,6 +1782,28 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, schedule_work(&req->work); req->work_scheduled = true; } + + /* + * If the waitqueue is being freed early but we can't complete + * the request inline, we have to tear down the request as best + * we can. That means immediately removing the request from its + * waitqueue and preventing all further accesses to the + * waitqueue via the request. We also need to schedule the + * completion work (done above). Also mark the request as + * cancelled, to potentially skip an unneeded call to ->poll(). + */ + if (mask & POLLFREE) { + WRITE_ONCE(req->cancelled, true); + list_del_init(&req->wait.entry); + + /* + * Careful: this *must* be the last step, since as soon + * as req->head is NULL'ed out, the request can be + * completed and freed, since aio_poll_complete_work() + * will no longer need to take the waitqueue lock. + */ + smp_store_release(&req->head, NULL); + } } return 1; } @@ -1741,6 +1811,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, struct aio_poll_table { struct poll_table_struct pt; struct aio_kiocb *iocb; + bool queued; int error; };
@@ -1751,11 +1822,12 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head, struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt);
/* multiple wait queues per file are not supported */ - if (unlikely(pt->iocb->poll.head)) { + if (unlikely(pt->queued)) { pt->error = -EINVAL; return; }
+ pt->queued = true; pt->error = 0; pt->iocb->poll.head = head; add_wait_queue(head, &pt->iocb->poll.wait); @@ -1787,6 +1859,7 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) apt.pt._qproc = aio_poll_queue_proc; apt.pt._key = req->events; apt.iocb = aiocb; + apt.queued = false; apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
/* initialized the list so that we can do list_empty checks */ @@ -1795,9 +1868,10 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
mask = vfs_poll(req->file, &apt.pt) & req->events; spin_lock_irq(&ctx->ctx_lock); - if (likely(req->head)) { - spin_lock(&req->head->lock); - if (list_empty(&req->wait.entry) || req->work_scheduled) { + if (likely(apt.queued)) { + bool on_queue = poll_iocb_lock_wq(req); + + if (!on_queue || req->work_scheduled) { /* * aio_poll_wake() already either scheduled the async * completion work, or completed the request inline. @@ -1813,7 +1887,7 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) } else if (cancel) { /* Cancel if possible (may be too late though). */ WRITE_ONCE(req->cancelled, true); - } else if (!list_empty(&req->wait.entry)) { + } else if (on_queue) { /* * Actually waiting for an event, so add the request to * active_reqs so that it can be cancelled if needed. @@ -1821,7 +1895,8 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) list_add_tail(&aiocb->ki_list, &ctx->active_reqs); aiocb->ki_cancel = aio_poll_cancel; } - spin_unlock(&req->head->lock); + if (on_queue) + poll_iocb_unlock_wq(req); } if (mask) { /* no async, we'd stolen it */ aiocb->ki_res.res = mangle_poll(mask); diff --git a/include/uapi/asm-generic/poll.h b/include/uapi/asm-generic/poll.h index 41b509f410bf9..f9c520ce4bf4e 100644 --- a/include/uapi/asm-generic/poll.h +++ b/include/uapi/asm-generic/poll.h @@ -29,7 +29,7 @@ #define POLLRDHUP 0x2000 #endif
-#define POLLFREE (__force __poll_t)0x4000 /* currently only for epoll */ +#define POLLFREE (__force __poll_t)0x4000
#define POLL_BUSY_LOOP (__force __poll_t)0x8000
From: "Steven Rostedt (VMware)" rostedt@goodmis.org
stable inclusion from linux-4.19.221 commit 1e06540f8d2b94d0459e6cd2724001e5c8657416
--------------------------------
commit 48b27b6b5191e2e1f2798cd80877b6e4ef47c351 upstream.
As people have been asking to allow non-root processes to have access to the tracefs directory, it was considered best to only allow groups to have access to the directory, where it is easier to just set the tracefs file system to a specific group (as other would be too dangerous), and that way the admins could pick which processes would have access to tracefs.
Unfortunately, this broke tooling on Android that expected the other bit to be set. For some special cases, for non-root tools to trace the system, tracefs would be mounted and change the permissions of the top level directory which gave access to all running tasks permission to the tracing directory. Even though this would be dangerous to do in a production environment, for testing environments this can be useful.
Now with the new changes to not allow other (which is still the proper thing to do), it breaks the testing tooling. Now more code needs to be loaded on the system to change ownership of the tracing directory.
The real solution is to have tracefs honor the gid=xxx option when mounting. That is,
(tracing group tracing has value 1003)
mount -t tracefs -o gid=1003 tracefs /sys/kernel/tracing
should have it that all files in the tracing directory should be of the given group.
Copy the logic from d_walk() from dcache.c and simplify it for the mount case of tracefs if gid is set. All the files in tracefs will be walked and their group will be set to the value passed in.
Link: https://lkml.kernel.org/r/20211207171729.2a54e1b3@gandalf.local.home
Cc: Ingo Molnar mingo@kernel.org Cc: Kees Cook keescook@chromium.org Cc: Andrew Morton akpm@linux-foundation.org Cc: Linus Torvalds torvalds@linux-foundation.org Cc: linux-fsdevel@vger.kernel.org Cc: Al Viro viro@ZenIV.linux.org.uk Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Reported-by: Kalesh Singh kaleshsingh@google.com Reported-by: Yabin Cui yabinc@google.com Fixes: 49d67e445742 ("tracefs: Have tracefs directories not set OTH permission bits by default") Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/tracefs/inode.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+)
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index f4ea7bf13386c..fc609c4a676f6 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -162,6 +162,77 @@ struct tracefs_fs_info { struct tracefs_mount_opts mount_opts; };
+static void change_gid(struct dentry *dentry, kgid_t gid) +{ + if (!dentry->d_inode) + return; + dentry->d_inode->i_gid = gid; +} + +/* + * Taken from d_walk, but without he need for handling renames. + * Nothing can be renamed while walking the list, as tracefs + * does not support renames. This is only called when mounting + * or remounting the file system, to set all the files to + * the given gid. + */ +static void set_gid(struct dentry *parent, kgid_t gid) +{ + struct dentry *this_parent; + struct list_head *next; + + this_parent = parent; + spin_lock(&this_parent->d_lock); + + change_gid(this_parent, gid); +repeat: + next = this_parent->d_subdirs.next; +resume: + while (next != &this_parent->d_subdirs) { + struct list_head *tmp = next; + struct dentry *dentry = list_entry(tmp, struct dentry, d_child); + next = tmp->next; + + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + + change_gid(dentry, gid); + + if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&this_parent->d_lock); + spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); + this_parent = dentry; + spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); + goto repeat; + } + spin_unlock(&dentry->d_lock); + } + /* + * All done at this level ... ascend and resume the search. + */ + rcu_read_lock(); +ascend: + if (this_parent != parent) { + struct dentry *child = this_parent; + this_parent = child->d_parent; + + spin_unlock(&child->d_lock); + spin_lock(&this_parent->d_lock); + + /* go into the first sibling still alive */ + do { + next = child->d_child.next; + if (next == &this_parent->d_subdirs) + goto ascend; + child = list_entry(next, struct dentry, d_child); + } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED)); + rcu_read_unlock(); + goto resume; + } + rcu_read_unlock(); + spin_unlock(&this_parent->d_lock); + return; +} + static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts) { substring_t args[MAX_OPT_ARGS]; @@ -194,6 +265,7 @@ static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts) if (!gid_valid(gid)) return -EINVAL; opts->gid = gid; + set_gid(tracefs_mount->mnt_root, gid); break; case Opt_mode: if (match_octal(&args[0], &option))
From: Davidlohr Bueso dave@stgolabs.net
stable inclusion from linux-4.19.221 commit 23047a238f44bd11c0316598691eef2bfd2d557e
--------------------------------
commit e6a59aac8a8713f335a37d762db0dbe80e7f6d38 upstream.
do_each_pid_thread(PIDTYPE_PGID) can race with a concurrent change_pid(PIDTYPE_PGID) that can move the task from one hlist to another while iterating. Serialize ioprio_get to take the tasklist_lock in this case, just like it's set counterpart.
Fixes: d69b78ba1de (ioprio: grab rcu_read_lock in sys_ioprio_{set,get}()) Acked-by: Oleg Nesterov oleg@redhat.com Signed-off-by: Davidlohr Bueso dbueso@suse.de Link: https://lore.kernel.org/r/20211210182058.43417-1-dave@stgolabs.net Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/ioprio.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/block/ioprio.c b/block/ioprio.c index f9821080c92cc..f0ee9cc33d17b 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -206,6 +206,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) pgrp = task_pgrp(current); else pgrp = find_vpid(who); + read_lock(&tasklist_lock); do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { tmpio = get_task_ioprio(p); if (tmpio < 0) @@ -215,6 +216,8 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) else ret = ioprio_best(ret, tmpio); } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + read_unlock(&tasklist_lock); + break; case IOPRIO_WHO_USER: uid = make_kuid(current_user_ns(), who);
From: Manish Chopra manishc@marvell.com
stable inclusion from linux-4.19.221 commit 358544866727256a4184512913e03922968ec2c1
--------------------------------
commit 8e227b198a55859bf790dc7f4b1e30c0859c6756 upstream.
Although it is unlikely that stack could transmit a non LSO skb with length > MTU, however in some cases or environment such occurrences actually resulted into firmware asserts due to packet length being greater than the max supported by the device (~9700B).
This patch adds the safeguard for such odd cases to avoid firmware asserts.
v2: Added "Fixes" tag with one of the initial driver commit which enabled the TX traffic actually (as this was probably day1 issue which was discovered recently by some customer environment)
Fixes: a2ec6172d29c ("qede: Add support for link") Signed-off-by: Manish Chopra manishc@marvell.com Signed-off-by: Alok Prasad palok@marvell.com Signed-off-by: Prabhakar Kushwaha pkushwaha@marvell.com Signed-off-by: Ariel Elior aelior@marvell.com Link: https://lore.kernel.org/r/20211203174413.13090-1-manishc@marvell.com Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/net/ethernet/qlogic/qede/qede_fp.c | 7 +++++++ 1 file changed, 7 insertions(+)
diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c index a96da16f34049..512fd0a364afa 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_fp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c @@ -1606,6 +1606,13 @@ netdev_tx_t qede_start_xmit(struct sk_buff *skb, struct net_device *ndev) data_split = true; } } else { + if (unlikely(skb->len > ETH_TX_MAX_NON_LSO_PKT_LEN)) { + DP_ERR(edev, "Unexpected non LSO skb length = 0x%x\n", skb->len); + qede_free_failed_tx_pkt(txq, first_bd, 0, false); + qede_update_tx_producer(txq); + return NETDEV_TX_OK; + } + val |= ((skb->len & ETH_TX_DATA_1ST_BD_PKT_LEN_MASK) << ETH_TX_DATA_1ST_BD_PKT_LEN_SHIFT); }
From: Mateusz Palczewski mateusz.palczewski@intel.com
stable inclusion from linux-4.19.221 commit 4b689c3619619778fe58bc35adac858e61e9428e
--------------------------------
commit 8aa55ab422d9d0d825ebfb877702ed661e96e682 upstream.
After setting pre-set combined to 16 queues and reserving 16 queues by tc qdisc, pre-set maximum combined queues returned to default value after VF reset being 4 and this generated errors during removing tc. Fixed by removing clear num_req_queues before reset VF.
Fixes: e284fc280473 (i40e: Add and delete cloud filter) Signed-off-by: Grzegorz Szczurek grzegorzx.szczurek@intel.com Signed-off-by: Mateusz Palczewski mateusz.palczewski@intel.com Tested-by: Bindushree P Bindushree.p@intel.com Signed-off-by: Tony Nguyen anthony.l.nguyen@intel.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 5 ----- 1 file changed, 5 deletions(-)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 9154abe13b93f..4b1486f15d15f 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -3583,11 +3583,6 @@ static int i40e_vc_add_qch_msg(struct i40e_vf *vf, u8 *msg)
/* set this flag only after making sure all inputs are sane */ vf->adq_enabled = true; - /* num_req_queues is set when user changes number of queues via ethtool - * and this causes issue for default VSI(which depends on this variable) - * when ADq is enabled, hence reset it. - */ - vf->num_req_queues = 0;
/* reset the VF in order to allocate resources */ i40e_vc_notify_vf_reset(vf);
From: Herve Codina herve.codina@bootlin.com
stable inclusion from linux-4.19.221 commit fc0bb5f52e6599b4928ff2e3294abcd7b21454f2
--------------------------------
commit a4ca0c439f2d5ce9a3dc118d882f9f03449864c8 upstream.
The FSMC NAND controller should apply a delay after the instruction has been issued on the bus. The FSMC NAND controller driver did not handle this delay.
Add this waiting delay in the FSMC NAND controller driver.
Fixes: 4da712e70294 ("mtd: nand: fsmc: use ->exec_op()") Signed-off-by: Herve Codina herve.codina@bootlin.com Signed-off-by: Miquel Raynal miquel.raynal@bootlin.com Link: https://lore.kernel.org/linux-mtd/20211119150316.43080-4-herve.codina@bootli... Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/mtd/nand/raw/fsmc_nand.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/drivers/mtd/nand/raw/fsmc_nand.c b/drivers/mtd/nand/raw/fsmc_nand.c index f418236fa020a..2bc5a5194fa4f 100644 --- a/drivers/mtd/nand/raw/fsmc_nand.c +++ b/drivers/mtd/nand/raw/fsmc_nand.c @@ -18,6 +18,7 @@
#include <linux/clk.h> #include <linux/completion.h> +#include <linux/delay.h> #include <linux/dmaengine.h> #include <linux/dma-direction.h> #include <linux/dma-mapping.h> @@ -700,6 +701,9 @@ static int fsmc_exec_op(struct nand_chip *chip, const struct nand_operation *op, instr->ctx.waitrdy.timeout_ms); break; } + + if (instr->delay_ns) + ndelay(instr->delay_ns); }
return ret;
From: Eric Dumazet edumazet@google.com
stable inclusion from linux-4.19.221 commit 5177509b01e51457254671442902e211c34fbe60
--------------------------------
commit e195e9b5dee6459d8c8e6a314cc71a644a0537fd upstream.
Commit 2c611ad97a82 ("net, neigh: Extend neigh->flags to 32 bit to allow for extensions") enables a new KMSAM warning [1]
I think the bug is actually older, because the following intruction only occurred if ndm->ndm_flags had NTF_PROXY set.
pn->flags = ndm->ndm_flags;
Let's clear all pneigh_entry fields at alloc time.
[1] BUG: KMSAN: uninit-value in pneigh_fill_info+0x986/0xb30 net/core/neighbour.c:2593 pneigh_fill_info+0x986/0xb30 net/core/neighbour.c:2593 pneigh_dump_table net/core/neighbour.c:2715 [inline] neigh_dump_info+0x1e3f/0x2c60 net/core/neighbour.c:2832 netlink_dump+0xaca/0x16a0 net/netlink/af_netlink.c:2265 __netlink_dump_start+0xd1c/0xee0 net/netlink/af_netlink.c:2370 netlink_dump_start include/linux/netlink.h:254 [inline] rtnetlink_rcv_msg+0x181b/0x18c0 net/core/rtnetlink.c:5534 netlink_rcv_skb+0x447/0x800 net/netlink/af_netlink.c:2491 rtnetlink_rcv+0x50/0x60 net/core/rtnetlink.c:5589 netlink_unicast_kernel net/netlink/af_netlink.c:1319 [inline] netlink_unicast+0x1095/0x1360 net/netlink/af_netlink.c:1345 netlink_sendmsg+0x16f3/0x1870 net/netlink/af_netlink.c:1916 sock_sendmsg_nosec net/socket.c:704 [inline] sock_sendmsg net/socket.c:724 [inline] sock_write_iter+0x594/0x690 net/socket.c:1057 call_write_iter include/linux/fs.h:2162 [inline] new_sync_write fs/read_write.c:503 [inline] vfs_write+0x1318/0x2030 fs/read_write.c:590 ksys_write+0x28c/0x520 fs/read_write.c:643 __do_sys_write fs/read_write.c:655 [inline] __se_sys_write fs/read_write.c:652 [inline] __x64_sys_write+0xdb/0x120 fs/read_write.c:652 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x44/0xae
Uninit was created at: slab_post_alloc_hook mm/slab.h:524 [inline] slab_alloc_node mm/slub.c:3251 [inline] slab_alloc mm/slub.c:3259 [inline] __kmalloc+0xc3c/0x12d0 mm/slub.c:4437 kmalloc include/linux/slab.h:595 [inline] pneigh_lookup+0x60f/0xd70 net/core/neighbour.c:766 arp_req_set_public net/ipv4/arp.c:1016 [inline] arp_req_set+0x430/0x10a0 net/ipv4/arp.c:1032 arp_ioctl+0x8d4/0xb60 net/ipv4/arp.c:1232 inet_ioctl+0x4ef/0x820 net/ipv4/af_inet.c:947 sock_do_ioctl net/socket.c:1118 [inline] sock_ioctl+0xa3f/0x13e0 net/socket.c:1235 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:874 [inline] __se_sys_ioctl+0x2df/0x4a0 fs/ioctl.c:860 __x64_sys_ioctl+0xd8/0x110 fs/ioctl.c:860 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x44/0xae
CPU: 1 PID: 20001 Comm: syz-executor.0 Not tainted 5.16.0-rc3-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Fixes: 62dd93181aaa ("[IPV6] NDISC: Set per-entry is_router flag in Proxy NA.") Signed-off-by: Eric Dumazet edumazet@google.com Cc: Roopa Prabhu roopa@nvidia.com Reviewed-by: David Ahern dsahern@kernel.org Link: https://lore.kernel.org/r/20211206165329.1049835-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/core/neighbour.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/core/neighbour.c b/net/core/neighbour.c index e471c32e448f6..6233e9856016e 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -635,7 +635,7 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl,
ASSERT_RTNL();
- n = kmalloc(sizeof(*n) + key_len, GFP_KERNEL); + n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL); if (!n) goto out;
From: Wudi Wang wangwudi@hisilicon.com
stable inclusion from linux-4.19.221 commit 0642f669d9509d481be3c5ab02cdd9dac5f5e095
--------------------------------
commit b383a42ca523ce54bcbd63f7c8f3cf974abc9b9a upstream.
INVALL CMD specifies that the ITS must ensure any caching associated with the interrupt collection defined by ICID is consistent with the LPI configuration tables held in memory for all Redistributors. SYNC is required to ensure that INVALL is executed.
Currently, LPI configuration data may be inconsistent with that in the memory within a short period of time after the INVALL command is executed.
Signed-off-by: Wudi Wang wangwudi@hisilicon.com Signed-off-by: Shaokun Zhang zhangshaokun@hisilicon.com Signed-off-by: Marc Zyngier maz@kernel.org Fixes: cc2d3216f53c ("irqchip: GICv3: ITS command queue") Link: https://lore.kernel.org/r/20211208015429.5007-1-zhangshaokun@hisilicon.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/irqchip/irq-gic-v3-its.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index b191dc7af7e6e..15a0292e8e61e 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -600,7 +600,7 @@ static struct its_collection *its_build_invall_cmd(struct its_node *its,
its_fixup_cmd(cmd);
- return NULL; + return desc->its_invall_cmd.col; }
static struct its_vpe *its_build_vinvall_cmd(struct its_node *its,
From: Harshit Mogalapalli harshit.m.mogalapalli@oracle.com
stable inclusion from linux-4.19.222 commit ff3f517bf7138e01a17369042908a3f345c0ee41
--------------------------------
[ Upstream commit f123cffdd8fe8ea6c7fded4b88516a42798797d0 ]
Adding a check on len parameter to avoid empty skb. This prevents a division error in netem_enqueue function which is caused when skb->len=0 and skb->data_len=0 in the randomized corruption step as shown below.
skb->data[prandom_u32() % skb_headlen(skb)] ^= 1<<(prandom_u32() % 8);
Crash Report: [ 343.170349] netdevsim netdevsim0 netdevsim3: set [1, 0] type 2 family 0 port 6081 - 0 [ 343.216110] netem: version 1.3 [ 343.235841] divide error: 0000 [#1] PREEMPT SMP KASAN NOPTI [ 343.236680] CPU: 3 PID: 4288 Comm: reproducer Not tainted 5.16.0-rc1+ [ 343.237569] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014 [ 343.238707] RIP: 0010:netem_enqueue+0x1590/0x33c0 [sch_netem] [ 343.239499] Code: 89 85 58 ff ff ff e8 5f 5d e9 d3 48 8b b5 48 ff ff ff 8b 8d 50 ff ff ff 8b 85 58 ff ff ff 48 8b bd 70 ff ff ff 31 d2 2b 4f 74 <f7> f1 48 b8 00 00 00 00 00 fc ff df 49 01 d5 4c 89 e9 48 c1 e9 03 [ 343.241883] RSP: 0018:ffff88800bcd7368 EFLAGS: 00010246 [ 343.242589] RAX: 00000000ba7c0a9c RBX: 0000000000000001 RCX: 0000000000000000 [ 343.243542] RDX: 0000000000000000 RSI: ffff88800f8edb10 RDI: ffff88800f8eda40 [ 343.244474] RBP: ffff88800bcd7458 R08: 0000000000000000 R09: ffffffff94fb8445 [ 343.245403] R10: ffffffff94fb8336 R11: ffffffff94fb8445 R12: 0000000000000000 [ 343.246355] R13: ffff88800a5a7000 R14: ffff88800a5b5800 R15: 0000000000000020 [ 343.247291] FS: 00007fdde2bd7700(0000) GS:ffff888109780000(0000) knlGS:0000000000000000 [ 343.248350] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 343.249120] CR2: 00000000200000c0 CR3: 000000000ef4c000 CR4: 00000000000006e0 [ 343.250076] Call Trace: [ 343.250423] <TASK> [ 343.250713] ? memcpy+0x4d/0x60 [ 343.251162] ? netem_init+0xa0/0xa0 [sch_netem] [ 343.251795] ? __sanitizer_cov_trace_pc+0x21/0x60 [ 343.252443] netem_enqueue+0xe28/0x33c0 [sch_netem] [ 343.253102] ? stack_trace_save+0x87/0xb0 [ 343.253655] ? filter_irq_stacks+0xb0/0xb0 [ 343.254220] ? netem_init+0xa0/0xa0 [sch_netem] [ 343.254837] ? __kasan_check_write+0x14/0x20 [ 343.255418] ? _raw_spin_lock+0x88/0xd6 [ 343.255953] dev_qdisc_enqueue+0x50/0x180 [ 343.256508] __dev_queue_xmit+0x1a7e/0x3090 [ 343.257083] ? netdev_core_pick_tx+0x300/0x300 [ 343.257690] ? check_kcov_mode+0x10/0x40 [ 343.258219] ? _raw_spin_unlock_irqrestore+0x29/0x40 [ 343.258899] ? __kasan_init_slab_obj+0x24/0x30 [ 343.259529] ? setup_object.isra.71+0x23/0x90 [ 343.260121] ? new_slab+0x26e/0x4b0 [ 343.260609] ? kasan_poison+0x3a/0x50 [ 343.261118] ? kasan_unpoison+0x28/0x50 [ 343.261637] ? __kasan_slab_alloc+0x71/0x90 [ 343.262214] ? memcpy+0x4d/0x60 [ 343.262674] ? write_comp_data+0x2f/0x90 [ 343.263209] ? __kasan_check_write+0x14/0x20 [ 343.263802] ? __skb_clone+0x5d6/0x840 [ 343.264329] ? __sanitizer_cov_trace_pc+0x21/0x60 [ 343.264958] dev_queue_xmit+0x1c/0x20 [ 343.265470] netlink_deliver_tap+0x652/0x9c0 [ 343.266067] netlink_unicast+0x5a0/0x7f0 [ 343.266608] ? netlink_attachskb+0x860/0x860 [ 343.267183] ? __sanitizer_cov_trace_pc+0x21/0x60 [ 343.267820] ? write_comp_data+0x2f/0x90 [ 343.268367] netlink_sendmsg+0x922/0xe80 [ 343.268899] ? netlink_unicast+0x7f0/0x7f0 [ 343.269472] ? __sanitizer_cov_trace_pc+0x21/0x60 [ 343.270099] ? write_comp_data+0x2f/0x90 [ 343.270644] ? netlink_unicast+0x7f0/0x7f0 [ 343.271210] sock_sendmsg+0x155/0x190 [ 343.271721] ____sys_sendmsg+0x75f/0x8f0 [ 343.272262] ? kernel_sendmsg+0x60/0x60 [ 343.272788] ? write_comp_data+0x2f/0x90 [ 343.273332] ? write_comp_data+0x2f/0x90 [ 343.273869] ___sys_sendmsg+0x10f/0x190 [ 343.274405] ? sendmsg_copy_msghdr+0x80/0x80 [ 343.274984] ? slab_post_alloc_hook+0x70/0x230 [ 343.275597] ? futex_wait_setup+0x240/0x240 [ 343.276175] ? security_file_alloc+0x3e/0x170 [ 343.276779] ? write_comp_data+0x2f/0x90 [ 343.277313] ? __sanitizer_cov_trace_pc+0x21/0x60 [ 343.277969] ? write_comp_data+0x2f/0x90 [ 343.278515] ? __fget_files+0x1ad/0x260 [ 343.279048] ? __sanitizer_cov_trace_pc+0x21/0x60 [ 343.279685] ? write_comp_data+0x2f/0x90 [ 343.280234] ? __sanitizer_cov_trace_pc+0x21/0x60 [ 343.280874] ? sockfd_lookup_light+0xd1/0x190 [ 343.281481] __sys_sendmsg+0x118/0x200 [ 343.281998] ? __sys_sendmsg_sock+0x40/0x40 [ 343.282578] ? alloc_fd+0x229/0x5e0 [ 343.283070] ? write_comp_data+0x2f/0x90 [ 343.283610] ? write_comp_data+0x2f/0x90 [ 343.284135] ? __sanitizer_cov_trace_pc+0x21/0x60 [ 343.284776] ? ktime_get_coarse_real_ts64+0xb8/0xf0 [ 343.285450] __x64_sys_sendmsg+0x7d/0xc0 [ 343.285981] ? syscall_enter_from_user_mode+0x4d/0x70 [ 343.286664] do_syscall_64+0x3a/0x80 [ 343.287158] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 343.287850] RIP: 0033:0x7fdde24cf289 [ 343.288344] Code: 01 00 48 81 c4 80 00 00 00 e9 f1 fe ff ff 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d b7 db 2c 00 f7 d8 64 89 01 48 [ 343.290729] RSP: 002b:00007fdde2bd6d98 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 343.291730] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fdde24cf289 [ 343.292673] RDX: 0000000000000000 RSI: 00000000200000c0 RDI: 0000000000000004 [ 343.293618] RBP: 00007fdde2bd6e20 R08: 0000000100000001 R09: 0000000000000000 [ 343.294557] R10: 0000000100000001 R11: 0000000000000246 R12: 0000000000000000 [ 343.295493] R13: 0000000000021000 R14: 0000000000000000 R15: 00007fdde2bd7700 [ 343.296432] </TASK> [ 343.296735] Modules linked in: sch_netem ip6_vti ip_vti ip_gre ipip sit ip_tunnel geneve macsec macvtap tap ipvlan macvlan 8021q garp mrp hsr wireguard libchacha20poly1305 chacha_x86_64 poly1305_x86_64 ip6_udp_tunnel udp_tunnel libblake2s blake2s_x86_64 libblake2s_generic curve25519_x86_64 libcurve25519_generic libchacha xfrm_interface xfrm6_tunnel tunnel4 veth netdevsim psample batman_adv nlmon dummy team bonding tls vcan ip6_gre ip6_tunnel tunnel6 gre tun ip6t_rpfilter ipt_REJECT nf_reject_ipv4 ip6t_REJECT nf_reject_ipv6 xt_conntrack ip_set ebtable_nat ebtable_broute ip6table_nat ip6table_mangle ip6table_security ip6table_raw iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 iptable_mangle iptable_security iptable_raw ebtable_filter ebtables rfkill ip6table_filter ip6_tables iptable_filter ppdev bochs drm_vram_helper drm_ttm_helper ttm drm_kms_helper cec parport_pc drm joydev floppy parport sg syscopyarea sysfillrect sysimgblt i2c_piix4 qemu_fw_cfg fb_sys_fops pcspkr [ 343.297459] ip_tables xfs virtio_net net_failover failover sd_mod sr_mod cdrom t10_pi ata_generic pata_acpi ata_piix libata virtio_pci virtio_pci_legacy_dev serio_raw virtio_pci_modern_dev dm_mirror dm_region_hash dm_log dm_mod [ 343.311074] Dumping ftrace buffer: [ 343.311532] (ftrace buffer empty) [ 343.312040] ---[ end trace a2e3db5a6ae05099 ]--- [ 343.312691] RIP: 0010:netem_enqueue+0x1590/0x33c0 [sch_netem] [ 343.313481] Code: 89 85 58 ff ff ff e8 5f 5d e9 d3 48 8b b5 48 ff ff ff 8b 8d 50 ff ff ff 8b 85 58 ff ff ff 48 8b bd 70 ff ff ff 31 d2 2b 4f 74 <f7> f1 48 b8 00 00 00 00 00 fc ff df 49 01 d5 4c 89 e9 48 c1 e9 03 [ 343.315893] RSP: 0018:ffff88800bcd7368 EFLAGS: 00010246 [ 343.316622] RAX: 00000000ba7c0a9c RBX: 0000000000000001 RCX: 0000000000000000 [ 343.317585] RDX: 0000000000000000 RSI: ffff88800f8edb10 RDI: ffff88800f8eda40 [ 343.318549] RBP: ffff88800bcd7458 R08: 0000000000000000 R09: ffffffff94fb8445 [ 343.319503] R10: ffffffff94fb8336 R11: ffffffff94fb8445 R12: 0000000000000000 [ 343.320455] R13: ffff88800a5a7000 R14: ffff88800a5b5800 R15: 0000000000000020 [ 343.321414] FS: 00007fdde2bd7700(0000) GS:ffff888109780000(0000) knlGS:0000000000000000 [ 343.322489] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 343.323283] CR2: 00000000200000c0 CR3: 000000000ef4c000 CR4: 00000000000006e0 [ 343.324264] Kernel panic - not syncing: Fatal exception in interrupt [ 343.333717] Dumping ftrace buffer: [ 343.334175] (ftrace buffer empty) [ 343.334653] Kernel Offset: 0x13600000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [ 343.336027] Rebooting in 86400 seconds..
Reported-by: syzkaller syzkaller@googlegroups.com Signed-off-by: Harshit Mogalapalli harshit.m.mogalapalli@oracle.com Link: https://lore.kernel.org/r/20211129175328.55339-1-harshit.m.mogalapalli@oracl... Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/netlink/af_netlink.c | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 305248cfb5d2f..6ff327c44c9fa 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1852,6 +1852,11 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (msg->msg_flags&MSG_OOB) return -EOPNOTSUPP;
+ if (len == 0) { + pr_warn_once("Zero length message leads to an empty skb\n"); + return -ENODATA; + } + err = scm_send(sock, msg, &scm, true); if (err < 0) return err;
From: Joe Thornber ejt@redhat.com
stable inclusion from linux-4.19.222 commit 293f957be5e39720778fb1851ced7f5fba6d51c3
--------------------------------
commit 1b8d2789dad0005fd5e7d35dab26a8e1203fb6da upstream.
Move dm_tm_unlock() after dm_tm_dec().
Cc: stable@vger.kernel.org Signed-off-by: Joe Thornber ejt@redhat.com Signed-off-by: Mike Snitzer snitzer@redhat.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/md/persistent-data/dm-btree-remove.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c index 9e4d1212f4c16..63f2baed3c8a6 100644 --- a/drivers/md/persistent-data/dm-btree-remove.c +++ b/drivers/md/persistent-data/dm-btree-remove.c @@ -423,9 +423,9 @@ static int rebalance_children(struct shadow_spine *s,
memcpy(n, dm_block_data(child), dm_bm_block_size(dm_tm_get_bm(info->tm))); - dm_tm_unlock(info->tm, child);
dm_tm_dec(info->tm, dm_block_location(child)); + dm_tm_unlock(info->tm, child); return 0; }
From: Tom Lendacky thomas.lendacky@amd.com
stable inclusion from linux-4.19.222 commit 4355c9f343f90d879cdde497a1d1adc54ff813aa
--------------------------------
commit 1ff2fc02862d52e18fd3daabcfe840ec27e920a8 upstream.
Reserving memory using efi_mem_reserve() calls into the x86 efi_arch_mem_reserve() function. This function will insert a new EFI memory descriptor into the EFI memory map representing the area of memory to be reserved and marking it as EFI runtime memory. As part of adding this new entry, a new EFI memory map is allocated and mapped. The mapping is where a problem can occur. This new memory map is mapped using early_memremap() and generally mapped encrypted, unless the new memory for the mapping happens to come from an area of memory that is marked as EFI_BOOT_SERVICES_DATA memory. In this case, the new memory will be mapped unencrypted. However, during replacement of the old memory map, efi_mem_type() is disabled, so the new memory map will now be long-term mapped encrypted (in efi.memmap), resulting in the map containing invalid data and causing the kernel boot to crash.
Since it is known that the area will be mapped encrypted going forward, explicitly map the new memory map as encrypted using early_memremap_prot().
Cc: stable@vger.kernel.org # 4.14.x Fixes: 8f716c9b5feb ("x86/mm: Add support to access boot related data in the clear") Link: https://lore.kernel.org/all/ebf1eb2940405438a09d51d121ec0d02c8755558.1634752... Signed-off-by: Tom Lendacky thomas.lendacky@amd.com [ardb: incorporate Kconfig fix by Arnd] Signed-off-by: Ard Biesheuvel ardb@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/x86/Kconfig | 1 + arch/x86/platform/efi/quirks.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e8d2ca2009d8d..4ba0574941d96 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1980,6 +1980,7 @@ config EFI depends on ACPI select UCS2_STRING select EFI_RUNTIME_WRAPPERS + select ARCH_USE_MEMREMAP_PROT ---help--- This enables the kernel to use EFI runtime services that are available (such as the EFI variable services). diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index c9873c9168ad9..006eb09e95879 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -278,7 +278,8 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size) return; }
- new = early_memremap(new_phys, new_size); + new = early_memremap_prot(new_phys, new_size, + pgprot_val(pgprot_encrypted(FIXMAP_PAGE_NORMAL))); if (!new) { pr_err("Failed to map new boot services memmap\n"); return;
From: Alyssa Ross hi@alyssa.is
stable inclusion from linux-4.19.222 commit 109627a6d54d2e44d7bbdb6c73be345cbd32511f
--------------------------------
[ Upstream commit 822c9f2b833c53fc67e8adf6f63ecc3ea24d502c ]
modprobe can't handle spaces in aliases.
Fixes: 6b4cd727eaf1 ("dmaengine: st_fdma: Add STMicroelectronics FDMA engine driver support") Signed-off-by: Alyssa Ross hi@alyssa.is Link: https://lore.kernel.org/r/20211125154441.2626214-1-hi@alyssa.is Signed-off-by: Vinod Koul vkoul@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/dma/st_fdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/dma/st_fdma.c b/drivers/dma/st_fdma.c index bfb79bd0c6de5..087d22ba8a2f6 100644 --- a/drivers/dma/st_fdma.c +++ b/drivers/dma/st_fdma.c @@ -886,4 +886,4 @@ MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("STMicroelectronics FDMA engine driver"); MODULE_AUTHOR("Ludovic.barre Ludovic.barre@st.com"); MODULE_AUTHOR("Peter Griffin peter.griffin@linaro.org"); -MODULE_ALIAS("platform: " DRIVER_NAME); +MODULE_ALIAS("platform:" DRIVER_NAME);
From: Willem de Bruijn willemb@google.com
stable inclusion from linux-4.19.222 commit 18c73170de6719491f79b04c727ea8314c246b03
--------------------------------
[ Upstream commit ec6af094ea28f0f2dda1a6a33b14cd57e36a9755 ]
Packet sockets may switch ring versions. Avoid misinterpreting state between versions, whose fields share a union. rx_owner_map is only allocated with a packet ring (pg_vec) and both are swapped together. If pg_vec is NULL, meaning no packet ring was allocated, then neither was rx_owner_map. And the field may be old state from a tpacket_v3.
Fixes: 61fad6816fc1 ("net/packet: tpacket_rcv: avoid a producer race condition") Reported-by: Syzbot syzbot+1ac0994a0a0c55151121@syzkaller.appspotmail.com Signed-off-by: Willem de Bruijn willemb@google.com Reviewed-by: Eric Dumazet edumazet@google.com Link: https://lore.kernel.org/r/20211215143937.106178-1-willemdebruijn.kernel@gmai... Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/packet/af_packet.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 743193041ccee..fbf7d5ef83c71 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4439,9 +4439,10 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, }
out_free_pg_vec: - bitmap_free(rx_owner_map); - if (pg_vec) + if (pg_vec) { + bitmap_free(rx_owner_map); free_pg_vec(pg_vec, order, req->tp_block_nr); + } out: return err; }
From: Eric Dumazet edumazet@google.com
stable inclusion from linux-4.19.222 commit e56b65c1e74d7f706d74b51baba15187be2fb4b5
--------------------------------
[ Upstream commit e28587cc491ef0f3c51258fdc87fbc386b1d4c59 ]
ipip6_dev_free is sit dev->priv_destructor, already called by register_netdevice() if something goes wrong.
Alternative would be to make ipip6_dev_free() robust against multiple invocations, but other drivers do not implement this strategy.
syzbot reported:
dst_release underflow WARNING: CPU: 0 PID: 5059 at net/core/dst.c:173 dst_release+0xd8/0xe0 net/core/dst.c:173 Modules linked in: CPU: 1 PID: 5059 Comm: syz-executor.4 Not tainted 5.16.0-rc5-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:dst_release+0xd8/0xe0 net/core/dst.c:173 Code: 4c 89 f2 89 d9 31 c0 5b 41 5e 5d e9 da d5 44 f9 e8 1d 90 5f f9 c6 05 87 48 c6 05 01 48 c7 c7 80 44 99 8b 31 c0 e8 e8 67 29 f9 <0f> 0b eb 85 0f 1f 40 00 53 48 89 fb e8 f7 8f 5f f9 48 83 c3 a8 48 RSP: 0018:ffffc9000aa5faa0 EFLAGS: 00010246 RAX: d6894a925dd15a00 RBX: 00000000ffffffff RCX: 0000000000040000 RDX: ffffc90005e19000 RSI: 000000000003ffff RDI: 0000000000040000 RBP: 0000000000000000 R08: ffffffff816a1f42 R09: ffffed1017344f2c R10: ffffed1017344f2c R11: 0000000000000000 R12: 0000607f462b1358 R13: 1ffffffff1bfd305 R14: ffffe8ffffcb1358 R15: dffffc0000000000 FS: 00007f66c71a2700(0000) GS:ffff8880b9a00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f88aaed5058 CR3: 0000000023e0f000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: <TASK> dst_cache_destroy+0x107/0x1e0 net/core/dst_cache.c:160 ipip6_dev_free net/ipv6/sit.c:1414 [inline] sit_init_net+0x229/0x550 net/ipv6/sit.c:1936 ops_init+0x313/0x430 net/core/net_namespace.c:140 setup_net+0x35b/0x9d0 net/core/net_namespace.c:326 copy_net_ns+0x359/0x5c0 net/core/net_namespace.c:470 create_new_namespaces+0x4ce/0xa00 kernel/nsproxy.c:110 unshare_nsproxy_namespaces+0x11e/0x180 kernel/nsproxy.c:226 ksys_unshare+0x57d/0xb50 kernel/fork.c:3075 __do_sys_unshare kernel/fork.c:3146 [inline] __se_sys_unshare kernel/fork.c:3144 [inline] __x64_sys_unshare+0x34/0x40 kernel/fork.c:3144 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f66c882ce99 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f66c71a2168 EFLAGS: 00000246 ORIG_RAX: 0000000000000110 RAX: ffffffffffffffda RBX: 00007f66c893ff60 RCX: 00007f66c882ce99 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000048040200 RBP: 00007f66c8886ff1 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007fff6634832f R14: 00007f66c71a2300 R15: 0000000000022000 </TASK>
Fixes: cf124db566e6 ("net: Fix inconsistent teardown and release of private netdev state.") Signed-off-by: Eric Dumazet edumazet@google.com Reported-by: syzbot syzkaller@googlegroups.com Link: https://lore.kernel.org/r/20211216111741.1387540-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv6/sit.c | 1 - 1 file changed, 1 deletion(-)
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index bcd690ccb551e..f2a2065d640ff 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1875,7 +1875,6 @@ static int __net_init sit_init_net(struct net *net) return 0;
err_reg_dev: - ipip6_dev_free(sitn->fb_tunnel_dev); free_netdev(sitn->fb_tunnel_dev); err_alloc_dev: return err;
From: Thomas Gleixner tglx@linutronix.de
stable inclusion from linux-4.19.222 commit 107883a5bc2084ceb2d2241abcdd4dd450c6eeab
--------------------------------
commit 94185adbfad56815c2c8401e16d81bdb74a79201 upstream.
PCI_MSIX_FLAGS_MASKALL is set in the MSI-X control register at MSI-X interrupt setup time. It's cleared on success, but the error handling path only clears the PCI_MSIX_FLAGS_ENABLE bit.
That's incorrect as the reset state of the PCI_MSIX_FLAGS_MASKALL bit is zero. That can be observed via lspci:
Capabilities: [b0] MSI-X: Enable- Count=67 Masked+
Clear the bit in the error path to restore the reset state.
Fixes: 438553958ba1 ("PCI/MSI: Enable and mask MSI-X early") Reported-by: Stefan Roese sr@denx.de Signed-off-by: Thomas Gleixner tglx@linutronix.de Tested-by: Stefan Roese sr@denx.de Cc: linux-pci@vger.kernel.org Cc: Bjorn Helgaas bhelgaas@google.com Cc: Michal Simek michal.simek@xilinx.com Cc: Marek Vasut marex@denx.de Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/87tufevoqx.ffs@tglx Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/pci/msi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 9c1ee2ee6cc0d..bf560dcf8dd4b 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -861,7 +861,7 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries, free_msi_irqs(dev);
out_disable: - pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0); + pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL | PCI_MSIX_FLAGS_ENABLE, 0);
return ret; }
From: Miklos Szeredi mszeredi@redhat.com
stable inclusion from linux-4.19.222 commit 55e0c283fa6857e66acfc44984ceea893e1b8451
--------------------------------
commit bda9a71980e083699a0360963c0135657b73f47a upstream.
Add missing inode lock annotatation; found by syzbot.
Reported-and-tested-by: syzbot+9f747458f5990eaa8d43@syzkaller.appspotmail.com Signed-off-by: Miklos Szeredi mszeredi@redhat.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/fuse/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 29d8aa4844ff2..b468ccb296c34 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1002,7 +1002,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, if (!parent) return -ENOENT;
- inode_lock(parent); + inode_lock_nested(parent, I_MUTEX_PARENT); if (!S_ISDIR(parent->i_mode)) goto unlock;
From: Miklos Szeredi mszeredi@redhat.com
stable inclusion from linux-4.19.222 commit 445d2dc63e5871d218f21b8f62ab29ac72f2e6b8
--------------------------------
commit 1f5573cfe7a7056e80a92c7a037a3e69f3a13d1c upstream.
Syzbot triggered the following warning in ovl_workdir_create() -> ovl_create_real():
if (!err && WARN_ON(!newdentry->d_inode)) {
The reason is that the cgroup2 filesystem returns from mkdir without instantiating the new dentry.
Weird filesystems such as this will be rejected by overlayfs at a later stage during setup, but to prevent such a warning, call ovl_mkdir_real() directly from ovl_workdir_create() and reject this case early.
Reported-and-tested-by: syzbot+75eab84fd0af9e8bf66b@syzkaller.appspotmail.com Signed-off-by: Miklos Szeredi mszeredi@redhat.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/overlayfs/dir.c | 3 +-- fs/overlayfs/overlayfs.h | 1 + fs/overlayfs/super.c | 12 ++++++++---- 3 files changed, 10 insertions(+), 6 deletions(-)
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 72e95c9c244de..8ee7b77effcdd 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -116,8 +116,7 @@ int ovl_cleanup_and_whiteout(struct dentry *workdir, struct inode *dir, goto out; }
-static int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, - umode_t mode) +int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode) { int err; struct dentry *d, *dentry = *newdentry; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 9199e16653117..76e096ad2ef90 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -411,6 +411,7 @@ struct ovl_cattr {
#define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) })
+int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode); struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry, struct ovl_cattr *attr); int ovl_cleanup(struct inode *dir, struct dentry *dentry); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 20c20f6210a5f..6729999f4a66c 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -654,10 +654,14 @@ static struct dentry *ovl_workdir_create(struct ovl_fs *ofs, goto retry; }
- work = ovl_create_real(dir, work, OVL_CATTR(attr.ia_mode)); - err = PTR_ERR(work); - if (IS_ERR(work)) - goto out_err; + err = ovl_mkdir_real(dir, &work, attr.ia_mode); + if (err) + goto out_dput; + + /* Weird filesystem returning with hashed negative (kernfs)? */ + err = -EINVAL; + if (d_really_is_negative(work)) + goto out_dput;
/* * Try to remove POSIX ACL xattrs from workdir. We are good if:
From: George Kennedy george.kennedy@oracle.com
stable inclusion from linux-4.19.222 commit 04181973c38f3d6a353f9246dcf7fee08024fd9e
--------------------------------
commit e0a2c28da11e2c2b963fc01d50acbf03045ac732 upstream.
In resp_mode_select() sanity check the block descriptor len to avoid UAF.
BUG: KASAN: use-after-free in resp_mode_select+0xa4c/0xb40 drivers/scsi/scsi_debug.c:2509 Read of size 1 at addr ffff888026670f50 by task scsicmd/15032
CPU: 1 PID: 15032 Comm: scsicmd Not tainted 5.15.0-01d0625 #15 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Call Trace: <TASK> dump_stack_lvl+0x89/0xb5 lib/dump_stack.c:107 print_address_description.constprop.9+0x28/0x160 mm/kasan/report.c:257 kasan_report.cold.14+0x7d/0x117 mm/kasan/report.c:443 __asan_report_load1_noabort+0x14/0x20 mm/kasan/report_generic.c:306 resp_mode_select+0xa4c/0xb40 drivers/scsi/scsi_debug.c:2509 schedule_resp+0x4af/0x1a10 drivers/scsi/scsi_debug.c:5483 scsi_debug_queuecommand+0x8c9/0x1e70 drivers/scsi/scsi_debug.c:7537 scsi_queue_rq+0x16b4/0x2d10 drivers/scsi/scsi_lib.c:1521 blk_mq_dispatch_rq_list+0xb9b/0x2700 block/blk-mq.c:1640 __blk_mq_sched_dispatch_requests+0x28f/0x590 block/blk-mq-sched.c:325 blk_mq_sched_dispatch_requests+0x105/0x190 block/blk-mq-sched.c:358 __blk_mq_run_hw_queue+0xe5/0x150 block/blk-mq.c:1762 __blk_mq_delay_run_hw_queue+0x4f8/0x5c0 block/blk-mq.c:1839 blk_mq_run_hw_queue+0x18d/0x350 block/blk-mq.c:1891 blk_mq_sched_insert_request+0x3db/0x4e0 block/blk-mq-sched.c:474 blk_execute_rq_nowait+0x16b/0x1c0 block/blk-exec.c:63 sg_common_write.isra.18+0xeb3/0x2000 drivers/scsi/sg.c:837 sg_new_write.isra.19+0x570/0x8c0 drivers/scsi/sg.c:775 sg_ioctl_common+0x14d6/0x2710 drivers/scsi/sg.c:941 sg_ioctl+0xa2/0x180 drivers/scsi/sg.c:1166 __x64_sys_ioctl+0x19d/0x220 fs/ioctl.c:52 do_syscall_64+0x3a/0x80 arch/x86/entry/common.c:50 entry_SYSCALL_64_after_hwframe+0x44/0xae arch/x86/entry/entry_64.S:113
Link: https://lore.kernel.org/r/1637262208-28850-1-git-send-email-george.kennedy@o... Reported-by: syzkaller syzkaller@googlegroups.com Acked-by: Douglas Gilbert dgilbert@interlog.com Signed-off-by: George Kennedy george.kennedy@oracle.com Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/scsi/scsi_debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index f283905e910f6..e0279b0b5ec85 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -2302,11 +2302,11 @@ static int resp_mode_select(struct scsi_cmnd *scp, __func__, param_len, res); md_len = mselect6 ? (arr[0] + 1) : (get_unaligned_be16(arr + 0) + 2); bd_len = mselect6 ? arr[3] : get_unaligned_be16(arr + 6); - if (md_len > 2) { + off = bd_len + (mselect6 ? 4 : 8); + if (md_len > 2 || off >= res) { mk_sense_invalid_fld(scp, SDEB_IN_DATA, 0, -1); return check_condition_result; } - off = bd_len + (mselect6 ? 4 : 8); mpage = arr[off] & 0x3f; ps = !!(arr[off] & 0x80); if (ps) {