From: Eric Dumazet edumazet@google.com
mainline inclusion from mainline-v5.6 commit c54a274 category: bugfix bugzilla: 93967 CVE: NA
-----------------------------------------------
We would like to use hlist_unhashed() from timer_pending(), which runs without protection of a lock.
Note that other callers might also want to use this variant.
Instead of forcing a READ_ONCE() for all hlist_unhashed() callers, add a new helper with an explicit _lockless suffix in the name to better document what is going on.
Also add various WRITE_ONCE() in __hlist_del(), hlist_add_head() and hlist_add_before()/hlist_add_behind() to pair with the READ_ONCE().
Signed-off-by: Eric Dumazet edumazet@google.com Cc: Thomas Gleixner tglx@linutronix.de [ paulmck: Also add WRITE_ONCE() to rculist.h. ] Signed-off-by: Paul E. McKenney paulmck@kernel.org
Conflicts: include/linux/list.h [wangxiongfeng: include patch commit ae325dcd1 ("list: Don't use WRITE_ONCE() in hlist_add_behind()")] Signed-off-by: Xiongfeng Wang wangxiongfeng2@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/list.h | 30 ++++++++++++++++++++---------- include/linux/rculist.h | 24 ++++++++++++------------ 2 files changed, 32 insertions(+), 22 deletions(-)
diff --git a/include/linux/list.h b/include/linux/list.h index 0e540581d52c4..fc0e87f94d286 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -676,6 +676,16 @@ static inline int hlist_unhashed(const struct hlist_node *h) return !h->pprev; }
+/* This variant of hlist_unhashed() must be used in lockless contexts + * to avoid potential load-tearing. + * The READ_ONCE() is paired with the various WRITE_ONCE() in hlist + * helpers that are defined below. + */ +static inline int hlist_unhashed_lockless(const struct hlist_node *h) +{ + return !READ_ONCE(h->pprev); +} + static inline int hlist_empty(const struct hlist_head *h) { return !READ_ONCE(h->first); @@ -688,7 +698,7 @@ static inline void __hlist_del(struct hlist_node *n)
WRITE_ONCE(*pprev, next); if (next) - next->pprev = pprev; + WRITE_ONCE(next->pprev, pprev); }
static inline void hlist_del(struct hlist_node *n) @@ -709,32 +719,32 @@ static inline void hlist_del_init(struct hlist_node *n) static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *first = h->first; - n->next = first; + WRITE_ONCE(n->next, first); if (first) - first->pprev = &n->next; + WRITE_ONCE(first->pprev, &n->next); WRITE_ONCE(h->first, n); - n->pprev = &h->first; + WRITE_ONCE(n->pprev, &h->first); }
/* next must be != NULL */ static inline void hlist_add_before(struct hlist_node *n, struct hlist_node *next) { - n->pprev = next->pprev; - n->next = next; - next->pprev = &n->next; + WRITE_ONCE(n->pprev, next->pprev); + WRITE_ONCE(n->next, next); + WRITE_ONCE(next->pprev, &n->next); WRITE_ONCE(*(n->pprev), n); }
static inline void hlist_add_behind(struct hlist_node *n, struct hlist_node *prev) { - n->next = prev->next; + WRITE_ONCE(n->next, prev->next); WRITE_ONCE(prev->next, n); - n->pprev = &prev->next; + WRITE_ONCE(n->pprev, &prev->next);
if (n->next) - n->next->pprev = &n->next; + WRITE_ONCE(n->next->pprev, &n->next); }
/* after that we'll appear to be on some hlist and hlist_del will work */ diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 4786c2235b981..74ba25c858429 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -155,7 +155,7 @@ static inline void hlist_del_init_rcu(struct hlist_node *n) { if (!hlist_unhashed(n)) { __hlist_del(n); - n->pprev = NULL; + WRITE_ONCE(n->pprev, NULL); } }
@@ -455,7 +455,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, static inline void hlist_del_rcu(struct hlist_node *n) { __hlist_del(n); - n->pprev = LIST_POISON2; + WRITE_ONCE(n->pprev, LIST_POISON2); }
/** @@ -471,11 +471,11 @@ static inline void hlist_replace_rcu(struct hlist_node *old, struct hlist_node *next = old->next;
new->next = next; - new->pprev = old->pprev; + WRITE_ONCE(new->pprev, old->pprev); rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new); if (next) - new->next->pprev = &new->next; - old->pprev = LIST_POISON2; + WRITE_ONCE(new->next->pprev, &new->next); + WRITE_ONCE(old->pprev, LIST_POISON2); }
/* @@ -510,10 +510,10 @@ static inline void hlist_add_head_rcu(struct hlist_node *n, struct hlist_node *first = h->first;
n->next = first; - n->pprev = &h->first; + WRITE_ONCE(n->pprev, &h->first); rcu_assign_pointer(hlist_first_rcu(h), n); if (first) - first->pprev = &n->next; + WRITE_ONCE(first->pprev, &n->next); }
/** @@ -546,7 +546,7 @@ static inline void hlist_add_tail_rcu(struct hlist_node *n,
if (last) { n->next = last->next; - n->pprev = &last->next; + WRITE_ONCE(n->pprev, &last->next); rcu_assign_pointer(hlist_next_rcu(last), n); } else { hlist_add_head_rcu(n, h); @@ -574,10 +574,10 @@ static inline void hlist_add_tail_rcu(struct hlist_node *n, static inline void hlist_add_before_rcu(struct hlist_node *n, struct hlist_node *next) { - n->pprev = next->pprev; + WRITE_ONCE(n->pprev, next->pprev); n->next = next; rcu_assign_pointer(hlist_pprev_rcu(n), n); - next->pprev = &n->next; + WRITE_ONCE(next->pprev, &n->next); }
/** @@ -602,10 +602,10 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n, struct hlist_node *prev) { n->next = prev->next; - n->pprev = &prev->next; + WRITE_ONCE(n->pprev, &prev->next); rcu_assign_pointer(hlist_next_rcu(prev), n); if (n->next) - n->next->pprev = &n->next; + WRITE_ONCE(n->next->pprev, &n->next); }
#define __hlist_for_each_rcu(pos, head) \
From: Eric Dumazet edumazet@google.com
mainline inclusion from mainline-v5.6 commit 90c018942c2babab73814648b37808fc3bf2ed1a category: bugfix bugzilla: 93967 CVE: NA
-----------------------------------------------
The timer_pending() function is mostly used in lockless contexts, so Without proper annotations, KCSAN might detect a data-race [1].
Using hlist_unhashed_lockless() instead of hand-coding it seems appropriate (as suggested by Paul E. McKenney).
[1]
BUG: KCSAN: data-race in del_timer / detach_if_pending
write to 0xffff88808697d870 of 8 bytes by task 10 on cpu 0: __hlist_del include/linux/list.h:764 [inline] detach_timer kernel/time/timer.c:815 [inline] detach_if_pending+0xcd/0x2d0 kernel/time/timer.c:832 try_to_del_timer_sync+0x60/0xb0 kernel/time/timer.c:1226 del_timer_sync+0x6b/0xa0 kernel/time/timer.c:1365 schedule_timeout+0x2d2/0x6e0 kernel/time/timer.c:1896 rcu_gp_fqs_loop+0x37c/0x580 kernel/rcu/tree.c:1639 rcu_gp_kthread+0x143/0x230 kernel/rcu/tree.c:1799 kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352
read to 0xffff88808697d870 of 8 bytes by task 12060 on cpu 1: del_timer+0x3b/0xb0 kernel/time/timer.c:1198 sk_stop_timer+0x25/0x60 net/core/sock.c:2845 inet_csk_clear_xmit_timers+0x69/0xa0 net/ipv4/inet_connection_sock.c:523 tcp_clear_xmit_timers include/net/tcp.h:606 [inline] tcp_v4_destroy_sock+0xa3/0x3f0 net/ipv4/tcp_ipv4.c:2096 inet_csk_destroy_sock+0xf4/0x250 net/ipv4/inet_connection_sock.c:836 tcp_close+0x6f3/0x970 net/ipv4/tcp.c:2497 inet_release+0x86/0x100 net/ipv4/af_inet.c:427 __sock_release+0x85/0x160 net/socket.c:590 sock_close+0x24/0x30 net/socket.c:1268 __fput+0x1e1/0x520 fs/file_table.c:280 ____fput+0x1f/0x30 fs/file_table.c:313 task_work_run+0xf6/0x130 kernel/task_work.c:113 tracehook_notify_resume include/linux/tracehook.h:188 [inline] exit_to_usermode_loop+0x2b4/0x2c0 arch/x86/entry/common.c:163
Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 12060 Comm: syz-executor.5 Not tainted 5.4.0-rc3+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine,
Signed-off-by: Eric Dumazet edumazet@google.com Cc: Thomas Gleixner tglx@linutronix.de [ paulmck: Pulled in Eric's later amendments. ] Signed-off-by: Paul E. McKenney paulmck@kernel.org Signed-off-by: Xiongfeng Wang wangxiongfeng2@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/timer.h | 2 +- kernel/time/timer.c | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/include/linux/timer.h b/include/linux/timer.h index b1e4b2e9335b4..4759f2d940efa 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -159,7 +159,7 @@ static inline void destroy_timer_on_stack(struct timer_list *timer) { } */ static inline int timer_pending(const struct timer_list * timer) { - return timer->entry.pprev != NULL; + return !hlist_unhashed_lockless(&timer->entry); }
extern void add_timer_on(struct timer_list *timer, int cpu); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a6e88d9bb931c..8b6b33b81ac4d 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -958,6 +958,7 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,
#define MOD_TIMER_PENDING_ONLY 0x01 #define MOD_TIMER_REDUCE 0x02 +#define MOD_TIMER_NOTPENDING 0x04
static inline int __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options) @@ -974,7 +975,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option * the timer is re-modified to have the same timeout or ends up in the * same array bucket then just return: */ - if (timer_pending(timer)) { + if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) { /* * The downside of this optimization is that it can result in * larger granularity than you would get from adding a new @@ -1147,7 +1148,7 @@ EXPORT_SYMBOL(timer_reduce); void add_timer(struct timer_list *timer) { BUG_ON(timer_pending(timer)); - mod_timer(timer, timer->expires); + __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING); } EXPORT_SYMBOL(add_timer);
@@ -1814,7 +1815,7 @@ signed long __sched schedule_timeout(signed long timeout)
timer.task = current; timer_setup_on_stack(&timer.timer, process_timeout, 0); - __mod_timer(&timer.timer, expire, 0); + __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING); schedule(); del_singleshot_timer_sync(&timer.timer);