From: Lai Jiangshan laijs@linux.alibaba.com
mainline inclusion from mainline-v5.15-rc1 commit f97a4a1a3f8769e3452885967955e21c88f3f263 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7LRJF DTS: DTS2023072811781
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
---------------------------
There are two kinds of "delayed" work items in workqueue subsystem.
One is for timer-delayed work items which are visible to workqueue users. The other kind is for work items delayed by active management which can not be directly visible to workqueue users. We mixed the word "delayed" for both kinds and caused somewhat ambiguity.
This patch renames the later one (delayed by active management) to "inactive", because it is used for workqueue active management and most of its related symbols are named with "active" or "activate".
All "delayed" and "DELAYED" are carefully checked and renamed one by one to avoid accidentally changing the name of the other kind for timer-delayed.
No functional change intended.
Signed-off-by: Lai Jiangshan laijs@linux.alibaba.com Signed-off-by: Tejun Heo tj@kernel.org
conflict: kernel/workqueue.c
Signed-off-by: Zeng Heng zengheng4@huawei.com --- include/linux/workqueue.h | 4 +-- kernel/workqueue.c | 58 +++++++++++++++++++-------------------- 2 files changed, 31 insertions(+), 31 deletions(-)
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 6f2b042fc44c..3ac0fa0c822d 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -30,7 +30,7 @@ void delayed_work_timer_fn(struct timer_list *t);
enum { WORK_STRUCT_PENDING_BIT = 0, /* work item is pending execution */ - WORK_STRUCT_DELAYED_BIT = 1, /* work item is delayed */ + WORK_STRUCT_INACTIVE_BIT= 1, /* work item is inactive */ WORK_STRUCT_PWQ_BIT = 2, /* data points to pwq */ WORK_STRUCT_LINKED_BIT = 3, /* next work is linked to this one */ #ifdef CONFIG_DEBUG_OBJECTS_WORK @@ -43,7 +43,7 @@ enum { WORK_STRUCT_COLOR_BITS = 4,
WORK_STRUCT_PENDING = 1 << WORK_STRUCT_PENDING_BIT, - WORK_STRUCT_DELAYED = 1 << WORK_STRUCT_DELAYED_BIT, + WORK_STRUCT_INACTIVE = 1 << WORK_STRUCT_INACTIVE_BIT, WORK_STRUCT_PWQ = 1 << WORK_STRUCT_PWQ_BIT, WORK_STRUCT_LINKED = 1 << WORK_STRUCT_LINKED_BIT, #ifdef CONFIG_DEBUG_OBJECTS_WORK diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 04b558a267ae..fe3c4a292909 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -207,7 +207,7 @@ struct pool_workqueue { /* L: nr of in_flight works */ int nr_active; /* L: nr of active works */ int max_active; /* L: max active works */ - struct list_head delayed_works; /* L: delayed works */ + struct list_head inactive_works; /* L: inactive works */ struct list_head pwqs_node; /* WR: node on wq->pwqs */ struct list_head mayday_node; /* MD: node on wq->maydays */
@@ -1105,7 +1105,7 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq) } }
-static void pwq_activate_delayed_work(struct work_struct *work) +static void pwq_activate_inactive_work(struct work_struct *work) { struct pool_workqueue *pwq = get_work_pwq(work);
@@ -1113,16 +1113,16 @@ static void pwq_activate_delayed_work(struct work_struct *work) if (list_empty(&pwq->pool->worklist)) pwq->pool->watchdog_ts = jiffies; move_linked_works(work, &pwq->pool->worklist, NULL); - __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); + __clear_bit(WORK_STRUCT_INACTIVE_BIT, work_data_bits(work)); pwq->nr_active++; }
-static void pwq_activate_first_delayed(struct pool_workqueue *pwq) +static void pwq_activate_first_inactive(struct pool_workqueue *pwq) { - struct work_struct *work = list_first_entry(&pwq->delayed_works, + struct work_struct *work = list_first_entry(&pwq->inactive_works, struct work_struct, entry);
- pwq_activate_delayed_work(work); + pwq_activate_inactive_work(work); }
/** @@ -1145,10 +1145,10 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) pwq->nr_in_flight[color]--;
pwq->nr_active--; - if (!list_empty(&pwq->delayed_works)) { - /* one down, submit a delayed one */ + if (!list_empty(&pwq->inactive_works)) { + /* one down, submit an inactive one */ if (pwq->nr_active < pwq->max_active) - pwq_activate_first_delayed(pwq); + pwq_activate_first_inactive(pwq); }
/* is flush in progress and are we at the flushing tip? */ @@ -1246,14 +1246,14 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, debug_work_deactivate(work);
/* - * A delayed work item cannot be grabbed directly because + * An inactive work item cannot be grabbed directly because * it might have linked NO_COLOR work items which, if left - * on the delayed_list, will confuse pwq->nr_active + * on the inactive_works list, will confuse pwq->nr_active * management later on and cause stall. Make sure the work * item is activated before grabbing. */ - if (*work_data_bits(work) & WORK_STRUCT_DELAYED) - pwq_activate_delayed_work(work); + if (*work_data_bits(work) & WORK_STRUCT_INACTIVE) + pwq_activate_inactive_work(work);
list_del_init(&work->entry); pwq_dec_nr_in_flight(pwq, get_work_color(work)); @@ -1451,8 +1451,8 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, if (list_empty(worklist)) pwq->pool->watchdog_ts = jiffies; } else { - work_flags |= WORK_STRUCT_DELAYED; - worklist = &pwq->delayed_works; + work_flags |= WORK_STRUCT_INACTIVE; + worklist = &pwq->inactive_works; }
debug_work_activate(work); @@ -2496,7 +2496,7 @@ static int rescuer_thread(void *__rescuer) /* * The above execution of rescued work items could * have created more to rescue through - * pwq_activate_first_delayed() or chained + * pwq_activate_first_inactive() or chained * queueing. Let's put @pwq back on mayday list so * that such back-to-back work items, which may be * being used to relieve memory pressure, don't @@ -2922,7 +2922,7 @@ void drain_workqueue(struct workqueue_struct *wq) bool drained;
spin_lock_irq(&pwq->pool->lock); - drained = !pwq->nr_active && list_empty(&pwq->delayed_works); + drained = !pwq->nr_active && list_empty(&pwq->inactive_works); spin_unlock_irq(&pwq->pool->lock);
if (drained) @@ -3703,7 +3703,7 @@ static void pwq_unbound_release_workfn(struct work_struct *work) * @pwq: target pool_workqueue * * If @pwq isn't freezing, set @pwq->max_active to the associated - * workqueue's saved_max_active and activate delayed work items + * workqueue's saved_max_active and activate inactive work items * accordingly. If @pwq is freezing, clear @pwq->max_active to zero. */ static void pwq_adjust_max_active(struct pool_workqueue *pwq) @@ -3732,9 +3732,9 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq)
pwq->max_active = wq->saved_max_active;
- while (!list_empty(&pwq->delayed_works) && + while (!list_empty(&pwq->inactive_works) && pwq->nr_active < pwq->max_active) { - pwq_activate_first_delayed(pwq); + pwq_activate_first_inactive(pwq); kick = true; }
@@ -3765,7 +3765,7 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, pwq->wq = wq; pwq->flush_color = -1; pwq->refcnt = 1; - INIT_LIST_HEAD(&pwq->delayed_works); + INIT_LIST_HEAD(&pwq->inactive_works); INIT_LIST_HEAD(&pwq->pwqs_node); INIT_LIST_HEAD(&pwq->mayday_node); INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); @@ -4386,7 +4386,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
if (WARN_ON((pwq != wq->dfl_pwq) && (pwq->refcnt > 1)) || WARN_ON(pwq->nr_active) || - WARN_ON(!list_empty(&pwq->delayed_works))) { + WARN_ON(!list_empty(&pwq->inactive_works))) { mutex_unlock(&wq->mutex); show_workqueue_state(); return; @@ -4527,7 +4527,7 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) else pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
- ret = !list_empty(&pwq->delayed_works); + ret = !list_empty(&pwq->inactive_works); rcu_read_unlock_sched();
return ret; @@ -4722,11 +4722,11 @@ static void show_pwq(struct pool_workqueue *pwq) pr_cont("\n"); }
- if (!list_empty(&pwq->delayed_works)) { + if (!list_empty(&pwq->inactive_works)) { bool comma = false;
- pr_info(" delayed:"); - list_for_each_entry(work, &pwq->delayed_works, entry) { + pr_info(" inactive:"); + list_for_each_entry(work, &pwq->inactive_works, entry) { pr_cont_work(comma, work); comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); } @@ -4756,7 +4756,7 @@ void show_workqueue_state(void) bool idle = true;
for_each_pwq(pwq, wq) { - if (pwq->nr_active || !list_empty(&pwq->delayed_works)) { + if (pwq->nr_active || !list_empty(&pwq->inactive_works)) { idle = false; break; } @@ -4768,7 +4768,7 @@ void show_workqueue_state(void)
for_each_pwq(pwq, wq) { spin_lock_irqsave(&pwq->pool->lock, flags); - if (pwq->nr_active || !list_empty(&pwq->delayed_works)) + if (pwq->nr_active || !list_empty(&pwq->inactive_works)) show_pwq(pwq); spin_unlock_irqrestore(&pwq->pool->lock, flags); /* @@ -5143,7 +5143,7 @@ EXPORT_SYMBOL_GPL(work_on_cpu_safe); * freeze_workqueues_begin - begin freezing workqueues * * Start freezing workqueues. After this function returns, all freezable - * workqueues will queue new works to their delayed_works list instead of + * workqueues will queue new works to their inactive_works list instead of * pool->worklist. * * CONTEXT:
From: Lai Jiangshan laijs@linux.alibaba.com
mainline inclusion from mainline-v5.15-rc1 commit c4560c2c88a4c809800ba8e76faabaf80bf6ee89 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7LRJF DTS: DTS2023072811781
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
---------------------------
Make pwq_dec_nr_in_flight() use work_data rather just work_color.
Prepare for later patch to get WORK_STRUCT_INACTIVE bit from work_data in pwq_dec_nr_in_flight().
No functional change intended.
Signed-off-by: Lai Jiangshan laijs@linux.alibaba.com Signed-off-by: Tejun Heo tj@kernel.org Signed-off-by: Zeng Heng zengheng4@huawei.com --- kernel/workqueue.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index fe3c4a292909..f62ce7596640 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -581,9 +581,9 @@ static unsigned int work_color_to_flags(int color) return color << WORK_STRUCT_COLOR_SHIFT; }
-static int get_work_color(struct work_struct *work) +static int get_work_color(unsigned long work_data) { - return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) & + return (work_data >> WORK_STRUCT_COLOR_SHIFT) & ((1 << WORK_STRUCT_COLOR_BITS) - 1); }
@@ -1128,7 +1128,7 @@ static void pwq_activate_first_inactive(struct pool_workqueue *pwq) /** * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight * @pwq: pwq of interest - * @color: color of work which left the queue + * @work_data: work_data of work which left the queue * * A work either has completed or is removed from pending queue, * decrement nr_in_flight of its pwq and handle workqueue flushing. @@ -1136,8 +1136,10 @@ static void pwq_activate_first_inactive(struct pool_workqueue *pwq) * CONTEXT: * spin_lock_irq(pool->lock). */ -static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) +static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_data) { + int color = get_work_color(work_data); + /* uncolored work items don't participate in flushing or nr_active */ if (color == WORK_NO_COLOR) goto out_put; @@ -1256,7 +1258,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, pwq_activate_inactive_work(work);
list_del_init(&work->entry); - pwq_dec_nr_in_flight(pwq, get_work_color(work)); + pwq_dec_nr_in_flight(pwq, *work_data_bits(work));
/* work->data points to pwq iff queued, point to pool */ set_work_pool_and_keep_pending(work, pool->id); @@ -2129,7 +2131,7 @@ __acquires(&pool->lock) struct pool_workqueue *pwq = get_work_pwq(work); struct worker_pool *pool = worker->pool; bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE; - int work_color; + unsigned long work_data; struct worker *collision; #ifdef CONFIG_LOCKDEP /* @@ -2165,7 +2167,7 @@ __acquires(&pool->lock) worker->current_work = work; worker->current_func = work->func; worker->current_pwq = pwq; - work_color = get_work_color(work); + work_data = *work_data_bits(work);
/* * Record wq name for cmdline and debug reporting, may get @@ -2270,7 +2272,7 @@ __acquires(&pool->lock) * (!= NO_COLOR) to avoid prematurely restoring the nice level. */ if (unlikely(worker->flags & WORKER_NICED && - work_color != WORK_NO_COLOR)) { + get_work_color(work_data) != WORK_NO_COLOR)) { set_user_nice(worker->task, worker->pool->attrs->nice); worker_clr_flags(worker, WORKER_NICED); } @@ -2280,7 +2282,7 @@ __acquires(&pool->lock) worker->current_work = NULL; worker->current_func = NULL; worker->current_pwq = NULL; - pwq_dec_nr_in_flight(pwq, work_color); + pwq_dec_nr_in_flight(pwq, work_data); }
/**
From: Lai Jiangshan laijs@linux.alibaba.com
mainline inclusion from mainline-v5.15-rc1 commit d21cece0dbb424ad3ff9e49bde6954632b8efede category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7LRJF DTS: DTS2023072811781
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
---------------------------
Add a local var @work_flags to calculate work_flags step by step, so that we don't need to squeeze several flags in only the last line of code.
Parepare for next patch to add a bit to barrier work item's flag. Not squshing this to next patch makes it clear that what it will have changed.
No functional change intended.
Signed-off-by: Lai Jiangshan laijs@linux.alibaba.com Signed-off-by: Tejun Heo tj@kernel.org Signed-off-by: Zeng Heng zengheng4@huawei.com --- kernel/workqueue.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f62ce7596640..9acd0941401a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2625,8 +2625,8 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, struct wq_barrier *barr, struct work_struct *target, struct worker *worker) { + unsigned int work_flags = work_color_to_flags(WORK_NO_COLOR); struct list_head *head; - unsigned int linked = 0;
/* * debugobject calls are safe here even with pool->lock locked @@ -2652,13 +2652,12 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
head = target->entry.next; /* there can already be other linked works, inherit and set */ - linked = *bits & WORK_STRUCT_LINKED; + work_flags |= *bits & WORK_STRUCT_LINKED; __set_bit(WORK_STRUCT_LINKED_BIT, bits); }
debug_work_activate(&barr->work); - insert_work(pwq, &barr->work, head, - work_color_to_flags(WORK_NO_COLOR) | linked); + insert_work(pwq, &barr->work, head, work_flags); }
/**
From: Lai Jiangshan laijs@linux.alibaba.com
mainline inclusion from mainline-v5.15-rc1 commit 018f3a13dd6300701103f268b6bfec0a56beea57 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7LRJF DTS: DTS2023072811781
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
---------------------------
Currently, WORK_NO_COLOR has two meanings: Not participate in flushing Not participate in nr_active
And only non-barrier work items are marked with WORK_STRUCT_INACTIVE when they are in inactive_works list. The barrier work items are not marked INACTIVE even linked in inactive_works list since these tail items are always moved together with the head work item.
These definitions are simple, clean and practical. (Except a small blemish that only the first meaning of WORK_NO_COLOR is documented in include/linux/workqueue.h while both meanings are in workqueue.c)
But dual-purpose WORK_NO_COLOR used for barrier work items has proven to be problematical[1]. Only the second purpose is obligatory. So we plan to make barrier work items participate in flushing but keep them still not participating in nr_active.
So the plan is to mark barrier work items inactive without using WORK_NO_COLOR in this patch so that we can assign a flushing color to them in next patch.
The reasonable way is to add or reuse a bit in work data of the work item. But adding a bit will double the size of pool_workqueue.
Currently, WORK_STRUCT_INACTIVE is only used in try_to_grab_pending() for user-queued work items and try_to_grab_pending() can't work for barrier work items. So we extend WORK_STRUCT_INACTIVE to also mark barrier work items no matter which list they are in because we don't need to determind which list a barrier work item is in.
So the meaning of WORK_STRUCT_INACTIVE becomes just "the work items don't participate in nr_active" (no matter whether it is a barrier work item or a user-queued work item). And WORK_STRUCT_INACTIVE for user-queued work items means they are in inactive_works list.
This patch does it by setting WORK_STRUCT_INACTIVE for barrier work items in insert_wq_barrier() and checking WORK_STRUCT_INACTIVE first in pwq_dec_nr_in_flight(). And the meaning of WORK_NO_COLOR is reduced to only "not participating in flushing".
There is no functionality change intended in this patch. Because WORK_NO_COLOR+WORK_STRUCT_INACTIVE represents the previous WORK_NO_COLOR in meaning and try_to_grab_pending() doesn't use for barrier work items and avoids being confused by this extended WORK_STRUCT_INACTIVE.
A bunch of comment for nr_active & WORK_STRUCT_INACTIVE is also added for documenting how WORK_STRUCT_INACTIVE works in nr_active management.
[1]: https://lore.kernel.org/lkml/20210812083814.32453-1-lizhe.67@bytedance.com/ Signed-off-by: Lai Jiangshan laijs@linux.alibaba.com Signed-off-by: Tejun Heo tj@kernel.org Signed-off-by: Zeng Heng zengheng4@huawei.com --- kernel/workqueue.c | 42 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 8 deletions(-)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9acd0941401a..ef049b6a26a4 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -205,6 +205,23 @@ struct pool_workqueue { int refcnt; /* L: reference count */ int nr_in_flight[WORK_NR_COLORS]; /* L: nr of in_flight works */ + + /* + * nr_active management and WORK_STRUCT_INACTIVE: + * + * When pwq->nr_active >= max_active, new work item is queued to + * pwq->inactive_works instead of pool->worklist and marked with + * WORK_STRUCT_INACTIVE. + * + * All work items marked with WORK_STRUCT_INACTIVE do not participate + * in pwq->nr_active and all work items in pwq->inactive_works are + * marked with WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE + * work items are in pwq->inactive_works. Some of them are ready to + * run in pool->worklist or worker->scheduled. Those work itmes are + * only struct wq_barrier which is used for flush_work() and should + * not participate in pwq->nr_active. For non-barrier work item, it + * is marked with WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works. + */ int nr_active; /* L: nr of active works */ int max_active; /* L: max active works */ struct list_head inactive_works; /* L: inactive works */ @@ -1140,19 +1157,21 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_ { int color = get_work_color(work_data);
- /* uncolored work items don't participate in flushing or nr_active */ + if (!(work_data & WORK_STRUCT_INACTIVE)) { + pwq->nr_active--; + if (!list_empty(&pwq->inactive_works)) { + /* one down, submit an inactive one */ + if (pwq->nr_active < pwq->max_active) + pwq_activate_first_inactive(pwq); + } + } + + /* uncolored work items don't participate in flushing */ if (color == WORK_NO_COLOR) goto out_put;
pwq->nr_in_flight[color]--;
- pwq->nr_active--; - if (!list_empty(&pwq->inactive_works)) { - /* one down, submit an inactive one */ - if (pwq->nr_active < pwq->max_active) - pwq_activate_first_inactive(pwq); - } - /* is flush in progress and are we at the flushing tip? */ if (likely(pwq->flush_color != color)) goto out_put; @@ -1248,6 +1267,10 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, debug_work_deactivate(work);
/* + * A cancelable inactive work item must be in the + * pwq->inactive_works since a queued barrier can't be + * canceled (see the comments in insert_wq_barrier()). + * * An inactive work item cannot be grabbed directly because * it might have linked NO_COLOR work items which, if left * on the inactive_works list, will confuse pwq->nr_active @@ -2641,6 +2664,9 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
barr->task = current;
+ /* The barrier work item does not participate in pwq->nr_active. */ + work_flags |= WORK_STRUCT_INACTIVE; + /* * If @target is currently being executed, schedule the * barrier to the worker; otherwise, put it after @target.
From: Lai Jiangshan laijs@linux.alibaba.com
mainline inclusion from mainline-v5.15-rc1 commit d812796eb3906424cd3c0eea530983961e2f88bd category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7LRJF DTS: DTS2023072811781
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
---------------------------
There was no strong reason to or not to flush barrier work items in flush_workqueue(). And we have to make barrier work items not participate in nr_active so we had been using WORK_NO_COLOR for them which also makes them can't be flushed by flush_workqueue().
And the users of flush_workqueue() often do not intend to wait barrier work items issued by flush_work(). That made the choice sound perfect.
But barrier work items have reference to internal structure (pool_workqueue) and the worker thread[s] is/are still busy for the workqueue user when the barrrier work items are not done. So it is reasonable to make flush_workqueue() also watch for flush_work() to make it more robust.
And a problem[1] reported by Li Zhe shows that we need such robustness. The warning logs are listed below:
WARNING: CPU: 0 PID: 19336 at kernel/workqueue.c:4430 destroy_workqueue+0x11a/0x2f0 ***** destroy_workqueue: test_workqueue9 has the following busy pwq pwq 4: cpus=2 node=0 flags=0x0 nice=0 active=0/1 refcnt=2 in-flight: 5658:wq_barrier_func Showing busy workqueues and worker pools: *****
It shows that even after drain_workqueue() returns, the barrier work item is still in flight and the pwq (and a worker) is still busy on it.
The problem is caused by flush_workqueue() not watching flush_work():
Thread A Worker /* normal work item with linked */ process_scheduled_works() destroy_workqueue() process_one_work() drain_workqueue() /* run normal work item */ /-- pwq_dec_nr_in_flight() flush_workqueue() <---/ /* the last normal work item is done */ sanity_check process_one_work() /-- raw_spin_unlock_irq(&pool->lock) raw_spin_lock_irq(&pool->lock) <-/ /* maybe preempt */ *WARNING* wq_barrier_func() /* maybe preempt by cond_resched() */
Thread A can get the pool lock after the Worker unlocks the pool lock before running wq_barrier_func(). And if there is any preemption happen around wq_barrier_func(), destroy_workqueue()'s sanity check is more likely to get the lock and catch it. (Note: preemption is not necessary to cause the bug, the unlocking is enough to possibly trigger the WARNING.)
A simple solution might be just executing all linked barrier work items once without releasing pool lock after the head work item's pwq_dec_nr_in_flight(). But this solution has two problems:
1) the head work item might also be barrier work item when the user-queued work item is cancelled. For example: thread 1: thread 2: queue_work(wq, &my_work) flush_work(&my_work) cancel_work_sync(&my_work); /* Neiter my_work nor the barrier work is scheduled. */ destroy_workqueue(wq); /* This is an easier way to catch the WARNING. */
2) there might be too much linked barrier work items and running them all once without releasing pool lock just causes trouble.
The only solution is to make flush_workqueue() aslo watch barrier work items. So we have to assign a color to these barrier work items which is the color of the head (user-queued) work item.
Assigning a color doesn't cause any problem in ative management, because the prvious patch made barrier work items not participate in nr_active via WORK_STRUCT_INACTIVE rather than reliance on the (old) WORK_NO_COLOR.
[1]: https://lore.kernel.org/lkml/20210812083814.32453-1-lizhe.67@bytedance.com/ Reported-by: Li Zhe lizhe.67@bytedance.com Signed-off-by: Lai Jiangshan laijs@linux.alibaba.com Signed-off-by: Tejun Heo tj@kernel.org Signed-off-by: Zeng Heng zengheng4@huawei.com --- kernel/workqueue.c | 20 ++++++++++++-------- kernel/workqueue_internal.h | 3 ++- 2 files changed, 14 insertions(+), 9 deletions(-)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ef049b6a26a4..3ce68188ad0b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1166,10 +1166,6 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_ } }
- /* uncolored work items don't participate in flushing */ - if (color == WORK_NO_COLOR) - goto out_put; - pwq->nr_in_flight[color]--;
/* is flush in progress and are we at the flushing tip? */ @@ -1272,7 +1268,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, * canceled (see the comments in insert_wq_barrier()). * * An inactive work item cannot be grabbed directly because - * it might have linked NO_COLOR work items which, if left + * it might have linked barrier work items which, if left * on the inactive_works list, will confuse pwq->nr_active * management later on and cause stall. Make sure the work * item is activated before grabbing. @@ -2191,6 +2187,7 @@ __acquires(&pool->lock) worker->current_func = work->func; worker->current_pwq = pwq; work_data = *work_data_bits(work); + worker->current_color = get_work_color(work_data);
/* * Record wq name for cmdline and debug reporting, may get @@ -2305,6 +2302,7 @@ __acquires(&pool->lock) worker->current_work = NULL; worker->current_func = NULL; worker->current_pwq = NULL; + worker->current_color = INT_MAX; pwq_dec_nr_in_flight(pwq, work_data); }
@@ -2648,7 +2646,8 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, struct wq_barrier *barr, struct work_struct *target, struct worker *worker) { - unsigned int work_flags = work_color_to_flags(WORK_NO_COLOR); + unsigned int work_flags = 0; + unsigned int work_color; struct list_head *head;
/* @@ -2671,17 +2670,22 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, * If @target is currently being executed, schedule the * barrier to the worker; otherwise, put it after @target. */ - if (worker) + if (worker) { head = worker->scheduled.next; - else { + work_color = worker->current_color; + } else { unsigned long *bits = work_data_bits(target);
head = target->entry.next; /* there can already be other linked works, inherit and set */ work_flags |= *bits & WORK_STRUCT_LINKED; + work_color = get_work_color(*bits); __set_bit(WORK_STRUCT_LINKED_BIT, bits); }
+ pwq->nr_in_flight[work_color]++; + work_flags |= work_color_to_flags(work_color); + debug_work_activate(&barr->work); insert_work(pwq, &barr->work, head, work_flags); } diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 30cfed226b39..8d9f3c225414 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -30,7 +30,8 @@ struct worker {
struct work_struct *current_work; /* L: work being processed */ work_func_t current_func; /* L: current_work's fn */ - struct pool_workqueue *current_pwq; /* L: current_work's pwq */ + struct pool_workqueue *current_pwq; /* L: current_work's pwq */ + unsigned int current_color; /* L: current_work's color */ struct list_head scheduled; /* L: scheduled works */
/* 64 bytes boundary on 64bit, 32 on 32bit */