hulk inclusion category: bugfix bugzilla: https://atomgit.com/openeuler/kernel/issues/9375 -------------------------------- When the shrinker walks a journal's checkpoint list via jbd2_journal_shrink_checkpoint_list(), it holds j_list_lock across each call to journal_shrink_one_cp_list() and only releases it when need_resched() or spin_needbreak() becomes true. journal_shrink_one_cp_list() charges the scan budget (nr_to_scan) only for buffers it actually frees. In JBD2_SHRINK_BUSY_SKIP mode, when metadata is heavily contended, most buffers are busy and cannot be freed, so "nr_to_scan" decreases slowly and the loop keeps walking the list for a long time. On large journals under heavy ext4 writeback, a single shrinker invocation can therefore hold j_list_lock long enough to starve other CPUs spinning for the same lock, triggering softlockups. Fix this by introducing a user-configurable per-hold budget that bounds the number of checkpoint buffers examined while j_list_lock is held, preventing the shrinker from walking the list for an unbounded time. The cap is disabled by default (value 0, preserving the previous behaviour) and can be enabled at runtime via /sys/module/jbd2/parameters/shrink_batch. Fixes: 82c2be921b1a ("jbd2,ext4: add a shrinker to release checkpointed buffers") Signed-off-by: Zizhi Wo <wozizhi@huawei.com> --- fs/jbd2/checkpoint.c | 48 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 727f3505280c..ca7d60f6a858 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -19,10 +19,23 @@ #include <linux/jbd2.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <trace/events/jbd2.h> +#include <linux/moduleparam.h> + +/* + * Max number of checkpoint buffers to examine while holding j_list_lock + * in a single pass. + * + * Tunable at runtime via /sys/module/jbd2/parameters/shrink_batch. + * A value of 0 disables the per-hold cap (legacy behaviour). + */ +static unsigned int jbd2_shrink_batch; +module_param_named(shrink_batch, jbd2_shrink_batch, uint, 0644); +MODULE_PARM_DESC(shrink_batch, + "Max checkpoint buffers examined per j_list_lock hold during shrink (0 = unlimited)"); /* * Unlink a buffer from a transaction checkpoint list. * * Called with j_list_lock held. @@ -364,11 +377,12 @@ enum shrink_type {SHRINK_DESTROY, SHRINK_BUSY_STOP, SHRINK_BUSY_SKIP}; * * Called with j_list_lock held. */ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, enum shrink_type type, - bool *released) + bool *released, + unsigned long *scan_budget) { struct journal_head *last_jh; struct journal_head *next_jh = jh; unsigned long nr_freed = 0; int ret; @@ -383,10 +397,22 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, next_jh = jh->b_cpnext; if (type == SHRINK_DESTROY) { ret = __jbd2_journal_remove_checkpoint(jh); } else { + /* + * Charge budget per examined buffer, not per freed one, + * so a run of busy buffers (the 'continue' path below, + * freeing nothing) still bounds how long one j_list_lock + * hold walks the checkpoint list. + */ + if (scan_budget) { + if (*scan_budget == 0) + break; + (*scan_budget)--; + } + ret = jbd2_journal_try_remove_checkpoint(jh); if (ret < 0) { if (type == SHRINK_BUSY_SKIP) continue; break; @@ -422,12 +448,23 @@ unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, bool __maybe_unused released; tid_t first_tid = 0, last_tid = 0, next_tid = 0; tid_t tid = 0; unsigned long nr_freed = 0; unsigned long freed; + unsigned long scan_budget; + unsigned int batch = READ_ONCE(jbd2_shrink_batch); again: + /* + * Reset the per-hold examination budget on every (re)acquire of + * j_list_lock: one hold examines at most `batch` buffers, decoupled + * from journal size, list length and busy ratio, so the lock hold + * stays short even when the whole list is busy and nothing is freed. + * batch == 0 means no cap, exactly the legacy behaviour. + */ + scan_budget = batch; + spin_lock(&journal->j_list_lock); if (!journal->j_checkpoint_transactions) { spin_unlock(&journal->j_list_lock); goto out; } @@ -452,13 +489,17 @@ unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, transaction = next_transaction; next_transaction = transaction->t_cpnext; tid = transaction->t_tid; freed = journal_shrink_one_cp_list(transaction->t_checkpoint_list, - SHRINK_BUSY_SKIP, &released); + SHRINK_BUSY_SKIP, &released, + batch ? &scan_budget : NULL); nr_freed += freed; (*nr_to_scan) -= min(*nr_to_scan, freed); + + if (batch && scan_budget == 0) + break; if (*nr_to_scan == 0) break; if (need_resched() || spin_needbreak(&journal->j_list_lock)) break; } while (transaction != last_transaction); @@ -505,12 +546,13 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy) last_transaction = transaction->t_cpprev; next_transaction = transaction; do { transaction = next_transaction; next_transaction = transaction->t_cpnext; + /* destroy/clean path: clear everything, no scan budget */ journal_shrink_one_cp_list(transaction->t_checkpoint_list, - type, &released); + type, &released, NULL); /* * This function only frees up some memory if possible so we * dont have an obligation to finish processing. Bail out if * preemption requested: */ -- 2.52.0