Baokun Li (1): blk-wbt: don't throttle swap writes in direct reclaim
Christoph Hellwig (1): blk-wbt: move private information from blk-wbt.h to blk-wbt.c
block/blk-mq-sched.c | 1 + block/blk-settings.c | 1 + block/blk-sysfs.c | 1 + block/blk-wbt.c | 91 ++++++++++++++++++++++++++++++++++++++++---- block/blk-wbt.h | 86 ----------------------------------------- block/elevator.c | 1 + 6 files changed, 88 insertions(+), 93 deletions(-)
From: Christoph Hellwig hch@lst.de
mainline inclusion from mainline-v6.3-rc1 commit 0bc65bd41dfd2f75b9f38812326d767db5cd0663 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IA8D5J CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
A large part of blk-wbt.h is only used in blk-wbt.c, so move it there.
Signed-off-by: Christoph Hellwig hch@lst.de Acked-by: Tejun Heo tj@kernel.org Link: https://lore.kernel.org/r/20230203150400.3199230-11-hch@lst.de Signed-off-by: Jens Axboe axboe@kernel.dk
Conflicts: block/blk-mq-sched.c block/blk-settings.c block/blk-sysfs.c block/blk-wbt.c block/elevator.c [1. different context for header files; 2. also add blk-rq-qos.h in blk-mq-sched.c and elevator.c to avoid kabi change.] Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-mq-sched.c | 1 + block/blk-settings.c | 1 + block/blk-sysfs.c | 1 + block/blk-wbt.c | 77 +++++++++++++++++++++++++++++++++++++++ block/blk-wbt.h | 86 -------------------------------------------- block/elevator.c | 1 + 6 files changed, 81 insertions(+), 86 deletions(-)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index c92d25b71a72..29f8a6df6b18 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -16,6 +16,7 @@ #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" #include "blk-mq-tag.h" +#include "blk-rq-qos.h" #include "blk-wbt.h"
void blk_mq_sched_assign_ioc(struct request *rq) diff --git a/block/blk-settings.c b/block/blk-settings.c index d1a1f963c3eb..7cdf95b6a568 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -15,6 +15,7 @@ #include <linux/dma-mapping.h>
#include "blk.h" +#include "blk-rq-qos.h" #include "blk-wbt.h"
unsigned long blk_max_low_pfn; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 078aace75204..293a4af1e0bc 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -16,6 +16,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" +#include "blk-rq-qos.h" #include "blk-wbt.h"
struct queue_sysfs_entry { diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 6a90d33e6f6a..799caf8e4dcb 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -25,12 +25,79 @@ #include <linux/backing-dev.h> #include <linux/swap.h>
+#include "blk-stat.h" #include "blk-wbt.h" #include "blk-rq-qos.h"
#define CREATE_TRACE_POINTS #include <trace/events/wbt.h>
+enum wbt_flags { + WBT_TRACKED = 1, /* write, tracked for throttling */ + WBT_READ = 2, /* read */ + WBT_KSWAPD = 4, /* write, from kswapd */ + WBT_DISCARD = 8, /* discard */ + + WBT_NR_BITS = 4, /* number of bits */ +}; + +enum { + WBT_RWQ_BG = 0, + WBT_RWQ_KSWAPD, + WBT_RWQ_DISCARD, + WBT_NUM_RWQ, +}; + +/* + * If current state is WBT_STATE_ON/OFF_DEFAULT, it can be covered to any other + * state, if current state is WBT_STATE_ON/OFF_MANUAL, it can only be covered + * to WBT_STATE_OFF/ON_MANUAL. + */ +enum { + WBT_STATE_ON_DEFAULT = 1, /* on by default */ + WBT_STATE_ON_MANUAL = 2, /* on manually by sysfs */ + WBT_STATE_OFF_DEFAULT = 3, /* off by default */ + WBT_STATE_OFF_MANUAL = 4, /* off manually by sysfs */ +}; + +struct rq_wb { + /* + * Settings that govern how we throttle + */ + unsigned int wb_background; /* background writeback */ + unsigned int wb_normal; /* normal writeback */ + + short enable_state; /* WBT_STATE_* */ + + /* + * Number of consecutive periods where we don't have enough + * information to make a firm scale up/down decision. + */ + unsigned int unknown_cnt; + + u64 win_nsec; /* default window size */ + u64 cur_win_nsec; /* current window size */ + + struct blk_stat_callback *cb; + + u64 sync_issue; + void *sync_cookie; + + unsigned int wc; + + unsigned long last_issue; /* last non-throttled issue */ + unsigned long last_comp; /* last non-throttled comp */ + unsigned long min_lat_nsec; + struct rq_qos rqos; + struct rq_wait rq_wait[WBT_NUM_RWQ]; + struct rq_depth rq_depth; +}; + +static inline struct rq_wb *RQWB(struct rq_qos *rqos) +{ + return container_of(rqos, struct rq_wb, rqos); +} + static inline void wbt_clear_state(struct request *rq) { rq->wbt_flags = 0; @@ -225,6 +292,16 @@ static u64 rwb_sync_issue_lat(struct rq_wb *rwb) return now - issue; }
+static inline unsigned int wbt_inflight(struct rq_wb *rwb) +{ + unsigned int i, ret = 0; + + for (i = 0; i < WBT_NUM_RWQ; i++) + ret += atomic_read(&rwb->rq_wait[i].inflight); + + return ret; +} + enum { LAT_OK = 1, LAT_UNKNOWN, diff --git a/block/blk-wbt.h b/block/blk-wbt.h index 824047c395ff..22c8025b9cbc 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -2,92 +2,6 @@ #ifndef WB_THROTTLE_H #define WB_THROTTLE_H
-#include <linux/kernel.h> -#include <linux/atomic.h> -#include <linux/wait.h> -#include <linux/timer.h> -#include <linux/ktime.h> - -#include "blk-stat.h" -#include "blk-rq-qos.h" - -enum wbt_flags { - WBT_TRACKED = 1, /* write, tracked for throttling */ - WBT_READ = 2, /* read */ - WBT_KSWAPD = 4, /* write, from kswapd */ - WBT_DISCARD = 8, /* discard */ - - WBT_NR_BITS = 4, /* number of bits */ -}; - -enum { - WBT_RWQ_BG = 0, - WBT_RWQ_KSWAPD, - WBT_RWQ_DISCARD, - WBT_NUM_RWQ, -}; - -/* - * If current state is WBT_STATE_ON/OFF_DEFAULT, it can be covered to any other - * state, if current state is WBT_STATE_ON/OFF_MANUAL, it can only be covered - * to WBT_STATE_OFF/ON_MANUAL. - */ -enum { - WBT_STATE_ON_DEFAULT = 1, /* on by default */ - WBT_STATE_ON_MANUAL = 2, /* on manually by sysfs */ - WBT_STATE_OFF_DEFAULT = 3, /* off by default */ - WBT_STATE_OFF_MANUAL = 4, /* off manually by sysfs */ -}; - -struct rq_wb { - /* - * Settings that govern how we throttle - */ - unsigned int wb_background; /* background writeback */ - unsigned int wb_normal; /* normal writeback */ - - short enable_state; /* WBT_STATE_* */ - - /* - * Number of consecutive periods where we don't have enough - * information to make a firm scale up/down decision. - */ - unsigned int unknown_cnt; - - u64 win_nsec; /* default window size */ - u64 cur_win_nsec; /* current window size */ - - struct blk_stat_callback *cb; - - u64 sync_issue; - void *sync_cookie; - - unsigned int wc; - - unsigned long last_issue; /* last non-throttled issue */ - unsigned long last_comp; /* last non-throttled comp */ - unsigned long min_lat_nsec; - struct rq_qos rqos; - struct rq_wait rq_wait[WBT_NUM_RWQ]; - struct rq_depth rq_depth; -}; - -static inline struct rq_wb *RQWB(struct rq_qos *rqos) -{ - return container_of(rqos, struct rq_wb, rqos); -} - -static inline unsigned int wbt_inflight(struct rq_wb *rwb) -{ - unsigned int i, ret = 0; - - for (i = 0; i < WBT_NUM_RWQ; i++) - ret += atomic_read(&rwb->rq_wait[i].inflight); - - return ret; -} - - #ifdef CONFIG_BLK_WBT
int wbt_init(struct request_queue *); diff --git a/block/elevator.c b/block/elevator.c index 6f7de2ffad0e..87199709e0b5 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -43,6 +43,7 @@ #include "blk.h" #include "blk-mq-sched.h" #include "blk-pm.h" +#include "blk-rq-qos.h" #include "blk-wbt.h"
static DEFINE_SPINLOCK(elv_list_lock);
From: Baokun Li libaokun1@huawei.com
mainline inclusion from mainline-v6.11-rc1 commit 4e63aeb5d0101ddada36a2f64f048e2f9d2202fc category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IA8D5J CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Now we avoid throttling swap writes by determining whether the current process is kswapd (aka current_is_kswapd()), but swap writes can come from either kswapd or direct reclaim, so the swap writes from direct reclaim will still be throttled.
When a process holds a lock to allocate a free page, and enters direct reclaim because there is no free memory, then it might trigger a hung due to the wbt throttling that causes other processes to fail to get the lock.
Both kswapd and direct reclaim set the REQ_SWAP flag, so use REQ_SWAP instead of current_is_kswapd() to avoid throttling swap writes. Also renamed WBT_KSWAPD to WBT_SWAP and WBT_RWQ_KSWAPD to WBT_RWQ_SWAP.
Signed-off-by: Baokun Li libaokun1@huawei.com Reviewed-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Christoph Hellwig hch@lst.de Link: https://lore.kernel.org/r/20240604030522.3686177-1-libaokun@huaweicloud.com Signed-off-by: Jens Axboe axboe@kernel.dk
Conflicts: block/blk-wbt.c [commit 16458cf3bd15 ("block: Use the new blk_opf_t type") is not backported] Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/blk-wbt.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 799caf8e4dcb..183f26a83347 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -35,7 +35,7 @@ enum wbt_flags { WBT_TRACKED = 1, /* write, tracked for throttling */ WBT_READ = 2, /* read */ - WBT_KSWAPD = 4, /* write, from kswapd */ + WBT_SWAP = 4, /* write, from swap_writepage() */ WBT_DISCARD = 8, /* discard */
WBT_NR_BITS = 4, /* number of bits */ @@ -43,7 +43,7 @@ enum wbt_flags {
enum { WBT_RWQ_BG = 0, - WBT_RWQ_KSWAPD, + WBT_RWQ_SWAP, WBT_RWQ_DISCARD, WBT_NUM_RWQ, }; @@ -172,8 +172,8 @@ static bool wb_recent_wait(struct rq_wb *rwb) static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, enum wbt_flags wb_acct) { - if (wb_acct & WBT_KSWAPD) - return &rwb->rq_wait[WBT_RWQ_KSWAPD]; + if (wb_acct & WBT_SWAP) + return &rwb->rq_wait[WBT_RWQ_SWAP]; else if (wb_acct & WBT_DISCARD) return &rwb->rq_wait[WBT_RWQ_DISCARD];
@@ -536,7 +536,7 @@ static bool close_io(struct rq_wb *rwb) time_before(now, rwb->last_comp + HZ / 10); }
-#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO) +#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO | REQ_SWAP)
static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) { @@ -554,13 +554,13 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
/* * At this point we know it's a buffered write. If this is - * kswapd trying to free memory, or REQ_SYNC is set, then + * swap trying to free memory, or REQ_SYNC is set, then * it's WB_SYNC_ALL writeback, and we'll use the max limit for * that. If the write is marked as a background write, then use * the idle limit, or go to normal if we haven't had competing * IO for a bit. */ - if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) + if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb)) limit = rwb->rq_depth.max_depth; else if ((rw & REQ_BACKGROUND) || close_io(rwb)) { /* @@ -637,8 +637,8 @@ static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio) if (bio_op(bio) == REQ_OP_READ) { flags = WBT_READ; } else if (wbt_should_throttle(rwb, bio)) { - if (current_is_kswapd()) - flags |= WBT_KSWAPD; + if (bio->bi_opf & REQ_SWAP) + flags |= WBT_SWAP; if (bio_op(bio) == REQ_OP_DISCARD) flags |= WBT_DISCARD; flags |= WBT_TRACKED;
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/10721 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/A...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/10721 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/A...