From: Xu Wei xuwei56@huawei.com
euleros inclusion category: feature bugzilla: https://bugzilla.openeuler.org/show_bug.cgi?id=327 CVE: NA
When cache available is low, bcache turn to writethrough mode. Therefore, All write IO will be directly sent to backend device, which is usually HDD. At same time, cache device flush dirty data to the backend device in the bcache writeback process. So write IO from user will damage the sequentiality of writeback. And if there is lots of IO from writeback, user's write IO may be block. This patch add traffic policy in bcache to solve the problem and improve the performance for bcache when cache available is low.
Signed-off-by: qinghaixiang xuweiqhx@163.com Signed-off-by: Xu Wei xuwei56@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Reviewed-by: Li Ruilin liruilin4@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/md/bcache/bcache.h | 49 ++++++++++++ drivers/md/bcache/btree.h | 6 +- drivers/md/bcache/request.c | 145 ++++++++++++++++++++++++++++++++-- drivers/md/bcache/request.h | 2 + drivers/md/bcache/super.c | 36 +++++++++ drivers/md/bcache/sysfs.c | 56 +++++++++++++ drivers/md/bcache/writeback.c | 11 ++- drivers/md/bcache/writeback.h | 3 +- 8 files changed, 300 insertions(+), 8 deletions(-)
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 3340f59117112..aa9e85dc04872 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -400,6 +400,28 @@ struct cached_dev { unsigned int offline_seconds;
char backing_dev_name[BDEVNAME_SIZE]; + + /* Count the front and writeback io bandwidth per second */ + atomic_t writeback_sector_size; + atomic_t writeback_io_num; + atomic_t front_io_num; + unsigned int writeback_sector_size_per_sec; + unsigned int writeback_io_num_per_sec; + unsigned int front_io_num_per_sec; + struct timer_list io_stat_timer; + + unsigned int writeback_state; +#define WRITEBACK_DEFAULT 0 +#define WRITEBACK_QUICK 1 +#define WRITEBACK_SLOW 2 + + /* realize for token bucket */ + spinlock_t token_lock; + unsigned int max_sector_size; + unsigned int max_io_num; + unsigned int write_token_sector_size; + unsigned int write_token_io_num; + struct timer_list token_assign_timer; };
enum alloc_reserve { @@ -739,6 +761,10 @@ struct cache_set {
#define BUCKET_HASH_BITS 12 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; + unsigned int cutoff_writeback_sync; + bool traffic_policy_start; + bool force_write_through; + unsigned int gc_sectors; };
struct bbio { @@ -754,6 +780,29 @@ struct bbio { struct bio bio; };
+struct get_bcache_status { + unsigned int writeback_sector_size_per_sec; + unsigned int writeback_io_num_per_sec; + unsigned int front_io_num_per_sec; + uint64_t dirty_rate; + unsigned int available; +}; + +struct set_bcache_status { + unsigned int write_token_sector_size; + unsigned int write_token_io_num; + bool traffic_policy_start; + bool force_write_through; + bool copy_gc_enabled; + bool trigger_gc; + unsigned int writeback_state; + unsigned int gc_sectors; + unsigned int cutoff_writeback_sync; +}; +#define BCACHE_MAJOR 'B' +#define BCACHE_GET_WRITE_STATUS _IOR(BCACHE_MAJOR, 0x0, struct get_bcache_status) +#define BCACHE_SET_WRITE_STATUS _IOW(BCACHE_MAJOR, 0x1, struct set_bcache_status) + #define BTREE_PRIO USHRT_MAX #define INITIAL_PRIO 32768U
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 76cfd121a4861..7c5f7c235c533 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -193,7 +193,11 @@ static inline unsigned int bset_block_offset(struct btree *b, struct bset *i)
static inline void set_gc_sectors(struct cache_set *c) { - atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16); + if (c->gc_sectors == 0) + atomic_set(&c->sectors_to_gc, + c->sb.bucket_size * c->nbuckets / 16); + else + atomic_set(&c->sectors_to_gc, c->gc_sectors); }
void bkey_put(struct cache_set *c, struct bkey *k); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 2f3ecd8fc5801..50303841dd45b 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -28,6 +28,7 @@ struct kmem_cache *bch_search_cache;
static void bch_data_insert_start(struct closure *cl); +static void alloc_token(struct cached_dev *dc, unsigned int sectors);
static unsigned int cache_mode(struct cached_dev *dc) { @@ -384,8 +385,9 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) goto skip;
if (mode == CACHE_MODE_NONE || - (mode == CACHE_MODE_WRITEAROUND && - op_is_write(bio_op(bio)))) + ((mode == CACHE_MODE_WRITEAROUND || + c->force_write_through == true) && + op_is_write(bio_op(bio)))) goto skip;
/* @@ -863,6 +865,10 @@ static void cached_dev_read_done(struct closure *cl) if (s->iop.bio && (!dc->read_bypass || s->prefetch) && !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) { BUG_ON(!s->iop.replace); + if ((dc->disk.c->traffic_policy_start == true) && + (dc->disk.c->force_write_through != true)) { + alloc_token(dc, bio_sectors(s->iop.bio)); + } closure_call(&s->iop.cl, bch_data_insert, NULL, cl); }
@@ -1005,6 +1011,35 @@ static void cached_dev_write_complete(struct closure *cl) continue_at(cl, cached_dev_bio_complete, NULL); }
+static void alloc_token(struct cached_dev *dc, unsigned int sectors) +{ + int count = 0; + + spin_lock_bh(&dc->token_lock); + + while ((dc->write_token_sector_size < sectors) && + (dc->write_token_io_num == 0)) { + spin_unlock_bh(&dc->token_lock); + schedule_timeout_interruptible(msecs_to_jiffies(10)); + count++; + if ((dc->disk.c->traffic_policy_start != true) || + (cache_mode(dc) != CACHE_MODE_WRITEBACK) || + (count > 100)) + return; + spin_lock_bh(&dc->token_lock); + } + + if (dc->write_token_sector_size >= sectors) + dc->write_token_sector_size -= sectors; + else + dc->write_token_sector_size = 0; + + if (dc->write_token_io_num > 0) + dc->write_token_io_num--; + + spin_unlock_bh(&dc->token_lock); +} + static void cached_dev_write(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl; @@ -1252,6 +1287,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, cached_dev_nodata, bcache_wq); } else { + atomic_inc(&dc->front_io_num); s->iop.bypass = check_should_bypass(dc, bio);
if (!s->iop.bypass && bio->bi_iter.bi_size && !rw) { @@ -1263,10 +1299,17 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, save_circ_item(&s->smp); }
- if (rw) + if (rw) { + if ((s->iop.bypass == false) && + (dc->disk.c->traffic_policy_start == true) && + (cache_mode(dc) == CACHE_MODE_WRITEBACK) && + (bio_op(bio) != REQ_OP_DISCARD)) { + alloc_token(dc, bio_sectors(bio)); + } cached_dev_write(dc, s); - else + } else { cached_dev_read(dc, s); + } } } else /* I/O request sent to backing device */ @@ -1275,6 +1318,65 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, return BLK_QC_T_NONE; }
+static int bcache_get_write_status(struct cached_dev *dc, unsigned long arg) +{ + struct get_bcache_status a; + uint64_t cache_sectors; + struct cache_set *c = dc->disk.c; + + if (c == NULL) + return -ENODEV; + + a.writeback_sector_size_per_sec = dc->writeback_sector_size_per_sec; + a.writeback_io_num_per_sec = dc->writeback_io_num_per_sec; + a.front_io_num_per_sec = dc->front_io_num_per_sec; + cache_sectors = c->nbuckets * c->sb.bucket_size - + atomic_long_read(&c->flash_dev_dirty_sectors); + a.dirty_rate = div64_u64(bcache_dev_sectors_dirty(&dc->disk) * 100, + cache_sectors); + a.available = 100 - c->gc_stats.in_use; + if (copy_to_user((struct get_bcache_status *)arg, &a, + sizeof(struct get_bcache_status))) + return -EFAULT; + return 0; +} + +static int bcache_set_write_status(struct cached_dev *dc, unsigned long arg) +{ + struct set_bcache_status a; + struct cache_set *c = dc->disk.c; + + if (c == NULL) + return -ENODEV; + if (copy_from_user(&a, (struct set_bcache_status *)arg, + sizeof(struct set_bcache_status))) + return -EFAULT; + + if (c->traffic_policy_start != a.traffic_policy_start) + pr_info("%s traffic policy %s", dc->disk.disk->disk_name, + (a.traffic_policy_start == true) ? "enable" : "disable"); + if (c->force_write_through != a.force_write_through) + pr_info("%s force write through %s", dc->disk.disk->disk_name, + (a.force_write_through == true) ? "enable" : "disable"); + if (a.trigger_gc) { + pr_info("trigger %s gc", dc->disk.disk->disk_name); + atomic_set(&c->sectors_to_gc, -1); + wake_up_gc(c); + } + if ((a.cutoff_writeback_sync >= CUTOFF_WRITEBACK_SYNC) && + (a.cutoff_writeback_sync <= CUTOFF_WRITEBACK_SYNC_MAX)) { + c->cutoff_writeback_sync = a.cutoff_writeback_sync; + } + + dc->max_sector_size = a.write_token_sector_size; + dc->max_io_num = a.write_token_io_num; + c->traffic_policy_start = a.traffic_policy_start; + c->force_write_through = a.force_write_through; + c->gc_sectors = a.gc_sectors; + dc->writeback_state = a.writeback_state; + return 0; +} + static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, unsigned int cmd, unsigned long arg) { @@ -1283,7 +1385,14 @@ static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, if (dc->io_disable) return -EIO;
- return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg); + switch (cmd) { + case BCACHE_GET_WRITE_STATUS: + return bcache_get_write_status(dc, arg); + case BCACHE_SET_WRITE_STATUS: + return bcache_set_write_status(dc, arg); + default: + return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg); + } }
static int cached_dev_congested(void *data, int bits) @@ -1443,3 +1552,29 @@ int __init bch_request_init(void)
return 0; } + +static void token_assign(struct timer_list *t) +{ + struct cached_dev *dc = from_timer(dc, t, token_assign_timer); + + dc->token_assign_timer.expires = jiffies + HZ / 8; + add_timer(&dc->token_assign_timer); + + spin_lock(&dc->token_lock); + dc->write_token_sector_size = dc->max_sector_size / 8; + dc->write_token_io_num = dc->max_io_num / 8; + dc->write_token_io_num = + (dc->write_token_io_num == 0) ? 1 : dc->write_token_io_num; + spin_unlock(&dc->token_lock); +} + +void bch_traffic_policy_init(struct cached_dev *dc) +{ + spin_lock_init(&dc->token_lock); + dc->write_token_sector_size = 0; + dc->write_token_io_num = 0; + + timer_setup(&dc->token_assign_timer, token_assign, 0); + dc->token_assign_timer.expires = jiffies + HZ / 8; + add_timer(&dc->token_assign_timer); +} diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 7682630db73f6..8e2ab8071b5bf 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -41,6 +41,8 @@ void bch_data_insert(struct closure *cl); void bch_cached_dev_request_init(struct cached_dev *dc); void bch_flash_dev_request_init(struct bcache_device *d);
+void bch_traffic_policy_init(struct cached_dev *dc); + extern struct kmem_cache *bch_search_cache;
struct search { diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 754e88895738b..e2828c762edc5 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1260,6 +1260,9 @@ static void cached_dev_free(struct closure *cl) { struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
+ del_timer_sync(&dc->io_stat_timer); + del_timer_sync(&dc->token_assign_timer); + if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) cancel_writeback_rate_update_dwork(dc);
@@ -1303,6 +1306,36 @@ static void cached_dev_flush(struct closure *cl) continue_at(cl, cached_dev_free, system_wq); }
+static void cached_dev_io_stat(struct timer_list *t) +{ + struct cached_dev *dc = from_timer(dc, t, io_stat_timer); + + dc->io_stat_timer.expires = jiffies + HZ; + add_timer(&dc->io_stat_timer); + + dc->writeback_sector_size_per_sec = + atomic_read(&dc->writeback_sector_size); + dc->writeback_io_num_per_sec = atomic_read(&dc->writeback_io_num); + dc->front_io_num_per_sec = atomic_read(&dc->front_io_num); + atomic_set(&dc->writeback_sector_size, 0); + atomic_set(&dc->writeback_io_num, 0); + atomic_set(&dc->front_io_num, 0); +} + +static void cached_dev_timer_init(struct cached_dev *dc) +{ + dc->writeback_sector_size_per_sec = 0; + dc->writeback_io_num_per_sec = 0; + dc->front_io_num_per_sec = 0; + atomic_set(&dc->writeback_sector_size, 0); + atomic_set(&dc->writeback_io_num, 0); + atomic_set(&dc->front_io_num, 0); + + timer_setup(&dc->io_stat_timer, cached_dev_io_stat, 0); + dc->io_stat_timer.expires = jiffies + HZ; + add_timer(&dc->io_stat_timer); +} + static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) { int ret; @@ -1319,6 +1352,8 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) INIT_LIST_HEAD(&dc->io_lru); spin_lock_init(&dc->io_lock); bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); + cached_dev_timer_init(dc); + bch_traffic_policy_init(dc);
dc->sequential_cutoff = 4 << 20; dc->inflight_block_enable = 1; @@ -1837,6 +1872,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->congested_read_threshold_us = 2000; c->congested_write_threshold_us = 20000; c->error_limit = DEFAULT_IO_ERROR_LIMIT; + c->cutoff_writeback_sync = bch_cutoff_writeback_sync; c->idle_max_writeback_rate_enabled = 1; WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags));
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index e23c426229399..097560389760c 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -53,6 +53,13 @@ static const char * const error_actions[] = { NULL };
+static const char * const writeback_state[] = { + "default", + "quick", + "slow", + NULL +}; + write_attribute(attach); write_attribute(detach); write_attribute(unregister); @@ -102,6 +109,9 @@ read_attribute(cutoff_writeback); read_attribute(cutoff_writeback_sync); rw_attribute(congested_read_threshold_us); rw_attribute(congested_write_threshold_us); +rw_attribute(gc_sectors); +rw_attribute(traffic_policy_start); +rw_attribute(force_write_through);
rw_attribute(sequential_cutoff); rw_attribute(read_bypass); @@ -120,7 +130,13 @@ rw_attribute(writeback_rate_update_seconds); rw_attribute(writeback_rate_i_term_inverse); rw_attribute(writeback_rate_p_term_inverse); rw_attribute(writeback_rate_minimum); +rw_attribute(writeback_state); +read_attribute(writeback_sector_size_per_sec); +read_attribute(writeback_io_num_per_sec); +read_attribute(front_io_num_per_sec); read_attribute(writeback_rate_debug); +read_attribute(write_token_sector_size); +read_attribute(write_token_io_num);
read_attribute(stripe_size); read_attribute(partial_stripes_expensive); @@ -177,6 +193,11 @@ SHOW(__bch_cached_dev) bch_cache_modes, BDEV_CACHE_MODE(&dc->sb));
+ if (attr == &sysfs_writeback_state) + return bch_snprint_string_list(buf, PAGE_SIZE, + writeback_state, + dc->writeback_state); + if (attr == &sysfs_readahead_cache_policy) return bch_snprint_string_list(buf, PAGE_SIZE, bch_reada_cache_policies, @@ -194,6 +215,9 @@ SHOW(__bch_cached_dev) var_printf(writeback_metadata, "%i"); var_printf(writeback_running, "%i"); var_print(writeback_delay); + var_print(writeback_sector_size_per_sec); + var_print(writeback_io_num_per_sec); + var_print(front_io_num_per_sec); var_print(writeback_percent); sysfs_hprint(writeback_rate, wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 : 0); @@ -256,6 +280,8 @@ SHOW(__bch_cached_dev)
sysfs_print(running, atomic_read(&dc->running)); sysfs_print(state, states[BDEV_STATE(&dc->sb)]); + var_print(write_token_sector_size); + var_print(write_token_io_num);
if (attr == &sysfs_label) { memcpy(buf, dc->sb.label, SB_LABEL_SIZE); @@ -375,6 +401,15 @@ STORE(__cached_dev) } }
+ if (attr == &sysfs_writeback_state) { + v = __sysfs_match_string(writeback_state, -1, buf); + + if (v < 0) + return v; + + dc->writeback_state = v; + } + if (attr == &sysfs_readahead_cache_policy) { v = __sysfs_match_string(bch_reada_cache_policies, -1, buf); if (v < 0) @@ -498,11 +533,14 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_data_csum, #endif &sysfs_cache_mode, + &sysfs_writeback_state, &sysfs_readahead_cache_policy, &sysfs_stop_when_cache_set_failed, &sysfs_writeback_metadata, &sysfs_writeback_running, &sysfs_writeback_delay, + &sysfs_writeback_sector_size_per_sec, + &sysfs_writeback_io_num_per_sec, &sysfs_writeback_percent, &sysfs_writeback_rate, &sysfs_writeback_rate_update_seconds, @@ -510,6 +548,9 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_writeback_rate_p_term_inverse, &sysfs_writeback_rate_minimum, &sysfs_writeback_rate_debug, + &sysfs_write_token_sector_size, + &sysfs_write_token_io_num, + &sysfs_front_io_num_per_sec, &sysfs_io_errors, &sysfs_io_error_limit, &sysfs_io_disable, @@ -770,6 +811,12 @@ SHOW(__bch_cache_set) c->congested_read_threshold_us); sysfs_print(congested_write_threshold_us, c->congested_write_threshold_us); + sysfs_print(gc_sectors, + c->gc_sectors); + sysfs_print(traffic_policy_start, + c->traffic_policy_start); + sysfs_print(force_write_through, + c->force_write_through);
sysfs_print(cutoff_writeback, bch_cutoff_writeback); sysfs_print(cutoff_writeback_sync, bch_cutoff_writeback_sync); @@ -855,6 +902,12 @@ STORE(__bch_cache_set) sysfs_strtoul_clamp(congested_write_threshold_us, c->congested_write_threshold_us, 0, UINT_MAX); + sysfs_strtoul(gc_sectors, + c->gc_sectors); + sysfs_strtoul(traffic_policy_start, + c->traffic_policy_start); + sysfs_strtoul(force_write_through, + c->force_write_through);
if (attr == &sysfs_errors) { v = __sysfs_match_string(error_actions, -1, buf); @@ -997,6 +1050,9 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_idle_max_writeback_rate, &sysfs_gc_after_writeback, &sysfs_io_disable, + &sysfs_gc_sectors, + &sysfs_traffic_policy_start, + &sysfs_force_write_through, &sysfs_cutoff_writeback, &sysfs_cutoff_writeback_sync, NULL diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 25a66e54f7ba5..1e3041f6363c7 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -239,7 +239,13 @@ static unsigned int writeback_delay(struct cached_dev *dc, !dc->writeback_percent) return 0;
- return bch_next_delay(&dc->writeback_rate, sectors); + if (dc->writeback_state == WRITEBACK_DEFAULT) { + return bch_next_delay(&dc->writeback_rate, sectors); + } else if (dc->writeback_state == WRITEBACK_QUICK) { + return 0; + } else { + return msecs_to_jiffies(1000); + } }
struct dirty_io { @@ -304,6 +310,9 @@ static void write_dirty_finish(struct closure *cl) : &dc->disk.c->writeback_keys_done); }
+ atomic_add(KEY_SIZE(&w->key), &dc->writeback_sector_size); + atomic_inc(&dc->writeback_io_num); + bch_keybuf_del(&dc->writeback_keys, w); up(&dc->in_flight);
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index c4ff76037227b..912b1a146cea2 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -80,10 +80,11 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, unsigned int cache_mode, bool would_skip) { unsigned int in_use = dc->disk.c->gc_stats.in_use; + unsigned int cutoff = dc->disk.c->cutoff_writeback_sync;
if (cache_mode != CACHE_MODE_WRITEBACK || test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || - in_use > bch_cutoff_writeback_sync) + in_use > cutoff) return false;
if (bio_op(bio) == REQ_OP_DISCARD)