From: Xu Wei xuwei56@huawei.com
euleros inclusion category: feature bugzilla: https://bugzilla.openeuler.org/show_bug.cgi?id=327 CVE: NA
When cache available is low, bcache turn to writethrough mode. Therefore, All write IO will be directly sent to backend device, which is usually HDD. At same time, cache device flush dirty data to the backend device in the bcache writeback process. So write IO from user will damage the sequentiality of writeback. And if there is lots of IO from writeback, user's write IO may be block. This patch add traffic policy in bcache to solve the problem and improve the performance for bcache when cache available is low.
Signed-off-by: qinghaixiang xuweiqhx@163.com Signed-off-by: Xu Wei xuwei56@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Reviewed-by: Li Ruilin liruilin4@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/md/bcache/bcache.h | 49 ++++++++++++ drivers/md/bcache/btree.h | 6 +- drivers/md/bcache/request.c | 143 +++++++++++++++++++++++++++++++++- drivers/md/bcache/request.h | 2 + drivers/md/bcache/super.c | 35 +++++++++ drivers/md/bcache/sysfs.c | 56 +++++++++++++ drivers/md/bcache/writeback.c | 11 ++- drivers/md/bcache/writeback.h | 6 +- 8 files changed, 300 insertions(+), 8 deletions(-)
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 99d12fce876b2..70fbde8ca70c9 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -399,6 +399,28 @@ struct cached_dev { unsigned int offline_seconds;
char backing_dev_name[BDEVNAME_SIZE]; + + /* Count the front and writeback io bandwidth per second */ + atomic_t writeback_sector_size; + atomic_t writeback_io_num; + atomic_t front_io_num; + unsigned int writeback_sector_size_per_sec; + unsigned int writeback_io_num_per_sec; + unsigned int front_io_num_per_sec; + struct timer_list io_stat_timer; + + unsigned int writeback_state; +#define WRITEBACK_DEFAULT 0 +#define WRITEBACK_QUICK 1 +#define WRITEBACK_SLOW 2 + + /* realize for token bucket */ + spinlock_t token_lock; + unsigned int max_sector_size; + unsigned int max_io_num; + unsigned int write_token_sector_size; + unsigned int write_token_io_num; + struct timer_list token_assign_timer; };
enum alloc_reserve { @@ -717,6 +739,10 @@ struct cache_set {
#define BUCKET_HASH_BITS 12 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; + unsigned int cutoff_writeback_sync; + bool traffic_policy_start; + bool force_write_through; + unsigned int gc_sectors; };
struct bbio { @@ -732,6 +758,29 @@ struct bbio { struct bio bio; };
+struct get_bcache_status { + unsigned int writeback_sector_size_per_sec; + unsigned int writeback_io_num_per_sec; + unsigned int front_io_num_per_sec; + uint64_t dirty_rate; + unsigned int available; +}; + +struct set_bcache_status { + unsigned int write_token_sector_size; + unsigned int write_token_io_num; + bool traffic_policy_start; + bool force_write_through; + bool copy_gc_enabled; + bool trigger_gc; + unsigned int writeback_state; + unsigned int gc_sectors; + unsigned int cutoff_writeback_sync; +}; +#define BCACHE_MAJOR 'B' +#define BCACHE_GET_WRITE_STATUS _IOR(BCACHE_MAJOR, 0x0, struct get_bcache_status) +#define BCACHE_SET_WRITE_STATUS _IOW(BCACHE_MAJOR, 0x1, struct set_bcache_status) + #define BTREE_PRIO USHRT_MAX #define INITIAL_PRIO 32768U
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 4d0cca145f699..7ddadcc485ea6 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -193,7 +193,11 @@ static inline unsigned int bset_block_offset(struct btree *b, struct bset *i)
static inline void set_gc_sectors(struct cache_set *c) { - atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16); + if (c->gc_sectors == 0) + atomic_set(&c->sectors_to_gc, + c->sb.bucket_size * c->nbuckets / 16); + else + atomic_set(&c->sectors_to_gc, c->gc_sectors); }
void bkey_put(struct cache_set *c, struct bkey *k); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 6d89e56a4a410..c05544e07722e 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -28,6 +28,7 @@ struct kmem_cache *bch_search_cache;
static void bch_data_insert_start(struct closure *cl); +static void alloc_token(struct cached_dev *dc, unsigned int sectors);
static unsigned int cache_mode(struct cached_dev *dc) { @@ -396,7 +397,8 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) goto skip;
if (mode == CACHE_MODE_NONE || - (mode == CACHE_MODE_WRITEAROUND && + ((mode == CACHE_MODE_WRITEAROUND || + c->force_write_through == true) && op_is_write(bio_op(bio)))) goto skip;
@@ -858,6 +860,10 @@ static void cached_dev_read_done(struct closure *cl) if (s->iop.bio && (!dc->read_bypass || s->prefetch) && !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) { BUG_ON(!s->iop.replace); + if ((dc->disk.c->traffic_policy_start == true) && + (dc->disk.c->force_write_through != true)) { + alloc_token(dc, bio_sectors(s->iop.bio)); + } closure_call(&s->iop.cl, bch_data_insert, NULL, cl); }
@@ -1000,6 +1006,35 @@ static void cached_dev_write_complete(struct closure *cl) continue_at(cl, cached_dev_bio_complete, NULL); }
+static void alloc_token(struct cached_dev *dc, unsigned int sectors) +{ + int count = 0; + + spin_lock_bh(&dc->token_lock); + + while ((dc->write_token_sector_size < sectors) && + (dc->write_token_io_num == 0)) { + spin_unlock_bh(&dc->token_lock); + schedule_timeout_interruptible(msecs_to_jiffies(10)); + count++; + if ((dc->disk.c->traffic_policy_start != true) || + (cache_mode(dc) != CACHE_MODE_WRITEBACK) || + (count > 100)) + return; + spin_lock_bh(&dc->token_lock); + } + + if (dc->write_token_sector_size >= sectors) + dc->write_token_sector_size -= sectors; + else + dc->write_token_sector_size = 0; + + if (dc->write_token_io_num > 0) + dc->write_token_io_num--; + + spin_unlock_bh(&dc->token_lock); +} + static void cached_dev_write(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl; @@ -1247,6 +1282,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, cached_dev_nodata, bcache_wq); } else { + atomic_inc(&dc->front_io_num); s->iop.bypass = check_should_bypass(dc, bio);
if (!s->iop.bypass && bio->bi_iter.bi_size && !rw) { @@ -1258,10 +1294,17 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, save_circ_item(&s->smp); }
- if (rw) + if (rw) { + if ((s->iop.bypass == false) && + (dc->disk.c->traffic_policy_start == true) && + (cache_mode(dc) == CACHE_MODE_WRITEBACK) && + (bio_op(bio) != REQ_OP_DISCARD)) { + alloc_token(dc, bio_sectors(bio)); + } cached_dev_write(dc, s); - else + } else { cached_dev_read(dc, s); + } } } else /* I/O request sent to backing device */ @@ -1270,6 +1313,65 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, return BLK_QC_T_NONE; }
+static int bcache_get_write_status(struct cached_dev *dc, unsigned long arg) +{ + struct get_bcache_status a; + uint64_t cache_sectors; + struct cache_set *c = dc->disk.c; + + if (c == NULL) + return -ENODEV; + + a.writeback_sector_size_per_sec = dc->writeback_sector_size_per_sec; + a.writeback_io_num_per_sec = dc->writeback_io_num_per_sec; + a.front_io_num_per_sec = dc->front_io_num_per_sec; + cache_sectors = c->nbuckets * c->sb.bucket_size - + atomic_long_read(&c->flash_dev_dirty_sectors); + a.dirty_rate = div64_u64(bcache_dev_sectors_dirty(&dc->disk) * 100, + cache_sectors); + a.available = 100 - c->gc_stats.in_use; + if (copy_to_user((struct get_bcache_status *)arg, &a, + sizeof(struct get_bcache_status))) + return -EFAULT; + return 0; +} + +static int bcache_set_write_status(struct cached_dev *dc, unsigned long arg) +{ + struct set_bcache_status a; + struct cache_set *c = dc->disk.c; + + if (c == NULL) + return -ENODEV; + if (copy_from_user(&a, (struct set_bcache_status *)arg, + sizeof(struct set_bcache_status))) + return -EFAULT; + + if (c->traffic_policy_start != a.traffic_policy_start) + pr_info("%s traffic policy %s", dc->disk.disk->disk_name, + (a.traffic_policy_start == true) ? "enable" : "disable"); + if (c->force_write_through != a.force_write_through) + pr_info("%s force write through %s", dc->disk.disk->disk_name, + (a.force_write_through == true) ? "enable" : "disable"); + if (a.trigger_gc) { + pr_info("trigger %s gc", dc->disk.disk->disk_name); + atomic_set(&c->sectors_to_gc, -1); + wake_up_gc(c); + } + if ((a.cutoff_writeback_sync >= MIN_CUTOFF_WRITEBACK_SYNC) && + (a.cutoff_writeback_sync <= MAX_CUTOFF_WRITEBACK_SYNC)) { + c->cutoff_writeback_sync = a.cutoff_writeback_sync; + } + + dc->max_sector_size = a.write_token_sector_size; + dc->max_io_num = a.write_token_io_num; + c->traffic_policy_start = a.traffic_policy_start; + c->force_write_through = a.force_write_through; + c->gc_sectors = a.gc_sectors; + dc->writeback_state = a.writeback_state; + return 0; +} + static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, unsigned int cmd, unsigned long arg) { @@ -1278,7 +1380,14 @@ static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, if (dc->io_disable) return -EIO;
- return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg); + switch (cmd) { + case BCACHE_GET_WRITE_STATUS: + return bcache_get_write_status(dc, arg); + case BCACHE_SET_WRITE_STATUS: + return bcache_set_write_status(dc, arg); + default: + return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg); + } }
static int cached_dev_congested(void *data, int bits) @@ -1438,3 +1547,29 @@ int __init bch_request_init(void)
return 0; } + +static void token_assign(struct timer_list *t) +{ + struct cached_dev *dc = from_timer(dc, t, token_assign_timer); + + dc->token_assign_timer.expires = jiffies + HZ / 8; + add_timer(&dc->token_assign_timer); + + spin_lock(&dc->token_lock); + dc->write_token_sector_size = dc->max_sector_size / 8; + dc->write_token_io_num = dc->max_io_num / 8; + dc->write_token_io_num = + (dc->write_token_io_num == 0) ? 1 : dc->write_token_io_num; + spin_unlock(&dc->token_lock); +} + +void bch_traffic_policy_init(struct cached_dev *dc) +{ + spin_lock_init(&dc->token_lock); + dc->write_token_sector_size = 0; + dc->write_token_io_num = 0; + + timer_setup(&dc->token_assign_timer, token_assign, 0); + dc->token_assign_timer.expires = jiffies + HZ / 8; + add_timer(&dc->token_assign_timer); +} diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 3667bc5390dfe..f677ba8704940 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -41,6 +41,8 @@ void bch_data_insert(struct closure *cl); void bch_cached_dev_request_init(struct cached_dev *dc); void bch_flash_dev_request_init(struct bcache_device *d);
+void bch_traffic_policy_init(struct cached_dev *dc); + extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache;
struct search { diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e7f7a0f038682..3f858de9e9602 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1210,6 +1210,8 @@ static void cached_dev_free(struct closure *cl) { struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
+ del_timer_sync(&dc->io_stat_timer); + del_timer_sync(&dc->token_assign_timer); if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) cancel_writeback_rate_update_dwork(dc);
@@ -1250,6 +1252,36 @@ static void cached_dev_flush(struct closure *cl) continue_at(cl, cached_dev_free, system_wq); }
+static void cached_dev_io_stat(struct timer_list *t) +{ + struct cached_dev *dc = from_timer(dc, t, io_stat_timer); + + dc->io_stat_timer.expires = jiffies + HZ; + add_timer(&dc->io_stat_timer); + + dc->writeback_sector_size_per_sec = + atomic_read(&dc->writeback_sector_size); + dc->writeback_io_num_per_sec = atomic_read(&dc->writeback_io_num); + dc->front_io_num_per_sec = atomic_read(&dc->front_io_num); + atomic_set(&dc->writeback_sector_size, 0); + atomic_set(&dc->writeback_io_num, 0); + atomic_set(&dc->front_io_num, 0); +} + +static void cached_dev_timer_init(struct cached_dev *dc) +{ + dc->writeback_sector_size_per_sec = 0; + dc->writeback_io_num_per_sec = 0; + dc->front_io_num_per_sec = 0; + atomic_set(&dc->writeback_sector_size, 0); + atomic_set(&dc->writeback_io_num, 0); + atomic_set(&dc->front_io_num, 0); + + timer_setup(&dc->io_stat_timer, cached_dev_io_stat, 0); + dc->io_stat_timer.expires = jiffies + HZ; + add_timer(&dc->io_stat_timer); +} + static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) { int ret; @@ -1266,6 +1298,8 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) INIT_LIST_HEAD(&dc->io_lru); spin_lock_init(&dc->io_lock); bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); + cached_dev_timer_init(dc); + bch_traffic_policy_init(dc);
dc->sequential_cutoff = 4 << 20; dc->inflight_block_enable = 1; @@ -1774,6 +1808,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->congested_read_threshold_us = 2000; c->congested_write_threshold_us = 20000; c->error_limit = DEFAULT_IO_ERROR_LIMIT; + c->cutoff_writeback_sync = MIN_CUTOFF_WRITEBACK_SYNC; WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags));
return c; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 706d3a245dba6..4c693ac29b0e0 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -51,6 +51,13 @@ static const char * const error_actions[] = { NULL };
+static const char * const writeback_state[] = { + "default", + "quick", + "slow", + NULL +}; + write_attribute(attach); write_attribute(detach); write_attribute(unregister); @@ -96,6 +103,9 @@ read_attribute(io_errors); read_attribute(congested); rw_attribute(congested_read_threshold_us); rw_attribute(congested_write_threshold_us); +rw_attribute(gc_sectors); +rw_attribute(traffic_policy_start); +rw_attribute(force_write_through);
rw_attribute(sequential_cutoff); rw_attribute(read_bypass); @@ -114,7 +124,13 @@ rw_attribute(writeback_rate_update_seconds); rw_attribute(writeback_rate_i_term_inverse); rw_attribute(writeback_rate_p_term_inverse); rw_attribute(writeback_rate_minimum); +rw_attribute(writeback_state); +read_attribute(writeback_sector_size_per_sec); +read_attribute(writeback_io_num_per_sec); +read_attribute(front_io_num_per_sec); read_attribute(writeback_rate_debug); +read_attribute(write_token_sector_size); +read_attribute(write_token_io_num);
read_attribute(stripe_size); read_attribute(partial_stripes_expensive); @@ -169,6 +185,11 @@ SHOW(__bch_cached_dev) bch_cache_modes, BDEV_CACHE_MODE(&dc->sb));
+ if (attr == &sysfs_writeback_state) + return bch_snprint_string_list(buf, PAGE_SIZE, + writeback_state, + dc->writeback_state); + if (attr == &sysfs_readahead_cache_policy) return bch_snprint_string_list(buf, PAGE_SIZE, bch_reada_cache_policies, @@ -186,6 +207,9 @@ SHOW(__bch_cached_dev) var_printf(writeback_metadata, "%i"); var_printf(writeback_running, "%i"); var_print(writeback_delay); + var_print(writeback_sector_size_per_sec); + var_print(writeback_io_num_per_sec); + var_print(front_io_num_per_sec); var_print(writeback_percent); sysfs_hprint(writeback_rate, wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 : 0); @@ -248,6 +272,8 @@ SHOW(__bch_cached_dev)
sysfs_print(running, atomic_read(&dc->running)); sysfs_print(state, states[BDEV_STATE(&dc->sb)]); + var_print(write_token_sector_size); + var_print(write_token_io_num);
if (attr == &sysfs_label) { memcpy(buf, dc->sb.label, SB_LABEL_SIZE); @@ -346,6 +372,15 @@ STORE(__cached_dev) } }
+ if (attr == &sysfs_writeback_state) { + v = __sysfs_match_string(writeback_state, -1, buf); + + if (v < 0) + return v; + + dc->writeback_state = v; + } + if (attr == &sysfs_readahead_cache_policy) { v = __sysfs_match_string(bch_reada_cache_policies, -1, buf); if (v < 0) @@ -448,11 +483,14 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_data_csum, #endif &sysfs_cache_mode, + &sysfs_writeback_state, &sysfs_readahead_cache_policy, &sysfs_stop_when_cache_set_failed, &sysfs_writeback_metadata, &sysfs_writeback_running, &sysfs_writeback_delay, + &sysfs_writeback_sector_size_per_sec, + &sysfs_writeback_io_num_per_sec, &sysfs_writeback_percent, &sysfs_writeback_rate, &sysfs_writeback_rate_update_seconds, @@ -460,6 +498,9 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_writeback_rate_p_term_inverse, &sysfs_writeback_rate_minimum, &sysfs_writeback_rate_debug, + &sysfs_write_token_sector_size, + &sysfs_write_token_io_num, + &sysfs_front_io_num_per_sec, &sysfs_io_errors, &sysfs_io_error_limit, &sysfs_io_disable, @@ -714,6 +755,12 @@ SHOW(__bch_cache_set) c->congested_read_threshold_us); sysfs_print(congested_write_threshold_us, c->congested_write_threshold_us); + sysfs_print(gc_sectors, + c->gc_sectors); + sysfs_print(traffic_policy_start, + c->traffic_policy_start); + sysfs_print(force_write_through, + c->force_write_through);
sysfs_print(active_journal_entries, fifo_used(&c->journal.pin)); sysfs_printf(verify, "%i", c->verify); @@ -800,6 +847,12 @@ STORE(__bch_cache_set) c->congested_read_threshold_us); sysfs_strtoul(congested_write_threshold_us, c->congested_write_threshold_us); + sysfs_strtoul(gc_sectors, + c->gc_sectors); + sysfs_strtoul(traffic_policy_start, + c->traffic_policy_start); + sysfs_strtoul(force_write_through, + c->force_write_through);
if (attr == &sysfs_errors) { v = __sysfs_match_string(error_actions, -1, buf); @@ -926,6 +979,9 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_btree_shrinker_disabled, &sysfs_copy_gc_enabled, &sysfs_io_disable, + &sysfs_gc_sectors, + &sysfs_traffic_policy_start, + &sysfs_force_write_through, NULL }; KTYPE(bch_cache_set_internal); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index b5fc3c6c7178e..901ad8bae7614 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -222,7 +222,13 @@ static unsigned int writeback_delay(struct cached_dev *dc, !dc->writeback_percent) return 0;
- return bch_next_delay(&dc->writeback_rate, sectors); + if (dc->writeback_state == WRITEBACK_DEFAULT) { + return bch_next_delay(&dc->writeback_rate, sectors); + } else if (dc->writeback_state == WRITEBACK_QUICK) { + return 0; + } else { + return msecs_to_jiffies(1000); + } }
struct dirty_io { @@ -287,6 +293,9 @@ static void write_dirty_finish(struct closure *cl) : &dc->disk.c->writeback_keys_done); }
+ atomic_add(KEY_SIZE(&w->key), &dc->writeback_sector_size); + atomic_inc(&dc->writeback_io_num); + bch_keybuf_del(&dc->writeback_keys, w); up(&dc->in_flight);
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index e75dc33339f6f..a3151c0e96609 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -3,7 +3,8 @@ #define _BCACHE_WRITEBACK_H
#define CUTOFF_WRITEBACK 40 -#define CUTOFF_WRITEBACK_SYNC 70 +#define MIN_CUTOFF_WRITEBACK_SYNC 70 +#define MAX_CUTOFF_WRITEBACK_SYNC 90
#define MAX_WRITEBACKS_IN_PASS 5 #define MAX_WRITESIZE_IN_PASS 5000 /* *512b */ @@ -57,10 +58,11 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, unsigned int cache_mode, bool would_skip) { unsigned int in_use = dc->disk.c->gc_stats.in_use; + unsigned int cutoff = dc->disk.c->cutoff_writeback_sync;
if (cache_mode != CACHE_MODE_WRITEBACK || test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || - in_use > CUTOFF_WRITEBACK_SYNC) + in_use > cutoff) return false;
if (bio_op(bio) == REQ_OP_DISCARD)