From: Xu Wei xuwei56@huawei.com
euleros inclusion category: feature bugzilla: https://bugzilla.openeuler.org/show_bug.cgi?id=327 CVE: NA
When cache available is low, bcache turn to writethrough mode. Therefore, All write IO will be directly sent to backend device, which is usually HDD. At same time, cache device flush dirty data to the backend device in the bcache writeback process. So write IO from user will damage the sequentiality of writeback. And if there is lots of IO from writeback, user's write IO may be block. This patch add traffic policy in bcache to solve the problem and improve the performance for bcache when cache available is low.
Signed-off-by: qinghaixiang xuweiqhx@163.com Signed-off-by: Xu Wei xuwei56@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Reviewed-by: Li Ruilin liruilin4@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/md/bcache/bcache.h | 49 ++++++++++++ drivers/md/bcache/btree.h | 6 +- drivers/md/bcache/request.c | 143 +++++++++++++++++++++++++++++++++- drivers/md/bcache/request.h | 2 + drivers/md/bcache/super.c | 35 +++++++++ drivers/md/bcache/sysfs.c | 56 +++++++++++++ drivers/md/bcache/writeback.c | 11 ++- drivers/md/bcache/writeback.h | 6 +- 8 files changed, 300 insertions(+), 8 deletions(-)
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 99d12fce876b2..70fbde8ca70c9 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -399,6 +399,28 @@ struct cached_dev { unsigned int offline_seconds;
char backing_dev_name[BDEVNAME_SIZE]; + + /* Count the front and writeback io bandwidth per second */ + atomic_t writeback_sector_size; + atomic_t writeback_io_num; + atomic_t front_io_num; + unsigned int writeback_sector_size_per_sec; + unsigned int writeback_io_num_per_sec; + unsigned int front_io_num_per_sec; + struct timer_list io_stat_timer; + + unsigned int writeback_state; +#define WRITEBACK_DEFAULT 0 +#define WRITEBACK_QUICK 1 +#define WRITEBACK_SLOW 2 + + /* realize for token bucket */ + spinlock_t token_lock; + unsigned int max_sector_size; + unsigned int max_io_num; + unsigned int write_token_sector_size; + unsigned int write_token_io_num; + struct timer_list token_assign_timer; };
enum alloc_reserve { @@ -717,6 +739,10 @@ struct cache_set {
#define BUCKET_HASH_BITS 12 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; + unsigned int cutoff_writeback_sync; + bool traffic_policy_start; + bool force_write_through; + unsigned int gc_sectors; };
struct bbio { @@ -732,6 +758,29 @@ struct bbio { struct bio bio; };
+struct get_bcache_status { + unsigned int writeback_sector_size_per_sec; + unsigned int writeback_io_num_per_sec; + unsigned int front_io_num_per_sec; + uint64_t dirty_rate; + unsigned int available; +}; + +struct set_bcache_status { + unsigned int write_token_sector_size; + unsigned int write_token_io_num; + bool traffic_policy_start; + bool force_write_through; + bool copy_gc_enabled; + bool trigger_gc; + unsigned int writeback_state; + unsigned int gc_sectors; + unsigned int cutoff_writeback_sync; +}; +#define BCACHE_MAJOR 'B' +#define BCACHE_GET_WRITE_STATUS _IOR(BCACHE_MAJOR, 0x0, struct get_bcache_status) +#define BCACHE_SET_WRITE_STATUS _IOW(BCACHE_MAJOR, 0x1, struct set_bcache_status) + #define BTREE_PRIO USHRT_MAX #define INITIAL_PRIO 32768U
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 4d0cca145f699..7ddadcc485ea6 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -193,7 +193,11 @@ static inline unsigned int bset_block_offset(struct btree *b, struct bset *i)
static inline void set_gc_sectors(struct cache_set *c) { - atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16); + if (c->gc_sectors == 0) + atomic_set(&c->sectors_to_gc, + c->sb.bucket_size * c->nbuckets / 16); + else + atomic_set(&c->sectors_to_gc, c->gc_sectors); }
void bkey_put(struct cache_set *c, struct bkey *k); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 6d89e56a4a410..c05544e07722e 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -28,6 +28,7 @@ struct kmem_cache *bch_search_cache;
static void bch_data_insert_start(struct closure *cl); +static void alloc_token(struct cached_dev *dc, unsigned int sectors);
static unsigned int cache_mode(struct cached_dev *dc) { @@ -396,7 +397,8 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) goto skip;
if (mode == CACHE_MODE_NONE || - (mode == CACHE_MODE_WRITEAROUND && + ((mode == CACHE_MODE_WRITEAROUND || + c->force_write_through == true) && op_is_write(bio_op(bio)))) goto skip;
@@ -858,6 +860,10 @@ static void cached_dev_read_done(struct closure *cl) if (s->iop.bio && (!dc->read_bypass || s->prefetch) && !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) { BUG_ON(!s->iop.replace); + if ((dc->disk.c->traffic_policy_start == true) && + (dc->disk.c->force_write_through != true)) { + alloc_token(dc, bio_sectors(s->iop.bio)); + } closure_call(&s->iop.cl, bch_data_insert, NULL, cl); }
@@ -1000,6 +1006,35 @@ static void cached_dev_write_complete(struct closure *cl) continue_at(cl, cached_dev_bio_complete, NULL); }
+static void alloc_token(struct cached_dev *dc, unsigned int sectors) +{ + int count = 0; + + spin_lock_bh(&dc->token_lock); + + while ((dc->write_token_sector_size < sectors) && + (dc->write_token_io_num == 0)) { + spin_unlock_bh(&dc->token_lock); + schedule_timeout_interruptible(msecs_to_jiffies(10)); + count++; + if ((dc->disk.c->traffic_policy_start != true) || + (cache_mode(dc) != CACHE_MODE_WRITEBACK) || + (count > 100)) + return; + spin_lock_bh(&dc->token_lock); + } + + if (dc->write_token_sector_size >= sectors) + dc->write_token_sector_size -= sectors; + else + dc->write_token_sector_size = 0; + + if (dc->write_token_io_num > 0) + dc->write_token_io_num--; + + spin_unlock_bh(&dc->token_lock); +} + static void cached_dev_write(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl; @@ -1247,6 +1282,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, cached_dev_nodata, bcache_wq); } else { + atomic_inc(&dc->front_io_num); s->iop.bypass = check_should_bypass(dc, bio);
if (!s->iop.bypass && bio->bi_iter.bi_size && !rw) { @@ -1258,10 +1294,17 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, save_circ_item(&s->smp); }
- if (rw) + if (rw) { + if ((s->iop.bypass == false) && + (dc->disk.c->traffic_policy_start == true) && + (cache_mode(dc) == CACHE_MODE_WRITEBACK) && + (bio_op(bio) != REQ_OP_DISCARD)) { + alloc_token(dc, bio_sectors(bio)); + } cached_dev_write(dc, s); - else + } else { cached_dev_read(dc, s); + } } } else /* I/O request sent to backing device */ @@ -1270,6 +1313,65 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, return BLK_QC_T_NONE; }
+static int bcache_get_write_status(struct cached_dev *dc, unsigned long arg) +{ + struct get_bcache_status a; + uint64_t cache_sectors; + struct cache_set *c = dc->disk.c; + + if (c == NULL) + return -ENODEV; + + a.writeback_sector_size_per_sec = dc->writeback_sector_size_per_sec; + a.writeback_io_num_per_sec = dc->writeback_io_num_per_sec; + a.front_io_num_per_sec = dc->front_io_num_per_sec; + cache_sectors = c->nbuckets * c->sb.bucket_size - + atomic_long_read(&c->flash_dev_dirty_sectors); + a.dirty_rate = div64_u64(bcache_dev_sectors_dirty(&dc->disk) * 100, + cache_sectors); + a.available = 100 - c->gc_stats.in_use; + if (copy_to_user((struct get_bcache_status *)arg, &a, + sizeof(struct get_bcache_status))) + return -EFAULT; + return 0; +} + +static int bcache_set_write_status(struct cached_dev *dc, unsigned long arg) +{ + struct set_bcache_status a; + struct cache_set *c = dc->disk.c; + + if (c == NULL) + return -ENODEV; + if (copy_from_user(&a, (struct set_bcache_status *)arg, + sizeof(struct set_bcache_status))) + return -EFAULT; + + if (c->traffic_policy_start != a.traffic_policy_start) + pr_info("%s traffic policy %s", dc->disk.disk->disk_name, + (a.traffic_policy_start == true) ? "enable" : "disable"); + if (c->force_write_through != a.force_write_through) + pr_info("%s force write through %s", dc->disk.disk->disk_name, + (a.force_write_through == true) ? "enable" : "disable"); + if (a.trigger_gc) { + pr_info("trigger %s gc", dc->disk.disk->disk_name); + atomic_set(&c->sectors_to_gc, -1); + wake_up_gc(c); + } + if ((a.cutoff_writeback_sync >= MIN_CUTOFF_WRITEBACK_SYNC) && + (a.cutoff_writeback_sync <= MAX_CUTOFF_WRITEBACK_SYNC)) { + c->cutoff_writeback_sync = a.cutoff_writeback_sync; + } + + dc->max_sector_size = a.write_token_sector_size; + dc->max_io_num = a.write_token_io_num; + c->traffic_policy_start = a.traffic_policy_start; + c->force_write_through = a.force_write_through; + c->gc_sectors = a.gc_sectors; + dc->writeback_state = a.writeback_state; + return 0; +} + static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, unsigned int cmd, unsigned long arg) { @@ -1278,7 +1380,14 @@ static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, if (dc->io_disable) return -EIO;
- return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg); + switch (cmd) { + case BCACHE_GET_WRITE_STATUS: + return bcache_get_write_status(dc, arg); + case BCACHE_SET_WRITE_STATUS: + return bcache_set_write_status(dc, arg); + default: + return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg); + } }
static int cached_dev_congested(void *data, int bits) @@ -1438,3 +1547,29 @@ int __init bch_request_init(void)
return 0; } + +static void token_assign(struct timer_list *t) +{ + struct cached_dev *dc = from_timer(dc, t, token_assign_timer); + + dc->token_assign_timer.expires = jiffies + HZ / 8; + add_timer(&dc->token_assign_timer); + + spin_lock(&dc->token_lock); + dc->write_token_sector_size = dc->max_sector_size / 8; + dc->write_token_io_num = dc->max_io_num / 8; + dc->write_token_io_num = + (dc->write_token_io_num == 0) ? 1 : dc->write_token_io_num; + spin_unlock(&dc->token_lock); +} + +void bch_traffic_policy_init(struct cached_dev *dc) +{ + spin_lock_init(&dc->token_lock); + dc->write_token_sector_size = 0; + dc->write_token_io_num = 0; + + timer_setup(&dc->token_assign_timer, token_assign, 0); + dc->token_assign_timer.expires = jiffies + HZ / 8; + add_timer(&dc->token_assign_timer); +} diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 3667bc5390dfe..f677ba8704940 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -41,6 +41,8 @@ void bch_data_insert(struct closure *cl); void bch_cached_dev_request_init(struct cached_dev *dc); void bch_flash_dev_request_init(struct bcache_device *d);
+void bch_traffic_policy_init(struct cached_dev *dc); + extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache;
struct search { diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e7f7a0f038682..3f858de9e9602 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1210,6 +1210,8 @@ static void cached_dev_free(struct closure *cl) { struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
+ del_timer_sync(&dc->io_stat_timer); + del_timer_sync(&dc->token_assign_timer); if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) cancel_writeback_rate_update_dwork(dc);
@@ -1250,6 +1252,36 @@ static void cached_dev_flush(struct closure *cl) continue_at(cl, cached_dev_free, system_wq); }
+static void cached_dev_io_stat(struct timer_list *t) +{ + struct cached_dev *dc = from_timer(dc, t, io_stat_timer); + + dc->io_stat_timer.expires = jiffies + HZ; + add_timer(&dc->io_stat_timer); + + dc->writeback_sector_size_per_sec = + atomic_read(&dc->writeback_sector_size); + dc->writeback_io_num_per_sec = atomic_read(&dc->writeback_io_num); + dc->front_io_num_per_sec = atomic_read(&dc->front_io_num); + atomic_set(&dc->writeback_sector_size, 0); + atomic_set(&dc->writeback_io_num, 0); + atomic_set(&dc->front_io_num, 0); +} + +static void cached_dev_timer_init(struct cached_dev *dc) +{ + dc->writeback_sector_size_per_sec = 0; + dc->writeback_io_num_per_sec = 0; + dc->front_io_num_per_sec = 0; + atomic_set(&dc->writeback_sector_size, 0); + atomic_set(&dc->writeback_io_num, 0); + atomic_set(&dc->front_io_num, 0); + + timer_setup(&dc->io_stat_timer, cached_dev_io_stat, 0); + dc->io_stat_timer.expires = jiffies + HZ; + add_timer(&dc->io_stat_timer); +} + static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) { int ret; @@ -1266,6 +1298,8 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) INIT_LIST_HEAD(&dc->io_lru); spin_lock_init(&dc->io_lock); bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); + cached_dev_timer_init(dc); + bch_traffic_policy_init(dc);
dc->sequential_cutoff = 4 << 20; dc->inflight_block_enable = 1; @@ -1774,6 +1808,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->congested_read_threshold_us = 2000; c->congested_write_threshold_us = 20000; c->error_limit = DEFAULT_IO_ERROR_LIMIT; + c->cutoff_writeback_sync = MIN_CUTOFF_WRITEBACK_SYNC; WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags));
return c; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 706d3a245dba6..4c693ac29b0e0 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -51,6 +51,13 @@ static const char * const error_actions[] = { NULL };
+static const char * const writeback_state[] = { + "default", + "quick", + "slow", + NULL +}; + write_attribute(attach); write_attribute(detach); write_attribute(unregister); @@ -96,6 +103,9 @@ read_attribute(io_errors); read_attribute(congested); rw_attribute(congested_read_threshold_us); rw_attribute(congested_write_threshold_us); +rw_attribute(gc_sectors); +rw_attribute(traffic_policy_start); +rw_attribute(force_write_through);
rw_attribute(sequential_cutoff); rw_attribute(read_bypass); @@ -114,7 +124,13 @@ rw_attribute(writeback_rate_update_seconds); rw_attribute(writeback_rate_i_term_inverse); rw_attribute(writeback_rate_p_term_inverse); rw_attribute(writeback_rate_minimum); +rw_attribute(writeback_state); +read_attribute(writeback_sector_size_per_sec); +read_attribute(writeback_io_num_per_sec); +read_attribute(front_io_num_per_sec); read_attribute(writeback_rate_debug); +read_attribute(write_token_sector_size); +read_attribute(write_token_io_num);
read_attribute(stripe_size); read_attribute(partial_stripes_expensive); @@ -169,6 +185,11 @@ SHOW(__bch_cached_dev) bch_cache_modes, BDEV_CACHE_MODE(&dc->sb));
+ if (attr == &sysfs_writeback_state) + return bch_snprint_string_list(buf, PAGE_SIZE, + writeback_state, + dc->writeback_state); + if (attr == &sysfs_readahead_cache_policy) return bch_snprint_string_list(buf, PAGE_SIZE, bch_reada_cache_policies, @@ -186,6 +207,9 @@ SHOW(__bch_cached_dev) var_printf(writeback_metadata, "%i"); var_printf(writeback_running, "%i"); var_print(writeback_delay); + var_print(writeback_sector_size_per_sec); + var_print(writeback_io_num_per_sec); + var_print(front_io_num_per_sec); var_print(writeback_percent); sysfs_hprint(writeback_rate, wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 : 0); @@ -248,6 +272,8 @@ SHOW(__bch_cached_dev)
sysfs_print(running, atomic_read(&dc->running)); sysfs_print(state, states[BDEV_STATE(&dc->sb)]); + var_print(write_token_sector_size); + var_print(write_token_io_num);
if (attr == &sysfs_label) { memcpy(buf, dc->sb.label, SB_LABEL_SIZE); @@ -346,6 +372,15 @@ STORE(__cached_dev) } }
+ if (attr == &sysfs_writeback_state) { + v = __sysfs_match_string(writeback_state, -1, buf); + + if (v < 0) + return v; + + dc->writeback_state = v; + } + if (attr == &sysfs_readahead_cache_policy) { v = __sysfs_match_string(bch_reada_cache_policies, -1, buf); if (v < 0) @@ -448,11 +483,14 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_data_csum, #endif &sysfs_cache_mode, + &sysfs_writeback_state, &sysfs_readahead_cache_policy, &sysfs_stop_when_cache_set_failed, &sysfs_writeback_metadata, &sysfs_writeback_running, &sysfs_writeback_delay, + &sysfs_writeback_sector_size_per_sec, + &sysfs_writeback_io_num_per_sec, &sysfs_writeback_percent, &sysfs_writeback_rate, &sysfs_writeback_rate_update_seconds, @@ -460,6 +498,9 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_writeback_rate_p_term_inverse, &sysfs_writeback_rate_minimum, &sysfs_writeback_rate_debug, + &sysfs_write_token_sector_size, + &sysfs_write_token_io_num, + &sysfs_front_io_num_per_sec, &sysfs_io_errors, &sysfs_io_error_limit, &sysfs_io_disable, @@ -714,6 +755,12 @@ SHOW(__bch_cache_set) c->congested_read_threshold_us); sysfs_print(congested_write_threshold_us, c->congested_write_threshold_us); + sysfs_print(gc_sectors, + c->gc_sectors); + sysfs_print(traffic_policy_start, + c->traffic_policy_start); + sysfs_print(force_write_through, + c->force_write_through);
sysfs_print(active_journal_entries, fifo_used(&c->journal.pin)); sysfs_printf(verify, "%i", c->verify); @@ -800,6 +847,12 @@ STORE(__bch_cache_set) c->congested_read_threshold_us); sysfs_strtoul(congested_write_threshold_us, c->congested_write_threshold_us); + sysfs_strtoul(gc_sectors, + c->gc_sectors); + sysfs_strtoul(traffic_policy_start, + c->traffic_policy_start); + sysfs_strtoul(force_write_through, + c->force_write_through);
if (attr == &sysfs_errors) { v = __sysfs_match_string(error_actions, -1, buf); @@ -926,6 +979,9 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_btree_shrinker_disabled, &sysfs_copy_gc_enabled, &sysfs_io_disable, + &sysfs_gc_sectors, + &sysfs_traffic_policy_start, + &sysfs_force_write_through, NULL }; KTYPE(bch_cache_set_internal); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index b5fc3c6c7178e..901ad8bae7614 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -222,7 +222,13 @@ static unsigned int writeback_delay(struct cached_dev *dc, !dc->writeback_percent) return 0;
- return bch_next_delay(&dc->writeback_rate, sectors); + if (dc->writeback_state == WRITEBACK_DEFAULT) { + return bch_next_delay(&dc->writeback_rate, sectors); + } else if (dc->writeback_state == WRITEBACK_QUICK) { + return 0; + } else { + return msecs_to_jiffies(1000); + } }
struct dirty_io { @@ -287,6 +293,9 @@ static void write_dirty_finish(struct closure *cl) : &dc->disk.c->writeback_keys_done); }
+ atomic_add(KEY_SIZE(&w->key), &dc->writeback_sector_size); + atomic_inc(&dc->writeback_io_num); + bch_keybuf_del(&dc->writeback_keys, w); up(&dc->in_flight);
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index e75dc33339f6f..a3151c0e96609 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -3,7 +3,8 @@ #define _BCACHE_WRITEBACK_H
#define CUTOFF_WRITEBACK 40 -#define CUTOFF_WRITEBACK_SYNC 70 +#define MIN_CUTOFF_WRITEBACK_SYNC 70 +#define MAX_CUTOFF_WRITEBACK_SYNC 90
#define MAX_WRITEBACKS_IN_PASS 5 #define MAX_WRITESIZE_IN_PASS 5000 /* *512b */ @@ -57,10 +58,11 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, unsigned int cache_mode, bool would_skip) { unsigned int in_use = dc->disk.c->gc_stats.in_use; + unsigned int cutoff = dc->disk.c->cutoff_writeback_sync;
if (cache_mode != CACHE_MODE_WRITEBACK || test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || - in_use > CUTOFF_WRITEBACK_SYNC) + in_use > cutoff) return false;
if (bio_op(bio) == REQ_OP_DISCARD)
From: Xu Wei xuwei56@huawei.com
euleros inclusion category: feature bugzilla: https://bugzilla.openeuler.org/show_bug.cgi?id=327 CVE: NA
Bcache will move all data, including clean and dirty data, in bucket when gc running. This will cause big write amplification, which may reduce the cache device's life. This patch provice a switch for gc to move only dirty data, which can reduce write amplification.
Signed-off-by: qinghaixiang xuweiqhx@163.com Signed-off-by: Xu Wei xuwei56@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Reviewed-by: Li Ruilin liruilin4@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/md/bcache/alloc.c | 1 + drivers/md/bcache/bcache.h | 4 ++- drivers/md/bcache/btree.c | 13 +++++--- drivers/md/bcache/btree.h | 2 +- drivers/md/bcache/movinggc.c | 57 +++++++++++++++++++++++++----------- drivers/md/bcache/sysfs.c | 4 +++ 6 files changed, 58 insertions(+), 23 deletions(-)
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 46794cac167e7..a6ce0636f3237 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -471,6 +471,7 @@ void __bch_bucket_free(struct cache *ca, struct bucket *b) { SET_GC_MARK(b, 0); SET_GC_SECTORS_USED(b, 0); + SET_GC_DIRTY_SECTORS(b, 0);
if (ca->set->avail_nbuckets < ca->set->nbuckets) { ca->set->avail_nbuckets++; diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 70fbde8ca70c9..76d5026c924a4 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -202,7 +202,7 @@ struct bucket { uint16_t prio; uint8_t gen; uint8_t last_gc; /* Most out of date gen in the btree */ - uint16_t gc_mark; /* Bitfield used by GC. See below for field */ + uint32_t gc_mark; /* Bitfield used by GC. See below for field */ };
/* @@ -218,6 +218,7 @@ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); #define MAX_GC_SECTORS_USED (~(~0ULL << GC_SECTORS_USED_SIZE)) BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE); BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1); +BITMASK(GC_DIRTY_SECTORS, struct bucket, gc_mark, 16, GC_SECTORS_USED_SIZE);
#include "journal.h" #include "stats.h" @@ -736,6 +737,7 @@ struct cache_set { unsigned int gc_always_rewrite:1; unsigned int shrinker_disabled:1; unsigned int copy_gc_enabled:1; + unsigned int gc_only_dirty_data:1;
#define BUCKET_HASH_BITS 12 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 313a2f76213fd..f7c76efc97cdc 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1257,12 +1257,16 @@ static uint8_t __bch_btree_mark_key(struct cache_set *c, int level, c, "inconsistent ptrs: mark = %llu, level = %i", GC_MARK(g), level);
- if (level) + if (level) { SET_GC_MARK(g, GC_MARK_METADATA); - else if (KEY_DIRTY(k)) + } else if (KEY_DIRTY(k)) { SET_GC_MARK(g, GC_MARK_DIRTY); - else if (!GC_MARK(g)) + SET_GC_DIRTY_SECTORS(g, min_t(unsigned int, + GC_DIRTY_SECTORS(g) + KEY_SIZE(k), + MAX_GC_SECTORS_USED)); + } else if (!GC_MARK(g)) { SET_GC_MARK(g, GC_MARK_RECLAIMABLE); + }
/* guard against overflow */ SET_GC_SECTORS_USED(g, min_t(unsigned int, @@ -1746,6 +1750,7 @@ static void btree_gc_start(struct cache_set *c) if (!atomic_read(&b->pin)) { SET_GC_MARK(b, 0); SET_GC_SECTORS_USED(b, 0); + SET_GC_DIRTY_SECTORS(b, 0); } }
@@ -1860,7 +1865,7 @@ static void bch_btree_gc(struct cache_set *c)
trace_bcache_gc_end(c);
- bch_moving_gc(c); + bch_moving_gc(c, c->gc_only_dirty_data); }
static bool gc_should_run(struct cache_set *c) diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 7ddadcc485ea6..8bcca2beca986 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -263,7 +263,7 @@ int bch_btree_insert(struct cache_set *c, struct keylist *keys,
int bch_gc_thread_start(struct cache_set *c); void bch_initial_gc_finish(struct cache_set *c); -void bch_moving_gc(struct cache_set *c); +void bch_moving_gc(struct cache_set *c, bool only_move_dirty); int bch_btree_check(struct cache_set *c); void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k);
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 7891fb512736d..749422b927394 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -123,7 +123,7 @@ static void read_moving_submit(struct closure *cl) continue_at(cl, write_moving, io->op.wq); }
-static void read_moving(struct cache_set *c) +static void read_moving(struct cache_set *c, bool only_move_dirty) { struct keybuf_key *w; struct moving_io *io; @@ -140,7 +140,8 @@ static void read_moving(struct cache_set *c) if (!w) break;
- if (ptr_stale(c, &w->key, 0)) { + if (ptr_stale(c, &w->key, 0) || + (only_move_dirty && (!KEY_DIRTY(&w->key)))) { bch_keybuf_del(&c->moving_gc_keys, w); continue; } @@ -187,22 +188,43 @@ static bool bucket_cmp(struct bucket *l, struct bucket *r) return GC_SECTORS_USED(l) < GC_SECTORS_USED(r); }
-static unsigned int bucket_heap_top(struct cache *ca) +static bool bucket_cmp_dirty(struct bucket *l, struct bucket *r) +{ + return GC_DIRTY_SECTORS(l) < GC_DIRTY_SECTORS(r); +} + +static unsigned int bucket_heap_top(struct cache *ca, bool only_dirty) { struct bucket *b; + if (only_dirty) + return (b = heap_peek(&ca->heap)) ? GC_DIRTY_SECTORS(b) : 0; + else + return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0; +}
- return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0; +static unsigned bucket_sectors(struct bucket *b, bool only_dirty) +{ + if (only_dirty) + return GC_DIRTY_SECTORS(b); + else + return GC_SECTORS_USED(b); }
-void bch_moving_gc(struct cache_set *c) +void bch_moving_gc(struct cache_set *c, bool only_move_dirty) { struct cache *ca; struct bucket *b; unsigned int i; + bool (*cmp)(struct bucket*, struct bucket*);
if (!c->copy_gc_enabled) return;
+ if (only_move_dirty) + cmp = &bucket_cmp_dirty; + else + cmp = &bucket_cmp; + mutex_lock(&c->bucket_lock);
for_each_cache(ca, c, i) { @@ -214,29 +236,30 @@ void bch_moving_gc(struct cache_set *c)
for_each_bucket(b, ca) { if (GC_MARK(b) == GC_MARK_METADATA || - !GC_SECTORS_USED(b) || - GC_SECTORS_USED(b) == ca->sb.bucket_size || + !bucket_sectors(b, only_move_dirty) || + ((!only_move_dirty) && + (GC_SECTORS_USED(b) == ca->sb.bucket_size)) || atomic_read(&b->pin)) continue;
if (!heap_full(&ca->heap)) { - sectors_to_move += GC_SECTORS_USED(b); - heap_add(&ca->heap, b, bucket_cmp); - } else if (bucket_cmp(b, heap_peek(&ca->heap))) { - sectors_to_move -= bucket_heap_top(ca); - sectors_to_move += GC_SECTORS_USED(b); + sectors_to_move += bucket_sectors(b, only_move_dirty); + heap_add(&ca->heap, b, (*cmp)); + } else if ((*cmp)(b, heap_peek(&ca->heap))) { + sectors_to_move -= bucket_heap_top(ca, only_move_dirty); + sectors_to_move += bucket_sectors(b, only_move_dirty);
ca->heap.data[0] = b; - heap_sift(&ca->heap, 0, bucket_cmp); + heap_sift(&ca->heap, 0, (*cmp)); } }
while (sectors_to_move > reserve_sectors) { - heap_pop(&ca->heap, b, bucket_cmp); - sectors_to_move -= GC_SECTORS_USED(b); + heap_pop(&ca->heap, b, (*cmp)); + sectors_to_move -= bucket_sectors(b, only_move_dirty); }
- while (heap_pop(&ca->heap, b, bucket_cmp)) + while (heap_pop(&ca->heap, b, (*cmp))) SET_GC_MOVE(b, 1); }
@@ -244,7 +267,7 @@ void bch_moving_gc(struct cache_set *c)
c->moving_gc_keys.last_scanned = ZERO_KEY;
- read_moving(c); + read_moving(c, only_move_dirty); }
void bch_moving_init_cache_set(struct cache_set *c) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 4c693ac29b0e0..178a66455481e 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -153,6 +153,7 @@ rw_attribute(expensive_debug_checks); rw_attribute(cache_replacement_policy); rw_attribute(btree_shrinker_disabled); rw_attribute(copy_gc_enabled); +rw_attribute(gc_only_dirty_data); rw_attribute(size);
static ssize_t bch_snprint_string_list(char *buf, @@ -770,6 +771,7 @@ SHOW(__bch_cache_set) sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); + sysfs_printf(gc_only_dirty_data, "%i", c->gc_only_dirty_data); sysfs_printf(io_disable, "%i", test_bit(CACHE_SET_IO_DISABLE, &c->flags));
@@ -898,6 +900,7 @@ STORE(__bch_cache_set) sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite); sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled); sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled); + sysfs_strtoul(gc_only_dirty_data, c->gc_only_dirty_data);
return size; } @@ -978,6 +981,7 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_gc_always_rewrite, &sysfs_btree_shrinker_disabled, &sysfs_copy_gc_enabled, + &sysfs_gc_only_dirty_data, &sysfs_io_disable, &sysfs_gc_sectors, &sysfs_traffic_policy_start,
From: Xu Wei xuwei56@huawei.com
euleros inclusion category: feature bugzilla: https://bugzilla.openeuler.org/show_bug.cgi?id=327 CVE: NA
When kernel config don't enbale CONFIG_BCACHE, compiling bcache module will fail. This patch add the judgment for CONFIG_BCACHE macro to make sure compiling bcache module success.
Signed-off-by: qinghaixiang xuweiqhx@163.com Signed-off-by: Xu Wei xuwei56@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Reviewed-by: Li Ruilin liruilin4@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/md/bcache/request.c | 12 ++++++++++++ 1 file changed, 12 insertions(+)
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index c05544e07722e..d9c92ed52d49b 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -370,6 +370,7 @@ unsigned int bch_get_congested(struct cache_set *c) return i > 0 ? i : 1; }
+#if IS_ENABLED(CONFIG_BCACHE) static void add_sequential(struct task_struct *t) { ewma_add(t->sequential_io_avg, @@ -377,6 +378,7 @@ static void add_sequential(struct task_struct *t)
t->sequential_io = 0; } +#endif
static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) { @@ -388,7 +390,9 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) struct cache_set *c = dc->disk.c; unsigned int mode = cache_mode(dc); unsigned int sectors, congested = bch_get_congested(c); +#if IS_ENABLED(CONFIG_BCACHE) struct task_struct *task = current; +#endif struct io *i;
if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || @@ -443,7 +447,9 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
i = list_first_entry(&dc->io_lru, struct io, lru);
+#if IS_ENABLED(CONFIG_BCACHE) add_sequential(task); +#endif i->sequential = 0; found: if (i->sequential + bio->bi_iter.bi_size > i->sequential) @@ -451,7 +457,9 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
i->last = bio_end_sector(bio); i->jiffies = jiffies + msecs_to_jiffies(5000); +#if IS_ENABLED(CONFIG_BCACHE) task->sequential_io = i->sequential; +#endif
hlist_del(&i->hash); hlist_add_head(&i->hash, iohash(dc, i->last)); @@ -459,8 +467,12 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
spin_unlock(&dc->io_lock);
+#if IS_ENABLED(CONFIG_BCACHE) sectors = max(task->sequential_io, task->sequential_io_avg) >> 9; +#else + sectors = i->sequential >> 9; +#endif
if (dc->sequential_cutoff && sectors >= dc->sequential_cutoff >> 9) {
From: Coly Li colyli@suse.de
mainline inclusion from mainline-v5.7-rc1 commit 8e7102273f597dbb38af43da874f8c123f8e6dbe category: performance bugzilla: https://bugzilla.openeuler.org/show_bug.cgi?id=327 CVE: NA
When registering a cache device, bch_btree_check() is called to check all btree nodes, to make sure the btree is consistent and not corrupted.
bch_btree_check() is recursively executed in a single thread, when there are a lot of data cached and the btree is huge, it may take very long time to check all the btree nodes. In my testing, I observed it took around 50 minutes to finish bch_btree_check().
When checking the bcache btree nodes, the cache set is not running yet, and indeed the whole tree is in read-only state, it is safe to create multiple threads to check the btree in parallel.
This patch tries to create multiple threads, and each thread tries to one-by-one check the sub-tree indexed by a key from the btree root node. The parallel thread number depends on how many keys in the btree root node. At most BCH_BTR_CHKTHREAD_MAX (64) threads can be created, but in practice is should be min(cpu-number/2, root-node-keys-number).
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: qinghaixiang xuweiqhx@163.com Signed-off-by: Xu Wei xuwei56@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Reviewed-by: Li Ruilin liruilin4@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/md/bcache/btree.c | 224 +++++++++++++++++++++++++++----------- drivers/md/bcache/btree.h | 85 +++++++++++++++ 2 files changed, 246 insertions(+), 63 deletions(-)
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index f7c76efc97cdc..24e70ee342f0d 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -101,65 +101,6 @@
#define insert_lock(s, b) ((b)->level <= (s)->lock)
-/* - * These macros are for recursing down the btree - they handle the details of - * locking and looking up nodes in the cache for you. They're best treated as - * mere syntax when reading code that uses them. - * - * op->lock determines whether we take a read or a write lock at a given depth. - * If you've got a read lock and find that you need a write lock (i.e. you're - * going to have to split), set op->lock and return -EINTR; btree_root() will - * call you again and you'll have the correct lock. - */ - -/** - * btree - recurse down the btree on a specified key - * @fn: function to call, which will be passed the child node - * @key: key to recurse on - * @b: parent btree node - * @op: pointer to struct btree_op - */ -#define btree(fn, key, b, op, ...) \ -({ \ - int _r, l = (b)->level - 1; \ - bool _w = l <= (op)->lock; \ - struct btree *_child = bch_btree_node_get((b)->c, op, key, l, \ - _w, b); \ - if (!IS_ERR(_child)) { \ - _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ - rw_unlock(_w, _child); \ - } else \ - _r = PTR_ERR(_child); \ - _r; \ -}) - -/** - * btree_root - call a function on the root of the btree - * @fn: function to call, which will be passed the child node - * @c: cache set - * @op: pointer to struct btree_op - */ -#define btree_root(fn, c, op, ...) \ -({ \ - int _r = -EINTR; \ - do { \ - struct btree *_b = (c)->root; \ - bool _w = insert_lock(op, _b); \ - rw_lock(_w, _b, _b->level); \ - if (_b == (c)->root && \ - _w == insert_lock(op, _b)) { \ - _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ - } \ - rw_unlock(_w, _b); \ - bch_cannibalize_unlock(c); \ - if (_r == -EINTR) \ - schedule(); \ - } while (_r == -EINTR); \ - \ - finish_wait(&(c)->btree_cache_wait, &(op)->wait); \ - _r; \ -}) - static inline struct bset *write_block(struct btree *b) { return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c); @@ -1949,13 +1890,170 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) return ret; }
+static int bch_btree_check_thread(void *arg) +{ + int ret; + struct btree_check_info *info = arg; + struct btree_check_state *check_state = info->state; + struct cache_set *c = check_state->c; + struct btree_iter iter; + struct bkey *k, *p; + int cur_idx, prev_idx, skip_nr; + + k = p = NULL; + cur_idx = prev_idx = 0; + ret = 0; + + /* root node keys are checked before thread created */ + bch_btree_iter_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); + WARN_ON(!k); + + p = k; + while (k) { + /* + * Fetch a root node key index, skip the keys which + * should be fetched by other threads, then check the + * sub-tree indexed by the fetched key. + */ + spin_lock(&check_state->idx_lock); + cur_idx = check_state->key_idx; + check_state->key_idx++; + spin_unlock(&check_state->idx_lock); + + skip_nr = cur_idx - prev_idx; + + while (skip_nr) { + k = bch_btree_iter_next_filter(&iter, + &c->root->keys, + bch_ptr_bad); + if (k) + p = k; + else { + /* + * No more keys to check in root node, + * current checking threads are enough, + * stop creating more. + */ + atomic_set(&check_state->enough, 1); + /* Update check_state->enough earlier */ + smp_mb(); + goto out; + } + skip_nr--; + cond_resched(); + } + + if (p) { + struct btree_op op; + + btree_node_prefetch(c->root, p); + c->gc_stats.nodes++; + bch_btree_op_init(&op, 0); + ret = btree(check_recurse, p, c->root, &op); + if (ret) + goto out; + } + p = NULL; + prev_idx = cur_idx; + cond_resched(); + } + +out: + info->result = ret; + /* update check_state->started among all CPUs */ + smp_mb(); + if (atomic_dec_and_test(&check_state->started)) + wake_up(&check_state->wait); + + return ret; +} + +static int bch_btree_chkthread_nr(void) +{ + int n = num_online_cpus() / 2; + + if (n == 0) + n = 1; + else if (n > BCH_BTR_CHKTHREAD_MAX) + n = BCH_BTR_CHKTHREAD_MAX; + + return n; +} + int bch_btree_check(struct cache_set *c) { - struct btree_op op; + int ret = 0; + int i; + struct bkey *k = NULL; + struct btree_iter iter; + struct btree_check_state *check_state; + char name[32];
- bch_btree_op_init(&op, SHRT_MAX); + /* check and mark root node keys */ + for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid) + bch_initial_mark_key(c, c->root->level, k); + + bch_initial_mark_key(c, c->root->level + 1, &c->root->key); + + if (c->root->level == 0) + return 0; + + check_state = kzalloc(sizeof(struct btree_check_state), GFP_KERNEL); + if (!check_state) + return -ENOMEM;
- return btree_root(check_recurse, c, &op); + check_state->c = c; + check_state->total_threads = bch_btree_chkthread_nr(); + check_state->key_idx = 0; + spin_lock_init(&check_state->idx_lock); + atomic_set(&check_state->started, 0); + atomic_set(&check_state->enough, 0); + init_waitqueue_head(&check_state->wait); + + /* + * Run multiple threads to check btree nodes in parallel, + * if check_state->enough is non-zero, it means current + * running check threads are enough, unncessary to create + * more. + */ + for (i = 0; i < check_state->total_threads; i++) { + /* fetch latest check_state->enough earlier */ + smp_mb(); + if (atomic_read(&check_state->enough)) + break; + + check_state->infos[i].result = 0; + check_state->infos[i].state = check_state; + snprintf(name, sizeof(name), "bch_btrchk[%u]", i); + atomic_inc(&check_state->started); + + check_state->infos[i].thread = + kthread_run(bch_btree_check_thread, + &check_state->infos[i], + name); + if (IS_ERR(check_state->infos[i].thread)) { + pr_err("fails to run thread bch_btrchk[%d]\n", i); + for (--i; i >= 0; i--) + kthread_stop(check_state->infos[i].thread); + ret = -ENOMEM; + goto out; + } + } + + wait_event_interruptible(check_state->wait, + atomic_read(&check_state->started) == 0); + + for (i = 0; i < check_state->total_threads; i++) { + if (check_state->infos[i].result) { + ret = check_state->infos[i].result; + goto out; + } + } + +out: + kfree(check_state); + return ret; }
void bch_initial_gc_finish(struct cache_set *c) @@ -2416,7 +2514,7 @@ int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, return btree_root(map_nodes_recurse, c, op, from, fn, flags); }
-static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, +int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, struct bkey *from, btree_map_keys_fn *fn, int flags) { diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 8bcca2beca986..42954927abd09 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -222,6 +222,25 @@ struct btree_op { unsigned int insert_collision:1; };
+struct btree_check_state; +struct btree_check_info { + struct btree_check_state *state; + struct task_struct *thread; + int result; +}; + +#define BCH_BTR_CHKTHREAD_MAX 64 +struct btree_check_state { + struct cache_set *c; + int total_threads; + int key_idx; + spinlock_t idx_lock; + atomic_t started; + atomic_t enough; + wait_queue_head_t wait; + struct btree_check_info infos[BCH_BTR_CHKTHREAD_MAX]; +}; + static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level) { memset(op, 0, sizeof(struct btree_op)); @@ -266,12 +285,78 @@ void bch_initial_gc_finish(struct cache_set *c); void bch_moving_gc(struct cache_set *c, bool only_move_dirty); int bch_btree_check(struct cache_set *c); void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k); +typedef int (btree_map_keys_fn)(struct btree_op *op, struct btree *b, + struct bkey *k); +int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, + struct bkey *from, btree_map_keys_fn *fn, int flags); +int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, + struct bkey *from, btree_map_keys_fn *fn, + int flags);
static inline void wake_up_gc(struct cache_set *c) { wake_up(&c->gc_wait); }
+/* + * These macros are for recursing down the btree - they handle the details of + * locking and looking up nodes in the cache for you. They're best treated as + * mere syntax when reading code that uses them. + * + * op->lock determines whether we take a read or a write lock at a given depth. + * If you've got a read lock and find that you need a write lock (i.e. you're + * going to have to split), set op->lock and return -EINTR; btree_root() will + * call you again and you'll have the correct lock. + */ + +/** + * btree - recurse down the btree on a specified key + * @fn: function to call, which will be passed the child node + * @key: key to recurse on + * @b: parent btree node + * @op: pointer to struct btree_op + */ +#define btree(fn, key, b, op, ...) \ +({ \ + int _r, l = (b)->level - 1; \ + bool _w = l <= (op)->lock; \ + struct btree *_child = bch_btree_node_get((b)->c, op, key, l, \ + _w, b); \ + if (!IS_ERR(_child)) { \ + _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ + rw_unlock(_w, _child); \ + } else \ + _r = PTR_ERR(_child); \ + _r; \ +}) + +/** + * btree_root - call a function on the root of the btree + * @fn: function to call, which will be passed the child node + * @c: cache set + * @op: pointer to struct btree_op + */ +#define btree_root(fn, c, op, ...) \ +({ \ + int _r = -EINTR; \ + do { \ + struct btree *_b = (c)->root; \ + bool _w = insert_lock(op, _b); \ + rw_lock(_w, _b, _b->level); \ + if (_b == (c)->root && \ + _w == insert_lock(op, _b)) { \ + _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ + } \ + rw_unlock(_w, _b); \ + bch_cannibalize_unlock(c); \ + if (_r == -EINTR) \ + schedule(); \ + } while (_r == -EINTR); \ + \ + finish_wait(&(c)->btree_cache_wait, &(op)->wait); \ + _r; \ +}) + #define MAP_DONE 0 #define MAP_CONTINUE 1