Hi openEler kernel maintainer,
This is the bcache backport since Linux v4.19 to v5.5 for openEuler-10 kernel. All the patches are based on openEuler-1.0 commit cf975ca6a64a ("Document: add guideline to submitting patches to openEuler").
This patch set survives for 2+ hours smoking test on a 48x2 cores Taishan machine. Pleasse review and consider to pick them.
Thanks in advance.
Coly Li ---
Andrea Righi (1): bcache: fix deadlock in bcache_allocator
Christoph Hellwig (2): bcache: remove the extra cflags for request.o bcache: don't export symbols
Colin Ian King (1): bcache: fix indentation issue, remove tabs on a hunk of code
Coly Li (50): bcache: fix typo in code comments of closure_return_with_destructor() bcache: introduce force_wake_up_gc() bcache: option to automatically run gc thread after writeback bcache: add MODULE_DESCRIPTION information bcache: make cutoff_writeback and cutoff_writeback_sync tunable bcache: set writeback_percent in a flexible range bcache: not use hard coded memset size in bch_cache_accounting_clear() bcache: export backing_dev_name via sysfs bcache: export backing_dev_uuid via sysfs bcache: fix input integer overflow of congested threshold bcache: add sysfs_strtoul_bool() for setting bit-field variables bcache: use sysfs_strtoul_bool() to set bit-field variables bcache: fix input overflow to writeback_delay bcache: fix input overflow to journal_delay_ms bcache: fix input overflow to cache set io_error_limit bcache: move definition of 'int ret' out of macro read_bucket() bcache: add comments for kobj release callback routine bcache: add error check for calling register_bdev() bcache: Add comments for blkdev_put() in registration code path bcache: add comments for closure_fn to be called in closure_queue() bcache: improve bcache_reboot() bcache: don't set max writeback rate if gc is running bcache: fix return value error in bch_journal_read() bcache: avoid flushing btree node in cache_set_flush() if io disabled bcache: add io error counting in write_bdev_super_endio() bcache: remove unnecessary prefetch() in bset_search_tree() bcache: add return value check to bch_cached_dev_run() bcache: remove unncessary code in bch_btree_keys_init() bcache: more detailed error message to bcache_device_link() bcache: add more error message in bch_cached_dev_attach() bcache: improve error message in bch_cached_dev_run() bcache: remove "XXX:" comment line from run_cache_set() bcache: make bset_search_tree() be more understandable bcache: add pendings_cleanup to stop pending bcache device bcache: stop writeback kthread and kworker when bch_cached_dev_run() failed bcache: avoid a deadlock in bcache_reboot() bcache: acquire bch_register_lock later in cached_dev_detach_finish() bcache: add code comments for journal_read_bucket() bcache: set largest seq to ja->seq[bucket_index] in journal_read_bucket() bcache: shrink btree node cache after bch_btree_check() bcache: remove retry_flush_write from struct cache_set bcache: performance improvement for btree_flush_write() bcache: add reclaimed_journal_buckets to struct cache_set bcache: fix static checker warning in bcache_device_free() bcache: add more accurate error messages in read_super() bcache: deleted code comments for dead code in bch_data_insert_keys() bcache: add code comment bch_keylist_pop() and bch_keylist_pop_front() bcache: add code comments in bch_btree_leaf_dirty() bcache: add idle_max_writeback_rate sysfs interface bcache: at least try to shrink 1 node in bch_mca_scan()
Dan Carpenter (1): bcache: Fix an error code in bch_dump_read()
Dongbo Cao (3): bcache: remove useless parameter of bch_debug_init() bcache: split combined if-condition code into separate ones bcache: panic fix for making cache device
Geliang Tang (1): bcache: use kmemdup_nul for CACHED_LABEL buffer
George Spelvin (1): bcache: Clean up bch_get_congested()
Guoju Fang (4): bcache: print number of keys in trace_bcache_journal_write bcache: fix crashes stopping bcache device before read miss done bcache: fix inaccurate result of unused buckets bcache: fix a lost wake-up problem caused by mca_cannibalize_lock
Jens Axboe (1): bcache: make is_discard_enabled() static
Ming Lei (1): bcache: avoid to use bio_for_each_segment_all() in bch_bio_alloc_pages()
Shenghui Wang (7): bcache: remove unused bch_passthrough_cache bcache: use MAX_CACHES_PER_SET instead of magic number 8 in __bch_bucket_alloc_set bcache: add comment for cache_set->fill_iter bcache: update comment for bch_data_insert bcache: update comment in sysfs.c bcache: cannot set writeback_running via sysfs if no writeback kthread created bcache: fix wrong usage use-after-freed on keylist in out_nocoalesce branch of btree_gc_coalesce
Shile Zhang (1): bcache: add cond_resched() in __bch_cache_cmp()
Wei Yongjun (1): bcache: fix possible memory leak in bch_cached_dev_run()
drivers/md/bcache/Makefile | 2 - drivers/md/bcache/alloc.c | 7 +- drivers/md/bcache/bcache.h | 30 ++- drivers/md/bcache/bset.c | 78 ++----- drivers/md/bcache/btree.c | 30 ++- drivers/md/bcache/btree.h | 18 ++ drivers/md/bcache/closure.c | 7 - drivers/md/bcache/closure.h | 3 +- drivers/md/bcache/debug.c | 7 +- drivers/md/bcache/journal.c | 121 ++++++++--- drivers/md/bcache/journal.h | 4 + drivers/md/bcache/request.c | 59 ++--- drivers/md/bcache/request.h | 4 +- drivers/md/bcache/stats.c | 2 +- drivers/md/bcache/super.c | 486 +++++++++++++++++++++++++++++++++++------- drivers/md/bcache/sysfs.c | 166 +++++++++++---- drivers/md/bcache/sysfs.h | 10 + drivers/md/bcache/util.c | 6 +- drivers/md/bcache/util.h | 26 ++- drivers/md/bcache/writeback.c | 34 +++ drivers/md/bcache/writeback.h | 12 +- include/trace/events/bcache.h | 27 ++- 22 files changed, 861 insertions(+), 278 deletions(-)
mainline inclusion from mainline-4.20-rc1 commit 4516da427fcf214af7d826273ab275f1d6e56ef0 category: backport
The code comments of closure_return_with_destructor() in closure.h makrs function name as closure_return(). This patch fixes this type with the correct name - closure_return_with_destructor.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/closure.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index eca0d496b686..c88cdc4ae4ec 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -345,7 +345,8 @@ do { \ } while (0)
/** - * closure_return - finish execution of a closure, with destructor + * closure_return_with_destructor - finish execution of a closure, + * with destructor * * Works like closure_return(), except @destructor will be called when all * outstanding refs on @cl have been dropped; @destructor may be used to safely
From: Shenghui Wang shhuiw@foxmail.com
mainline inclusion from mainline-4.20-rc1 commit 3fd3c5c02b2865e506bee1212c8423eac0d4cddf category: backport
struct kmem_cache *bch_passthrough_cache is not used in bcache code. Remove it.
Signed-off-by: Shenghui Wang shhuiw@foxmail.com Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/request.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index aa055cfeb099..721bf336ed1a 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -39,6 +39,6 @@ void bch_data_insert(struct closure *cl); void bch_cached_dev_request_init(struct cached_dev *dc); void bch_flash_dev_request_init(struct bcache_device *d);
-extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache; +extern struct kmem_cache *bch_search_cache;
#endif /* _BCACHE_REQUEST_H_ */
From: Dongbo Cao cdbdyx@163.com
mainline inclusion from mainline-4.20-rc1 commit 91bafdf081b8ad8ab4977918ee45dffe3d744060 category: backport
Parameter "struct kobject *kobj" in bch_debug_init() is useless, remove it in this patch.
Signed-off-by: Dongbo Cao cdbdyx@163.com Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/bcache.h | 2 +- drivers/md/bcache/debug.c | 2 +- drivers/md/bcache/super.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 83f0b91aeb90..d256231d9915 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -1002,7 +1002,7 @@ void bch_open_buckets_free(struct cache_set *c); int bch_cache_allocator_start(struct cache *ca);
void bch_debug_exit(void); -void bch_debug_init(struct kobject *kobj); +void bch_debug_init(void); void bch_request_exit(void); int bch_request_init(void);
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 8c53d874ada4..8b123be05254 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -252,7 +252,7 @@ void bch_debug_exit(void) debugfs_remove_recursive(bcache_debug); }
-void __init bch_debug_init(struct kobject *kobj) +void __init bch_debug_init(void) { /* * it is unnecessary to check return value of diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 14d381cc6d74..65e83e53449f 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2383,7 +2383,7 @@ static int __init bcache_init(void) sysfs_create_files(bcache_kobj, files)) goto err;
- bch_debug_init(bcache_kobj); + bch_debug_init(); closure_debug_init();
return 0;
From: Shenghui Wang shhuiw@foxmail.com
mainline inclusion from mainline-4.20-rc1 commit 8792099f9ad487cf381f4e8199ff2158ba0f6eb5 category: backport
Current cache_set has MAX_CACHES_PER_SET caches most, and the macro is used for " struct cache *cache_by_alloc[MAX_CACHES_PER_SET]; " in the define of struct cache_set.
Use MAX_CACHES_PER_SET instead of magic number 8 in __bch_bucket_alloc_set.
Signed-off-by: Shenghui Wang shhuiw@foxmail.com Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 9c3beb1e382b..6f776823b9ba 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -494,7 +494,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, return -1;
lockdep_assert_held(&c->bucket_lock); - BUG_ON(!n || n > c->caches_loaded || n > 8); + BUG_ON(!n || n > c->caches_loaded || n > MAX_CACHES_PER_SET);
bkey_init(k);
From: Dongbo Cao cdbdyx@163.com
mainline inclusion from mainline-4.20-rc1 commit f6027bca9e382efc4d4f28a2d9678e0a07428363 category: backport
Split the combined '||' statements in if() check, to make the code easier for debug.
Signed-off-by: Dongbo Cao cdbdyx@163.com Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/super.c | 90 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 76 insertions(+), 14 deletions(-)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 65e83e53449f..1d6b74854bc8 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2046,6 +2046,8 @@ static int cache_alloc(struct cache *ca) size_t free; size_t btree_buckets; struct bucket *b; + int ret = -ENOMEM; + const char *err = NULL;
__module_get(THIS_MODULE); kobject_init(&ca->kobj, &bch_cache_ktype); @@ -2064,26 +2066,86 @@ static int cache_alloc(struct cache *ca) btree_buckets = ca->sb.njournal_buckets ?: 8; free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
- if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets, GFP_KERNEL) || - !init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || - !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) || - !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) || - !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) || - !init_heap(&ca->heap, free << 3, GFP_KERNEL) || - !(ca->buckets = vzalloc(array_size(sizeof(struct bucket), - ca->sb.nbuckets))) || - !(ca->prio_buckets = kzalloc(array3_size(sizeof(uint64_t), - prio_buckets(ca), 2), - GFP_KERNEL)) || - !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca))) - return -ENOMEM; + if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets, + GFP_KERNEL)) { + err = "ca->free[RESERVE_BTREE] alloc failed"; + goto err_btree_alloc; + } + + if (!init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca), + GFP_KERNEL)) { + err = "ca->free[RESERVE_PRIO] alloc failed"; + goto err_prio_alloc; + } + + if (!init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL)) { + err = "ca->free[RESERVE_MOVINGGC] alloc failed"; + goto err_movinggc_alloc; + } + + if (!init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL)) { + err = "ca->free[RESERVE_NONE] alloc failed"; + goto err_none_alloc; + } + + if (!init_fifo(&ca->free_inc, free << 2, GFP_KERNEL)) { + err = "ca->free_inc alloc failed"; + goto err_free_inc_alloc; + } + + if (!init_heap(&ca->heap, free << 3, GFP_KERNEL)) { + err = "ca->heap alloc failed"; + goto err_heap_alloc; + } + + ca->buckets = vzalloc(array_size(sizeof(struct bucket), + ca->sb.nbuckets)); + if (!ca->buckets) { + err = "ca->buckets alloc failed"; + goto err_buckets_alloc; + } + + ca->prio_buckets = kzalloc(array3_size(sizeof(uint64_t), + prio_buckets(ca), 2), + GFP_KERNEL); + if (!ca->prio_buckets) { + err = "ca->prio_buckets alloc failed"; + goto err_prio_buckets_alloc; + } + + ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca); + if (!ca->disk_buckets) { + err = "ca->disk_buckets alloc failed"; + goto err_disk_buckets_alloc; + }
ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
for_each_bucket(b, ca) atomic_set(&b->pin, 0); - return 0; + +err_disk_buckets_alloc: + kfree(ca->prio_buckets); +err_prio_buckets_alloc: + vfree(ca->buckets); +err_buckets_alloc: + free_heap(&ca->heap); +err_heap_alloc: + free_fifo(&ca->free_inc); +err_free_inc_alloc: + free_fifo(&ca->free[RESERVE_NONE]); +err_none_alloc: + free_fifo(&ca->free[RESERVE_MOVINGGC]); +err_movinggc_alloc: + free_fifo(&ca->free[RESERVE_PRIO]); +err_prio_alloc: + free_fifo(&ca->free[RESERVE_BTREE]); +err_btree_alloc: + module_put(THIS_MODULE); + if (err) + pr_notice("error %s: %s", ca->cache_dev_name, err); + return ret; }
static int register_cache(struct cache_sb *sb, struct page *sb_page,
From: Dongbo Cao cdbdyx@163.com
mainline inclusion from mainline-4.20-rc1 commit 3a646fd77684dd5fbe20748bb04e12077bbecddc category: backport
when the nbuckets of cache device is smaller than 1024, making cache device will trigger BUG_ON in kernel, add a condition to avoid this.
Reported-by: nitroxis n@nxs.re Signed-off-by: Dongbo Cao cdbdyx@163.com Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/super.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 1d6b74854bc8..b72459d340cf 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2065,6 +2065,11 @@ static int cache_alloc(struct cache *ca) */ btree_buckets = ca->sb.njournal_buckets ?: 8; free = roundup_pow_of_two(ca->sb.nbuckets) >> 10; + if (!free) { + ret = -EPERM; + err = "ca->sb.nbuckets is too small"; + goto err_free; + }
if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets, GFP_KERNEL)) { @@ -2142,6 +2147,7 @@ static int cache_alloc(struct cache *ca) err_prio_alloc: free_fifo(&ca->free[RESERVE_BTREE]); err_btree_alloc: +err_free: module_put(THIS_MODULE); if (err) pr_notice("error %s: %s", ca->cache_dev_name, err); @@ -2171,6 +2177,8 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); if (ret == -ENOMEM) err = "cache_alloc(): -ENOMEM"; + else if (ret == -EPERM) + err = "cache_alloc(): cache device is too small"; else err = "cache_alloc(): unknown error"; goto err;
From: Shenghui Wang shhuiw@foxmail.com
mainline inclusion from mainline-5.0-rc1 commit id2f96f487f471b4c718324ecd1540c89d2be52ac category: backport
We have the following define for btree iterator: struct btree_iter { size_t size, used; #ifdef CONFIG_BCACHE_DEBUG struct btree_keys *b; #endif struct btree_iter_set { struct bkey *k, *end; } data[MAX_BSETS]; };
We can see that the length of data[] field is static MAX_BSETS, which is defined as 4 currently.
But a btree node on disk could have too many bsets for an iterator to fit on the stack - maybe far more that MAX_BSETS. Have to dynamically allocate space to host more btree_iter_sets.
bch_cache_set_alloc() will make sure the pool cache_set->fill_iter can allocate an iterator equipped with enough room that can host (sb.bucket_size / sb.block_size) btree_iter_sets, which is more than static MAX_BSETS.
bch_btree_node_read_done() will use that pool to allocate one iterator, to host many bsets in one btree node.
Add more comment around cache_set->fill_iter to make code less confusing.
Signed-off-by: Shenghui Wang shhuiw@foxmail.com Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/bcache.h | 6 +++++- drivers/md/bcache/btree.c | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d256231d9915..603d1c4d1d45 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -658,7 +658,11 @@ struct cache_set {
/* * A btree node on disk could have too many bsets for an iterator to fit - * on the stack - have to dynamically allocate them + * on the stack - have to dynamically allocate them. + * bch_cache_set_alloc() will make sure the pool can allocate iterators + * equipped with enough room that can host + * (sb.bucket_size / sb.block_size) + * btree_iter_sets, which is more than static MAX_BSETS. */ mempool_t fill_iter;
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 45f684689c35..6e04f4333495 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -207,6 +207,11 @@ void bch_btree_node_read_done(struct btree *b) struct bset *i = btree_bset_first(b); struct btree_iter *iter;
+ /* + * c->fill_iter can allocate an iterator with more memory space + * than static MAX_BSETS. + * See the comment arount cache_set->fill_iter. + */ iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO); iter->size = b->c->sb.bucket_size / b->c->sb.block_size; iter->used = 0;
From: Shenghui Wang shhuiw@foxmail.com
mainline inclusion from mainline-5.0-rc1 commit 3db4d0783eaf2a2537f6b83679ad57cba0ad0481 category: backport
commit 220bb38c21b8 ("bcache: Break up struct search") introduced changes to struct search and s->iop. bypass/bio are fields of struct data_insert_op now. Update the comment.
Signed-off-by: Shenghui Wang shhuiw@foxmail.com Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/request.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 4ca3e3d3f9c7..f101bfe8657a 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -311,11 +311,11 @@ static void bch_data_insert_start(struct closure *cl) * data is written it calls bch_journal, and after the keys have been added to * the next journal write they're inserted into the btree. * - * It inserts the data in s->cache_bio; bi_sector is used for the key offset, + * It inserts the data in op->bio; bi_sector is used for the key offset, * and op->inode is used for the key inode. * - * If s->bypass is true, instead of inserting the data it invalidates the - * region of the cache represented by s->cache_bio and op->inode. + * If op->bypass is true, instead of inserting the data it invalidates the + * region of the cache represented by op->bio and op->inode. */ void bch_data_insert(struct closure *cl) {
From: Shenghui Wang shhuiw@foxmail.com
mainline inclusion from mainline-5.0-rc1 commit 4e361e020e7220fa8fc812acdca4d08814633f49 category: backport
We have struct cached_dev allocated by kzalloc in register_bcache(), which initializes all the fields of cached_dev with 0s. And commit ce4c3e19e520 ("bcache: Replace bch_read_string_list() by __sysfs_match_string()") has remove the string "default".
Update the comment.
Signed-off-by: Shenghui Wang shhuiw@foxmail.com Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 3e8d1f1b562f..2bc4d90a2d0b 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -16,7 +16,7 @@ #include <linux/sort.h> #include <linux/sched/clock.h>
-/* Default is -1; we skip past it for struct cached_dev's cache mode */ +/* Default is 0 ("writethrough") */ static const char * const bch_cache_modes[] = { "writethrough", "writeback", @@ -25,7 +25,7 @@ static const char * const bch_cache_modes[] = { NULL };
-/* Default is -1; we skip past it for stop_when_cache_set_failed */ +/* Default is 0 ("auto") */ static const char * const bch_stop_on_failure_modes[] = { "auto", "always",
From: Shenghui Wang shhuiw@foxmail.com
mainline inclusion from mainline-5.0-rc1 commit f383ae300c4b6466824fd5c8cbd0a6c4ed2b53d3 category: backport
"echo 1 > writeback_running" marks writeback_running even if no writeback kthread created as "d_strtoul(writeback_running)" will simply set dc-> writeback_running without checking the existence of dc->writeback_thread.
Add check for setting writeback_running via sysfs: if no writeback kthread available, reject setting to 1.
v2 -> v3: * Make message on wrong assignment more clear. * Print name of bcache device instead of name of backing device.
Signed-off-by: Shenghui Wang shhuiw@foxmail.com Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/sysfs.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-)
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 2bc4d90a2d0b..419939c39fd9 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -392,8 +392,25 @@ STORE(bch_cached_dev) mutex_lock(&bch_register_lock); size = __cached_dev_store(kobj, attr, buf, size);
- if (attr == &sysfs_writeback_running) - bch_writeback_queue(dc); + if (attr == &sysfs_writeback_running) { + /* dc->writeback_running changed in __cached_dev_store() */ + if (IS_ERR_OR_NULL(dc->writeback_thread)) { + /* + * reject setting it to 1 via sysfs if writeback + * kthread is not created yet. + */ + if (dc->writeback_running) { + dc->writeback_running = false; + pr_err("%s: failed to run non-existent writeback thread", + dc->disk.disk->disk_name); + } + } else + /* + * writeback kthread will check if dc->writeback_running + * is true or false. + */ + bch_writeback_queue(dc); + }
/* * Only set BCACHE_DEV_WB_RUNNING when cached device attached to
mainline inclusion from mainline-5.0-rc1 commit cb07ad63682ffcce10e9beaed7828a26780e3df9 category: backport
Garbage collection thread starts to work when c->sectors_to_gc is negative value, otherwise nothing will happen even the gc thread is woken up by wake_up_gc().
force_wake_up_gc() sets c->sectors_to_gc to -1 before calling wake_up_gc(), then gc thread may have chance to run if no one else sets c->sectors_to_gc to a positive value before gc_should_run().
This routine can be called where the gc thread is woken up and required to run in force.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/btree.h | 18 ++++++++++++++++++ drivers/md/bcache/sysfs.c | 17 ++--------------- 2 files changed, 20 insertions(+), 15 deletions(-)
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 4d0cca145f69..76cfd121a486 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -268,6 +268,24 @@ static inline void wake_up_gc(struct cache_set *c) wake_up(&c->gc_wait); }
+static inline void force_wake_up_gc(struct cache_set *c) +{ + /* + * Garbage collection thread only works when sectors_to_gc < 0, + * calling wake_up_gc() won't start gc thread if sectors_to_gc is + * not a nagetive value. + * Therefore sectors_to_gc is set to -1 here, before waking up + * gc thread by calling wake_up_gc(). Then gc_should_run() will + * give a chance to permit gc thread to run. "Give a chance" means + * before going into gc_should_run(), there is still possibility + * that c->sectors_to_gc being set to other positive value. So + * this routine won't 100% make sure gc thread will be woken up + * to run. + */ + atomic_set(&c->sectors_to_gc, -1); + wake_up_gc(c); +} + #define MAP_DONE 0 #define MAP_CONTINUE 1
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 419939c39fd9..1c709c6528c9 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -755,21 +755,8 @@ STORE(__bch_cache_set) bch_cache_accounting_clear(&c->accounting); }
- if (attr == &sysfs_trigger_gc) { - /* - * Garbage collection thread only works when sectors_to_gc < 0, - * when users write to sysfs entry trigger_gc, most of time - * they want to forcibly triger gargage collection. Here -1 is - * set to c->sectors_to_gc, to make gc_should_run() give a - * chance to permit gc thread to run. "give a chance" means - * before going into gc_should_run(), there is still chance - * that c->sectors_to_gc being set to other positive value. So - * writing sysfs entry trigger_gc won't always make sure gc - * thread takes effect. - */ - atomic_set(&c->sectors_to_gc, -1); - wake_up_gc(c); - } + if (attr == &sysfs_trigger_gc) + force_wake_up_gc(c);
if (attr == &sysfs_prune_cache) { struct shrink_control sc;
mainline inclusion from mainline-5.0-rc1 commit 7a671d8ef821bf5743fdff17fae0600648345b03 category: backport
The option gc_after_writeback is disabled by default, because garbage collection will discard SSD data which drops cached data.
Echo 1 into /sys/fs/bcache/<UUID>/internal/gc_after_writeback will enable this option, which wakes up gc thread when writeback accomplished and all cached data is clean.
This option is helpful for people who cares writing performance more. In heavy writing workload, all cached data can be clean only happens when writeback thread cleans all cached data in I/O idle time. In such situation a following gc running may help to shrink bcache B+ tree and discard more clean data, which may be helpful for future writing requests.
If you are not sure whether this is helpful for your own workload, please leave it as disabled by default.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/bcache.h | 14 ++++++++++++++ drivers/md/bcache/sysfs.c | 9 +++++++++ drivers/md/bcache/writeback.c | 27 +++++++++++++++++++++++++++ drivers/md/bcache/writeback.h | 2 ++ 4 files changed, 52 insertions(+)
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 603d1c4d1d45..e30a983a68cd 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -626,6 +626,20 @@ struct cache_set { /* Where in the btree gc currently is */ struct bkey gc_done;
+ /* + * For automatical garbage collection after writeback completed, this + * varialbe is used as bit fields, + * - 0000 0001b (BCH_ENABLE_AUTO_GC): enable gc after writeback + * - 0000 0010b (BCH_DO_AUTO_GC): do gc after writeback + * This is an optimization for following write request after writeback + * finished, but read hit rate dropped due to clean data on cache is + * discarded. Unless user explicitly sets it via sysfs, it won't be + * enabled. + */ +#define BCH_ENABLE_AUTO_GC 1 +#define BCH_DO_AUTO_GC 2 + uint8_t gc_after_writeback; + /* * The allocation code needs gc_mark in struct bucket to be correct, but * it's not while a gc is in progress. Protected by bucket_lock. diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 1c709c6528c9..1e8fffa107fb 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -128,6 +128,7 @@ rw_attribute(expensive_debug_checks); rw_attribute(cache_replacement_policy); rw_attribute(btree_shrinker_disabled); rw_attribute(copy_gc_enabled); +rw_attribute(gc_after_writeback); rw_attribute(size);
static ssize_t bch_snprint_string_list(char *buf, @@ -706,6 +707,7 @@ SHOW(__bch_cache_set) sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); + sysfs_printf(gc_after_writeback, "%i", c->gc_after_writeback); sysfs_printf(io_disable, "%i", test_bit(CACHE_SET_IO_DISABLE, &c->flags));
@@ -815,6 +817,12 @@ STORE(__bch_cache_set) sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite); sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled); sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled); + /* + * write gc_after_writeback here may overwrite an already set + * BCH_DO_AUTO_GC, it doesn't matter because this flag will be + * set in next chance. + */ + sysfs_strtoul_clamp(gc_after_writeback, c->gc_after_writeback, 0, 1);
return size; } @@ -895,6 +903,7 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_gc_always_rewrite, &sysfs_btree_shrinker_disabled, &sysfs_copy_gc_enabled, + &sysfs_gc_after_writeback, &sysfs_io_disable, NULL }; diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index b5fc3c6c7178..e9ffcea1ca50 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -17,6 +17,15 @@ #include <linux/sched/clock.h> #include <trace/events/bcache.h>
+static void update_gc_after_writeback(struct cache_set *c) +{ + if (c->gc_after_writeback != (BCH_ENABLE_AUTO_GC) || + c->gc_stats.in_use < BCH_AUTO_GC_DIRTY_THRESHOLD) + return; + + c->gc_after_writeback |= BCH_DO_AUTO_GC; +} + /* Rate limiting */ static uint64_t __calc_target_rate(struct cached_dev *dc) { @@ -191,6 +200,7 @@ static void update_writeback_rate(struct work_struct *work) if (!set_at_max_writeback_rate(c, dc)) { down_read(&dc->writeback_lock); __update_writeback_rate(dc); + update_gc_after_writeback(c); up_read(&dc->writeback_lock); } } @@ -689,6 +699,23 @@ static int bch_writeback_thread(void *arg) up_write(&dc->writeback_lock); break; } + + /* + * When dirty data rate is high (e.g. 50%+), there might + * be heavy buckets fragmentation after writeback + * finished, which hurts following write performance. + * If users really care about write performance they + * may set BCH_ENABLE_AUTO_GC via sysfs, then when + * BCH_DO_AUTO_GC is set, garbage collection thread + * will be wake up here. After moving gc, the shrunk + * btree and discarded free buckets SSD space may be + * helpful for following write requests. + */ + if (c->gc_after_writeback == + (BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) { + c->gc_after_writeback &= ~BCH_DO_AUTO_GC; + force_wake_up_gc(c); + } }
up_write(&dc->writeback_lock); diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index e75dc33339f6..f3bd888a4c7d 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -11,6 +11,8 @@ #define WRITEBACK_RATE_UPDATE_SECS_MAX 60 #define WRITEBACK_RATE_UPDATE_SECS_DEFAULT 5
+#define BCH_AUTO_GC_DIRTY_THRESHOLD 50 + /* * 14 (16384ths) is chosen here as something that each backing device * should be a reasonable fraction of the share, and not to blow up
mainline inclusion from mainline-5.0-rc1 commit 009673d02fa92acaa7ed0b1e1389610e4390ba49 category: backport
This patch moves MODULE_AUTHOR and MODULE_LICENSE to end of super.c, and add MODULE_DESCRIPTION("Bcache: a Linux block layer cache").
This is preparation for adding module parameters.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/super.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index b72459d340cf..ac54b61ac283 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -25,9 +25,6 @@ #include <linux/reboot.h> #include <linux/sysfs.h>
-MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Kent Overstreet kent.overstreet@gmail.com"); - static const char bcache_magic[] = { 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca, 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81 @@ -2464,3 +2461,7 @@ static int __init bcache_init(void)
module_exit(bcache_exit); module_init(bcache_init); + +MODULE_DESCRIPTION("Bcache: a Linux block layer cache"); +MODULE_AUTHOR("Kent Overstreet kent.overstreet@gmail.com"); +MODULE_LICENSE("GPL");
mainline inclusion from mainline-5.0-rc1 commit 9aaf51654672b16566c5fe787da3ca41ebf6d297 category: backport
Currently the cutoff writeback and cutoff writeback sync thresholds are defined by CUTOFF_WRITEBACK (40) and CUTOFF_WRITEBACK_SYNC (70) as static values. Most of time these they work fine, but when people want to do research on bcache writeback mode performance tuning, there is no chance to modify the soft and hard cutoff writeback values.
This patch introduces two module parameters bch_cutoff_writeback_sync and bch_cutoff_writeback which permit people to tune the values when loading bcache.ko. If they are not specified by module loading, current values CUTOFF_WRITEBACK_SYNC and CUTOFF_WRITEBACK will be used as default and nothing changes.
When people want to tune this two values, - cutoff_writeback can be set in range [1, 70] - cutoff_writeback_sync can be set in range [1, 90] - cutoff_writeback always <= cutoff_writeback_sync
The default values are strongly recommended to most of users for most of workloads. Anyway, if people wants to take their own risk to do research on new writeback cutoff tuning for their own workload, now they can make it.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/super.c | 40 ++++++++++++++++++++++++++++++++++++++++ drivers/md/bcache/sysfs.c | 7 +++++++ drivers/md/bcache/writeback.h | 10 ++++++++-- 3 files changed, 55 insertions(+), 2 deletions(-)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index ac54b61ac283..2ad333245674 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -25,6 +25,9 @@ #include <linux/reboot.h> #include <linux/sysfs.h>
+unsigned int bch_cutoff_writeback; +unsigned int bch_cutoff_writeback_sync; + static const char bcache_magic[] = { 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca, 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81 @@ -2415,6 +2418,32 @@ static void bcache_exit(void) mutex_destroy(&bch_register_lock); }
+/* Check and fixup module parameters */ +static void check_module_parameters(void) +{ + if (bch_cutoff_writeback_sync == 0) + bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC; + else if (bch_cutoff_writeback_sync > CUTOFF_WRITEBACK_SYNC_MAX) { + pr_warn("set bch_cutoff_writeback_sync (%u) to max value %u", + bch_cutoff_writeback_sync, CUTOFF_WRITEBACK_SYNC_MAX); + bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC_MAX; + } + + if (bch_cutoff_writeback == 0) + bch_cutoff_writeback = CUTOFF_WRITEBACK; + else if (bch_cutoff_writeback > CUTOFF_WRITEBACK_MAX) { + pr_warn("set bch_cutoff_writeback (%u) to max value %u", + bch_cutoff_writeback, CUTOFF_WRITEBACK_MAX); + bch_cutoff_writeback = CUTOFF_WRITEBACK_MAX; + } + + if (bch_cutoff_writeback > bch_cutoff_writeback_sync) { + pr_warn("set bch_cutoff_writeback (%u) to %u", + bch_cutoff_writeback, bch_cutoff_writeback_sync); + bch_cutoff_writeback = bch_cutoff_writeback_sync; + } +} + static int __init bcache_init(void) { static const struct attribute *files[] = { @@ -2423,6 +2452,8 @@ static int __init bcache_init(void) NULL };
+ check_module_parameters(); + mutex_init(&bch_register_lock); init_waitqueue_head(&unregister_wait); register_reboot_notifier(&reboot); @@ -2459,9 +2490,18 @@ static int __init bcache_init(void) return -ENOMEM; }
+/* + * Module hooks + */ module_exit(bcache_exit); module_init(bcache_init);
+module_param(bch_cutoff_writeback, uint, 0); +MODULE_PARM_DESC(bch_cutoff_writeback, "threshold to cutoff writeback"); + +module_param(bch_cutoff_writeback_sync, uint, 0); +MODULE_PARM_DESC(bch_cutoff_writeback_sync, "hard threshold to cutoff writeback"); + MODULE_DESCRIPTION("Bcache: a Linux block layer cache"); MODULE_AUTHOR("Kent Overstreet kent.overstreet@gmail.com"); MODULE_LICENSE("GPL"); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 1e8fffa107fb..7cecb17e3798 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -88,6 +88,8 @@ read_attribute(writeback_keys_done); read_attribute(writeback_keys_failed); read_attribute(io_errors); read_attribute(congested); +read_attribute(cutoff_writeback); +read_attribute(cutoff_writeback_sync); rw_attribute(congested_read_threshold_us); rw_attribute(congested_write_threshold_us);
@@ -699,6 +701,9 @@ SHOW(__bch_cache_set) sysfs_print(congested_write_threshold_us, c->congested_write_threshold_us);
+ sysfs_print(cutoff_writeback, bch_cutoff_writeback); + sysfs_print(cutoff_writeback_sync, bch_cutoff_writeback_sync); + sysfs_print(active_journal_entries, fifo_used(&c->journal.pin)); sysfs_printf(verify, "%i", c->verify); sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled); @@ -905,6 +910,8 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_copy_gc_enabled, &sysfs_gc_after_writeback, &sysfs_io_disable, + &sysfs_cutoff_writeback, + &sysfs_cutoff_writeback_sync, NULL }; KTYPE(bch_cache_set_internal); diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index f3bd888a4c7d..4e4c6810dc3c 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -5,6 +5,9 @@ #define CUTOFF_WRITEBACK 40 #define CUTOFF_WRITEBACK_SYNC 70
+#define CUTOFF_WRITEBACK_MAX 70 +#define CUTOFF_WRITEBACK_SYNC_MAX 90 + #define MAX_WRITEBACKS_IN_PASS 5 #define MAX_WRITESIZE_IN_PASS 5000 /* *512b */
@@ -55,6 +58,9 @@ static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc, } }
+extern unsigned int bch_cutoff_writeback; +extern unsigned int bch_cutoff_writeback_sync; + static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, unsigned int cache_mode, bool would_skip) { @@ -62,7 +68,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
if (cache_mode != CACHE_MODE_WRITEBACK || test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || - in_use > CUTOFF_WRITEBACK_SYNC) + in_use > bch_cutoff_writeback_sync) return false;
if (bio_op(bio) == REQ_OP_DISCARD) @@ -78,7 +84,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
return (op_is_sync(bio->bi_opf) || bio->bi_opf & (REQ_META|REQ_PRIO) || - in_use <= CUTOFF_WRITEBACK); + in_use <= bch_cutoff_writeback); }
static inline void bch_writeback_queue(struct cached_dev *dc)
mainline inclusion from mainline-5.0-rc1 commit cc38ca7ed54a1df04ae9f23614b348ebfc918029 category: backport
Because CUTOFF_WRITEBACK is defined as 40, so before the changes of dynamic cutoff writeback values, writeback_percent is limited to [0, CUTOFF_WRITEBACK]. Any value larger than CUTOFF_WRITEBACK will be fixed up to 40.
Now cutof writeback limit is a dynamic value bch_cutoff_writeback, so the range of writeback_percent can be a more flexible range as [0, bch_cutoff_writeback]. The flexibility is, it can be expended to a larger or smaller range than [0, 40], depends on how value bch_cutoff_writeback is specified.
The default value is still strongly recommended to most of users for most of workloads. But for people who want to do research on bcache writeback perforamnce tuning, they may have chance to specify more flexible writeback_percent in range [0, 70].
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/sysfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 7cecb17e3798..b5d54df7dc2e 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -267,7 +267,8 @@ STORE(__cached_dev) d_strtoul(writeback_running); d_strtoul(writeback_delay);
- sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); + sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, + 0, bch_cutoff_writeback);
if (attr == &sysfs_writeback_rate) { ssize_t ret;
From: Guoju Fang fangguoju@gmail.com
mainline inclusion from mainline-5.0-rc1 commit e78bd0d26f7396c7bd17be60d9686be8ef8e453a category: backport
Sometimes flush journal may be very frequent, so it's useful to dump number of keys every time write journal.
Signed-off-by: Guoju Fang fangguoju@gmail.com Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/journal.c | 2 +- include/trace/events/bcache.h | 27 ++++++++++++++++++++++++--- 2 files changed, 25 insertions(+), 4 deletions(-)
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 7bb15cddca5e..5f11405e6193 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -674,7 +674,7 @@ static void journal_write_unlocked(struct closure *cl) REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); bch_bio_map(bio, w->data);
- trace_bcache_journal_write(bio); + trace_bcache_journal_write(bio, w->data->keys); bio_list_add(&list, bio);
SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors); diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index 2cbd6e42ad83..e4526f85c19d 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -221,9 +221,30 @@ DEFINE_EVENT(cache_set, bcache_journal_entry_full, TP_ARGS(c) );
-DEFINE_EVENT(bcache_bio, bcache_journal_write, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) +TRACE_EVENT(bcache_journal_write, + TP_PROTO(struct bio *bio, u32 keys), + TP_ARGS(bio, keys), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(sector_t, sector ) + __field(unsigned int, nr_sector ) + __array(char, rwbs, 6 ) + __field(u32, nr_keys ) + ), + + TP_fast_assign( + __entry->dev = bio_dev(bio); + __entry->sector = bio->bi_iter.bi_sector; + __entry->nr_sector = bio->bi_iter.bi_size >> 9; + __entry->nr_keys = keys; + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); + ), + + TP_printk("%d,%d %s %llu + %u keys %u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + (unsigned long long)__entry->sector, __entry->nr_sector, + __entry->nr_keys) );
/* Btree */
mainline inclusion from mainline-5.1-rc1 commit 83ff9318c44babe32da0947d81443bad82da2d44 category: backport
In stats.c:bch_cache_accounting_clear(), a hard coded number '7' is used in memset(). It is because in struct cache_stats, there are 7 atomic_t type members. This is not good when new members added into struct stats, the hard coded number will only clear part of memory.
This patch replaces 'sizeof(unsigned long) * 7' by more generic 'sizeof(struct cache_stats))', to avoid potential error if new member added into struct cache_stats.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/stats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index 894410f3f829..ba1c93791d8d 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -111,7 +111,7 @@ void bch_cache_accounting_clear(struct cache_accounting *acc) { memset(&acc->total.cache_hits, 0, - sizeof(unsigned long) * 7); + sizeof(struct cache_stats)); }
void bch_cache_accounting_destroy(struct cache_accounting *acc)
mainline inclusion from mainline-5.1-rc1 commit 926d19465b66cb6bde4ca28fde16de775af4e357 category: backport
This patch export dc->backing_dev_name to sysfs file /sys/block/bcache<?>/bcache/backing_dev_name, then people or user space tools may know the backing device name of this bcache device.
Of cause it can be done by parsing sysfs links, but this method can be much simpler to find the link between bcache device and backing device.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/sysfs.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index b5d54df7dc2e..96240060038a 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -67,6 +67,7 @@ read_attribute(written); read_attribute(btree_written); read_attribute(metadata_written); read_attribute(active_journal_entries); +read_attribute(backing_dev_name);
sysfs_time_stats_attribute(btree_gc, sec, ms); sysfs_time_stats_attribute(btree_split, sec, us); @@ -243,6 +244,12 @@ SHOW(__bch_cached_dev) return strlen(buf); }
+ if (attr == &sysfs_backing_dev_name) { + snprintf(buf, BDEVNAME_SIZE + 1, "%s", dc->backing_dev_name); + strcat(buf, "\n"); + return strlen(buf); + } + #undef var return 0; } @@ -465,6 +472,7 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_verify, &sysfs_bypass_torture_test, #endif + &sysfs_backing_dev_name, NULL }; KTYPE(bch_cached_dev);
mainline inclusion from mainline-5.1-rc1 commit d4610456cfa412811b749f6215b9adae976ab4c3 category: backport
When there are multiple bcache devices, after a reboot the name of bcache devices may change (e.g. current /dev/bcache1 was /dev/bcache0 before reboot). Therefore we need the backing device UUID (sb.uuid) to identify each bcache device.
Backing device uuid can be found by program bcache-super-show, but directly exporting backing_dev_uuid by sysfs file /sys/block/bcache<?>/bcache/backing_dev_uuid is a much simpler method.
With backing_dev_uuid, and partition uuids from /dev/disk/by-partuuid/, now we can identify each bcache device and its partitions conveniently.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/sysfs.c | 9 +++++++++ 1 file changed, 9 insertions(+)
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 96240060038a..4e27a91a6d2f 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -68,6 +68,7 @@ read_attribute(btree_written); read_attribute(metadata_written); read_attribute(active_journal_entries); read_attribute(backing_dev_name); +read_attribute(backing_dev_uuid);
sysfs_time_stats_attribute(btree_gc, sec, ms); sysfs_time_stats_attribute(btree_split, sec, us); @@ -250,6 +251,13 @@ SHOW(__bch_cached_dev) return strlen(buf); }
+ if (attr == &sysfs_backing_dev_uuid) { + /* convert binary uuid into 36-byte string plus '\0' */ + snprintf(buf, 36+1, "%pU", dc->sb.uuid); + strcat(buf, "\n"); + return strlen(buf); + } + #undef var return 0; } @@ -473,6 +481,7 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_bypass_torture_test, #endif &sysfs_backing_dev_name, + &sysfs_backing_dev_uuid, NULL }; KTYPE(bch_cached_dev);
From: Colin Ian King colin.king@canonical.com
mainline inclusion from mainline-5.1-rc1 commit e8cf978dffb2c603340d4615eec2e5358c9df06d category: backport
There is a hunk of code that is indented one level too deep, fix this by removing the extra tabs.
Signed-off-by: Colin Ian King colin.king@canonical.com Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/super.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 2ad333245674..a02dacfe562a 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1596,21 +1596,21 @@ static void conditional_stop_bcache_device(struct cache_set *c, */ pr_warn("stop_when_cache_set_failed of %s is "auto" and cache is dirty, stop it to avoid potential data corruption.", d->disk->disk_name); - /* - * There might be a small time gap that cache set is - * released but bcache device is not. Inside this time - * gap, regular I/O requests will directly go into - * backing device as no cache set attached to. This - * behavior may also introduce potential inconsistence - * data in writeback mode while cache is dirty. - * Therefore before calling bcache_device_stop() due - * to a broken cache device, dc->io_disable should be - * explicitly set to true. - */ - dc->io_disable = true; - /* make others know io_disable is true earlier */ - smp_mb(); - bcache_device_stop(d); + /* + * There might be a small time gap that cache set is + * released but bcache device is not. Inside this time + * gap, regular I/O requests will directly go into + * backing device as no cache set attached to. This + * behavior may also introduce potential inconsistence + * data in writeback mode while cache is dirty. + * Therefore before calling bcache_device_stop() due + * to a broken cache device, dc->io_disable should be + * explicitly set to true. + */ + dc->io_disable = true; + /* make others know io_disable is true earlier */ + smp_mb(); + bcache_device_stop(d); } else { /* * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
mainline inclusion from mainline-5.1-rc1 commit f54478c6e226bb1540a3e58366601039dfd778e2 category: backport
Cache set congested threshold values congested_read_threshold_us and congested_write_threshold_us can be set via sysfs interface. These two values are 'unsigned int' type, but sysfs interface uses strtoul to convert input string. So if people input a large number like 9999999999, the value indeed set is 1410065407, which is not expected behavior.
This patch replaces sysfs_strtoul() by sysfs_strtoul_clamp() when convert input string to unsigned int value, and set value range in [0, UINT_MAX], to avoid the above integer overflow errors.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/sysfs.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 4e27a91a6d2f..738c097038cd 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -791,10 +791,12 @@ STORE(__bch_cache_set) c->shrink.scan_objects(&c->shrink, &sc); }
- sysfs_strtoul(congested_read_threshold_us, - c->congested_read_threshold_us); - sysfs_strtoul(congested_write_threshold_us, - c->congested_write_threshold_us); + sysfs_strtoul_clamp(congested_read_threshold_us, + c->congested_read_threshold_us, + 0, UINT_MAX); + sysfs_strtoul_clamp(congested_write_threshold_us, + c->congested_write_threshold_us, + 0, UINT_MAX);
if (attr == &sysfs_errors) { v = __sysfs_match_string(error_actions, -1, buf);
mainline inclusion from mainline-5.1-rc1 commit e4db37fb69d56d9523bb540bd8e07bf221aa9f6d category: backport
When setting bool values via sysfs interface, e.g. writeback_metadata, if writing 1 into writeback_metadata file, dc->writeback_metadata is set to 1, but if writing 2 into the file, dc->writeback_metadata is 0. This is misleading, a better result should be 1 for all non-zero input value.
It is because dc->writeback_metadata is a bit-field variable, and current code simply use d_strtoul() to convert a string into integer and takes the lowest bit value. To fix such error, we need a routine to convert the input string into unsigned integer, and set target variable to 1 if the converted integer is non-zero.
This patch introduces a new macro called sysfs_strtoul_bool(), it can be used to convert input string into bool value, we can use it to set bool value for bit-field vairables.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/sysfs.h | 10 ++++++++++ 1 file changed, 10 insertions(+)
diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h index 0ad2715a884e..215df32f567b 100644 --- a/drivers/md/bcache/sysfs.h +++ b/drivers/md/bcache/sysfs.h @@ -79,6 +79,16 @@ do { \ return strtoul_safe(buf, var) ?: (ssize_t) size; \ } while (0)
+#define sysfs_strtoul_bool(file, var) \ +do { \ + if (attr == &sysfs_ ## file) { \ + unsigned long v = strtoul_or_return(buf); \ + \ + var = v ? 1 : 0; \ + return size; \ + } \ +} while (0) + #define sysfs_strtoul_clamp(file, var, min, max) \ do { \ if (attr == &sysfs_ ## file) { \
mainline inclusion from mainline-5.1-rc1 commit f5c0b95d2eeb17cf8a81fde0461938d2a79303ab category: backport
When setting bcache parameters via sysfs, there are some variables are defined as bit-field value. Current bcache code in sysfs.c uses either d_strtoul() or sysfs_strtoul() to convert the input string to unsigned integer value and set it to the corresponded bit-field value.
The problem is, the bit-field value only takes the lowest bit of the converted value. If input is 2, the expected value (like bool value) of the bit-field value should be 1, but indeed it is 0.
The following sysfs files for bit-field variables have such problem, bypass_torture_test, for dc->bypass_torture_test writeback_metadata, for dc->writeback_metadata writeback_running, for dc->writeback_running verify, for c->verify key_merging_disabled, for c->key_merging_disabled gc_always_rewrite, for c->gc_always_rewrite btree_shrinker_disabled,for c->shrinker_disabled copy_gc_enabled, for c->copy_gc_enabled
This patch uses sysfs_strtoul_bool() to set such bit-field variables, then if the converted value is non-zero, the bit-field variables will be set to 1, like setting a bool value like expensive_debug_checks.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/sysfs.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 738c097038cd..ba7e525b45d6 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -277,9 +277,9 @@ STORE(__cached_dev)
sysfs_strtoul(data_csum, dc->disk.data_csum); d_strtoul(verify); - d_strtoul(bypass_torture_test); - d_strtoul(writeback_metadata); - d_strtoul(writeback_running); + sysfs_strtoul_bool(bypass_torture_test, dc->bypass_torture_test); + sysfs_strtoul_bool(writeback_metadata, dc->writeback_metadata); + sysfs_strtoul_bool(writeback_running, dc->writeback_running); d_strtoul(writeback_delay);
sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, @@ -836,12 +836,12 @@ STORE(__bch_cache_set) }
sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); - sysfs_strtoul(verify, c->verify); - sysfs_strtoul(key_merging_disabled, c->key_merging_disabled); + sysfs_strtoul_bool(verify, c->verify); + sysfs_strtoul_bool(key_merging_disabled, c->key_merging_disabled); sysfs_strtoul(expensive_debug_checks, c->expensive_debug_checks); - sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite); - sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled); - sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled); + sysfs_strtoul_bool(gc_always_rewrite, c->gc_always_rewrite); + sysfs_strtoul_bool(btree_shrinker_disabled, c->shrinker_disabled); + sysfs_strtoul_bool(copy_gc_enabled, c->copy_gc_enabled); /* * write gc_after_writeback here may overwrite an already set * BCH_DO_AUTO_GC, it doesn't matter because this flag will be
mainline inclusion from mainline-5.1-rc1 commit 369d21a73a241682de019ac5c5209ce3ec627743 category: backport
Sysfs file writeback_delay is used to configure dc->writeback_delay which is type unsigned int. But bcache code uses sysfs_strtoul() to convert the input string, therefore it might be overflowed if the input value is too large. E.g. input value is 4294967296 but indeed 0 is set to dc->writeback_delay.
This patch uses sysfs_strtoul_clamp() to convert the input string and set the result value range in [0, UINT_MAX] to avoid such unsigned integer overflow.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index ba7e525b45d6..c6869d80adf1 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -280,7 +280,7 @@ STORE(__cached_dev) sysfs_strtoul_bool(bypass_torture_test, dc->bypass_torture_test); sysfs_strtoul_bool(writeback_metadata, dc->writeback_metadata); sysfs_strtoul_bool(writeback_running, dc->writeback_running); - d_strtoul(writeback_delay); + sysfs_strtoul_clamp(writeback_delay, dc->writeback_delay, 0, UINT_MAX);
sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, bch_cutoff_writeback);
mainline inclusion from mainline-5.1-rc1 commit 453745fbbebecf7e459785db7e29e11563908525 category: backport
c->journal_delay_ms is in type unsigned short, it is set via sysfs interface and converted by sysfs_strtoul() from input string to unsigned short value. Therefore overflow to unsigned short might be happen when the converted value exceed USHRT_MAX. e.g. writing 65536 into sysfs file journal_delay_ms, c->journal_delay_ms is set to 0.
This patch uses sysfs_strtoul_clamp() to convert the input string and limit value range in [0, USHRT_MAX], to avoid the input overflow.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/sysfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index c6869d80adf1..48234481cd50 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -835,7 +835,9 @@ STORE(__bch_cache_set) } }
- sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); + sysfs_strtoul_clamp(journal_delay_ms, + c->journal_delay_ms, + 0, USHRT_MAX); sysfs_strtoul_bool(verify, c->verify); sysfs_strtoul_bool(key_merging_disabled, c->key_merging_disabled); sysfs_strtoul(expensive_debug_checks, c->expensive_debug_checks);
mainline inclusion from mainline-5.1-rc1 commit b15008403b59955c9fa0c8b55cadd6dae991a4e9 category: backport
c->error_limit is in type unsigned int, it is set via cache set sysfs file io_error_limit. Inside the bcache code, input string is converted by strtoul_or_return() and set the converted value to c->error_limit.
Because the converted value is unsigned long, and c->error_limit is unsigned int, if the input is large enought, overflow will happen to c->error_limit.
This patch uses sysfs_strtoul_clamp() to convert input string, and set the range in [0, UINT_MAX] to avoid the potential overflow.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/sysfs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 48234481cd50..3ae271f95ffe 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -806,8 +806,7 @@ STORE(__bch_cache_set) c->on_error = v; }
- if (attr == &sysfs_io_error_limit) - c->error_limit = strtoul_or_return(buf); + sysfs_strtoul_clamp(io_error_limit, c->error_limit, 0, UINT_MAX);
/* See count_io_errors() for why 88 */ if (attr == &sysfs_io_error_halflife) {
From: Ming Lei ming.lei@redhat.com
mainline inclusion from mainline-5.1-rc1 commit i2e1f4f4d2481d8bf111904c3e45fc0c4c94bf76e category: backport
bch_bio_alloc_pages() is always called on one new bio, so it is safe to access the bvec table directly. Given it is the only kind of this case, open code the bvec table access since bio_for_each_segment_all() will be changed to support for iterating over multipage bvec.
Acked-by: Coly Li colyli@suse.de Reviewed-by: Omar Sandoval osandov@fb.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Ming Lei ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/util.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index 20eddeac1531..62fb917f7a4f 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -270,7 +270,11 @@ int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) int i; struct bio_vec *bv;
- bio_for_each_segment_all(bv, bio, i) { + /* + * This is called on freshly new bio, so it is safe to access the + * bvec table directly. + */ + for (i = 0, bv = bio->bi_io_vec; i < bio->bi_vcnt; bv++, i++) { bv->bv_page = alloc_page(gfp_mask); if (!bv->bv_page) { while (--bv >= bio->bi_io_vec)
From: Guoju Fang fangguoju@gmail.com
mainline inclusion from mainline-5.2-rc1 commit 1568ee7e3c6305d9fbb2414bfd4b56e52853d42d category: backport
The bio from upper layer is considered completed when bio_complete() returns. In most scenarios bio_complete() is called in search_free(), but when read miss happens, the bio_compete() is called when backing device reading completed, while the struct search is still in use until cache inserting finished.
If someone stops the bcache device just then, the device may be closed and released, but after cache inserting finished the struct search will access a freed struct cached_dev.
This patch add the reference of bcache device before bio_complete() when read miss happens, and put it after the search is not used.
Signed-off-by: Guoju Fang fangguoju@gmail.com Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/request.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-)
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index f101bfe8657a..f11123079fe0 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -706,14 +706,14 @@ static void search_free(struct closure *cl) { struct search *s = container_of(cl, struct search, cl);
- atomic_dec(&s->d->c->search_inflight); + atomic_dec(&s->iop.c->search_inflight);
if (s->iop.bio) bio_put(s->iop.bio);
bio_complete(s); closure_debug_destroy(cl); - mempool_free(s, &s->d->c->search); + mempool_free(s, &s->iop.c->search); }
static inline struct search *search_alloc(struct bio *bio, @@ -756,13 +756,13 @@ static void cached_dev_bio_complete(struct closure *cl) struct search *s = container_of(cl, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
- search_free(cl); cached_dev_put(dc); + search_free(cl); }
/* Process reads */
-static void cached_dev_cache_miss_done(struct closure *cl) +static void cached_dev_read_error_done(struct closure *cl) { struct search *s = container_of(cl, struct search, cl);
@@ -800,7 +800,22 @@ static void cached_dev_read_error(struct closure *cl) closure_bio_submit(s->iop.c, bio, cl); }
- continue_at(cl, cached_dev_cache_miss_done, NULL); + continue_at(cl, cached_dev_read_error_done, NULL); +} + +static void cached_dev_cache_miss_done(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + struct bcache_device *d = s->d; + + if (s->iop.replace_collision) + bch_mark_cache_miss_collision(s->iop.c, s->d); + + if (s->iop.bio) + bio_free_pages(s->iop.bio); + + cached_dev_bio_complete(cl); + closure_put(&d->cl); }
static void cached_dev_read_done(struct closure *cl) @@ -833,6 +848,7 @@ static void cached_dev_read_done(struct closure *cl) if (verify(dc) && s->recoverable && !s->read_dirty_data) bch_data_verify(dc, s->orig_bio);
+ closure_get(&dc->disk.cl); bio_complete(s);
if (s->iop.bio &&
From: Guoju Fang fangguoju@gmail.com
mainline inclusion from mainline-5.2-rc1 commit 4e0c04ec3a304490a83d5c0355e64176acc9b4ba category: backport
To get the amount of unused buckets in sysfs_priority_stats, the code count the buckets which GC_SECTORS_USED is zero. It's correct and should not be overwritten by the count of buckets which prio is zero.
Signed-off-by: Guoju Fang fangguoju@gmail.com Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/sysfs.c | 2 -- 1 file changed, 2 deletions(-)
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 3ae271f95ffe..d0b1ebf4eb56 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -1001,8 +1001,6 @@ SHOW(__bch_cache) !cached[n - 1]) --n;
- unused = ca->sb.nbuckets - n; - while (cached < p + n && *cached == BTREE_PRIO) cached++, n--;
From: Geliang Tang geliangtang@gmail.com
mainline inclusion from mainline-5.2-rc1 commit 792732d9852c0e4505aceff4631ea2168fd02480 category: backport
This patch uses kmemdup_nul to create a NUL-terminated string from dc->sb.label. This is better than open coding it.
With this, we can move env[2] initialization into env[] array to make code more elegant.
Signed-off-by: Geliang Tang geliangtang@gmail.com Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/super.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a02dacfe562a..70916ad816b4 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -906,21 +906,18 @@ static int cached_dev_status_update(void *arg) void bch_cached_dev_run(struct cached_dev *dc) { struct bcache_device *d = &dc->disk; - char buf[SB_LABEL_SIZE + 1]; + char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL); char *env[] = { "DRIVER=bcache", kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid), - NULL, + kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf ? : ""), NULL, };
- memcpy(buf, dc->sb.label, SB_LABEL_SIZE); - buf[SB_LABEL_SIZE] = '\0'; - env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf); - if (atomic_xchg(&dc->running, 1)) { kfree(env[1]); kfree(env[2]); + kfree(buf); return; }
@@ -944,6 +941,7 @@ void bch_cached_dev_run(struct cached_dev *dc) kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); kfree(env[1]); kfree(env[2]); + kfree(buf);
if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
From: George Spelvin lkml@sdf.org
mainline inclusion from mainline-5.2-rc1 commit 3a3947271cd6ce05c5b30c1250fa99de57410500 category: backport
There are a few nits in this function. They could in theory all be separate patches, but that's probably taking small commits too far.
1) I added a brief comment saying what it does.
2) I like to declare pointer parameters "const" where possible for documentation reasons.
3) It uses bitmap_weight(&rand, BITS_PER_LONG) to compute the Hamming weight of a 32-bit random number (giving a random integer with mean 16 and variance 8). Passing by reference in a 64-bit variable is silly; just use hweight32().
4) Its helper function fract_exp_two is unnecessarily tangled. Gcc can optimize the multiply by (1 << x) to a shift, but it can be written in a much more straightforward way at the cost of one more bit of internal precision. Some analysis reveals that this bit is always available.
This shrinks the object code for fract_exp_two(x, 6) from 23 bytes:
0000000000000000 <foo1>: 0: 89 f9 mov %edi,%ecx 2: c1 e9 06 shr $0x6,%ecx 5: b8 01 00 00 00 mov $0x1,%eax a: d3 e0 shl %cl,%eax c: 83 e7 3f and $0x3f,%edi f: d3 e7 shl %cl,%edi 11: c1 ef 06 shr $0x6,%edi 14: 01 f8 add %edi,%eax 16: c3 retq
To 19:
0000000000000017 <foo2>: 17: 89 f8 mov %edi,%eax 19: 83 e0 3f and $0x3f,%eax 1c: 83 c0 40 add $0x40,%eax 1f: 89 f9 mov %edi,%ecx 21: c1 e9 06 shr $0x6,%ecx 24: d3 e0 shl %cl,%eax 26: c1 e8 06 shr $0x6,%eax 29: c3 retq
(Verified with 0 <= frac_bits <= 8, 0 <= x < 16<<frac_bits; both versions produce the same output.)
5) And finally, the call to bch_get_congested() in check_should_bypass() is separated from the use of the value by multiple tests which could moot the need to compute it. Move the computation down to where it's needed. This also saves a local register to hold the computed value.
Signed-off-by: George Spelvin lkml@sdf.org Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/request.c | 15 ++++++++------- drivers/md/bcache/request.h | 2 +- drivers/md/bcache/util.h | 26 +++++++++++++++++++------- 3 files changed, 28 insertions(+), 15 deletions(-)
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index f11123079fe0..41adcd1546f1 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -329,12 +329,13 @@ void bch_data_insert(struct closure *cl) bch_data_insert_start(cl); }
-/* Congested? */ - -unsigned int bch_get_congested(struct cache_set *c) +/* + * Congested? Return 0 (not congested) or the limit (in sectors) + * beyond which we should bypass the cache due to congestion. + */ +unsigned int bch_get_congested(const struct cache_set *c) { int i; - long rand;
if (!c->congested_read_threshold_us && !c->congested_write_threshold_us) @@ -353,8 +354,7 @@ unsigned int bch_get_congested(struct cache_set *c) if (i > 0) i = fract_exp_two(i, 6);
- rand = get_random_int(); - i -= bitmap_weight(&rand, BITS_PER_LONG); + i -= hweight32(get_random_u32());
return i > 0 ? i : 1; } @@ -376,7 +376,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) { struct cache_set *c = dc->disk.c; unsigned int mode = cache_mode(dc); - unsigned int sectors, congested = bch_get_congested(c); + unsigned int sectors, congested; struct task_struct *task = current; struct io *i;
@@ -412,6 +412,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) goto rescale; }
+ congested = bch_get_congested(c); if (!congested && !dc->sequential_cutoff) goto rescale;
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 721bf336ed1a..c64dbd7a91aa 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -33,7 +33,7 @@ struct data_insert_op { BKEY_PADDED(replace_key); };
-unsigned int bch_get_congested(struct cache_set *c); +unsigned int bch_get_congested(const struct cache_set *c); void bch_data_insert(struct closure *cl);
void bch_cached_dev_request_init(struct cached_dev *dc); diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index b1f5b7aea872..c029f7443190 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -558,17 +558,29 @@ static inline uint64_t bch_crc64_update(uint64_t crc, return crc; }
-/* Does linear interpolation between powers of two */ +/* + * A stepwise-linear pseudo-exponential. This returns 1 << (x >> + * frac_bits), with the less-significant bits filled in by linear + * interpolation. + * + * This can also be interpreted as a floating-point number format, + * where the low frac_bits are the mantissa (with implicit leading + * 1 bit), and the more significant bits are the exponent. + * The return value is 1.mantissa * 2^exponent. + * + * The way this is used, fract_bits is 6 and the largest possible + * input is CONGESTED_MAX-1 = 1023 (exponent 16, mantissa 0x1.fc), + * so the maximum output is 0x1fc00. + */ static inline unsigned int fract_exp_two(unsigned int x, unsigned int fract_bits) { - unsigned int fract = x & ~(~0 << fract_bits); - - x >>= fract_bits; - x = 1 << x; - x += (x * fract) >> fract_bits; + unsigned int mantissa = 1 << fract_bits; /* Implicit bit */
- return x; + mantissa += x & (mantissa - 1); + x >>= fract_bits; /* The exponent */ + /* Largest intermediate value 0x7f0000 */ + return mantissa << x >> fract_bits; }
void bch_bio_map(struct bio *bio, void *base);
mainline inclusion from mainline-5.2-rc1 commit 14215ee01f6377c81c25c2cecda729e8811d2826 category: backport
'int ret' is defined as a local variable inside macro read_bucket(). Since this macro is called multiple times, and following patches will use a 'int ret' variable in bch_journal_read(), this patch moves definition of 'int ret' from macro read_bucket() to range of function bch_journal_read().
Signed-off-by: Coly Li colyli@suse.de Reviewed-by: Hannes Reinecke hare@suse.com Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/journal.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 5f11405e6193..a2a093f0fc76 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -147,7 +147,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) { #define read_bucket(b) \ ({ \ - int ret = journal_read_bucket(ca, list, b); \ + ret = journal_read_bucket(ca, list, b); \ __set_bit(b, bitmap); \ if (ret < 0) \ return ret; \ @@ -156,6 +156,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
struct cache *ca; unsigned int iter; + int ret = 0;
for_each_cache(ca, c, iter) { struct journal_device *ja = &ca->journal; @@ -267,7 +268,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) struct journal_replay, list)->j.seq;
- return 0; + return ret; #undef read_bucket }
mainline inclusion from mainline-5.2-rc1 commit 2d17456eb1cc78803b999fdd503c2dbd42a7d3da category: backport
Bcache has several routines to release resources in implicit way, they are called when the associated kobj released. This patch adds code comments to notice when and which release callback will be called, - When dc->disk.kobj released: void bch_cached_dev_release(struct kobject *kobj) - When d->kobj released: void bch_flash_dev_release(struct kobject *kobj) - When c->kobj released: void bch_cache_set_release(struct kobject *kobj) - When ca->kobj released void bch_cache_release(struct kobject *kobj)
Signed-off-by: Coly Li colyli@suse.de Reviewed-by: Chaitanya Kulkarni chaitanya.kulkarni@wdc.com Reviewed-by: Hannes Reinecke hare@suse.com Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/super.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 70916ad816b4..ee794a8afaea 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1172,6 +1172,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, return 0; }
+/* when dc->disk.kobj released */ void bch_cached_dev_release(struct kobject *kobj) { struct cached_dev *dc = container_of(kobj, struct cached_dev, @@ -1322,6 +1323,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page,
/* Flash only volumes */
+/* When d->kobj released */ void bch_flash_dev_release(struct kobject *kobj) { struct bcache_device *d = container_of(kobj, struct bcache_device, @@ -1475,6 +1477,7 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) return true; }
+/* When c->kobj released */ void bch_cache_set_release(struct kobject *kobj) { struct cache_set *c = container_of(kobj, struct cache_set, kobj); @@ -2009,6 +2012,7 @@ static const char *register_cache_set(struct cache *ca)
/* Cache device */
+/* When ca->kobj released */ void bch_cache_release(struct kobject *kobj) { struct cache *ca = container_of(kobj, struct cache, kobj);
mainline inclusion from mainline-5.2-rc1 commit 88c12d42d2bb6e05deb3cfd24d12f6fe80544575 category: backport
This patch adds return value to register_bdev(). Then if failure happens inside register_bdev(), its caller register_bcache() may detect and handle the failure more properly.
Signed-off-by: Coly Li colyli@suse.de Reviewed-by: Hannes Reinecke hare@suse.com Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/super.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index ee794a8afaea..e61abdce9a23 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1277,7 +1277,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
/* Cached device - bcache superblock */
-static void register_bdev(struct cache_sb *sb, struct page *sb_page, +static int register_bdev(struct cache_sb *sb, struct page *sb_page, struct block_device *bdev, struct cached_dev *dc) { @@ -1315,10 +1315,11 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page, BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) bch_cached_dev_run(dc);
- return; + return 0; err: pr_notice("error %s: %s", dc->backing_dev_name, err); bcache_device_stop(&dc->disk); + return -EIO; }
/* Flash only volumes */ @@ -2259,7 +2260,7 @@ static bool bch_is_open(struct block_device *bdev) static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, const char *buffer, size_t size) { - ssize_t ret = size; + ssize_t ret = -EINVAL; const char *err = "cannot allocate memory"; char *path = NULL; struct cache_sb *sb = NULL; @@ -2293,7 +2294,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!IS_ERR(bdev)) bdput(bdev); if (attr == &ksysfs_register_quiet) - goto out; + goto quiet_out; } goto err; } @@ -2314,8 +2315,10 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, goto err_close;
mutex_lock(&bch_register_lock); - register_bdev(sb, sb_page, bdev, dc); + ret = register_bdev(sb, sb_page, bdev, dc); mutex_unlock(&bch_register_lock); + if (ret < 0) + goto err; } else { struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
@@ -2325,6 +2328,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (register_cache(sb, sb_page, bdev, ca) != 0) goto err; } +quiet_out: + ret = size; out: if (sb_page) put_page(sb_page); @@ -2337,7 +2342,6 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); err: pr_info("error %s: %s", path, err); - ret = -EINVAL; goto out; }
mainline inclusion from mainline-5.2-rc1 commit bb6d355c2aff42d4075a8e7428dd72cb009d6143 category: backport
Add comments to explain why in register_bcache() blkdev_put() won't be called in two location. Add comments to explain why blkdev_put() must be called in register_cache() when cache_alloc() failed.
Signed-off-by: Coly Li colyli@suse.de Reviewed-by: Chaitanya Kulkarni chaitanya.kulkarni@wdc.com Reviewed-by: Hannes Reinecke hare@suse.com Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/super.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e61abdce9a23..32dd81a6e57d 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2177,6 +2177,12 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page,
ret = cache_alloc(ca); if (ret != 0) { + /* + * If we failed here, it means ca->kobj is not initialized yet, + * kobject_put() won't be called and there is no chance to + * call blkdev_put() to bdev in bch_cache_release(). So we + * explicitly call blkdev_put() here. + */ blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); if (ret == -ENOMEM) err = "cache_alloc(): -ENOMEM"; @@ -2317,6 +2323,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, mutex_lock(&bch_register_lock); ret = register_bdev(sb, sb_page, bdev, dc); mutex_unlock(&bch_register_lock); + /* blkdev_put() will be called in cached_dev_free() */ if (ret < 0) goto err; } else { @@ -2325,6 +2332,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!ca) goto err_close;
+ /* blkdev_put() will be called in bch_cache_release() */ if (register_cache(sb, sb_page, bdev, ca) != 0) goto err; }
mainline inclusion from mainline-5.2-rc1 commit 63d63b51d70fb5155754dcf0baa2c1700bcafcb0 category: backport
Add code comments to explain which call back function might be called for the closure_queue(). This is an effort to make code to be more understandable for readers.
Signed-off-by: Coly Li colyli@suse.de Reviewed-by: Chaitanya Kulkarni chaitanya.kulkarni@wdc.com Reviewed-by: Hannes Reinecke hare@suse.com Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/super.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 32dd81a6e57d..6010881c706e 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -662,6 +662,11 @@ static const struct block_device_operations bcache_ops = { void bcache_device_stop(struct bcache_device *d) { if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags)) + /* + * closure_fn set to + * - cached device: cached_dev_flush() + * - flash dev: flash_dev_flush() + */ closure_queue(&d->cl); }
@@ -1656,6 +1661,7 @@ static void __cache_set_unregister(struct closure *cl) void bch_cache_set_stop(struct cache_set *c) { if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags)) + /* closure_fn set to __cache_set_unregister() */ closure_queue(&c->caching); }
mainline inclusion from mainline-5.2-rc1 commit eb8cbb6df38f6e5124a3d5f1f8a3dbf519537c60 category: backport
This patch tries to release mutex bch_register_lock early, to give chance to stop cache set and bcache device early.
This patch also expends time out of stopping all bcache device from 2 seconds to 10 seconds, because stopping writeback rate update worker may delay for 5 seconds, 2 seconds is not enough.
After this patch applied, stopping bcache devices during system reboot or shutdown is very hard to be observed any more.
Signed-off-by: Coly Li colyli@suse.de Reviewed-by: Hannes Reinecke hare@suse.com Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/super.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 6010881c706e..e5b29aa7e18a 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2385,10 +2385,19 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) list_for_each_entry_safe(dc, tdc, &uncached_devices, list) bcache_device_stop(&dc->disk);
+ mutex_unlock(&bch_register_lock); + + /* + * Give an early chance for other kthreads and + * kworkers to stop themselves + */ + schedule(); + /* What's a condition variable? */ while (1) { - long timeout = start + 2 * HZ - jiffies; + long timeout = start + 10 * HZ - jiffies;
+ mutex_lock(&bch_register_lock); stopped = list_empty(&bch_cache_sets) && list_empty(&uncached_devices);
@@ -2400,7 +2409,6 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
mutex_unlock(&bch_register_lock); schedule_timeout(timeout); - mutex_lock(&bch_register_lock); }
finish_wait(&unregister_wait, &wait);
From: Shenghui Wang shhuiw@foxmail.com
mainline inclusion from mainline-5.2-rc1 commit f16277ca20acf2c213fcd4b645f4c1cffcadf533 category: backport
Elements of keylist should be accessed before the list is freed. Move bch_keylist_free() calling after the while loop to avoid wrong content accessed.
Signed-off-by: Shenghui Wang shhuiw@foxmail.com Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 6e04f4333495..572dcc7651b1 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1512,11 +1512,11 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
out_nocoalesce: closure_sync(&cl); - bch_keylist_free(&keylist);
while ((k = bch_keylist_pop(&keylist))) if (!bkey_cmp(k, &ZERO_KEY)) atomic_dec(&b->c->prio_blocked); + bch_keylist_free(&keylist);
for (i = 0; i < nodes; i++) if (!IS_ERR_OR_NULL(new_nodes[i])) {
From: Jens Axboe axboe@kernel.dk
mainline inclusion from mainline-5.2-rc1 commit 2d5abb9a1e8e92b25e781f0c3537a5b3b4b2f033 category: backport
It's not used outside this file.
Fixes: 631207314d88 ("bcache: fix failure in journal relplay") Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/journal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index a2a093f0fc76..cc6f21061ed7 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -318,7 +318,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) } }
-bool is_discard_enabled(struct cache_set *s) +static bool is_discard_enabled(struct cache_set *s) { struct cache *ca; unsigned int i;
mainline inclusion from mainline-5.3-rc1 commit 141df8bb5dc052f605de8f48a7aa10290e1384ae category: backport
When gc is running, user space I/O processes may wait inside bcache code, so no new I/O coming. Indeed this is not a real idle time, maximum writeback rate should not be set in such situation. Otherwise a faster writeback thread may compete locks with gc thread and makes garbage collection slower, which results a longer I/O freeze period.
This patch checks c->gc_mark_valid in set_at_max_writeback_rate(). If c->gc_mark_valid is 0 (gc running), set_at_max_writeback_rate() returns false, then update_writeback_rate() will not set writeback rate to maximum value even c->idle_counter reaches an idle threshold.
Now writeback thread won't interfere gc thread performance.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/writeback.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index e9ffcea1ca50..d60268fe49e1 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -122,6 +122,9 @@ static void __update_writeback_rate(struct cached_dev *dc) static bool set_at_max_writeback_rate(struct cache_set *c, struct cached_dev *dc) { + /* Don't set max writeback rate if gc is running */ + if (!c->gc_mark_valid) + return false; /* * Idle_counter is increased everytime when update_writeback_rate() is * called. If all backing devices attached to the same cache set have
mainline inclusion from mainline-5.3-rc1 commit 0ae49cb7aa005ed18fe8f4d6ccf73019b78ac7b2 category: backport
When everything is OK in bch_journal_read(), finally the return value is returned by, return ret; which assumes ret will be 0 here. This assumption is wrong when all journal buckets as are full and filled with valid journal entries. In such cache the last location referencess read_bucket() sets 'ret' to 1, which means new jset added into jset list. The jset list is list 'journal' in caller run_cache_set().
Return 1 to run_cache_set() means something wrong and the cache set won't start, but indeed everything is OK.
This patch changes the line at end of bch_journal_read() to directly return 0 since everything if verything is good. Then a bogus error is fixed.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/journal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index cc6f21061ed7..33556acdcf9c 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -268,7 +268,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) struct journal_replay, list)->j.seq;
- return ret; + return 0; #undef read_bucket }
mainline inclusion from mainline-5.3-rc1 commit 08ec1e6282f271698f0053983fab89de6e1a8217 category: backport
When backing device super block is written by bch_write_bdev_super(), the bio complete callback write_bdev_super_endio() simply ignores I/O status. Indeed such write request also contribute to backing device health status if the request failed.
This patch checkes bio->bi_status in write_bdev_super_endio(), if there is error, bch_count_backing_io_errors() will be called to count an I/O error to dc->io_errors.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 4b7c5567576b..f73b75a4898b 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -197,7 +197,9 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, static void write_bdev_super_endio(struct bio *bio) { struct cached_dev *dc = bio->bi_private; - /* XXX: error checking */ + + if (bio->bi_status) + bch_count_backing_io_errors(dc, bio);
closure_put(&dc->sb_write); }
mainline inclusion from mainline-5.3-rc1 commit f960facb399ece6ff88a7a2d4b4a5515e3a467a0 category: backport
In function bset_search_tree(), when p >= t->size, t->tree[0] will be prefetched by the following code piece, 974 unsigned int p = n << 4; 975 976 p &= ((int) (p - t->size)) >> 31; 977 978 prefetch(&t->tree[p]);
The purpose of the above code is to avoid a branch instruction, but when p >= t->size, prefetch(&t->tree[0]) has no positive performance contribution at all. This patch avoids the unncessary prefetch by only calling prefetch() when p < t->size.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/bset.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-)
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 268f1b685084..e36a108d3648 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -970,22 +970,10 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, unsigned int inorder, j, n = 1;
do { - /* - * A bit trick here. - * If p < t->size, (int)(p - t->size) is a minus value and - * the most significant bit is set, right shifting 31 bits - * gets 1. If p >= t->size, the most significant bit is - * not set, right shifting 31 bits gets 0. - * So the following 2 lines equals to - * if (p >= t->size) - * p = 0; - * but a branch instruction is avoided. - */ unsigned int p = n << 4;
- p &= ((int) (p - t->size)) >> 31; - - prefetch(&t->tree[p]); + if (p < t->size) + prefetch(&t->tree[p]);
j = n; f = &t->tree[j];
mainline inclusion from mainline-5.3-rc1 commit 0b13efecf5f25ce5e31f2ab3930335015cb65a7d category: backport
This patch adds return value check to bch_cached_dev_run(), now if there is error happens inside bch_cached_dev_run(), it can be catched.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/bcache.h | 2 +- drivers/md/bcache/super.c | 33 ++++++++++++++++++++++++++------- drivers/md/bcache/sysfs.c | 7 +++++-- 3 files changed, 32 insertions(+), 10 deletions(-)
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index e30a983a68cd..cb268d7c6cea 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -1004,7 +1004,7 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size); int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, uint8_t *set_uuid); void bch_cached_dev_detach(struct cached_dev *dc); -void bch_cached_dev_run(struct cached_dev *dc); +int bch_cached_dev_run(struct cached_dev *dc); void bcache_device_stop(struct bcache_device *d);
void bch_cache_set_unregister(struct cache_set *c); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index f73b75a4898b..7bb9354e9637 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -910,7 +910,7 @@ static int cached_dev_status_update(void *arg) }
-void bch_cached_dev_run(struct cached_dev *dc) +int bch_cached_dev_run(struct cached_dev *dc) { struct bcache_device *d = &dc->disk; char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL); @@ -921,11 +921,14 @@ void bch_cached_dev_run(struct cached_dev *dc) NULL, };
+ if (dc->io_disable) + return -EIO; + if (atomic_xchg(&dc->running, 1)) { kfree(env[1]); kfree(env[2]); kfree(buf); - return; + return -EBUSY; }
if (!d->c && @@ -951,8 +954,11 @@ void bch_cached_dev_run(struct cached_dev *dc) kfree(buf);
if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || - sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) + sysfs_create_link(&disk_to_dev(d->disk)->kobj, + &d->kobj, "bcache")) { pr_debug("error creating sysfs link"); + return -ENOMEM; + }
dc->status_update_thread = kthread_run(cached_dev_status_update, dc, "bcache_status_update"); @@ -961,6 +967,8 @@ void bch_cached_dev_run(struct cached_dev *dc) "continue to run without monitoring backing " "device status"); } + + return 0; }
/* @@ -1056,6 +1064,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds()); struct uuid_entry *u; struct cached_dev *exist_dc, *t; + int ret = 0;
if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) || (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))) @@ -1165,7 +1174,12 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
bch_sectors_dirty_init(&dc->disk);
- bch_cached_dev_run(dc); + ret = bch_cached_dev_run(dc); + if (ret && (ret != -EBUSY)) { + up_write(&dc->writeback_lock); + return ret; + } + bcache_device_link(&dc->disk, c, "bdev"); atomic_inc(&c->attached_dev_nr);
@@ -1290,6 +1304,7 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page, { const char *err = "cannot allocate memory"; struct cache_set *c; + int ret = -ENOMEM;
bdevname(bdev, dc->backing_dev_name); memcpy(&dc->sb, sb, sizeof(struct cache_sb)); @@ -1319,14 +1334,18 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page, bch_cached_dev_attach(dc, c, NULL);
if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE || - BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) - bch_cached_dev_run(dc); + BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) { + err = "failed to run cached device"; + ret = bch_cached_dev_run(dc); + if (ret) + goto err; + }
return 0; err: pr_notice("error %s: %s", dc->backing_dev_name, err); bcache_device_stop(&dc->disk); - return -EIO; + return ret; }
/* Flash only volumes */ diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index d0b1ebf4eb56..a9cf16896244 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -329,8 +329,11 @@ STORE(__cached_dev) bch_cache_accounting_clear(&dc->accounting);
if (attr == &sysfs_running && - strtoul_or_return(buf)) - bch_cached_dev_run(dc); + strtoul_or_return(buf)) { + v = bch_cached_dev_run(dc); + if (v) + return v; + }
if (attr == &sysfs_cache_mode) { v = __sysfs_match_string(bch_cache_modes, -1, buf);
mainline inclusion from mainline-5.3-rc1 commit bd9026c8a7f33ebe25543b7b7e6276b49db60f7e category: backport
Function bch_btree_keys_init() initializes b->set[].size and b->set[].data to zero. As the code comments indicates, these code indeed is unncessary, because both struct btree_keys and struct bset_tree are nested embedded into struct btree, when struct btree is filled with 0 bits by kzalloc() in mca_bucket_alloc(), b->set[].size and b->set[].data are initialized to 0 (a.k.a NULL) already.
This patch removes the redundant code, and add comments in bch_btree_keys_init() and mca_bucket_alloc() to explain why it's safe.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/bset.c | 15 ++++++--------- drivers/md/bcache/btree.c | 4 ++++ 2 files changed, 10 insertions(+), 9 deletions(-)
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index e36a108d3648..8af9509e78bd 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -347,22 +347,19 @@ EXPORT_SYMBOL(bch_btree_keys_alloc); void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, bool *expensive_debug_checks) { - unsigned int i; - b->ops = ops; b->expensive_debug_checks = expensive_debug_checks; b->nsets = 0; b->last_set_unwritten = 0;
- /* XXX: shouldn't be needed */ - for (i = 0; i < MAX_BSETS; i++) - b->set[i].size = 0; /* - * Second loop starts at 1 because b->keys[0]->data is the memory we - * allocated + * struct btree_keys in embedded in struct btree, and struct + * bset_tree is embedded into struct btree_keys. They are all + * initialized as 0 by kzalloc() in mca_bucket_alloc(), and + * b->set[0].data is allocated in bch_btree_keys_alloc(), so we + * don't have to initiate b->set[].size and b->set[].data here + * any more. */ - for (i = 1; i < MAX_BSETS; i++) - b->set[i].data = NULL; } EXPORT_SYMBOL(bch_btree_keys_init);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 572dcc7651b1..1f8b8532e179 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -612,6 +612,10 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) static struct btree *mca_bucket_alloc(struct cache_set *c, struct bkey *k, gfp_t gfp) { + /* + * kzalloc() is necessary here for initialization, + * see code comments in bch_btree_keys_init(). + */ struct btree *b = kzalloc(sizeof(struct btree), gfp);
if (!b)
mainline inclusion from mainline-5.3-rc1 commit 4b6efb4bdbce25097f1a6329e18c2b77c4f27722 category: backport
This patch adds more accurate error message for specific ssyfs_create_link() call, to help debugging failure during bcache device start tup.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/super.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 7bb9354e9637..5dce7027f1cd 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -693,6 +693,7 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c, { unsigned int i; struct cache *ca; + int ret;
for_each_cache(ca, d->c, i) bd_link_disk_holder(ca->bdev, d->disk); @@ -700,9 +701,13 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c, snprintf(d->name, BCACHEDEVNAME_SIZE, "%s%u", name, d->id);
- WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") || - sysfs_create_link(&c->kobj, &d->kobj, d->name), - "Couldn't create device <-> cache set symlinks"); + ret = sysfs_create_link(&d->kobj, &c->kobj, "cache"); + if (ret < 0) + pr_err("Couldn't create device -> cache set symlink"); + + ret = sysfs_create_link(&c->kobj, &d->kobj, d->name); + if (ret < 0) + pr_err("Couldn't create cache set -> device symlink");
clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags); }
mainline inclusion from mainline-5.3-rc1 commit 633bb2ce60b949e2990c15324be162c54788c027 category: backport
This patch adds more error message for attaching cached device, this is helpful to debug code failure during bache device start up.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/super.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 5dce7027f1cd..dcfb96c6e596 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1169,6 +1169,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, down_write(&dc->writeback_lock); if (bch_cached_dev_writeback_start(dc)) { up_write(&dc->writeback_lock); + pr_err("Couldn't start writeback facilities for %s", + dc->disk.disk->disk_name); return -ENOMEM; }
@@ -1182,6 +1184,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, ret = bch_cached_dev_run(dc); if (ret && (ret != -EBUSY)) { up_write(&dc->writeback_lock); + pr_err("Couldn't run cached device %s", + dc->backing_dev_name); return ret; }
mainline inclusion from mainline-5.3-rc1 commit e0faa3d7f79f7e1abb43de168e88c76061518ea4 category: backport
This patch adds more error message in bch_cached_dev_run() to indicate the exact reason why an error value is returned. Please notice when printing out the "is running already" message, pr_info() is used here, because in this case also -EBUSY is returned, the bcache device can continue to attach to the cache devince and run, so it won't be an error level message in kernel message.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/super.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index dcfb96c6e596..e0c951b7cf12 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -926,13 +926,18 @@ int bch_cached_dev_run(struct cached_dev *dc) NULL, };
- if (dc->io_disable) + if (dc->io_disable) { + pr_err("I/O disabled on cached dev %s", + dc->backing_dev_name); return -EIO; + }
if (atomic_xchg(&dc->running, 1)) { kfree(env[1]); kfree(env[2]); kfree(buf); + pr_info("cached dev %s is running already", + dc->backing_dev_name); return -EBUSY; }
@@ -961,7 +966,7 @@ int bch_cached_dev_run(struct cached_dev *dc) if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) { - pr_debug("error creating sysfs link"); + pr_err("Couldn't create bcache dev <-> disk sysfs symlinks"); return -ENOMEM; }
mainline inclusion from mainline-5.3-rc1 commit 68a53c95a0fce541321fbca74a7f72c71361f496 category: backport
In previous bcache patches for Linux v5.2, the failure code path of run_cache_set() is tested and fixed. So now the following comment line can be removed from run_cache_set(), /* XXX: test this, it's broken */
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e0c951b7cf12..0cdf66f9dfd9 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1977,7 +1977,7 @@ static int run_cache_set(struct cache_set *c) }
closure_sync(&cl); - /* XXX: test this, it's broken */ + bch_cache_set_error(c, "%s", err);
return -EIO;
mainline inclusion from mainline-5.3-rc1 commit 944a4f340a65c21ee311d2d3e617034bef9d0b25 category: backport
The purpose of following code in bset_search_tree() is to avoid a branch instruction, 994 if (likely(f->exponent != 127)) 995 n = j * 2 + (((unsigned int) 996 (f->mantissa - 997 bfloat_mantissa(search, f))) >> 31); 998 else 999 n = (bkey_cmp(tree_to_bkey(t, j), search) > 0) 1000 ? j * 2 1001 : j * 2 + 1;
This piece of code is not very clear to understand, even when I tried to add code comment for it, I made mistake. This patch removes the implict bit operation and uses explicit branch to calculate next location in binary tree search.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/bset.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-)
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 8af9509e78bd..08768796b543 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -975,25 +975,17 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, j = n; f = &t->tree[j];
- /* - * Similar bit trick, use subtract operation to avoid a branch - * instruction. - * - * n = (f->mantissa > bfloat_mantissa()) - * ? j * 2 - * : j * 2 + 1; - * - * We need to subtract 1 from f->mantissa for the sign bit trick - * to work - that's done in make_bfloat() - */ - if (likely(f->exponent != 127)) - n = j * 2 + (((unsigned int) - (f->mantissa - - bfloat_mantissa(search, f))) >> 31); - else - n = (bkey_cmp(tree_to_bkey(t, j), search) > 0) - ? j * 2 - : j * 2 + 1; + if (likely(f->exponent != 127)) { + if (f->mantissa >= bfloat_mantissa(search, f)) + n = j * 2; + else + n = j * 2 + 1; + } else { + if (bkey_cmp(tree_to_bkey(t, j), search) > 0) + n = j * 2; + else + n = j * 2 + 1; + } } while (n < t->size);
inorder = to_inorder(j, t);
mainline inclusion from mainline-5.3-rc1 commit 0c277e211aae056b26513358fc060291d8523747 category: backport
If a bcache device is in dirty state and its cache set is not registered, this bcache device will not appear in /dev/bcache<N>, and there is no way to stop it or remove the bcache kernel module.
This is an as-designed behavior, but sometimes people has to reboot whole system to release or stop the pending backing device.
This sysfs interface may remove such pending bcache devices when write anything into the sysfs file manually.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/super.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 0cdf66f9dfd9..73e6505a53e0 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2271,9 +2271,13 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page,
static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, const char *buffer, size_t size); +static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, + struct kobj_attribute *attr, + const char *buffer, size_t size);
kobj_attribute_write(register, register_bcache); kobj_attribute_write(register_quiet, register_bcache); +kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup);
static bool bch_is_open_backing(struct block_device *bdev) { @@ -2398,6 +2402,56 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, goto out; }
+ +struct pdev { + struct list_head list; + struct cached_dev *dc; +}; + +static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, + struct kobj_attribute *attr, + const char *buffer, + size_t size) +{ + LIST_HEAD(pending_devs); + ssize_t ret = size; + struct cached_dev *dc, *tdc; + struct pdev *pdev, *tpdev; + struct cache_set *c, *tc; + + mutex_lock(&bch_register_lock); + list_for_each_entry_safe(dc, tdc, &uncached_devices, list) { + pdev = kmalloc(sizeof(struct pdev), GFP_KERNEL); + if (!pdev) + break; + pdev->dc = dc; + list_add(&pdev->list, &pending_devs); + } + + list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) { + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) { + char *pdev_set_uuid = pdev->dc->sb.set_uuid; + char *set_uuid = c->sb.uuid; + + if (!memcmp(pdev_set_uuid, set_uuid, 16)) { + list_del(&pdev->list); + kfree(pdev); + break; + } + } + } + mutex_unlock(&bch_register_lock); + + list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) { + pr_info("delete pdev %p", pdev); + list_del(&pdev->list); + bcache_device_stop(&pdev->dc->disk); + kfree(pdev); + } + + return ret; +} + static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) { if (code == SYS_DOWN || @@ -2516,6 +2570,7 @@ static int __init bcache_init(void) static const struct attribute *files[] = { &ksysfs_register.attr, &ksysfs_register_quiet.attr, + &ksysfs_pendings_cleanup.attr, NULL };
mainline inclusion from mainline-5.3-rc1 commit 5c2a634cbfaf1971cb6453fe5f86d83585257790 category: backport
In bch_cached_dev_attach() after bch_cached_dev_writeback_start() called, the wrireback kthread and writeback rate update kworker of the cached device are created, if the following bch_cached_dev_run() failed, bch_cached_dev_attach() will return with -ENOMEM without stopping the writeback related kthread and kworker.
This patch stops writeback kthread and writeback rate update kworker before returning -ENOMEM if bch_cached_dev_run() returns error.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/super.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 73e6505a53e0..c67dc192d292 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1189,6 +1189,14 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, ret = bch_cached_dev_run(dc); if (ret && (ret != -EBUSY)) { up_write(&dc->writeback_lock); + /* + * bch_register_lock is held, bcache_device_stop() is not + * able to be directly called. The kthread and kworker + * created previously in bch_cached_dev_writeback_start() + * have to be stopped manually here. + */ + kthread_stop(dc->writeback_thread); + cancel_writeback_rate_update_dwork(dc); pr_err("Couldn't run cached device %s", dc->backing_dev_name); return ret;
mainline inclusion from mainline-5.3-rc1 commit a59ff6ccc2bf2e2934b31bbf734f0bc04b5ec78a category: backport
It is quite frequently to observe deadlock in bcache_reboot() happens and hang the system reboot process. The reason is, in bcache_reboot() when calling bch_cache_set_stop() and bcache_device_stop() the mutex bch_register_lock is held. But in the process to stop cache set and bcache device, bch_register_lock will be acquired again. If this mutex is held here, deadlock will happen inside the stopping process. The aftermath of the deadlock is, whole system reboot gets hung.
The fix is to avoid holding bch_register_lock for the following loops in bcache_reboot(), list_for_each_entry_safe(c, tc, &bch_cache_sets, list) bch_cache_set_stop(c);
list_for_each_entry_safe(dc, tdc, &uncached_devices, list) bcache_device_stop(&dc->disk);
A module range variable 'bcache_is_reboot' is added, it sets to true in bcache_reboot(). In register_bcache(), if bcache_is_reboot is checked to be true, reject the registration by returning -EBUSY immediately.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/super.c | 40 +++++++++++++++++++++++++++++++++++++++- drivers/md/bcache/sysfs.c | 26 ++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index c67dc192d292..ec75ac2ca5d0 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -40,6 +40,7 @@ static const char invalid_uuid[] = {
static struct kobject *bcache_kobj; struct mutex bch_register_lock; +bool bcache_is_reboot; LIST_HEAD(bch_cache_sets); static LIST_HEAD(uncached_devices);
@@ -49,6 +50,7 @@ static wait_queue_head_t unregister_wait; struct workqueue_struct *bcache_wq; struct workqueue_struct *bch_journal_wq;
+ #define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE) /* limitation of partitions number on single bcache device */ #define BCACHE_MINORS 128 @@ -2333,6 +2335,11 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!try_module_get(THIS_MODULE)) return -EBUSY;
+ /* For latest state of bcache_is_reboot */ + smp_mb(); + if (bcache_is_reboot) + return -EBUSY; + path = kstrndup(buffer, size, GFP_KERNEL); if (!path) goto err; @@ -2462,6 +2469,9 @@ static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) { + if (bcache_is_reboot) + return NOTIFY_DONE; + if (code == SYS_DOWN || code == SYS_HALT || code == SYS_POWER_OFF) { @@ -2474,19 +2484,45 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
mutex_lock(&bch_register_lock);
+ if (bcache_is_reboot) + goto out; + + /* New registration is rejected since now */ + bcache_is_reboot = true; + /* + * Make registering caller (if there is) on other CPU + * core know bcache_is_reboot set to true earlier + */ + smp_mb(); + if (list_empty(&bch_cache_sets) && list_empty(&uncached_devices)) goto out;
+ mutex_unlock(&bch_register_lock); + pr_info("Stopping all devices:");
+ /* + * The reason bch_register_lock is not held to call + * bch_cache_set_stop() and bcache_device_stop() is to + * avoid potential deadlock during reboot, because cache + * set or bcache device stopping process will acqurie + * bch_register_lock too. + * + * We are safe here because bcache_is_reboot sets to + * true already, register_bcache() will reject new + * registration now. bcache_is_reboot also makes sure + * bcache_reboot() won't be re-entered on by other thread, + * so there is no race in following list iteration by + * list_for_each_entry_safe(). + */ list_for_each_entry_safe(c, tc, &bch_cache_sets, list) bch_cache_set_stop(c);
list_for_each_entry_safe(dc, tdc, &uncached_devices, list) bcache_device_stop(&dc->disk);
- mutex_unlock(&bch_register_lock);
/* * Give an early chance for other kthreads and @@ -2614,6 +2650,8 @@ static int __init bcache_init(void) bch_debug_init(); closure_debug_init();
+ bcache_is_reboot = false; + return 0; err: bcache_exit(); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index a9cf16896244..0734ca4fe5db 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -16,6 +16,8 @@ #include <linux/sort.h> #include <linux/sched/clock.h>
+extern bool bcache_is_reboot; + /* Default is 0 ("writethrough") */ static const char * const bch_cache_modes[] = { "writethrough", @@ -271,6 +273,10 @@ STORE(__cached_dev) struct cache_set *c; struct kobj_uevent_env *env;
+ /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + #define d_strtoul(var) sysfs_strtoul(var, dc->var) #define d_strtoul_nonzero(var) sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX) #define d_strtoi_h(var) sysfs_hatoi(var, dc->var) @@ -411,6 +417,10 @@ STORE(bch_cached_dev) struct cached_dev *dc = container_of(kobj, struct cached_dev, disk.kobj);
+ /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + mutex_lock(&bch_register_lock); size = __cached_dev_store(kobj, attr, buf, size);
@@ -514,6 +524,10 @@ STORE(__bch_flash_dev) kobj); struct uuid_entry *u = &d->c->uuids[d->id];
+ /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + sysfs_strtoul(data_csum, d->data_csum);
if (attr == &sysfs_size) { @@ -749,6 +763,10 @@ STORE(__bch_cache_set) struct cache_set *c = container_of(kobj, struct cache_set, kobj); ssize_t v;
+ /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + if (attr == &sysfs_unregister) bch_cache_set_unregister(c);
@@ -868,6 +886,10 @@ STORE(bch_cache_set_internal) { struct cache_set *c = container_of(kobj, struct cache_set, internal);
+ /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + return bch_cache_set_store(&c->kobj, attr, buf, size); }
@@ -1053,6 +1075,10 @@ STORE(__bch_cache) struct cache *ca = container_of(kobj, struct cache, kobj); ssize_t v;
+ /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + if (attr == &sysfs_discard) { bool v = strtoul_or_return(buf);
mainline inclusion from mainline-5.3-rc1 commit 97ba3b816e2cdea798398bc8486125f79f2453c1 category: backport
Now there is variable bcache_is_reboot to prevent device register or unregister during reboot, it is unncessary to still hold mutex lock bch_register_lock before stopping writeback_rate_update kworker and writeback kthread. And if the stopping kworker or kthread holding bch_register_lock inside their routine (we used to have such problem in writeback thread, thanks to Junhui Wang fixed it), it is very easy to introduce deadlock during reboot/shutdown procedure.
Therefore in this patch, the location to acquire bch_register_lock is moved to the location before calling calc_cached_dev_sectors(). Which is later then original location in cached_dev_detach_finish().
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index ec75ac2ca5d0..a8ea4e2086a9 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1018,7 +1018,6 @@ static void cached_dev_detach_finish(struct work_struct *w) BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); BUG_ON(refcount_read(&dc->count));
- mutex_lock(&bch_register_lock);
if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) cancel_writeback_rate_update_dwork(dc); @@ -1034,6 +1033,8 @@ static void cached_dev_detach_finish(struct work_struct *w) bch_write_bdev_super(dc, &cl); closure_sync(&cl);
+ mutex_lock(&bch_register_lock); + calc_cached_dev_sectors(dc->disk.c); bcache_device_detach(&dc->disk); list_move(&dc->list, &uncached_devices);
mainline inclusion from mainline-5.3-rc1 commit 2464b693148e5d5ca42b6064bb40c1a275ea61cd category: backport
This patch adds more code comments in journal_read_bucket(), this is an effort to make the code to be more understandable.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/journal.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+)
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 33556acdcf9c..749de7a56fef 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -100,6 +100,20 @@ reread: left = ca->sb.bucket_size - offset;
blocks = set_blocks(j, block_bytes(ca->set));
+ /* + * Nodes in 'list' are in linear increasing order of + * i->j.seq, the node on head has the smallest (oldest) + * journal seq, the node on tail has the biggest + * (latest) journal seq. + */ + + /* + * Check from the oldest jset for last_seq. If + * i->j.seq < j->last_seq, it means the oldest jset + * in list is expired and useless, remove it from + * this list. Otherwise, j is a condidate jset for + * further following checks. + */ while (!list_empty(list)) { i = list_first_entry(list, struct journal_replay, list); @@ -109,13 +123,22 @@ reread: left = ca->sb.bucket_size - offset; kfree(i); }
+ /* iterate list in reverse order (from latest jset) */ list_for_each_entry_reverse(i, list, list) { if (j->seq == i->j.seq) goto next_set;
+ /* + * if j->seq is less than any i->j.last_seq + * in list, j is an expired and useless jset. + */ if (j->seq < i->j.last_seq) goto next_set;
+ /* + * 'where' points to first jset in list which + * is elder then j. + */ if (j->seq > i->j.seq) { where = &i->list; goto add; @@ -129,6 +152,7 @@ reread: left = ca->sb.bucket_size - offset; if (!i) return -ENOMEM; memcpy(&i->j, j, bytes); + /* Add to the location after 'where' points to */ list_add(&i->list, where); ret = 1;
mainline inclusion from mainline-5.3-rc1 commit a231f07a5fe30a522b402011c5190cb936641a66 category: backport
In journal_read_bucket() when setting ja->seq[bucket_index], there might be potential case that a later non-maximum overwrites a better sequence number to ja->seq[bucket_index]. This patch adds a check to make sure that ja->seq[bucket_index] will be only set a new value if it is bigger then current value.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/journal.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 749de7a56fef..7fd22855fd47 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -156,7 +156,8 @@ reread: left = ca->sb.bucket_size - offset; list_add(&i->list, where); ret = 1;
- ja->seq[bucket_index] = j->seq; + if (j->seq > ja->seq[bucket_index]) + ja->seq[bucket_index] = j->seq; next_set: offset += blocks * ca->sb.block_size; len -= blocks * ca->sb.block_size;
mainline inclusion from mainline-5.3-rc1 commit 1df3877ff6a4810054237c3259d900ded4468969 category: backport
When cache set starts, bch_btree_check() will check all bkeys on cache device by calculating the checksum. This operation will consume a huge number of system memory if there are a lot of data cached. Since bcache uses its own mca cache to maintain all its read-in btree nodes, and only releases the cache space when system memory manage code starts to shrink caches. Then before memory manager code to call the mca cache shrinker callback, bcache mca cache will compete memory resource with user space application, which may have nagive effect to performance of user space workloads (e.g. data base, or I/O service of distributed storage node).
This patch tries to call bcache mca shrinker routine to proactively release mca cache memory, to decrease the memory pressure of system and avoid negative effort of the overall system I/O performance.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/super.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a8ea4e2086a9..26e374fbf57c 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1880,6 +1880,23 @@ static int run_cache_set(struct cache_set *c) if (bch_btree_check(c)) goto err;
+ /* + * bch_btree_check() may occupy too much system memory which + * has negative effects to user space application (e.g. data + * base) performance. Shrink the mca cache memory proactively + * here to avoid competing memory with user space workloads.. + */ + if (!c->shrinker_disabled) { + struct shrink_control sc; + + sc.gfp_mask = GFP_KERNEL; + sc.nr_to_scan = c->btree_cache_used * c->btree_pages; + /* first run to clear b->accessed tag */ + c->shrink.scan_objects(&c->shrink, &sc); + /* second run to reap non-accessed nodes */ + c->shrink.scan_objects(&c->shrink, &sc); + } + bch_journal_mark(c, &journal); bch_initial_gc_finish(c); pr_debug("btree_check() done");
mainline inclusion from mainline-5.3-rc1 commit d91ce7574daf48a4567ba62733d43284f5d2a3f4 category: backport
In struct cache_set, retry_flush_write is added for commit c4dc2497d50d ("bcache: fix high CPU occupancy during journal") which is reverted in previous patch.
Now it is useless anymore, and this patch removes it from bcache code.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/bcache.h | 1 - drivers/md/bcache/journal.c | 1 - drivers/md/bcache/sysfs.c | 5 ----- 3 files changed, 7 deletions(-)
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index cb268d7c6cea..35396248a7d5 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -706,7 +706,6 @@ struct cache_set {
atomic_long_t reclaim; atomic_long_t flush_write; - atomic_long_t retry_flush_write;
enum { ON_ERROR_UNREGISTER, diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 7fd22855fd47..a1e3e1fcea6e 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -453,7 +453,6 @@ static void btree_flush_write(struct cache_set *c) clear_bit(BTREE_NODE_journal_flush, &b->flags); mutex_unlock(&b->write_lock); /* We raced */ - atomic_long_inc(&c->retry_flush_write); goto retry; }
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 0734ca4fe5db..47c846b4e72c 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -87,7 +87,6 @@ read_attribute(state); read_attribute(cache_read_races); read_attribute(reclaim); read_attribute(flush_write); -read_attribute(retry_flush_write); read_attribute(writeback_keys_done); read_attribute(writeback_keys_failed); read_attribute(io_errors); @@ -713,9 +712,6 @@ SHOW(__bch_cache_set) sysfs_print(flush_write, atomic_long_read(&c->flush_write));
- sysfs_print(retry_flush_write, - atomic_long_read(&c->retry_flush_write)); - sysfs_print(writeback_keys_done, atomic_long_read(&c->writeback_keys_done)); sysfs_print(writeback_keys_failed, @@ -940,7 +936,6 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_cache_read_races, &sysfs_reclaim, &sysfs_flush_write, - &sysfs_retry_flush_write, &sysfs_writeback_keys_done, &sysfs_writeback_keys_failed,
mainline inclusion from mainline-5.3-rc1 commit 91be66e1318f67ed5888ca10e10cc8ffdc24f661 category: backport
This patch improves performance for btree_flush_write() in following ways, - Use another spinlock journal.flush_write_lock to replace the very hot journal.lock. We don't have to use journal.lock here, selecting candidate btree nodes takes a lot of time, hold journal.lock here will block other jouranling threads and drop the overall I/O performance. - Only select flushing btree node from c->btree_cache list. When the machine has a large system memory, mca cache may have a huge number of cached btree nodes. Iterating all the cached nodes will take a lot of CPU time, and most of the nodes on c->btree_cache_freeable and c->btree_cache_freed lists are cleared and have need to flush. So only travel mca list c->btree_cache to select flushing btree node should be enough for most of the cases. - Don't iterate whole c->btree_cache list, only reversely select first BTREE_FLUSH_NR btree nodes to flush. Iterate all btree nodes from c->btree_cache and select the oldest journal pin btree nodes consumes huge number of CPU cycles if the list is huge (push and pop a node into/out of a heap is expensive). The last several dirty btree nodes on the tail of c->btree_cache list are earlest allocated and cached btree nodes, they are relative to the oldest journal pin btree nodes. Therefore only flushing BTREE_FLUSH_NR btree nodes from tail of c->btree_cache probably includes the oldest journal pin btree nodes.
In my testing, the above change decreases 50%+ CPU consumption when journal space is full. Some times IOPS drops to 0 for 5-8 seconds, comparing blocking I/O for 120+ seconds in previous code, this is much better. Maybe there is room to improve in future, but at this momment the fix looks fine and performs well in my testing.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/journal.c | 85 +++++++++++++++++++++++++++++++++------------ drivers/md/bcache/journal.h | 4 +++ 2 files changed, 67 insertions(+), 22 deletions(-)
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index a1e3e1fcea6e..8bcd8f1bf8cb 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -419,47 +419,87 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
static void btree_flush_write(struct cache_set *c) { - /* - * Try to find the btree node with that references the oldest journal - * entry, best is our current candidate and is locked if non NULL: - */ - struct btree *b, *best; - unsigned int i; + struct btree *b, *t, *btree_nodes[BTREE_FLUSH_NR]; + unsigned int i, n; + + if (c->journal.btree_flushing) + return; + + spin_lock(&c->journal.flush_write_lock); + if (c->journal.btree_flushing) { + spin_unlock(&c->journal.flush_write_lock); + return; + } + c->journal.btree_flushing = true; + spin_unlock(&c->journal.flush_write_lock);
atomic_long_inc(&c->flush_write); -retry: - best = NULL; + memset(btree_nodes, 0, sizeof(btree_nodes)); + n = 0;
mutex_lock(&c->bucket_lock); - for_each_cached_btree(b, c, i) - if (btree_current_write(b)->journal) { - if (!best) - best = b; - else if (journal_pin_cmp(c, - btree_current_write(best)->journal, - btree_current_write(b)->journal)) { - best = b; - } + list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) { + if (btree_node_journal_flush(b)) + pr_err("BUG: flush_write bit should not be set here!"); + + mutex_lock(&b->write_lock); + + if (!btree_node_dirty(b)) { + mutex_unlock(&b->write_lock); + continue; + } + + if (!btree_current_write(b)->journal) { + mutex_unlock(&b->write_lock); + continue; }
- b = best; - if (b) set_btree_node_journal_flush(b); + + mutex_unlock(&b->write_lock); + + btree_nodes[n++] = b; + if (n == BTREE_FLUSH_NR) + break; + } mutex_unlock(&c->bucket_lock);
- if (b) { + for (i = 0; i < n; i++) { + b = btree_nodes[i]; + if (!b) { + pr_err("BUG: btree_nodes[%d] is NULL", i); + continue; + } + + /* safe to check without holding b->write_lock */ + if (!btree_node_journal_flush(b)) { + pr_err("BUG: bnode %p: journal_flush bit cleaned", b); + continue; + } + mutex_lock(&b->write_lock); if (!btree_current_write(b)->journal) { clear_bit(BTREE_NODE_journal_flush, &b->flags); mutex_unlock(&b->write_lock); - /* We raced */ - goto retry; + pr_debug("bnode %p: written by others", b); + continue; + } + + if (!btree_node_dirty(b)) { + clear_bit(BTREE_NODE_journal_flush, &b->flags); + mutex_unlock(&b->write_lock); + pr_debug("bnode %p: dirty bit cleaned by others", b); + continue; }
__bch_btree_node_write(b, NULL); clear_bit(BTREE_NODE_journal_flush, &b->flags); mutex_unlock(&b->write_lock); } + + spin_lock(&c->journal.flush_write_lock); + c->journal.btree_flushing = false; + spin_unlock(&c->journal.flush_write_lock); }
#define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) @@ -881,6 +921,7 @@ int bch_journal_alloc(struct cache_set *c) struct journal *j = &c->journal;
spin_lock_init(&j->lock); + spin_lock_init(&j->flush_write_lock); INIT_DELAYED_WORK(&j->work, journal_write_work);
c->journal_delay_ms = 100; diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index 66f0facff84b..f2ea34d5f431 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h @@ -103,6 +103,8 @@ struct journal_write { /* Embedded in struct cache_set */ struct journal { spinlock_t lock; + spinlock_t flush_write_lock; + bool btree_flushing; /* used when waiting because the journal was full */ struct closure_waitlist wait; struct closure io; @@ -154,6 +156,8 @@ struct journal_device { struct bio_vec bv[8]; };
+#define BTREE_FLUSH_NR 8 + #define journal_pin_cmp(c, l, r) \ (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r)))
mainline inclusion from mainline-5.3-rc1 commit dff90d58a1c815b87b2603295382c97e78064349 category: backport
Now we have counters for how many times jouranl is reclaimed, how many times cached dirty btree nodes are flushed, but we don't know how many jouranl buckets are really reclaimed.
This patch adds reclaimed_journal_buckets into struct cache_set, this is an increasing only counter, to tell how many journal buckets are reclaimed since cache set runs. From all these three counters (reclaim, reclaimed_journal_buckets, flush_write), we can have idea how well current journal space reclaim code works.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/journal.c | 1 + drivers/md/bcache/sysfs.c | 5 +++++ 3 files changed, 7 insertions(+)
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 35396248a7d5..013e35a9e317 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -705,6 +705,7 @@ struct cache_set { atomic_long_t writeback_keys_failed;
atomic_long_t reclaim; + atomic_long_t reclaimed_journal_buckets; atomic_long_t flush_write;
enum { diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 8bcd8f1bf8cb..be2a2a201603 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -614,6 +614,7 @@ static void journal_reclaim(struct cache_set *c) k->ptr[n++] = MAKE_PTR(0, bucket_to_sector(c, ca->sb.d[ja->cur_idx]), ca->sb.nr_this_dev); + atomic_long_inc(&c->reclaimed_journal_buckets); }
if (n) { diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 47c846b4e72c..e2059af90791 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -86,6 +86,7 @@ read_attribute(bset_tree_stats); read_attribute(state); read_attribute(cache_read_races); read_attribute(reclaim); +read_attribute(reclaimed_journal_buckets); read_attribute(flush_write); read_attribute(writeback_keys_done); read_attribute(writeback_keys_failed); @@ -709,6 +710,9 @@ SHOW(__bch_cache_set) sysfs_print(reclaim, atomic_long_read(&c->reclaim));
+ sysfs_print(reclaimed_journal_buckets, + atomic_long_read(&c->reclaimed_journal_buckets)); + sysfs_print(flush_write, atomic_long_read(&c->flush_write));
@@ -935,6 +939,7 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_bset_tree_stats, &sysfs_cache_read_races, &sysfs_reclaim, + &sysfs_reclaimed_journal_buckets, &sysfs_flush_write, &sysfs_writeback_keys_done, &sysfs_writeback_keys_failed,
From: Wei Yongjun weiyongjun1@huawei.com
mainline inclusion from mainline-5.3-rc2 commit 5d9e06d60eee95e021ffccf0d2c7ed800ae9dc14 category: backport
memory malloced in bch_cached_dev_run() and should be freed before leaving from the error handling cases, otherwise it will cause memory leak.
Fixes: 0b13efecf5f2 ("bcache: add return value check to bch_cached_dev_run()") Signed-off-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/super.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 26e374fbf57c..20ed838e9413 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -931,6 +931,9 @@ int bch_cached_dev_run(struct cached_dev *dc) if (dc->io_disable) { pr_err("I/O disabled on cached dev %s", dc->backing_dev_name); + kfree(env[1]); + kfree(env[2]); + kfree(buf); return -EIO; }
From: Shile Zhang shile.zhang@linux.alibaba.com
mainline inclusion from mainline-5.4-rc1 commit d55a4ae9e1af5fb1657e38284ef46c56e668efdb category: backport
Read /sys/fs/bcache/<uuid>/cacheN/priority_stats can take very long time with huge cache after long run.
Signed-off-by: Shile Zhang shile.zhang@linux.alibaba.com Tested-by: Heitor Alves de Siqueira halves@canonical.com Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/sysfs.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index e2059af90791..627dcea0f5b6 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -964,6 +964,7 @@ KTYPE(bch_cache_set_internal);
static int __bch_cache_cmp(const void *l, const void *r) { + cond_resched(); return *((uint16_t *)r) - *((uint16_t *)l); }
From: Dan Carpenter dan.carpenter@oracle.com
mainline inclusion from mainline-5.4-rc1 commit d66c9920c0cf984cf99cab5036fd5f3a1b7fba46 category: backport
The copy_to_user() function returns the number of bytes remaining to be copied, but the intention here was to return -EFAULT if the copy fails.
Fixes: cafe56359144 ("bcache: A block layer cache") Signed-off-by: Dan Carpenter dan.carpenter@oracle.com Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/debug.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 8b123be05254..336f43910383 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -178,10 +178,9 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf, while (size) { struct keybuf_key *w; unsigned int bytes = min(i->bytes, size); - int err = copy_to_user(buf, i->buf, bytes);
- if (err) - return err; + if (copy_to_user(buf, i->buf, bytes)) + return -EFAULT;
ret += bytes; buf += bytes;
From: Guoju Fang fangguoju@gmail.com
mainline inclusion from mainline-5.5-rc1 commit 34cf78bf34d48dddddfeeadb44f9841d7864997a category: backport
This patch fix a lost wake-up problem caused by the race between mca_cannibalize_lock and bch_cannibalize_unlock.
Consider two processes, A and B. Process A is executing mca_cannibalize_lock, while process B takes c->btree_cache_alloc_lock and is executing bch_cannibalize_unlock. The problem happens that after process A executes cmpxchg and will execute prepare_to_wait. In this timeslice process B executes wake_up, but after that process A executes prepare_to_wait and set the state to TASK_INTERRUPTIBLE. Then process A goes to sleep but no one will wake up it. This problem may cause bcache device to dead.
Signed-off-by: Guoju Fang fangguoju@gmail.com Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/btree.c | 12 ++++++++---- drivers/md/bcache/super.c | 1 + 3 files changed, 10 insertions(+), 4 deletions(-)
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 013e35a9e317..3653faf3bf48 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -582,6 +582,7 @@ struct cache_set { */ wait_queue_head_t btree_cache_wait; struct task_struct *btree_cache_alloc_lock; + spinlock_t btree_cannibalize_lock;
/* * When we free a btree node, we increment the gen of the bucket the diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 1f8b8532e179..c39a67c65620 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -883,15 +883,17 @@ static struct btree *mca_find(struct cache_set *c, struct bkey *k)
static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op) { - struct task_struct *old; - - old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current); - if (old && old != current) { + spin_lock(&c->btree_cannibalize_lock); + if (likely(c->btree_cache_alloc_lock == NULL)) { + c->btree_cache_alloc_lock = current; + } else if (c->btree_cache_alloc_lock != current) { if (op) prepare_to_wait(&c->btree_cache_wait, &op->wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&c->btree_cannibalize_lock); return -EINTR; } + spin_unlock(&c->btree_cannibalize_lock);
return 0; } @@ -926,10 +928,12 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, */ static void bch_cannibalize_unlock(struct cache_set *c) { + spin_lock(&c->btree_cannibalize_lock); if (c->btree_cache_alloc_lock == current) { c->btree_cache_alloc_lock = NULL; wake_up(&c->btree_cache_wait); } + spin_unlock(&c->btree_cannibalize_lock); }
static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op, diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 20ed838e9413..ebb854ed05a4 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1769,6 +1769,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) sema_init(&c->sb_write_mutex, 1); mutex_init(&c->bucket_lock); init_waitqueue_head(&c->btree_cache_wait); + spin_lock_init(&c->btree_cannibalize_lock); init_waitqueue_head(&c->bucket_wait); init_waitqueue_head(&c->gc_wait); sema_init(&c->uuid_write_mutex, 1);
mainline inclusion from mainline-5.5-rc1 commit 2d8869518a525c9bce5f5268419df9dfbe3dfdeb category: backport
Commit cafe56359144 ("bcache: A block layer cache") leads to the following static checker warning:
./drivers/md/bcache/super.c:770 bcache_device_free() warn: variable dereferenced before check 'd->disk' (see line 766)
drivers/md/bcache/super.c 762 static void bcache_device_free(struct bcache_device *d) 763 { 764 lockdep_assert_held(&bch_register_lock); 765 766 pr_info("%s stopped", d->disk->disk_name); ^^^^^^^^^ Unchecked dereference.
767 768 if (d->c) 769 bcache_device_detach(d); 770 if (d->disk && d->disk->flags & GENHD_FL_UP) ^^^^^^^ Check too late.
771 del_gendisk(d->disk); 772 if (d->disk && d->disk->queue) 773 blk_cleanup_queue(d->disk->queue); 774 if (d->disk) { 775 ida_simple_remove(&bcache_device_idx, 776 first_minor_to_idx(d->disk->first_minor)); 777 put_disk(d->disk); 778 } 779
It is not 100% sure that the gendisk struct of bcache device will always be there, the warning makes sense when there is problem in block core.
This patch tries to remove the static checking warning by checking d->disk to avoid NULL pointer deferences.
Reported-by: Dan Carpenter dan.carpenter@oracle.com Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/super.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index ebb854ed05a4..7beccede5360 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -761,20 +761,28 @@ static inline int idx_to_first_minor(int idx)
static void bcache_device_free(struct bcache_device *d) { + struct gendisk *disk = d->disk; + lockdep_assert_held(&bch_register_lock);
- pr_info("%s stopped", d->disk->disk_name); + if (disk) + pr_info("%s stopped", disk->disk_name); + else + pr_err("bcache device (NULL gendisk) stopped");
if (d->c) bcache_device_detach(d); - if (d->disk && d->disk->flags & GENHD_FL_UP) - del_gendisk(d->disk); - if (d->disk && d->disk->queue) - blk_cleanup_queue(d->disk->queue); - if (d->disk) { + + if (disk) { + if (disk->flags & GENHD_FL_UP) + del_gendisk(disk); + + if (disk->queue) + blk_cleanup_queue(disk->queue); + ida_simple_remove(&bcache_device_idx, - first_minor_to_idx(d->disk->first_minor)); - put_disk(d->disk); + first_minor_to_idx(disk->first_minor)); + put_disk(disk); }
bioset_exit(&d->bio_split);
mainline inclusion from mainline-5.5-rc1 commit aaf8dbeab5865720c66db60ae8329309e81a0c9c category: backport
Previous code only returns "Not a bcache superblock" for both bcache super block offset and magic error. This patch addss more accurate error messages, - for super block unmatched offset: "Not a bcache superblock (bad offset)" - for super block unmatched magic number: "Not a bcache superblock (bad magic)"
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 7beccede5360..623fdaf10c4c 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -92,10 +92,11 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u", sb->version, sb->flags, sb->seq, sb->keys);
- err = "Not a bcache superblock"; + err = "Not a bcache superblock (bad offset)"; if (sb->offset != SB_SECTOR) goto err;
+ err = "Not a bcache superblock (bad magic)"; if (memcmp(sb->magic, bcache_magic, 16)) goto err;
mainline inclusion from mainline-5.5-rc1 commit 41fa4deef90ba1cd048b740317f50b9decae9fc8 category: backport
In request.c:bch_data_insert_keys(), there is code comment for a piece of dead code. This patch deletes the dead code and its code comment since they are useless in practice.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/request.c | 12 ------------ 1 file changed, 12 deletions(-)
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 41adcd1546f1..73478a91a342 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -62,18 +62,6 @@ static void bch_data_insert_keys(struct closure *cl) struct bkey *replace_key = op->replace ? &op->replace_key : NULL; int ret;
- /* - * If we're looping, might already be waiting on - * another journal write - can't wait on more than one journal write at - * a time - * - * XXX: this looks wrong - */ -#if 0 - while (atomic_read(&s->cl.remaining) & CLOSURE_WAITING) - closure_sync(&s->cl); -#endif - if (!op->replace) journal_ref = bch_journal(op->c, &op->insert_keys, op->flush_journal ? cl : NULL);
mainline inclusion from mainline-5.5-rc1 commit 06c1526da97dd0022973de3fc41b79b2d431b435 category: backport
This patch adds simple code comments for bch_keylist_pop() and bch_keylist_pop_front() in bset.c, to make the code more easier to be understand.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/bset.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 08768796b543..f37a429f093d 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -155,6 +155,7 @@ int __bch_keylist_realloc(struct keylist *l, unsigned int u64s) return 0; }
+/* Pop the top key of keylist by pointing l->top to its previous key */ struct bkey *bch_keylist_pop(struct keylist *l) { struct bkey *k = l->keys; @@ -168,6 +169,7 @@ struct bkey *bch_keylist_pop(struct keylist *l) return l->top = k; }
+/* Pop the bottom key of keylist and update l->top_p */ void bch_keylist_pop_front(struct keylist *l) { l->top_p -= bkey_u64s(l->keys);
From: Andrea Righi andrea.righi@canonical.com
mainline inclusion from mainline-5.5-rc1 commit 84c529aea182939e68f618ed9813740c9165c7eb category: backport
bcache_allocator can call the following:
bch_allocator_thread() -> bch_prio_write() -> bch_bucket_alloc() -> wait on &ca->set->bucket_wait
But the wake up event on bucket_wait is supposed to come from bch_allocator_thread() itself => deadlock:
[ 1158.490744] INFO: task bcache_allocato:15861 blocked for more than 10 seconds. [ 1158.495929] Not tainted 5.3.0-050300rc3-generic #201908042232 [ 1158.500653] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1158.504413] bcache_allocato D 0 15861 2 0x80004000 [ 1158.504419] Call Trace: [ 1158.504429] __schedule+0x2a8/0x670 [ 1158.504432] schedule+0x2d/0x90 [ 1158.504448] bch_bucket_alloc+0xe5/0x370 [bcache] [ 1158.504453] ? wait_woken+0x80/0x80 [ 1158.504466] bch_prio_write+0x1dc/0x390 [bcache] [ 1158.504476] bch_allocator_thread+0x233/0x490 [bcache] [ 1158.504491] kthread+0x121/0x140 [ 1158.504503] ? invalidate_buckets+0x890/0x890 [bcache] [ 1158.504506] ? kthread_park+0xb0/0xb0 [ 1158.504510] ret_from_fork+0x35/0x40
Fix by making the call to bch_prio_write() non-blocking, so that bch_allocator_thread() never waits on itself.
Moreover, make sure to wake up the garbage collector thread when bch_prio_write() is failing to allocate buckets.
BugLink: https://bugs.launchpad.net/bugs/1784665 BugLink: https://bugs.launchpad.net/bugs/1796292 Signed-off-by: Andrea Righi andrea.righi@canonical.com Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/alloc.c | 5 ++++- drivers/md/bcache/bcache.h | 2 +- drivers/md/bcache/super.c | 27 +++++++++++++++++++++------ 3 files changed, 26 insertions(+), 8 deletions(-)
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 6f776823b9ba..a1df0d95151c 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -377,7 +377,10 @@ static int bch_allocator_thread(void *arg) if (!fifo_full(&ca->free_inc)) goto retry_invalidate;
- bch_prio_write(ca); + if (bch_prio_write(ca, false) < 0) { + ca->invalidate_needs_gc = 1; + wake_up_gc(ca->set); + } } } out: diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 3653faf3bf48..50241e045c70 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -978,7 +978,7 @@ bool bch_cached_dev_error(struct cached_dev *dc); __printf(2, 3) bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...);
-void bch_prio_write(struct cache *ca); +int bch_prio_write(struct cache *ca, bool wait); void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent);
extern struct workqueue_struct *bcache_wq; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 623fdaf10c4c..d1352fcc6ff2 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -530,12 +530,29 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op, closure_sync(cl); }
-void bch_prio_write(struct cache *ca) +int bch_prio_write(struct cache *ca, bool wait) { int i; struct bucket *b; struct closure cl;
+ pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu", + fifo_used(&ca->free[RESERVE_PRIO]), + fifo_used(&ca->free[RESERVE_NONE]), + fifo_used(&ca->free_inc)); + + /* + * Pre-check if there are enough free buckets. In the non-blocking + * scenario it's better to fail early rather than starting to allocate + * buckets and do a cleanup later in case of failure. + */ + if (!wait) { + size_t avail = fifo_used(&ca->free[RESERVE_PRIO]) + + fifo_used(&ca->free[RESERVE_NONE]); + if (prio_buckets(ca) > avail) + return -ENOMEM; + } + closure_init_stack(&cl);
lockdep_assert_held(&ca->set->bucket_lock); @@ -545,9 +562,6 @@ void bch_prio_write(struct cache *ca) atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), &ca->meta_sectors_written);
- //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), - // fifo_used(&ca->free_inc), fifo_used(&ca->unused)); - for (i = prio_buckets(ca) - 1; i >= 0; --i) { long bucket; struct prio_set *p = ca->disk_buckets; @@ -565,7 +579,7 @@ void bch_prio_write(struct cache *ca) p->magic = pset_magic(&ca->sb); p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
- bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true); + bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait); BUG_ON(bucket == -1);
mutex_unlock(&ca->set->bucket_lock); @@ -594,6 +608,7 @@ void bch_prio_write(struct cache *ca)
ca->prio_last_buckets[i] = ca->prio_buckets[i]; } + return 0; }
static void prio_read(struct cache *ca, uint64_t bucket) @@ -1964,7 +1979,7 @@ static int run_cache_set(struct cache_set *c)
mutex_lock(&c->bucket_lock); for_each_cache(ca, c, i) - bch_prio_write(ca); + bch_prio_write(ca, true); mutex_unlock(&c->bucket_lock);
err = "cannot allocate new UUID bucket";
mainline inclusion from mainline-5.5-rc1 commit 5dccefd3ea0b33cf3e5a45cbccc7e0bf22791655 category: backport
This patch adds code comments in bch_btree_leaf_dirty() to explain why w->journal should always reference the eldest journal pin of all the writing bkeys in the btree node. To make the bcache journal code to be easier to be understood.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/btree.c | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index c39a67c65620..15e71177d8e4 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -542,6 +542,11 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
set_btree_node_dirty(b);
+ /* + * w->journal is always the oldest journal pin of all bkeys + * in the leaf node, to make sure the oldest jset seq won't + * be increased before this btree node is flushed. + */ if (journal_ref) { if (w->journal && journal_pin_cmp(b->c, w->journal, journal_ref)) {
mainline inclusion from mainline-5.5-rc1 commit c5fcdedcee4e6ae15c0eb5e0fbe25467e57d2963 category: backport
For writeback mode, if there is no regular I/O request for a while, the writeback rate will be set to the maximum value (1TB/s for now). This is good for most of the storage workload, but there are still people don't what the maximum writeback rate in I/O idle time.
This patch adds a sysfs interface file idle_max_writeback_rate to permit people to disable maximum writeback rate. Then the minimum writeback rate can be advised by writeback_rate_minimum in the bcache device's sysfs interface.
Reported-by: Christian Balzer chibi@gol.com Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/super.c | 1 + drivers/md/bcache/sysfs.c | 7 +++++++ drivers/md/bcache/writeback.c | 4 ++++ 4 files changed, 13 insertions(+)
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 50241e045c70..9198c1b480d9 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -724,6 +724,7 @@ struct cache_set { unsigned int gc_always_rewrite:1; unsigned int shrinker_disabled:1; unsigned int copy_gc_enabled:1; + unsigned int idle_max_writeback_rate_enabled:1;
#define BUCKET_HASH_BITS 12 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index d1352fcc6ff2..77e9869345e7 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1834,6 +1834,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->congested_read_threshold_us = 2000; c->congested_write_threshold_us = 20000; c->error_limit = DEFAULT_IO_ERROR_LIMIT; + c->idle_max_writeback_rate_enabled = 1; WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags));
return c; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 627dcea0f5b6..733e2ddf3c78 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -134,6 +134,7 @@ rw_attribute(expensive_debug_checks); rw_attribute(cache_replacement_policy); rw_attribute(btree_shrinker_disabled); rw_attribute(copy_gc_enabled); +rw_attribute(idle_max_writeback_rate); rw_attribute(gc_after_writeback); rw_attribute(size);
@@ -747,6 +748,8 @@ SHOW(__bch_cache_set) sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); + sysfs_printf(idle_max_writeback_rate, "%i", + c->idle_max_writeback_rate_enabled); sysfs_printf(gc_after_writeback, "%i", c->gc_after_writeback); sysfs_printf(io_disable, "%i", test_bit(CACHE_SET_IO_DISABLE, &c->flags)); @@ -864,6 +867,9 @@ STORE(__bch_cache_set) sysfs_strtoul_bool(gc_always_rewrite, c->gc_always_rewrite); sysfs_strtoul_bool(btree_shrinker_disabled, c->shrinker_disabled); sysfs_strtoul_bool(copy_gc_enabled, c->copy_gc_enabled); + sysfs_strtoul_bool(idle_max_writeback_rate, + c->idle_max_writeback_rate_enabled); + /* * write gc_after_writeback here may overwrite an already set * BCH_DO_AUTO_GC, it doesn't matter because this flag will be @@ -954,6 +960,7 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_gc_always_rewrite, &sysfs_btree_shrinker_disabled, &sysfs_copy_gc_enabled, + &sysfs_idle_max_writeback_rate, &sysfs_gc_after_writeback, &sysfs_io_disable, &sysfs_cutoff_writeback, diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index d60268fe49e1..4a40f9eadeaf 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -122,6 +122,10 @@ static void __update_writeback_rate(struct cached_dev *dc) static bool set_at_max_writeback_rate(struct cache_set *c, struct cached_dev *dc) { + /* Don't sst max writeback rate if it is disabled */ + if (!c->idle_max_writeback_rate_enabled) + return false; + /* Don't set max writeback rate if gc is running */ if (!c->gc_mark_valid) return false;
mainline inclusion from mainline-5.5-rc1 commit 9fcc34b1a6dd4b8e5337e2b6ef45e428897eca6b category: backport
In bch_mca_scan(), the number of shrinking btree node is calculated by code like this, unsigned long nr = sc->nr_to_scan;
nr /= c->btree_pages; nr = min_t(unsigned long, nr, mca_can_free(c)); variable sc->nr_to_scan is number of objects (here is bcache B+tree nodes' number) to shrink, and pointer variable sc is sent from memory management code as parametr of a callback.
If sc->nr_to_scan is smaller than c->btree_pages, after the above calculation, variable 'nr' will be 0 and nothing will be shrunk. It is frequeently observed that only 1 or 2 is set to sc->nr_to_scan and make nr to be zero. Then bch_mca_scan() will do nothing more then acquiring and releasing mutex c->bucket_lock.
This patch checkes whether nr is 0 after the above calculation, if 0 is the result then set 1 to variable 'n'. Then at least bch_mca_scan() will try to shrink a single B+tree node.
Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/btree.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 15e71177d8e4..74c52677cd7b 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -727,6 +727,8 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, * IO can always make forward progress: */ nr /= c->btree_pages; + if (nr == 0) + nr = 1; nr = min_t(unsigned long, nr, mca_can_free(c));
i = 0;
From: Christoph Hellwig hch@lst.de
mainline inclusion from mainline-5.5-rc1 commit 651bbba57ada682a8651768df6979598e28e3b8d category: backport
There is no block directory this file needs includes from.
Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/Makefile | 2 -- 1 file changed, 2 deletions(-)
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index d26b35195825..fd714628da6a 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile @@ -5,5 +5,3 @@ obj-$(CONFIG_BCACHE) += bcache.o bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ util.o writeback.o - -CFLAGS_request.o += -Iblock
From: Christoph Hellwig hch@lst.de
mainline inclusion from mainline-5.5-rc1 commit 15fbb2312f32cf99bd8e0247ac0240c9bce0ba47 category: backport
None of the exported bcache symbols are actually used anywhere.
Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Coly Li colyli@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/md/bcache/bset.c | 15 --------------- drivers/md/bcache/closure.c | 7 ------- 2 files changed, 22 deletions(-)
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index f37a429f093d..cffcdc9feefb 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -311,7 +311,6 @@ void bch_btree_keys_free(struct btree_keys *b) t->tree = NULL; t->data = NULL; } -EXPORT_SYMBOL(bch_btree_keys_free);
int bch_btree_keys_alloc(struct btree_keys *b, unsigned int page_order, @@ -344,7 +343,6 @@ int bch_btree_keys_alloc(struct btree_keys *b, bch_btree_keys_free(b); return -ENOMEM; } -EXPORT_SYMBOL(bch_btree_keys_alloc);
void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, bool *expensive_debug_checks) @@ -363,7 +361,6 @@ void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, * any more. */ } -EXPORT_SYMBOL(bch_btree_keys_init);
/* Binary tree stuff for auxiliary search trees */
@@ -680,7 +677,6 @@ void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic)
bch_bset_build_unwritten_tree(b); } -EXPORT_SYMBOL(bch_bset_init_next);
/* * Build auxiliary binary tree 'struct bset_tree *t', this tree is used to @@ -734,7 +730,6 @@ void bch_bset_build_written_tree(struct btree_keys *b) j = inorder_next(j, t->size)) make_bfloat(t, j); } -EXPORT_SYMBOL(bch_bset_build_written_tree);
/* Insert */
@@ -782,7 +777,6 @@ fix_right: do { j = j * 2 + 1; } while (j < t->size); } -EXPORT_SYMBOL(bch_bset_fix_invalidated_key);
static void bch_bset_fix_lookup_table(struct btree_keys *b, struct bset_tree *t, @@ -857,7 +851,6 @@ bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r)
return b->ops->key_merge(b, l, r); } -EXPORT_SYMBOL(bch_bkey_try_merge);
void bch_bset_insert(struct btree_keys *b, struct bkey *where, struct bkey *insert) @@ -877,7 +870,6 @@ void bch_bset_insert(struct btree_keys *b, struct bkey *where, bkey_copy(where, insert); bch_bset_fix_lookup_table(b, t, where); } -EXPORT_SYMBOL(bch_bset_insert);
unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, struct bkey *replace_key) @@ -933,7 +925,6 @@ copy: bkey_copy(m, k); merged: return status; } -EXPORT_SYMBOL(bch_btree_insert_key);
/* Lookup */
@@ -1079,7 +1070,6 @@ struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t,
return i.l; } -EXPORT_SYMBOL(__bch_bset_search);
/* Btree iterator */
@@ -1134,7 +1124,6 @@ struct bkey *bch_btree_iter_init(struct btree_keys *b, { return __bch_btree_iter_init(b, iter, search, b->set); } -EXPORT_SYMBOL(bch_btree_iter_init);
static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, btree_iter_cmp_fn *cmp) @@ -1167,7 +1156,6 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter) return __bch_btree_iter_next(iter, btree_iter_cmp);
} -EXPORT_SYMBOL(bch_btree_iter_next);
struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, struct btree_keys *b, ptr_filter_fn fn) @@ -1198,7 +1186,6 @@ int bch_bset_sort_state_init(struct bset_sort_state *state,
return mempool_init_page_pool(&state->pool, 1, page_order); } -EXPORT_SYMBOL(bch_bset_sort_state_init);
static void btree_mergesort(struct btree_keys *b, struct bset *out, struct btree_iter *iter, @@ -1315,7 +1302,6 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start,
EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize); } -EXPORT_SYMBOL(bch_btree_sort_partial);
void bch_btree_sort_and_fix_extents(struct btree_keys *b, struct btree_iter *iter, @@ -1368,7 +1354,6 @@ void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state) out: bch_bset_build_written_tree(b); } -EXPORT_SYMBOL(bch_btree_sort_lazy);
void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats) { diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index c12cd809ab19..0164a1fe94a9 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -45,7 +45,6 @@ void closure_sub(struct closure *cl, int v) { closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); } -EXPORT_SYMBOL(closure_sub);
/* * closure_put - decrement a closure's refcount @@ -54,7 +53,6 @@ void closure_put(struct closure *cl) { closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); } -EXPORT_SYMBOL(closure_put);
/* * closure_wake_up - wake up all closures on a wait list, without memory barrier @@ -76,7 +74,6 @@ void __closure_wake_up(struct closure_waitlist *wait_list) closure_sub(cl, CLOSURE_WAITING + 1); } } -EXPORT_SYMBOL(__closure_wake_up);
/** * closure_wait - add a closure to a waitlist @@ -96,7 +93,6 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
return true; } -EXPORT_SYMBOL(closure_wait);
struct closure_syncer { struct task_struct *task; @@ -131,7 +127,6 @@ void __sched __closure_sync(struct closure *cl)
__set_current_state(TASK_RUNNING); } -EXPORT_SYMBOL(__closure_sync);
#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
@@ -149,7 +144,6 @@ void closure_debug_create(struct closure *cl) list_add(&cl->all, &closure_list); spin_unlock_irqrestore(&closure_list_lock, flags); } -EXPORT_SYMBOL(closure_debug_create);
void closure_debug_destroy(struct closure *cl) { @@ -162,7 +156,6 @@ void closure_debug_destroy(struct closure *cl) list_del(&cl->all); spin_unlock_irqrestore(&closure_list_lock, flags); } -EXPORT_SYMBOL(closure_debug_destroy);
static struct dentry *closure_debug;