hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9QJ1S CVE: NA
----------------------------------------
After a disk is created, debugfs inode and dentry will be created together, and the memory used for debugfs can't be freed until disk removal.
The number of debugfs inode and dentry is based on how many cpus and hctxs. For example, testing on a 128-core environemt, with default module parameters, each loop device will cost 1679KB memory, and debugfs will cost 336KB(20%).
The memory cost for debugfs for a disk seems little, but if a big machine contains thousands of disks, the cost will be xxGB. This memory overhead can be avoided by disabling CONFIG_BLK_DEBUG_FS.
This patch add a disk level switch that can enable/disable debugfs dynamically, so that user can disable debugfs if they care about the memory overhead, in the meantime, debugfs can be enabled again in demand.
Signed-off-by: Yu Kuai yukuai3@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + arch/powerpc/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + block/Kconfig | 15 +++++ block/blk-mq-debugfs.c | 74 +++++++++++++++++++++--- block/blk-mq-debugfs.h | 5 ++ block/blk-mq.c | 9 +++ block/blk-sysfs.c | 54 +++++++++++++++++ include/linux/blkdev.h | 3 + 9 files changed, 156 insertions(+), 7 deletions(-)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 0cfc294d0b7f..3c45090368f7 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -963,6 +963,7 @@ CONFIG_BLK_CGROUP_IOCOST=y CONFIG_BLK_WBT_MQ=y CONFIG_BLK_DEBUG_FS=y CONFIG_BLK_DEBUG_FS_ZONED=y +CONFIG_BLK_DEBUG_FS_SWITCH=y # CONFIG_BLK_SED_OPAL is not set # CONFIG_BLK_INLINE_ENCRYPTION is not set # CONFIG_BLK_IO_HUNG_TASK_CHECK is not set diff --git a/arch/powerpc/configs/openeuler_defconfig b/arch/powerpc/configs/openeuler_defconfig index fc0ca355dcaa..c285107c6d97 100644 --- a/arch/powerpc/configs/openeuler_defconfig +++ b/arch/powerpc/configs/openeuler_defconfig @@ -684,6 +684,7 @@ CONFIG_BLK_CGROUP_IOCOST=y CONFIG_BLK_WBT_MQ=y CONFIG_BLK_DEBUG_FS=y CONFIG_BLK_DEBUG_FS_ZONED=y +CONFIG_BLK_DEBUG_FS_SWITCH=y CONFIG_BLK_SED_OPAL=y CONFIG_BLK_INLINE_ENCRYPTION=y # CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK is not set diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 39b9252f156a..e657e4cfdbf9 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -941,6 +941,7 @@ CONFIG_BLK_CGROUP_IOCOST=y CONFIG_BLK_WBT_MQ=y CONFIG_BLK_DEBUG_FS=y CONFIG_BLK_DEBUG_FS_ZONED=y +CONFIG_BLK_DEBUG_FS_SWITCH=y # CONFIG_BLK_SED_OPAL is not set # CONFIG_BLK_INLINE_ENCRYPTION is not set # CONFIG_BLK_IO_HUNG_TASK_CHECK is not set diff --git a/block/Kconfig b/block/Kconfig index a1026e1b8f4d..d01e418a55d9 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -189,6 +189,21 @@ config BLK_DEBUG_FS_ZONED bool default BLK_DEBUG_FS && BLK_DEV_ZONED
+config BLK_DEBUG_FS_SWITCH + bool "Disk level switch to enable/disable debugfs dynamically" + depends on BLK_DEBUG_FS + depends on 64BIT + default y + help + After a disk is created, debugfs inode and dentry will be created + together, and the memory used for debugfs can't be freed until disk + removal. + + Enabling this will add a disk level switch that can enable/disable + debugfs dynamically, so that user can disable debugfs if they care + about the memory overhead, in the meantime, debugfs can be enabled + again in demand. + config BLK_SED_OPAL bool "Logic for interfacing with Opal enabled SEDs" help diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 9f55fe0d15a3..51d5bfdee655 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -132,6 +132,9 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(HCTX_ACTIVE), QUEUE_FLAG_NAME(NOWAIT), QUEUE_FLAG_NAME(DISPATCH_ASYNC), +#ifdef CONFIG_BLK_DEBUG_FS_SWITCH + QUEUE_FLAG_NAME(DEBUGFS), +#endif }; #undef QUEUE_FLAG_NAME
@@ -860,11 +863,27 @@ static void debugfs_create_files(struct dentry *parent, void *data, (void *)attr, &blk_mq_debugfs_fops); }
+static bool blk_mq_debugfs_enabled(struct request_queue *q) +{ + if (IS_ERR_OR_NULL(q->debugfs_dir)) + return false; + +#ifdef CONFIG_BLK_DEBUG_FS_SWITCH + if (!test_bit(QUEUE_FLAG_DEBUGFS, &q->queue_flags)) + return false; +#endif + + return true; +} + void blk_mq_debugfs_register(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; int i;
+ if (!blk_mq_debugfs_enabled(q)) + return; + debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs);
/* @@ -893,6 +912,46 @@ void blk_mq_debugfs_register(struct request_queue *q) } }
+static void debugfs_remove_files(struct dentry *parent, + const struct blk_mq_debugfs_attr *attr) +{ + if (IS_ERR_OR_NULL(parent)) + return; + + for (; attr->name; attr++) + debugfs_lookup_and_remove(attr->name, parent); +} + +void blk_mq_debugfs_unregister(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + unsigned long i; + + spin_lock(&q->queue_lock); + if (q->rq_qos) { + struct rq_qos *rqos = q->rq_qos; + + while (rqos) { + rqos->debugfs_dir = NULL; + rqos = rqos->next; + } + } + spin_unlock(&q->queue_lock); + + debugfs_remove_recursive(q->rqos_debugfs_dir); + q->rqos_debugfs_dir = NULL; + + queue_for_each_hw_ctx(q, hctx, i) { + if (hctx->debugfs_dir) + blk_mq_debugfs_unregister_hctx(hctx); + } + + if (q->sched_debugfs_dir) + blk_mq_debugfs_unregister_sched(q); + + debugfs_remove_files(q->debugfs_dir, blk_mq_debugfs_queue_attrs); +} + static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { @@ -914,7 +973,7 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q,
lockdep_assert_held(&q->debugfs_mutex);
- if (!q->debugfs_dir) + if (!blk_mq_debugfs_enabled(q)) return;
snprintf(name, sizeof(name), "hctx%u", hctx->queue_num); @@ -930,7 +989,7 @@ void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx) { lockdep_assert_held(&hctx->queue->debugfs_mutex);
- if (!hctx->queue->debugfs_dir) + if (!blk_mq_debugfs_enabled(hctx->queue)) return; debugfs_remove_recursive(hctx->debugfs_dir); hctx->sched_debugfs_dir = NULL; @@ -969,7 +1028,7 @@ void blk_mq_debugfs_register_sched(struct request_queue *q) * If the parent directory has not been created yet, return, we will be * called again later on and the directory/files will be created then. */ - if (!q->debugfs_dir) + if (!blk_mq_debugfs_enabled(q)) return;
if (!e->queue_debugfs_attrs) @@ -992,7 +1051,7 @@ void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) { lockdep_assert_held(&rqos->q->debugfs_mutex);
- if (!rqos->q->debugfs_dir) + if (!blk_mq_debugfs_enabled(rqos->q)) return; debugfs_remove_recursive(rqos->debugfs_dir); rqos->debugfs_dir = NULL; @@ -1005,7 +1064,8 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
lockdep_assert_held(&q->debugfs_mutex);
- if (rqos->debugfs_dir || !rqos->ops->debugfs_attrs) + if (rqos->debugfs_dir || !rqos->ops->debugfs_attrs || + !blk_mq_debugfs_enabled(q)) return;
if (!q->rqos_debugfs_dir) @@ -1025,7 +1085,7 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
lockdep_assert_held(&q->debugfs_mutex);
- if (!e->hctx_debugfs_attrs) + if (!e->hctx_debugfs_attrs || !blk_mq_debugfs_enabled(q)) return;
hctx->sched_debugfs_dir = debugfs_create_dir("sched", @@ -1038,7 +1098,7 @@ void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx) { lockdep_assert_held(&hctx->queue->debugfs_mutex);
- if (!hctx->queue->debugfs_dir) + if (!blk_mq_debugfs_enabled(hctx->queue)) return; debugfs_remove_recursive(hctx->sched_debugfs_dir); hctx->sched_debugfs_dir = NULL; diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index f6898560b1f3..3a2c43a9a0ae 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -19,6 +19,7 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq); int blk_mq_debugfs_rq_show(struct seq_file *m, void *v);
void blk_mq_debugfs_register(struct request_queue *q); +void blk_mq_debugfs_unregister(struct request_queue *q); void blk_mq_debugfs_register_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx); void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx); @@ -38,6 +39,10 @@ static inline void blk_mq_debugfs_register(struct request_queue *q) { }
+static inline void blk_mq_debugfs_unregister(struct request_queue *q) +{ +} + static inline void blk_mq_debugfs_register_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx) { diff --git a/block/blk-mq.c b/block/blk-mq.c index 5670dfeac85a..407098e8f210 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -47,6 +47,11 @@ bool mq_unfair_dtag = true; module_param_named(unfair_dtag, mq_unfair_dtag, bool, 0444);
+#ifdef CONFIG_BLK_DEBUG_FS_SWITCH +bool enable_debugfs = true; +module_param_named(enable_debugfs, enable_debugfs, bool, 0444); +#endif + static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
static void blk_mq_poll_stats_start(struct request_queue *q); @@ -3540,6 +3545,10 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->tag_set = set;
q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; +#ifdef CONFIG_BLK_DEBUG_FS_SWITCH + if (enable_debugfs) + blk_queue_flag_set(QUEUE_FLAG_DEBUGFS, q); +#endif if (set->nr_maps > HCTX_TYPE_POLL && set->map[HCTX_TYPE_POLL].nr_queues) blk_queue_flag_set(QUEUE_FLAG_POLL, q); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index f4e33203a8ee..18dfd2fe3b3e 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -673,6 +673,57 @@ static struct queue_sysfs_entry queue_dispatch_async_cpus_entry = { QUEUE_RW_ENTRY(queue_dispatch_async, "dispatch_async"); #endif
+#ifdef CONFIG_BLK_DEBUG_FS_SWITCH +static ssize_t queue_debugfs_show(struct request_queue *q, char *page) +{ + return queue_var_show(test_bit(QUEUE_FLAG_DEBUGFS, &q->queue_flags), + page); +} + +static ssize_t queue_debugfs_store(struct request_queue *q, const char *page, + size_t count) +{ + unsigned long val; + ssize_t ret; + bool enabled; + int err; + + if (!queue_is_mq(q)) + return count; + + if (!blk_queue_registered(q)) + return -ENODEV; + + ret = queue_var_store(&val, page, count); + if (ret < 0) + return ret; + + err = blk_queue_enter(q, 0); + if (err) + return err; + + mutex_lock(&q->debugfs_mutex); + enabled = test_bit(QUEUE_FLAG_DEBUGFS, &q->queue_flags); + if (!!val == enabled) + goto unlock; + + if (val) { + blk_queue_flag_set(QUEUE_FLAG_DEBUGFS, q); + blk_mq_debugfs_register(q); + } else { + blk_mq_debugfs_unregister(q); + blk_queue_flag_clear(QUEUE_FLAG_DEBUGFS, q); + } + +unlock: + mutex_unlock(&q->debugfs_mutex); + blk_queue_exit(q); + return ret; +} + +QUEUE_RW_ENTRY(queue_debugfs, "debugfs"); +#endif + static struct attribute *queue_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -719,6 +770,9 @@ static struct attribute *queue_attrs[] = { #endif #ifdef CONFIG_BLK_DEV_THROTTLING_LOW &blk_throtl_sample_time_entry.attr, +#endif +#ifdef CONFIG_BLK_DEBUG_FS_SWITCH + &queue_debugfs_entry.attr, #endif NULL, }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fbe7146b63fe..eea753a46419 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -652,6 +652,9 @@ struct request_queue { #define QUEUE_FLAG_HCTX_WAIT 30 /* support to dispatch bio asynchronously */ #define QUEUE_FLAG_DISPATCH_ASYNC 31 +#ifdef CONFIG_BLK_DEBUG_FS_SWITCH +#define QUEUE_FLAG_DEBUGFS 32 /* supports debugfs */ +#endif
#define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_SAME_COMP) | \