- Kernel - mailweb.openeuler.org

[PATCH OLK-6.6] ubi: block: fix memleak in ubiblock_create()
by Li Nan 04 Jan '24

04 Jan '24

hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8P68G ------------------------------- If idr_alloc() fails, dev->gd will be put after goto out_cleanup_disk in ubiblock_create(), but dev->gd has not been assigned yet at this time, and gd will not be put anymore. Fix it by putting gd directly. Signed-off-by: Li Nan <linan122(a)huawei.com> --- drivers/mtd/ubi/block.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index 309a42aeaa4c..654bd7372cd8 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -434,7 +434,7 @@ int ubiblock_create(struct ubi_volume_info *vi) list_del(&dev->list); idr_remove(&ubiblock_minor_idr, gd->first_minor); out_cleanup_disk: - put_disk(dev->gd); + put_disk(gd); out_free_tags: blk_mq_free_tag_set(&dev->tag_set); out_free_dev: -- 2.39.2

2 1

[PATCH V2 OLK-6.6] fs/dirty_pages: dump the number of dirty pages for each inode
by Zizhi Wo 04 Jan '24

04 Jan '24

From: yu kuai <yukuai3(a)huawei.com> hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8OHM4?from=project-issue CVE: NA --------------------------- In order to analyse the IO performance when using buffer IO, it's useful to obtain the number of dirty pages for each inode in the filesystem. This feature depends on 'CONFIG_DIRTY_PAGES'. It creates 3 interfaces by using procfs. /proc/dirty/buffer_size for buffer allocation and release, /proc/dirty/page_threshold to filter result and /proc/dirty/dirty_list to get dirty pages. Signed-off-by: yu kuai <yukuai3(a)huawei.com> Signed-off-by: Zizhi Wo <wozizhi(a)huawei.com> --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + fs/Kconfig | 13 + fs/Makefile | 1 + fs/dirty_pages.c | 380 +++++++++++++++++++++++++ 5 files changed, 396 insertions(+) create mode 100644 fs/dirty_pages.c diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 8e29cb161800..984ad448b5db 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -6909,6 +6909,7 @@ CONFIG_PROC_PAGE_MONITOR=y CONFIG_PROC_CHILDREN=y CONFIG_KERNFS=y CONFIG_SYSFS=y +CONFIG_DIRTY_PAGES=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_TMPFS_XATTR=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index b3e32559ab62..233b95fa4a84 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -8111,6 +8111,7 @@ CONFIG_PROC_PID_ARCH_STATUS=y CONFIG_PROC_CPU_RESCTRL=y CONFIG_KERNFS=y CONFIG_SYSFS=y +CONFIG_DIRTY_PAGES=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_TMPFS_XATTR=y diff --git a/fs/Kconfig b/fs/Kconfig index aa7e03cc1941..a2280cf98729 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -170,6 +170,19 @@ source "fs/proc/Kconfig" source "fs/kernfs/Kconfig" source "fs/sysfs/Kconfig" +config DIRTY_PAGES + bool "Dumps the number of dirty pages of each file" + depends on PROC_FS + default y + help + This config supports the rendering of dirty page data to the user, + which may be useful to analyze the IO performance when using buffer + IO. + + It create 3 interfaces by using procfs. /proc/dirty/buffer_size for + buffer allocation and release; /proc/dirty/page_threshold to filter + result; /proc/dirty/dirty_list to get dirty pages. + config TMPFS bool "Tmpfs virtual memory file system support (former shm fs)" depends on SHMEM diff --git a/fs/Makefile b/fs/Makefile index bb6164200a39..cc4735c23c32 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -44,6 +44,7 @@ obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ obj-$(CONFIG_COREDUMP) += coredump.o obj-$(CONFIG_SYSCTL) += drop_caches.o sysctls.o +obj-$(CONFIG_DIRTY_PAGES) += dirty_pages.o obj-$(CONFIG_FHANDLE) += fhandle.o obj-$(CONFIG_CGROUP_FILES) += filescontrol.o diff --git a/fs/dirty_pages.c b/fs/dirty_pages.c new file mode 100644 index 000000000000..19d9b0464176 --- /dev/null +++ b/fs/dirty_pages.c @@ -0,0 +1,380 @@ +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/uaccess.h> +#include <linux/pagemap.h> +#include <linux/pagevec.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/proc_fs.h> +#include <linux/kdev_t.h> +#include <linux/vmalloc.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include "internal.h" + +static char *buf_dirty; /* buffer to store number of dirty pages */ +static unsigned long buf_size; /* size of buffer in bytes */ +static unsigned long buff_num; /* size of buffer in number of pages */ +static unsigned long buff_limit; /* filter threshold of dirty pages*/ + +static struct proc_dir_entry *dirty_dir; + +unsigned long dirty_page_open; + +/* proc root directory */ +#define DIRTY_ROOT "dirty" +/* proc file for buffer allocation and release */ +#define DIRTY_SWITCH "buffer_size" +/* proc file to obtain diry pages of each inode */ +#define DIRTY_PAGES "dirty_list" +/* proc file to filter result */ +#define DIRTY_LIMIT "page_threshold" + +#define MAX_BUFF_SIZE 102400 + +static unsigned long dump_dirtypages_inode(struct inode *inode) +{ + XA_STATE(xas, &inode->i_mapping->i_pages, 0); + unsigned long nr_dirtys = 0; + void *page; + + xas_lock_irq(&xas); + xas_for_each_marked(&xas, page, (pgoff_t)-1, PAGECACHE_TAG_DIRTY) { + if (++nr_dirtys % XA_CHECK_SCHED) + continue; + + xas_pause(&xas); + xas_unlock_irq(&xas); + cond_resched(); + xas_lock_irq(&xas); + } + xas_unlock_irq(&xas); + + return nr_dirtys; +} + +static char *inode_filename(struct inode *inode, char *tmpname) +{ + struct dentry *dentry; + char *filename; + + dentry = d_find_alias(inode); + if (!dentry) + return ERR_PTR(-ENOENT); + + tmpname[PATH_MAX-1] = '\0'; + filename = dentry_path_raw(dentry, tmpname, PATH_MAX); + + dput(dentry); + + return filename; +} + +static inline bool is_sb_writable(struct super_block *sb) +{ + if (sb_rdonly(sb)) + return false; + + if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) + return false; + + return true; +} + +/* + * dump_dirtypages_sb - dump the dirty pages of each inode in the sb + * @sb the super block + * @m the seq_file witch is initialized in proc_dpages_open + * + * For each inode in the sb, call dump_dirtypages_pages to get the number + * of dirty pages. And use seq_printf to store the result in the buffer + * if it's not less than the threshold. The inode in unusual state will + * be skipped. + */ +static void dump_dirtypages_sb(struct super_block *sb, struct seq_file *m) +{ + struct inode *inode, *toput_inode = NULL; + unsigned long nr_dirtys; + const char *fstype; + char *filename; + char *tmpname; + unsigned long limit = READ_ONCE(buff_limit); + + if (!is_sb_writable(sb) || (sb->s_iflags & SB_I_NODEV)) + return; + + tmpname = kmalloc(PATH_MAX, GFP_KERNEL); + if (!tmpname) + return; + + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + spin_lock(&inode->i_lock); + + /* + * We must skip inodes in unusual state. We may also skip + * inodes without pages but we deliberately won't in case + * we need to reschedule to avoid softlockups. + */ + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + (inode->i_mapping->nrpages == 0 && !need_resched()) || + !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { + spin_unlock(&inode->i_lock); + continue; + } + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(&sb->s_inode_list_lock); + + cond_resched(); + + nr_dirtys = dump_dirtypages_inode(inode); + if (!nr_dirtys || nr_dirtys < limit) + goto skip; + + filename = inode_filename(inode, tmpname); + if (IS_ERR_OR_NULL(filename)) + filename = "unknown"; + + if (sb->s_type && sb->s_type->name) + fstype = sb->s_type->name; + else + fstype = "unknown"; + + seq_printf(m, "FSType: %s, Dev ID: %u(%u:%u) ino %lu, dirty pages %lu, path %s\n", + fstype, sb->s_dev, MAJOR(sb->s_dev), + MINOR(sb->s_dev), inode->i_ino, + nr_dirtys, filename); + + if (seq_has_overflowed(m)) { + m->size += 13; /* keep size > count to avoid overflow in seq_read_iter() */ + seq_printf(m, "terminated\n"); + iput(inode); + goto done; + } +skip: + iput(toput_inode); + toput_inode = inode; + spin_lock(&sb->s_inode_list_lock); + } + spin_unlock(&sb->s_inode_list_lock); +done: + iput(toput_inode); + kfree(tmpname); +} + +static int proc_dpages_show(struct seq_file *m, void *v) +{ + iterate_supers((void *)dump_dirtypages_sb, (void *)m); + return 0; +} + +static void free_buf_dirty(void) +{ + if (buf_dirty != NULL) { + vfree(buf_dirty); + buf_dirty = NULL; + buf_size = 0; + } +} + +static ssize_t write_proc( + struct file *filp, + const char *buf, + size_t count, + loff_t *offp) +{ + int ret = 0; + unsigned long old_buff_num; + + if (count > PAGE_SIZE) { + ret = -EINVAL; + goto out; + } + + old_buff_num = buff_num; + ret = kstrtoul_from_user(buf, count, 10, &buff_num); + if (ret != 0 || buff_num > MAX_BUFF_SIZE) { + buff_num = old_buff_num; + ret = -EINVAL; + goto out; + } + + ret = count; + if (buff_num == 0) { + free_buf_dirty(); + goto out; + } + if (buff_num == old_buff_num) + goto out; + + free_buf_dirty(); + buf_size = PAGE_SIZE * buff_num; + buf_dirty = vzalloc(buf_size); + if (!buf_dirty) + ret = -ENOMEM; + +out: + return ret; +} + +static int proc_dpages_open(struct inode *inode, struct file *filp) +{ + int ret; + struct seq_file *m; + + if (xchg(&dirty_page_open, 1) == 1) + return -EBUSY; + + if (buf_dirty == NULL || buf_size == 0) { + pr_warn("please allocate buffer before getting dirty pages\n"); + dirty_page_open = 0; + return -ENOMEM; + } + + ret = single_open(filp, proc_dpages_show, NULL); + if (ret) { + dirty_page_open = 0; + return ret; + } + + m = (struct seq_file *)filp->private_data; + memset(buf_dirty, 0, buf_size); + /* if seq_has_overflowed() return true, it need to contain "terminated\n\0" */ + m->size = buf_size - 13; + m->buf = buf_dirty; + + return ret; +} + +static int seq_release_dirty(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + + /* we don't want to free the buf */ + m->buf = NULL; + single_release(inode, file); + dirty_page_open = 0; + return 0; +} + +static const struct proc_ops proc_dpages_operations = { + .proc_open = proc_dpages_open, + .proc_read = seq_read, + .proc_release = seq_release_dirty, +}; + +static int proc_switch_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%lu\n", buff_num); + return 0; +} + +static int proc_limit_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%lu\n", READ_ONCE(buff_limit)); + return 0; +} + +static int proc_switch_open(struct inode *inode, struct file *filp) +{ + int ret; + + if (xchg(&dirty_page_open, 1) == 1) + return -EBUSY; + + ret = single_open(filp, proc_switch_show, NULL); + if (!ret) + dirty_page_open = 0; + + return ret; +} + +static int proc_limit_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, proc_limit_show, NULL); +} + +static int proc_switch_release(struct inode *inode, struct file *filp) { + dirty_page_open = 0; + return single_release(inode, filp); +} + +static ssize_t write_limit_proc( + struct file *filp, + const char *buf, + size_t count, + loff_t *offp) +{ + int ret = 0; + unsigned long tmp; + + if (count > PAGE_SIZE) + return -EINVAL; + + ret = kstrtoul_from_user(buf, count, 10, &tmp); + if (ret != 0) + return -EINVAL; + + WRITE_ONCE(buff_limit, tmp); + + return count; +} + + +static const struct proc_ops proc_switch_operations = { + .proc_open = proc_switch_open, + .proc_read = seq_read, + .proc_write = write_proc, + .proc_lseek = seq_lseek, + .proc_release = proc_switch_release, +}; + +static const struct proc_ops proc_limit_operations = { + .proc_open = proc_limit_open, + .proc_read = seq_read, + .proc_write = write_limit_proc, + .proc_lseek = seq_lseek, + .proc_release = single_release, +}; + + +static int __init dpages_proc_init(void) +{ + static struct proc_dir_entry *proc_file; + + dirty_dir = proc_mkdir(DIRTY_ROOT, NULL); + if (!dirty_dir) + goto fail_dir; + + proc_file = proc_create(DIRTY_PAGES, 0440, + dirty_dir, &proc_dpages_operations); + if (!proc_file) + goto fail_pages; + + proc_file = proc_create(DIRTY_SWITCH, 0640, + dirty_dir, &proc_switch_operations); + if (!proc_file) + goto fail_switch; + + proc_file = proc_create(DIRTY_LIMIT, 0640, + dirty_dir, &proc_limit_operations); + if (!proc_file) + goto fail_limit; + + return 0; + +fail_limit: + remove_proc_entry(DIRTY_SWITCH, dirty_dir); +fail_switch: + remove_proc_entry(DIRTY_PAGES, dirty_dir); +fail_pages: + remove_proc_entry(DIRTY_ROOT, NULL); +fail_dir: + return -ENOMEM; +} + +subsys_initcall(dpages_proc_init); -- 2.39.2

2 1

[PATCH OLK-5.10] Revert "sched: clear credit count in error branch"
by Yipeng Zou 04 Jan '24

04 Jan '24

hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8TNSV CVE: NA -------------------------------- There is no need fix this error branch in OLK-5.10. This reverts commit 2f31dbded7394d42a0e6c7b26ab3613b4b78d238. Signed-off-by: Yipeng Zou <zouyipeng(a)huawei.com> --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/fork.c b/kernel/fork.c index 7e31806a7452..0268eb744973 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2068,7 +2068,7 @@ static __latent_entropy struct task_struct *copy_process( #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY retval = sched_prefer_cpus_fork(p, current->prefer_cpus); if (retval) - goto bad_fork_cleanup_count; + goto bad_fork_free; #endif lockdep_assert_irqs_enabled(); -- 2.34.1

2 1

[PATCH OLK-6.6] fs/dirty_pages: dump the number of dirty pages for each inode
by Zizhi Wo 04 Jan '24

04 Jan '24

From: yu kuai <yukuai3(a)huawei.com> hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8OHM4?from=project-issue CVE: NA --------------------------- In order to analyse the IO performance when using buffer IO, it's useful to obtain the number of dirty pages for each inode in the filesystem. This feature depends on 'CONFIG_DIRTY_PAGES'. It creates 3 interfaces by using procfs. /proc/dirty/buffer_size for buffer allocation and release, /proc/dirty/page_threshold to filter result and /proc/dirty/dirty_list to get dirty pages. Signed-off-by: yu kuai <yukuai3(a)huawei.com> Signed-off-by: Zizhi Wo <wozizhi(a)huawei.com> --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + fs/Kconfig | 13 + fs/Makefile | 1 + fs/dirty_pages.c | 381 +++++++++++++++++++++++++ 5 files changed, 397 insertions(+) create mode 100644 fs/dirty_pages.c diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 8e29cb161800..984ad448b5db 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -6909,6 +6909,7 @@ CONFIG_PROC_PAGE_MONITOR=y CONFIG_PROC_CHILDREN=y CONFIG_KERNFS=y CONFIG_SYSFS=y +CONFIG_DIRTY_PAGES=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_TMPFS_XATTR=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index b3e32559ab62..233b95fa4a84 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -8111,6 +8111,7 @@ CONFIG_PROC_PID_ARCH_STATUS=y CONFIG_PROC_CPU_RESCTRL=y CONFIG_KERNFS=y CONFIG_SYSFS=y +CONFIG_DIRTY_PAGES=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_TMPFS_XATTR=y diff --git a/fs/Kconfig b/fs/Kconfig index aa7e03cc1941..a2280cf98729 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -170,6 +170,19 @@ source "fs/proc/Kconfig" source "fs/kernfs/Kconfig" source "fs/sysfs/Kconfig" +config DIRTY_PAGES + bool "Dumps the number of dirty pages of each file" + depends on PROC_FS + default y + help + This config supports the rendering of dirty page data to the user, + which may be useful to analyze the IO performance when using buffer + IO. + + It create 3 interfaces by using procfs. /proc/dirty/buffer_size for + buffer allocation and release; /proc/dirty/page_threshold to filter + result; /proc/dirty/dirty_list to get dirty pages. + config TMPFS bool "Tmpfs virtual memory file system support (former shm fs)" depends on SHMEM diff --git a/fs/Makefile b/fs/Makefile index bb6164200a39..cc4735c23c32 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -44,6 +44,7 @@ obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ obj-$(CONFIG_COREDUMP) += coredump.o obj-$(CONFIG_SYSCTL) += drop_caches.o sysctls.o +obj-$(CONFIG_DIRTY_PAGES) += dirty_pages.o obj-$(CONFIG_FHANDLE) += fhandle.o obj-$(CONFIG_CGROUP_FILES) += filescontrol.o diff --git a/fs/dirty_pages.c b/fs/dirty_pages.c new file mode 100644 index 000000000000..49e42df6638d --- /dev/null +++ b/fs/dirty_pages.c @@ -0,0 +1,381 @@ +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/uaccess.h> +#include <linux/pagemap.h> +#include <linux/pagevec.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/proc_fs.h> +#include <linux/kdev_t.h> +#include <linux/vmalloc.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include "internal.h" + +static char *buf_dirty; /* buffer to store number of dirty pages */ +static unsigned long buf_size; /* size of buffer in bytes */ +static unsigned long buff_num; /* size of buffer in number of pages */ +static unsigned long buff_limit; /* filter threshold of dirty pages*/ + +static struct proc_dir_entry *dirty_dir; + +unsigned long dirty_page_open; + +/* proc root directory */ +#define DIRTY_ROOT "dirty" +/* proc file for buffer allocation and release */ +#define DIRTY_SWITCH "buffer_size" +/* proc file to obtain diry pages of each inode */ +#define DIRTY_PAGES "dirty_list" +/* proc file to filter result */ +#define DIRTY_LIMIT "page_threshold" + +#define MAX_BUFF_SIZE 102400 + +static unsigned long dump_dirtypages_inode(struct inode *inode) +{ + XA_STATE(xas, &inode->i_mapping->i_pages, 0); + unsigned long nr_dirtys = 0; + void *page; + + xas_lock_irq(&xas); + xas_for_each_marked(&xas, page, (pgoff_t)-1, PAGECACHE_TAG_DIRTY) { + if (++nr_dirtys % XA_CHECK_SCHED) + continue; + + xas_pause(&xas); + xas_unlock_irq(&xas); + cond_resched(); + xas_lock_irq(&xas); + } + xas_unlock_irq(&xas); + + return nr_dirtys; +} + +static char *inode_filename(struct inode *inode, char *tmpname) +{ + struct dentry *dentry; + char *filename; + + dentry = d_find_alias(inode); + if (!dentry) + return ERR_PTR(-ENOENT); + + tmpname[PATH_MAX-1] = '\0'; + filename = dentry_path_raw(dentry, tmpname, PATH_MAX); + + dput(dentry); + + return filename; +} + +static inline bool is_sb_writable(struct super_block *sb) +{ + if (sb_rdonly(sb)) + return false; + + if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) + return false; + + return true; +} + +/* + * dump_dirtypages_sb - dump the dirty pages of each inode in the sb + * @sb the super block + * @m the seq_file witch is initialized in proc_dpages_open + * + * For each inode in the sb, call dump_dirtypages_pages to get the number + * of dirty pages. And use seq_printf to store the result in the buffer + * if it's not less than the threshold. The inode in unusual state will + * be skipped. + */ +static void dump_dirtypages_sb(struct super_block *sb, struct seq_file *m) +{ + struct inode *inode, *toput_inode = NULL; + unsigned long nr_dirtys; + const char *fstype; + char *filename; + char *tmpname; + unsigned long limit = READ_ONCE(buff_limit); + + if (!is_sb_writable(sb) || (sb->s_iflags & SB_I_NODEV)) + return; + + tmpname = kmalloc(PATH_MAX, GFP_KERNEL); + if (!tmpname) + return; + + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + spin_lock(&inode->i_lock); + + /* + * We must skip inodes in unusual state. We may also skip + * inodes without pages but we deliberately won't in case + * we need to reschedule to avoid softlockups. + */ + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + (inode->i_mapping->nrpages == 0 && !need_resched()) || + !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { + spin_unlock(&inode->i_lock); + continue; + } + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(&sb->s_inode_list_lock); + + cond_resched(); + + nr_dirtys = dump_dirtypages_inode(inode); + if (!nr_dirtys || nr_dirtys < limit) + goto skip; + + filename = inode_filename(inode, tmpname); + if (IS_ERR_OR_NULL(filename)) + filename = "unknown"; + + if (sb->s_type && sb->s_type->name) + fstype = sb->s_type->name; + else + fstype = "unknown"; + + seq_printf(m, "FSType: %s, Dev ID: %u(%u:%u) ino %lu, dirty pages %lu, path %s\n", + fstype, sb->s_dev, MAJOR(sb->s_dev), + MINOR(sb->s_dev), inode->i_ino, + nr_dirtys, filename); + + if (seq_has_overflowed(m)) { + m->size += 13; /* keep size > count to avoid overflow in seq_read_iter() */ + strncpy(m->buf + m->count - 12, "terminated\n\0", 12); + m->count += 12; + iput(inode); + goto done; + } +skip: + iput(toput_inode); + toput_inode = inode; + spin_lock(&sb->s_inode_list_lock); + } + spin_unlock(&sb->s_inode_list_lock); +done: + iput(toput_inode); + kfree(tmpname); +} + +static int proc_dpages_show(struct seq_file *m, void *v) +{ + iterate_supers((void *)dump_dirtypages_sb, (void *)m); + return 0; +} + +static void free_buf_dirty(void) +{ + if (buf_dirty != NULL) { + vfree(buf_dirty); + buf_dirty = NULL; + buf_size = 0; + } +} + +static ssize_t write_proc( + struct file *filp, + const char *buf, + size_t count, + loff_t *offp) +{ + int ret = 0; + unsigned long old_buff_num; + + if (count > PAGE_SIZE) { + ret = -EINVAL; + goto out; + } + + old_buff_num = buff_num; + ret = kstrtoul_from_user(buf, count, 10, &buff_num); + if (ret != 0 || buff_num > MAX_BUFF_SIZE) { + buff_num = old_buff_num; + ret = -EINVAL; + goto out; + } + + ret = count; + if (buff_num == 0) { + free_buf_dirty(); + goto out; + } + if (buff_num == old_buff_num) + goto out; + + free_buf_dirty(); + buf_size = PAGE_SIZE * buff_num; + buf_dirty = vzalloc(buf_size); + if (!buf_dirty) + ret = -ENOMEM; + +out: + return ret; +} + +static int proc_dpages_open(struct inode *inode, struct file *filp) +{ + int ret; + struct seq_file *m; + + if (xchg(&dirty_page_open, 1) == 1) + return -EBUSY; + + if (buf_dirty == NULL || buf_size == 0) { + pr_warn("please allocate buffer before getting dirty pages\n"); + dirty_page_open = 0; + return -ENOMEM; + } + + ret = single_open(filp, proc_dpages_show, NULL); + if (!ret) { + dirty_page_open = 0; + return ret; + } + + m = (struct seq_file *)filp->private_data; + memset(buf_dirty, 0, buf_size); + /* if seq_has_overflowed() return true, it need to contain "terminated\n\0" */ + m->size = buf_size - 13; + m->buf = buf_dirty; + + return ret; +} + +static int seq_release_dirty(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + + /* we don't want to free the buf */ + m->buf = NULL; + single_release(inode, file); + dirty_page_open = 0; + return 0; +} + +static const struct proc_ops proc_dpages_operations = { + .proc_open = proc_dpages_open, + .proc_read = seq_read, + .proc_release = seq_release_dirty, +}; + +static int proc_switch_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%lu\n", buff_num); + return 0; +} + +static int proc_limit_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%lu\n", READ_ONCE(buff_limit)); + return 0; +} + +static int proc_switch_open(struct inode *inode, struct file *filp) +{ + int ret; + + if (xchg(&dirty_page_open, 1) == 1) + return -EBUSY; + + ret = single_open(filp, proc_switch_show, NULL); + if (!ret) + dirty_page_open = 0; + + return ret; +} + +static int proc_limit_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, proc_limit_show, NULL); +} + +static int proc_switch_release(struct inode *inode, struct file *filp) { + dirty_page_open = 0; + return single_release(inode, filp); +} + +static ssize_t write_limit_proc( + struct file *filp, + const char *buf, + size_t count, + loff_t *offp) +{ + int ret = 0; + unsigned long tmp; + + if (count > PAGE_SIZE) + return -EINVAL; + + ret = kstrtoul_from_user(buf, count, 10, &tmp); + if (ret != 0) + return -EINVAL; + + WRITE_ONCE(buff_limit, tmp); + + return count; +} + + +static const struct proc_ops proc_switch_operations = { + .proc_open = proc_switch_open, + .proc_read = seq_read, + .proc_write = write_proc, + .proc_lseek = seq_lseek, + .proc_release = proc_switch_release, +}; + +static const struct proc_ops proc_limit_operations = { + .proc_open = proc_limit_open, + .proc_read = seq_read, + .proc_write = write_limit_proc, + .proc_lseek = seq_lseek, + .proc_release = single_release, +}; + + +static int __init dpages_proc_init(void) +{ + static struct proc_dir_entry *proc_file; + + dirty_dir = proc_mkdir(DIRTY_ROOT, NULL); + if (!dirty_dir) + goto fail_dir; + + proc_file = proc_create(DIRTY_PAGES, 0440, + dirty_dir, &proc_dpages_operations); + if (!proc_file) + goto fail_pages; + + proc_file = proc_create(DIRTY_SWITCH, 0640, + dirty_dir, &proc_switch_operations); + if (!proc_file) + goto fail_switch; + + proc_file = proc_create(DIRTY_LIMIT, 0640, + dirty_dir, &proc_limit_operations); + if (!proc_file) + goto fail_limit; + + return 0; + +fail_limit: + remove_proc_entry(DIRTY_SWITCH, dirty_dir); +fail_switch: + remove_proc_entry(DIRTY_PAGES, dirty_dir); +fail_pages: + remove_proc_entry(DIRTY_ROOT, NULL); +fail_dir: + return -ENOMEM; +} + +subsys_initcall(dpages_proc_init); -- 2.39.2

2 1

[PATCH OLK-6.6] fs/dirty_pages: dump the number of dirty pages for each inode
by Zizhi Wo 04 Jan '24

04 Jan '24

From: yu kuai <yukuai3(a)huawei.com> hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8OHM4?from=project-issue CVE: NA --------------------------- In order to analyse the IO performance when using buffer IO, it's useful to obtain the number of dirty pages for each inode in the filesystem. This feature depends on 'CONFIG_DIRTY_PAGES'. It creates 3 interfaces by using procfs. /proc/dirty/buffer_size for buffer allocation and release, /proc/dirty/page_threshold to filter result and /proc/dirty/dirty_list to get dirty pages. Signed-off-by: yu kuai <yukuai3(a)huawei.com> Signed-off-by: Zizhi Wo <wozizhi(a)huawei.com> --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + fs/Kconfig | 13 + fs/Makefile | 1 + fs/dirty_pages.c | 380 +++++++++++++++++++++++++ 5 files changed, 396 insertions(+) create mode 100644 fs/dirty_pages.c diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 039f3496af78..66b4812050df 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -6883,6 +6883,7 @@ CONFIG_PROC_PAGE_MONITOR=y CONFIG_PROC_CHILDREN=y CONFIG_KERNFS=y CONFIG_SYSFS=y +CONFIG_DIRTY_PAGES=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_TMPFS_XATTR=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index f481c1c8b4d1..6133f66fde61 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -8090,6 +8090,7 @@ CONFIG_PROC_PID_ARCH_STATUS=y CONFIG_PROC_CPU_RESCTRL=y CONFIG_KERNFS=y CONFIG_SYSFS=y +CONFIG_DIRTY_PAGES=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_TMPFS_XATTR=y diff --git a/fs/Kconfig b/fs/Kconfig index aa7e03cc1941..a2280cf98729 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -170,6 +170,19 @@ source "fs/proc/Kconfig" source "fs/kernfs/Kconfig" source "fs/sysfs/Kconfig" +config DIRTY_PAGES + bool "Dumps the number of dirty pages of each file" + depends on PROC_FS + default y + help + This config supports the rendering of dirty page data to the user, + which may be useful to analyze the IO performance when using buffer + IO. + + It create 3 interfaces by using procfs. /proc/dirty/buffer_size for + buffer allocation and release; /proc/dirty/page_threshold to filter + result; /proc/dirty/dirty_list to get dirty pages. + config TMPFS bool "Tmpfs virtual memory file system support (former shm fs)" depends on SHMEM diff --git a/fs/Makefile b/fs/Makefile index f9541f40be4e..6246e173e1e4 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -44,6 +44,7 @@ obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ obj-$(CONFIG_COREDUMP) += coredump.o obj-$(CONFIG_SYSCTL) += drop_caches.o sysctls.o +obj-$(CONFIG_DIRTY_PAGES) += dirty_pages.o obj-$(CONFIG_FHANDLE) += fhandle.o obj-y += iomap/ diff --git a/fs/dirty_pages.c b/fs/dirty_pages.c new file mode 100644 index 000000000000..0e29bb50358d --- /dev/null +++ b/fs/dirty_pages.c @@ -0,0 +1,380 @@ +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/uaccess.h> +#include <linux/pagemap.h> +#include <linux/pagevec.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/proc_fs.h> +#include <linux/kdev_t.h> +#include <linux/vmalloc.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include "internal.h" + +static char *buf_dirty; /* buffer to store number of dirty pages */ +static unsigned long buf_size; /* size of buffer in bytes */ +static unsigned long buff_num; /* size of buffer in number of pages */ +static unsigned long buff_limit; /* filter threshold of dirty pages*/ + +static struct proc_dir_entry *dirty_dir; + +unsigned long dirty_page_open; + +/* proc root directory */ +#define DIRTY_ROOT "dirty" +/* proc file for buffer allocation and release */ +#define DIRTY_SWITCH "buffer_size" +/* proc file to obtain diry pages of each inode */ +#define DIRTY_PAGES "dirty_list" +/* proc file to filter result */ +#define DIRTY_LIMIT "page_threshold" + +#define MAX_BUFF_SIZE 102400 + +static unsigned long dump_dirtypages_inode(struct inode *inode) +{ + XA_STATE(xas, &inode->i_mapping->i_pages, 0); + unsigned long nr_dirtys = 0; + void *page; + + xas_lock_irq(&xas); + xas_for_each_marked(&xas, page, (pgoff_t)-1, PAGECACHE_TAG_DIRTY) { + if (++nr_dirtys % XA_CHECK_SCHED) + continue; + + xas_pause(&xas); + xas_unlock_irq(&xas); + cond_resched(); + xas_lock_irq(&xas); + } + xas_unlock_irq(&xas); + + return nr_dirtys; +} + +static char *inode_filename(struct inode *inode, char *tmpname) +{ + struct dentry *dentry; + char *filename; + + dentry = d_find_alias(inode); + if (!dentry) + return ERR_PTR(-ENOENT); + + tmpname[PATH_MAX-1] = '\0'; + filename = dentry_path_raw(dentry, tmpname, PATH_MAX); + + dput(dentry); + + return filename; +} + +static inline bool is_sb_writable(struct super_block *sb) +{ + if (sb_rdonly(sb)) + return false; + + if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) + return false; + + return true; +} + +/* + * dump_dirtypages_sb - dump the dirty pages of each inode in the sb + * @sb the super block + * @m the seq_file witch is initialized in proc_dpages_open + * + * For each inode in the sb, call dump_dirtypages_pages to get the number + * of dirty pages. And use seq_printf to store the result in the buffer + * if it's not less than the threshold. The inode in unusual state will + * be skipped. + */ +static void dump_dirtypages_sb(struct super_block *sb, struct seq_file *m) +{ + struct inode *inode, *toput_inode = NULL; + unsigned long nr_dirtys; + const char *fstype; + char *filename; + char *tmpname; + unsigned long limit = READ_ONCE(buff_limit); + + if (!is_sb_writable(sb) || (sb->s_iflags & SB_I_NODEV)) + return; + + tmpname = kmalloc(PATH_MAX, GFP_KERNEL); + if (!tmpname) + return; + + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + spin_lock(&inode->i_lock); + + /* + * We must skip inodes in unusual state. We may also skip + * inodes without pages but we deliberately won't in case + * we need to reschedule to avoid softlockups. + */ + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + (inode->i_mapping->nrpages == 0 && !need_resched()) || + !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { + spin_unlock(&inode->i_lock); + continue; + } + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(&sb->s_inode_list_lock); + + cond_resched(); + + nr_dirtys = dump_dirtypages_inode(inode); + if (!nr_dirtys || nr_dirtys < limit) + goto skip; + + filename = inode_filename(inode, tmpname); + if (IS_ERR_OR_NULL(filename)) + filename = "unknown"; + + if (sb->s_type && sb->s_type->name) + fstype = sb->s_type->name; + else + fstype = "unknown"; + + seq_printf(m, "FSType: %s, Dev ID: %u(%u:%u) ino %lu, dirty pages %lu, path %s\n", + fstype, sb->s_dev, MAJOR(sb->s_dev), + MINOR(sb->s_dev), inode->i_ino, + nr_dirtys, filename); + + if (seq_has_overflowed(m)) { + m->size += 13; /* keep size > count to avoid overflow in seq_read_iter() */ + seq_printf(m, "terminated\n\0"); + iput(inode); + goto done; + } +skip: + iput(toput_inode); + toput_inode = inode; + spin_lock(&sb->s_inode_list_lock); + } + spin_unlock(&sb->s_inode_list_lock); +done: + iput(toput_inode); + kfree(tmpname); +} + +static int proc_dpages_show(struct seq_file *m, void *v) +{ + iterate_supers((void *)dump_dirtypages_sb, (void *)m); + return 0; +} + +static void free_buf_dirty(void) +{ + if (buf_dirty != NULL) { + vfree(buf_dirty); + buf_dirty = NULL; + buf_size = 0; + } +} + +static ssize_t write_proc( + struct file *filp, + const char *buf, + size_t count, + loff_t *offp) +{ + int ret = 0; + unsigned long old_buff_num; + + if (count > PAGE_SIZE) { + ret = -EINVAL; + goto out; + } + + old_buff_num = buff_num; + ret = kstrtoul_from_user(buf, count, 10, &buff_num); + if (ret != 0 || buff_num > MAX_BUFF_SIZE) { + buff_num = old_buff_num; + ret = -EINVAL; + goto out; + } + + ret = count; + if (buff_num == 0) { + free_buf_dirty(); + goto out; + } + if (buff_num == old_buff_num) + goto out; + + free_buf_dirty(); + buf_size = PAGE_SIZE * buff_num; + buf_dirty = vzalloc(buf_size); + if (!buf_dirty) + ret = -ENOMEM; + +out: + return ret; +} + +static int proc_dpages_open(struct inode *inode, struct file *filp) +{ + int ret; + struct seq_file *m; + + if (xchg(&dirty_page_open, 1) == 1) + return -EBUSY; + + if (buf_dirty == NULL || buf_size == 0) { + pr_warn("please allocate buffer before getting dirty pages\n"); + dirty_page_open = 0; + return -ENOMEM; + } + + ret = single_open(filp, proc_dpages_show, NULL); + if (!ret) { + dirty_page_open = 0; + return ret; + } + + m = (struct seq_file *)filp->private_data; + memset(buf_dirty, 0, buf_size); + /* if seq_has_overflowed() return true, it need to contain "terminated\n\0" */ + m->size = buf_size - 13; + m->buf = buf_dirty; + + return ret; +} + +static int seq_release_dirty(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + + /* we don't want to free the buf */ + m->buf = NULL; + single_release(inode, file); + dirty_page_open = 0; + return 0; +} + +static const struct proc_ops proc_dpages_operations = { + .proc_open = proc_dpages_open, + .proc_read = seq_read, + .proc_release = seq_release_dirty, +}; + +static int proc_switch_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%lu\n", buff_num); + return 0; +} + +static int proc_limit_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%lu\n", READ_ONCE(buff_limit)); + return 0; +} + +static int proc_switch_open(struct inode *inode, struct file *filp) +{ + int ret; + + if (xchg(&dirty_page_open, 1) == 1) + return -EBUSY; + + ret = single_open(filp, proc_switch_show, NULL); + if (!ret) + dirty_page_open = 0; + + return ret; +} + +static int proc_limit_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, proc_limit_show, NULL); +} + +static int proc_switch_release(struct inode *inode, struct file *filp) { + dirty_page_open = 0; + return single_release(inode, filp); +} + +static ssize_t write_limit_proc( + struct file *filp, + const char *buf, + size_t count, + loff_t *offp) +{ + int ret = 0; + unsigned long tmp; + + if (count > PAGE_SIZE) + return -EINVAL; + + ret = kstrtoul_from_user(buf, count, 10, &tmp); + if (ret != 0) + return -EINVAL; + + WRITE_ONCE(buff_limit, tmp); + + return count; +} + + +static const struct proc_ops proc_switch_operations = { + .proc_open = proc_switch_open, + .proc_read = seq_read, + .proc_write = write_proc, + .proc_lseek = seq_lseek, + .proc_release = proc_switch_release, +}; + +static const struct proc_ops proc_limit_operations = { + .proc_open = proc_limit_open, + .proc_read = seq_read, + .proc_write = write_limit_proc, + .proc_lseek = seq_lseek, + .proc_release = single_release, +}; + + +static int __init dpages_proc_init(void) +{ + static struct proc_dir_entry *proc_file; + + dirty_dir = proc_mkdir(DIRTY_ROOT, NULL); + if (!dirty_dir) + goto fail_dir; + + proc_file = proc_create(DIRTY_PAGES, 0440, + dirty_dir, &proc_dpages_operations); + if (!proc_file) + goto fail_pages; + + proc_file = proc_create(DIRTY_SWITCH, 0640, + dirty_dir, &proc_switch_operations); + if (!proc_file) + goto fail_switch; + + proc_file = proc_create(DIRTY_LIMIT, 0640, + dirty_dir, &proc_limit_operations); + if (!proc_file) + goto fail_limit; + + return 0; + +fail_limit: + remove_proc_entry(DIRTY_SWITCH, dirty_dir); +fail_switch: + remove_proc_entry(DIRTY_PAGES, dirty_dir); +fail_pages: + remove_proc_entry(DIRTY_ROOT, NULL); +fail_dir: + return -ENOMEM; +} + +subsys_initcall(dpages_proc_init); -- 2.39.2

2 1

[PATCH OLK-5.10] ext4: fix uninitialized ratelimit_state->lock access in __ext4_fill_super()
by Baokun Li 04 Jan '24

04 Jan '24

maillist inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8TM21 CVE: NA Reference: https://www.spinics.net/lists/kernel/msg5053632.html ---------------------------------------- In the following concurrency we will access the uninitialized rs->lock: ext4_fill_super ext4_register_sysfs // sysfs registered msg_ratelimit_interval_ms // Other processes modify rs->interval to // non-zero via msg_ratelimit_interval_ms ext4_orphan_cleanup ext4_msg(sb, KERN_INFO, "Errors on filesystem, " __ext4_msg ___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state) if (!rs->interval) // do nothing if interval is 0 return 1; raw_spin_trylock_irqsave(&rs->lock, flags) raw_spin_trylock(lock) _raw_spin_trylock __raw_spin_trylock spin_acquire(&lock->dep_map, 0, 1, _RET_IP_) lock_acquire __lock_acquire register_lock_class assign_lock_key dump_stack(); ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10); raw_spin_lock_init(&rs->lock); // init rs->lock here and get the following dump_stack: ========================================================= INFO: trying to register non-static key. The code is fine but needs lockdep annotation, or maybe you didn't initialize this object before use? turning off the locking correctness validator. CPU: 12 PID: 753 Comm: mount Tainted: G E 6.7.0-rc6-next-20231222 #504 [...] Call Trace: dump_stack_lvl+0xc5/0x170 dump_stack+0x18/0x30 register_lock_class+0x740/0x7c0 __lock_acquire+0x69/0x13a0 lock_acquire+0x120/0x450 _raw_spin_trylock+0x98/0xd0 ___ratelimit+0xf6/0x220 __ext4_msg+0x7f/0x160 [ext4] ext4_orphan_cleanup+0x665/0x740 [ext4] __ext4_fill_super+0x21ea/0x2b10 [ext4] ext4_fill_super+0x14d/0x360 [ext4] [...] ========================================================= Normally interval is 0 until s_msg_ratelimit_state is initialized, so ___ratelimit() does nothing. But registering sysfs precedes initializing rs->lock, so it is possible to change rs->interval to a non-zero value via the msg_ratelimit_interval_ms interface of sysfs while rs->lock is uninitialized, and then a call to ext4_msg triggers the problem by accessing an uninitialized rs->lock. Therefore register sysfs after all initializations are complete to avoid such problems. Signed-off-by: Baokun Li <libaokun1(a)huawei.com> Reviewed-by: Jan Kara <jack(a)suse.cz> Conflicts: fs/ext4/super.c Signed-off-by: Baokun Li <libaokun1(a)huawei.com> --- fs/ext4/super.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0d0221c73427..90f886184fc6 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5145,10 +5145,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (err) goto failed_mount6; - err = ext4_register_sysfs(sb); - if (err) - goto failed_mount7; - #ifdef CONFIG_QUOTA /* Enable quota usage during mount. */ if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) { @@ -5181,7 +5177,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_msg(sb, KERN_INFO, "recovery complete"); err = ext4_mark_recovery_complete(sb, es); if (err) - goto failed_mount10; + goto failed_mount9; } if (EXT4_SB(sb)->s_journal) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) @@ -5218,6 +5214,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) atomic_set(&sbi->s_warning_count, 0); atomic_set(&sbi->s_msg_count, 0); + /* Register sysfs after all initializations are complete. */ + err = ext4_register_sysfs(sb); + if (err) + goto failed_mount9; + kfree(orig_data); return 0; @@ -5226,12 +5227,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); goto failed_mount; -failed_mount10: +failed_mount9: ext4_quota_off_umount(sb); failed_mount8: __maybe_unused - ext4_unregister_sysfs(sb); - kobject_put(&sbi->s_kobj); -failed_mount7: ext4_unregister_li_request(sb); failed_mount6: ext4_mb_release(sb); -- 2.31.1

2 1

[PATCH openEuler-1.0-LTS] ext4: fix uninitialized ratelimit_state->lock access in __ext4_fill_super()
by Baokun Li 04 Jan '24

04 Jan '24

maillist inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8TM21 CVE: NA Reference: https://www.spinics.net/lists/kernel/msg5053632.html ---------------------------------------- In the following concurrency we will access the uninitialized rs->lock: ext4_fill_super ext4_register_sysfs // sysfs registered msg_ratelimit_interval_ms // Other processes modify rs->interval to // non-zero via msg_ratelimit_interval_ms ext4_orphan_cleanup ext4_msg(sb, KERN_INFO, "Errors on filesystem, " __ext4_msg ___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state) if (!rs->interval) // do nothing if interval is 0 return 1; raw_spin_trylock_irqsave(&rs->lock, flags) raw_spin_trylock(lock) _raw_spin_trylock __raw_spin_trylock spin_acquire(&lock->dep_map, 0, 1, _RET_IP_) lock_acquire __lock_acquire register_lock_class assign_lock_key dump_stack(); ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10); raw_spin_lock_init(&rs->lock); // init rs->lock here and get the following dump_stack: ========================================================= INFO: trying to register non-static key. The code is fine but needs lockdep annotation, or maybe you didn't initialize this object before use? turning off the locking correctness validator. CPU: 12 PID: 753 Comm: mount Tainted: G E 6.7.0-rc6-next-20231222 #504 [...] Call Trace: dump_stack_lvl+0xc5/0x170 dump_stack+0x18/0x30 register_lock_class+0x740/0x7c0 __lock_acquire+0x69/0x13a0 lock_acquire+0x120/0x450 _raw_spin_trylock+0x98/0xd0 ___ratelimit+0xf6/0x220 __ext4_msg+0x7f/0x160 [ext4] ext4_orphan_cleanup+0x665/0x740 [ext4] __ext4_fill_super+0x21ea/0x2b10 [ext4] ext4_fill_super+0x14d/0x360 [ext4] [...] ========================================================= Normally interval is 0 until s_msg_ratelimit_state is initialized, so ___ratelimit() does nothing. But registering sysfs precedes initializing rs->lock, so it is possible to change rs->interval to a non-zero value via the msg_ratelimit_interval_ms interface of sysfs while rs->lock is uninitialized, and then a call to ext4_msg triggers the problem by accessing an uninitialized rs->lock. Therefore register sysfs after all initializations are complete to avoid such problems. Signed-off-by: Baokun Li <libaokun1(a)huawei.com> Reviewed-by: Jan Kara <jack(a)suse.cz> Conflicts: fs/ext4/super.c Signed-off-by: Baokun Li <libaokun1(a)huawei.com> --- fs/ext4/super.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 571eaf1ed971..1cfec3ac7c9e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4750,10 +4750,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (err) goto failed_mount6; - err = ext4_register_sysfs(sb); - if (err) - goto failed_mount7; - #ifdef CONFIG_QUOTA /* Enable quota usage during mount. */ if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) { @@ -4786,7 +4782,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_msg(sb, KERN_INFO, "recovery complete"); err = ext4_mark_recovery_complete(sb, es); if (err) - goto failed_mount10; + goto failed_mount9; } if (EXT4_SB(sb)->s_journal) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) @@ -4821,6 +4817,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10); ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10); + /* Register sysfs after all initializations are complete. */ + err = ext4_register_sysfs(sb); + if (err) + goto failed_mount9; + kfree(orig_data); return 0; @@ -4829,12 +4830,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); goto failed_mount; -failed_mount10: +failed_mount9: ext4_quota_off_umount(sb); failed_mount8: __maybe_unused - ext4_unregister_sysfs(sb); - kobject_put(&sbi->s_kobj); -failed_mount7: ext4_unregister_li_request(sb); failed_mount6: ext4_mb_release(sb); -- 2.31.1

2 1

[PATCH OLK-5.10] tls: suppress wakeups unless we have a full record
by Ziyang Xuan 04 Jan '24

04 Jan '24

From: Jakub Kicinski <kuba(a)kernel.org> mainline inclusion from mainline-v6.5-rc1 commit 121dca784fc0f6c022493a5d23d86b3cc20380f4 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8QSMO Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?… -------------------------------- TLS does not override .poll() so TLS-enabled socket will generate an event whenever data arrives at the TCP socket. This leads to unnecessary wakeups on slow connections. Signed-off-by: Jakub Kicinski <kuba(a)kernel.org> Signed-off-by: David S. Miller <davem(a)davemloft.net> Conflicts: net/tls/tls_main.c Signed-off-by: Ziyang Xuan <william.xuanziyang(a)huawei.com> --- net/tls/tls_main.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 5790f992b7f8..b82c54cc3daf 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -333,6 +333,39 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) tls_ctx_free(sk, ctx); } +static __poll_t tls_sk_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait) +{ + struct tls_sw_context_rx *ctx; + struct tls_context *tls_ctx; + struct sock *sk = sock->sk; + struct sk_psock *psock; + __poll_t mask = 0; + u8 shutdown; + int state; + + mask = tcp_poll(file, sock, wait); + + state = inet_sk_state_load(sk); + shutdown = READ_ONCE(sk->sk_shutdown); + if (unlikely(state != TCP_ESTABLISHED || shutdown & RCV_SHUTDOWN)) + return mask; + + tls_ctx = tls_get_ctx(sk); + ctx = tls_sw_ctx_rx(tls_ctx); + psock = sk_psock_get(sk); + + if (skb_queue_empty_lockless(&ctx->rx_list) && + (ctx->recv_pkt == NULL) && + sk_psock_queue_empty(psock)) + mask &= ~(EPOLLIN | EPOLLRDNORM); + + if (psock) + sk_psock_put(sk, psock); + + return mask; +} + static int do_tls_getsockopt_conf(struct sock *sk, char __user *optval, int __user *optlen, int tx) { @@ -691,9 +724,11 @@ static void build_proto_ops(struct proto_ops ops[TLS_NUM_CONFIG][TLS_NUM_CONFIG] ops[TLS_BASE][TLS_SW ] = ops[TLS_BASE][TLS_BASE]; ops[TLS_BASE][TLS_SW ].splice_read = tls_sw_splice_read; + ops[TLS_BASE][TLS_SW ].poll = tls_sk_poll; ops[TLS_SW ][TLS_SW ] = ops[TLS_SW ][TLS_BASE]; ops[TLS_SW ][TLS_SW ].splice_read = tls_sw_splice_read; + ops[TLS_SW ][TLS_SW ].poll = tls_sk_poll; #ifdef CONFIG_TLS_DEVICE ops[TLS_HW ][TLS_BASE] = ops[TLS_BASE][TLS_BASE]; -- 2.25.1

2 1

[PATCH openEuler-1.0-LTS] iomap: add support to track dirty state of sub pages
by Yu Kuai 04 Jan '24

04 Jan '24

hulk inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I8TKTW CVE: NA ------------------------------------------ commit 9dc55f1389f9 ("iomap: add support for sub-pagesize buffered I/O without buffer heads") replace the per-block structure buffer_head with the per-page structure iomap_page. However, iomap_page can't track the dirty state of sub pages, which will cause performance issue since sub pages will be writeback even if they are not dirty. For example, if block size is 4k and page size is 64k: dd if=/dev/zero of=testfile bs=4k count=16 oflag=sync With buffer_head implementation, the above dd cmd will writeback 4k in each round. However, with iomap_page implementation, the range of writeback in each round is from the start of the page to the end offset we just wrote. Thus add support to track dirty state in iomap_page. test environment: platform: arm64 pagesize: 64k blocksize: 4k test case: dd if=/dev/zero of=/mnt/testfile bs=1M count=128 fio --ioengine=sync --rw=randwrite --iodepth=64 --name=test \ --filename=/mnt/testfile --bs=4k --fsync=1 The test result is: xfs with patch: WRITE: bw=4609KiB/s (4720kB/s) xfs without patch WRITE: bw=2714KiB/s (2780kB/s) ext4: WRITE: bw=3840KiB/s (3932kB/s) Signed-off-by: Yu Kuai <yukuai3(a)huawei.com> --- fs/iomap.c | 70 ++++++++++++++++++++++++++++++++++++------- fs/xfs/xfs_aops.c | 3 +- include/linux/iomap.h | 15 ++++++++-- 3 files changed, 74 insertions(+), 14 deletions(-) diff --git a/fs/iomap.c b/fs/iomap.c index 7078add7bbf9..d7234c7ac95e 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -121,8 +121,8 @@ iomap_page_create(struct inode *inode, struct page *page) iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL); atomic_set(&iop->read_count, 0); atomic_set(&iop->write_count, 0); - spin_lock_init(&iop->uptodate_lock); - bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE); + spin_lock_init(&iop->state_lock); + bitmap_zero(iop->state, IOMAP_STATE_ARRAY_SIZE); /* * migrate_page_move_mapping() assumes that pages with private data have @@ -175,7 +175,7 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, /* move forward for each leading block marked uptodate */ for (i = first; i <= last; i++) { - if (!test_bit(i, iop->uptodate)) + if (!test_bit(i, iop->state)) break; *pos += block_size; poff += block_size; @@ -185,7 +185,7 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, /* truncate len if we find any trailing uptodate block(s) */ for ( ; i <= last; i++) { - if (test_bit(i, iop->uptodate)) { + if (test_bit(i, iop->state)) { plen -= (last - i + 1) * block_size; last = i - 1; break; @@ -209,6 +209,54 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, *lenp = plen; } +static void +iomap_set_range_dirty(struct page *page, unsigned int off, + unsigned int len) +{ + struct inode *inode = page->mapping->host; + unsigned int first = DIRTY_BITS(off >> inode->i_blkbits); + unsigned int last = DIRTY_BITS((off + len - 1) >> inode->i_blkbits); + unsigned long flags; + struct iomap_page *iop; + + if (PageError(page)) + return; + + if (len) + iomap_set_page_dirty(page); + + if (!page_has_private(page)) + return; + + iop = to_iomap_page(page); + spin_lock_irqsave(&iop->state_lock, flags); + bitmap_set(iop->state, first, last - first + 1); + spin_unlock_irqrestore(&iop->state_lock, flags); +} + +void +iomap_clear_range_dirty(struct page *page, unsigned int off, + unsigned int len) +{ + struct inode *inode = page->mapping->host; + unsigned int first = DIRTY_BITS(off >> inode->i_blkbits); + unsigned int last = DIRTY_BITS((off + len - 1) >> inode->i_blkbits); + unsigned long flags; + struct iomap_page *iop; + + if (PageError(page)) + return; + + if (!page_has_private(page)) + return; + + iop = to_iomap_page(page); + spin_lock_irqsave(&iop->state_lock, flags); + bitmap_clear(iop->state, first, last - first + 1); + spin_unlock_irqrestore(&iop->state_lock, flags); +} +EXPORT_SYMBOL_GPL(iomap_clear_range_dirty); + static void iomap_iop_set_range_uptodate(struct page *page, unsigned off, unsigned len) { @@ -220,17 +268,17 @@ iomap_iop_set_range_uptodate(struct page *page, unsigned off, unsigned len) unsigned long flags; unsigned int i; - spin_lock_irqsave(&iop->uptodate_lock, flags); + spin_lock_irqsave(&iop->state_lock, flags); for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) { if (i >= first && i <= last) - set_bit(i, iop->uptodate); - else if (!test_bit(i, iop->uptodate)) + set_bit(i, iop->state); + else if (!test_bit(i, iop->state)) uptodate = false; } if (uptodate) SetPageUptodate(page); - spin_unlock_irqrestore(&iop->uptodate_lock, flags); + spin_unlock_irqrestore(&iop->state_lock, flags); } static void @@ -544,7 +592,7 @@ iomap_is_partially_uptodate(struct page *page, unsigned long from, if (iop) { for (i = first; i <= last; i++) - if (!test_bit(i, iop->uptodate)) + if (!test_bit(i, iop->state)) return 0; return 1; } @@ -760,7 +808,7 @@ __iomap_write_end(struct inode *inode, loff_t pos, unsigned len, if (unlikely(copied < len && !PageUptodate(page))) return 0; iomap_set_range_uptodate(page, offset_in_page(pos), len); - iomap_set_page_dirty(page); + iomap_set_range_dirty(page, offset_in_page(pos), len); return copied; } @@ -1096,7 +1144,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, } else { WARN_ON_ONCE(!PageUptodate(page)); iomap_page_create(inode, page); - set_page_dirty(page); + iomap_set_range_dirty(page, offset_in_page(pos), length); } return length; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 7a6396bd8114..554c74726552 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -751,7 +751,7 @@ xfs_writepage_map( for (i = 0, file_offset = page_offset(page); i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset; i++, file_offset += len) { - if (iop && !test_bit(i, iop->uptodate)) + if (iop && !test_bit(DIRTY_BITS(i), iop->state)) continue; error = xfs_map_blocks(wpc, inode, file_offset); @@ -800,6 +800,7 @@ xfs_writepage_map( */ set_page_writeback_keepwrite(page); } else { + iomap_clear_range_dirty(page, 0, PAGE_SIZE); clear_page_dirty_for_io(page); set_page_writeback(page); } diff --git a/include/linux/iomap.h b/include/linux/iomap.h index eed29ae891b1..4ce1290ac0c3 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -7,6 +7,7 @@ #include <linux/bitmap.h> #include <linux/mm.h> #include <linux/types.h> +#include <linux/blkdev.h> struct address_space; struct fiemap_extent_info; @@ -53,6 +54,9 @@ struct vm_fault; */ #define IOMAP_NULL_ADDR -1ULL /* addr is not valid */ +#define DIRTY_BITS(x) ((x) + PAGE_SIZE / SECTOR_SIZE) +#define IOMAP_STATE_ARRAY_SIZE (PAGE_SIZE * 2 / SECTOR_SIZE) + struct iomap { u64 addr; /* disk offset of mapping, bytes */ loff_t offset; /* file offset of mapping, bytes */ @@ -114,8 +118,12 @@ struct iomap_ops { struct iomap_page { atomic_t read_count; atomic_t write_count; - spinlock_t uptodate_lock; - DECLARE_BITMAP(uptodate, PAGE_SIZE / 512); + spinlock_t state_lock; + /* + * The first half bits are used to track sub-page uptodate status, + * the second half bits are for dirty status. + */ + DECLARE_BITMAP(state, IOMAP_STATE_ARRAY_SIZE); }; static inline struct iomap_page *to_iomap_page(struct page *page) @@ -136,6 +144,9 @@ int iomap_is_partially_uptodate(struct page *page, unsigned long from, int iomap_releasepage(struct page *page, gfp_t gfp_mask); void iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len); +void iomap_clear_range_dirty(struct page *page, unsigned int off, + unsigned int len); + #ifdef CONFIG_MIGRATION int iomap_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode); -- 2.39.2

2 1

[PATCH OLK-6.6 v3 0/3] block: show info about opening a mounted device for write
by Li Lingfeng 04 Jan '24

04 Jan '24

Both writing a mounted device and writing part0 while mounting a partition will provide a prompt message. v1->v2: 1) Replace BLK_DEV_DUMPINFO with BLK_DEV_WRITE_MOUNTED_QUIET, which is logically opposite; 2) Make BLK_DEV_WRITE_MOUNTED_QUIET dependent on BLK_DEV_WRITE_MOUNTED, and decide whether to set bd_writers by BLK_DEV_WRITE_MOUNTED_QUIET; 3) Move bdev_dump_info() into bdev_may_open(); 4) Add the cmdline interface to control whether to show info; 5) Pass the state of "writes blocked" to part0 when mounting a partition. v2->v3: Add restriction for modifying bd_writers. Li Lingfeng (3): block: Add config to show info about opening a mounted device for write block: detect confilt of write and mount between partitions and part0 block: show info of confilt of write and mount between partitions and part0 block/Kconfig | 9 ++++ block/bdev.c | 120 +++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 122 insertions(+), 7 deletions(-) -- 2.31.1

2 4