From: Zhengchao Shao shaozhengchao@huawei.com
mainline inclusion from mainline-v5.19-rc6 commit fd1894224407c484f652ad456e1ce423e89bb3eb category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5HWKR CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=...
--------------------------------
Syzbot found an issue [1]: fq_codel_drop() try to drop a flow whitout any skbs, that is, the flow->head is null. The root cause, as the [2] says, is because that bpf_prog_test_run_skb() run a bpf prog which redirects empty skbs. So we should determine whether the length of the packet modified by bpf prog or others like bpf_prog_test is valid before forwarding it directly.
LINK: [1] https://syzkaller.appspot.com/bug?id=0b84da80c2917757915afa89f7738a9d16ec96c... LINK: [2] https://www.spinics.net/lists/netdev/msg777503.html
Reported-by: syzbot+7a12909485b94426aceb@syzkaller.appspotmail.com Signed-off-by: Zhengchao Shao shaozhengchao@huawei.com Reviewed-by: Stanislav Fomichev sdf@google.com Link: https://lore.kernel.org/r/20220715115559.139691-1-shaozhengchao@huawei.com Signed-off-by: Alexei Starovoitov ast@kernel.org Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/skbuff.h | 8 ++++++++ net/bpf/test_run.c | 3 +++ net/core/dev.c | 1 + 3 files changed, 12 insertions(+)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 68efccc15a87..72cfb047fd2c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2251,6 +2251,14 @@ static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset)
#endif /* NET_SKBUFF_DATA_USES_OFFSET */
+static inline void skb_assert_len(struct sk_buff *skb) +{ +#ifdef CONFIG_DEBUG_NET + if (WARN_ONCE(!skb->len, "%s\n", __func__)) + DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); +#endif /* CONFIG_DEBUG_NET */ +} + /* * Add data to an sk_buff */ diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 99712d35e535..f266a9453c8e 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -398,6 +398,9 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) { struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb;
+ if (!skb->len) + return -EINVAL; + if (!__skb) return 0;
diff --git a/net/core/dev.c b/net/core/dev.c index 12089c484b30..8e4de36eede8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4094,6 +4094,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) bool again = false;
skb_reset_mac_header(skb); + skb_assert_len(skb);
if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
From: Sven Schnelle svens@linux.ibm.com
mainline inclusion from mainline-v5.14-rc6 commit f153c2246783ba210493054d99c66353f56423c9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5IDIC CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
commit f9c82a4ea89c3 ("Increase size of ucounts to atomic_long_t") changed the data type of ucounts/ucounts_max to long, but missed to adjust a few other places. This is noticeable on big endian platforms from user space because the /proc/sys/user/max_*_names files all contain 0.
v4 - Made the min and max constants long so the sysctl values are actually settable on little endian machines. -- EWB
Fixes: f9c82a4ea89c ("Increase size of ucounts to atomic_long_t") Signed-off-by: Sven Schnelle svens@linux.ibm.com Tested-by: Nathan Chancellor nathan@kernel.org Tested-by: Linux Kernel Functional Testing lkft@linaro.org Acked-by: Alexey Gladkov legion@kernel.org v1: https://lkml.kernel.org/r/20210721115800.910778-1-svens@linux.ibm.com v2: https://lkml.kernel.org/r/20210721125233.1041429-1-svens@linux.ibm.com v3: https://lkml.kernel.org/r/20210730062854.3601635-1-svens@linux.ibm.com Link: https://lkml.kernel.org/r/8735rijqlv.fsf_-_@disp2133 Signed-off-by: Eric W. Biederman ebiederm@xmission.com
Conflict: fs/notify/fanotify/fanotify_user.c
Signed-off-by: Li Nan linan122@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/notify/inotify/inotify_user.c | 17 +++++++++++------ kernel/ucount.c | 19 +++++++++++-------- 2 files changed, 22 insertions(+), 14 deletions(-)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 5f6c6bf65909..f13b64729d08 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -46,22 +46,27 @@ struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
#include <linux/sysctl.h>
+static long it_zero = 0; +static long it_int_max = INT_MAX; + struct ctl_table inotify_table[] = { { .procname = "max_user_instances", .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES], - .maxlen = sizeof(int), + .maxlen = sizeof(long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &it_zero, + .extra2 = &it_int_max, }, { .procname = "max_user_watches", .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES], - .maxlen = sizeof(int), + .maxlen = sizeof(long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &it_zero, + .extra2 = &it_int_max, }, { .procname = "max_queued_events", diff --git a/kernel/ucount.c b/kernel/ucount.c index dff1d9b739d2..1f5825b674d8 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -52,14 +52,17 @@ static struct ctl_table_root set_root = { .permissions = set_permissions, };
-#define UCOUNT_ENTRY(name) \ - { \ - .procname = name, \ - .maxlen = sizeof(int), \ - .mode = 0644, \ - .proc_handler = proc_dointvec_minmax, \ - .extra1 = SYSCTL_ZERO, \ - .extra2 = SYSCTL_INT_MAX, \ +static long ue_zero = 0; +static long ue_int_max = INT_MAX; + +#define UCOUNT_ENTRY(name) \ + { \ + .procname = name, \ + .maxlen = sizeof(long), \ + .mode = 0644, \ + .proc_handler = proc_doulongvec_minmax, \ + .extra1 = &ue_zero, \ + .extra2 = &ue_int_max, \ } static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_user_namespaces"),
From: Luo Meng luomeng12@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ETAB CVE: NA
--------------------------------
Warning reports as follows:
WARNING: CPU: 3 PID: 674 at fs/block_dev.c:1272 bd_link_disk_holder+0xcd/0x270 Modules linked in: null_blk(+) CPU: 3 PID: 674 Comm: dmsetup Not tainted 5.10.0-16691-gf6076432827d-dirty #158 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-4 RIP: 0010:bd_link_disk_holder+0xcd/0x270 Code: 69 73 ee 00 44 89 e8 5b 48 83 05 c5 bf 6d 0c 01 5d 41 5c 41 5d 41 5e 41 8 RSP: 0018:ffffc9000049bbb8 EFLAGS: 00010202 RAX: ffff888104e39038 RBX: ffff888104185000 RCX: 0000000000000000 RDX: 0000000000000001 RSI: ffffffffaa085692 RDI: 0000000000000000 RBP: ffff88810cc2ae00 R08: ffffffffa853659b R09: 0000000000000000 R10: ffffc9000049bbb0 R11: 720030626c6c756e R12: ffff88810e800000 R13: ffff88810e800090 R14: ffff888103570c98 R15: ffff888103570c80 FS: 00007fb49dc13dc0(0000) GS:ffff88813bd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ff994ebde70 CR3: 000000010d54a000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: dm_get_table_device+0x175/0x300 dm_get_device+0x238/0x360 linear_ctr+0xee/0x170 dm_table_add_target+0x199/0x4b0 table_load+0x18c/0x480 ? table_clear+0x190/0x190 ctl_ioctl+0x21d/0x640 ? check_preemption_disabled+0x140/0x150 dm_ctl_ioctl+0x12/0x20 __se_sys_ioctl+0xb1/0x100 __x64_sys_ioctl+0x1e/0x30 do_syscall_64+0x45/0x70 entry_SYSCALL_64_after_hwframe+0x44/0xa9
This can reproduce by concurrent operations: 1. modprobe null_blk 2. echo -e "0 10000 linear /dev/nullb0 0" > table dmsetup create xxx table
t1: create disk a | t2: dm setup | device_add_disk | dev->devt = devt | | dm_get_table_device | open_table_device | blkdev_get_by_dev -> succeed | bd_link_disk_holder | -> holder_dir is still NULL register_disk -> create holder_dir kobject_create_and_add
device_add_disk() will set devt before creating holder_dir, which leaves a window that dm_get_table_device() can find the disk by devt while it's holder_dir is NULL.
So move GENHD_FL_UP in blk_register_queue() to avoid this warning and fix a NULL-ptr in __blk_mq_sched_bio_merge().
Signed-off-by: Luo Meng luomeng12@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-sysfs.c | 6 ++++++ block/genhd.c | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 0a4fcbda8ab4..aff53c3ae836 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -910,6 +910,12 @@ int blk_register_queue(struct gendisk *disk) kobject_uevent(&q->elevator->kobj, KOBJ_ADD); mutex_unlock(&q->sysfs_lock);
+ + /* + * Set the flag at last, so that block devcie can't be opened + * before it's registration is done. + */ + disk->flags |= GENHD_FL_UP; ret = 0; unlock: mutex_unlock(&q->sysfs_dir_lock); diff --git a/block/genhd.c b/block/genhd.c index 8b37fcfa10d1..9d91f880ea95 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -799,8 +799,6 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, WARN_ON(!disk->minors && !(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN)));
- disk->flags |= GENHD_FL_UP; - retval = blk_alloc_devt(&disk->part0, &devt); if (retval) { WARN_ON(1);
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ETAB CVE: NA
--------------------------------
Patch ("block: Fix warning in bd_link_disk_holder()") moves the setting of flag 'GENHD_FL_UP' behind blkdev_get, which will disabled part scan:
devcie_add_disk register_disk blkdev_get __blkdev_get bdev_get_gendisk get_gendisk -> failed because 'GENHD_FL_UP' is not set
And this will cause tests block/017, block/018 and scsi/004 to fail.
Fix the problem by moving part scan as well.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-sysfs.c | 33 +++++++++++++++++++++++++++++++++ block/genhd.c | 27 --------------------------- 2 files changed, 33 insertions(+), 27 deletions(-)
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index aff53c3ae836..f7cd16cec0ed 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -821,6 +821,38 @@ struct kobj_type blk_queue_ktype = { .release = blk_release_queue, };
+static void disk_scan_partitions(struct gendisk *disk) +{ + struct block_device *bdev; + + if (!get_capacity(disk) || !disk_part_scan_enabled(disk)) + return; + + set_bit(GD_NEED_PART_SCAN, &disk->state); + bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL); + if (!IS_ERR(bdev)) + blkdev_put(bdev, FMODE_READ); +} + +static void disk_init_partition(struct gendisk *disk) +{ + struct device *ddev = disk_to_dev(disk); + struct disk_part_iter piter; + struct hd_struct *part; + + disk_scan_partitions(disk); + + /* announce disk after possible partitions are created */ + dev_set_uevent_suppress(ddev, 0); + kobject_uevent(&ddev->kobj, KOBJ_ADD); + + /* announce possible partitions */ + disk_part_iter_init(&piter, disk, 0); + while ((part = disk_part_iter_next(&piter))) + kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); + disk_part_iter_exit(&piter); +} + /** * blk_register_queue - register a block layer queue with sysfs * @disk: Disk of which the request queue should be registered with sysfs. @@ -916,6 +948,7 @@ int blk_register_queue(struct gendisk *disk) * before it's registration is done. */ disk->flags |= GENHD_FL_UP; + disk_init_partition(disk); ret = 0; unlock: mutex_unlock(&q->sysfs_dir_lock); diff --git a/block/genhd.c b/block/genhd.c index 9d91f880ea95..021c9c2d7231 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -687,25 +687,10 @@ static int exact_lock(dev_t devt, void *data) return 0; }
-static void disk_scan_partitions(struct gendisk *disk) -{ - struct block_device *bdev; - - if (!get_capacity(disk) || !disk_part_scan_enabled(disk)) - return; - - set_bit(GD_NEED_PART_SCAN, &disk->state); - bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL); - if (!IS_ERR(bdev)) - blkdev_put(bdev, FMODE_READ); -} - static void register_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups) { struct device *ddev = disk_to_dev(disk); - struct disk_part_iter piter; - struct hd_struct *part; int err;
ddev->parent = parent; @@ -743,18 +728,6 @@ static void register_disk(struct device *parent, struct gendisk *disk, if (disk->flags & GENHD_FL_HIDDEN) return;
- disk_scan_partitions(disk); - - /* announce disk after possible partitions are created */ - dev_set_uevent_suppress(ddev, 0); - kobject_uevent(&ddev->kobj, KOBJ_ADD); - - /* announce possible partitions */ - disk_part_iter_init(&piter, disk, 0); - while ((part = disk_part_iter_next(&piter))) - kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); - disk_part_iter_exit(&piter); - if (disk->queue->backing_dev_info->dev) { err = sysfs_create_link(&ddev->kobj, &disk->queue->backing_dev_info->dev->kobj,
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5ETAB CVE: NA
--------------------------------
Patch ("block: fix that part scan is disabled in device_add_disk()") confuse lockdep to produce following warning:
===================================================== WARNING: possible circular locking dependency detected 4.18.0+ #2 Tainted: G ---------r- - ------------------------------------------------------ syz-executor.0/4652 is trying to acquire lock: 00000000ad5f5a19 (&mddev->open_mutex){+.+.}, at: md_open+0x13a/0x260 home/install/linux-rh-3-10/drivers/md/md.c:7626
but task is already holding lock: 000000005c3a3fea (&bdev->bd_mutex){+.+.}, at: __blkdev_get+0x156/0x1490 home/install/linux-rh-3-10/fs/block_dev.c:1583
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #2 (&bdev->bd_mutex){+.+.}: __mutex_lock_common home/install/linux-rh-3-10/kernel/locking/mutex.c:925 [inline] __mutex_lock+0x105/0x1270 home/install/linux-rh-3-10/kernel/locking/mutex.c:1072 __blkdev_get+0x156/0x1490 home/install/linux-rh-3-10/fs/block_dev.c:1583 blkdev_get+0x33c/0xac0 home/install/linux-rh-3-10/fs/block_dev.c:1735 disk_init_partition home/install/linux-rh-3-10/block/blk-sysfs.c:972 [inline] blk_register_queue+0x5ed/0x6c0 home/install/linux-rh-3-10/block/blk-sysfs.c:1055 __device_add_disk+0xab5/0xd70 home/install/linux-rh-3-10/block/genhd.c:729 sd_probe_async+0x447/0x852 home/install/linux-rh-3-10/drivers/scsi/sd.c:3249 async_run_entry_fn+0xe1/0x700 home/install/linux-rh-3-10/kernel/async.c:127 process_one_work+0x9cf/0x1940 home/install/linux-rh-3-10/kernel/workqueue.c:2175 worker_thread+0x91/0xc50 home/install/linux-rh-3-10/kernel/workqueue.c:2321 kthread+0x33a/0x400 home/install/linux-rh-3-10/kernel/kthread.c:257 ret_from_fork+0x3a/0x50 home/install/linux-rh-3-10/arch/x86/entry/entry_64.S:355
-> #1 (&q->sysfs_dir_lock){+.+.}: __mutex_lock_common home/install/linux-rh-3-10/kernel/locking/mutex.c:925 [inline] __mutex_lock+0x105/0x1270 home/install/linux-rh-3-10/kernel/locking/mutex.c:1072 blk_register_queue+0x143/0x6c0 home/install/linux-rh-3-10/block/blk-sysfs.c:1010 __device_add_disk+0xab5/0xd70 home/install/linux-rh-3-10/block/genhd.c:729 add_disk home/install/linux-rh-3-10/./include/linux/genhd.h:447 [inline] md_alloc+0xb06/0x10d0 home/install/linux-rh-3-10/drivers/md/md.c:5525 md_probe+0x32/0x60 home/install/linux-rh-3-10/drivers/md/md.c:5554 kobj_lookup+0x2d2/0x450 home/install/linux-rh-3-10/drivers/base/map.c:152 get_gendisk+0x3b/0x360 home/install/linux-rh-3-10/block/genhd.c:860 bdev_get_gendisk home/install/linux-rh-3-10/fs/block_dev.c:1181 [inline] __blkdev_get+0x3b6/0x1490 home/install/linux-rh-3-10/fs/block_dev.c:1578 blkdev_get+0x33c/0xac0 home/install/linux-rh-3-10/fs/block_dev.c:1735 blkdev_open+0x1c2/0x250 home/install/linux-rh-3-10/fs/block_dev.c:1923 do_dentry_open+0x686/0xf50 home/install/linux-rh-3-10/fs/open.c:777 do_last home/install/linux-rh-3-10/fs/namei.c:3449 [inline] path_openat+0x92f/0x28c0 home/install/linux-rh-3-10/fs/namei.c:3578 do_filp_open+0x1aa/0x2b0 home/install/linux-rh-3-10/fs/namei.c:3613 do_sys_open+0x307/0x490 home/install/linux-rh-3-10/fs/open.c:1075 do_syscall_64+0xca/0x5c0 home/install/linux-rh-3-10/arch/x86/entry/common.c:298 entry_SYSCALL_64_after_hwframe+0x6a/0xdf
-> #0 (&mddev->open_mutex){+.+.}: lock_acquire+0x10b/0x3a0 home/install/linux-rh-3-10/kernel/locking/lockdep.c:3868 __mutex_lock_common home/install/linux-rh-3-10/kernel/locking/mutex.c:925 [inline] __mutex_lock+0x105/0x1270 home/install/linux-rh-3-10/kernel/locking/mutex.c:1072 md_open+0x13a/0x260 home/install/linux-rh-3-10/drivers/md/md.c:7626 __blkdev_get+0x2dc/0x1490 home/install/linux-rh-3-10/fs/block_dev.c:1599 blkdev_get+0x33c/0xac0 home/install/linux-rh-3-10/fs/block_dev.c:1735 blkdev_open+0x1c2/0x250 home/install/linux-rh-3-10/fs/block_dev.c:1923 do_dentry_open+0x686/0xf50 home/install/linux-rh-3-10/fs/open.c:777 do_last home/install/linux-rh-3-10/fs/namei.c:3449 [inline] path_openat+0x92f/0x28c0 home/install/linux-rh-3-10/fs/namei.c:3578 do_filp_open+0x1aa/0x2b0 home/install/linux-rh-3-10/fs/namei.c:3613 do_sys_open+0x307/0x490 home/install/linux-rh-3-10/fs/open.c:1075 do_syscall_64+0xca/0x5c0 home/install/linux-rh-3-10/arch/x86/entry/common.c:298 entry_SYSCALL_64_after_hwframe+0x6a/0xdf
other info that might help us debug this:
Chain exists of: &mddev->open_mutex --> &q->sysfs_dir_lock --> &bdev->bd_mutex
Possible unsafe locking scenario:
CPU0 CPU1 ---- ---- lock(&bdev->bd_mutex); lock(&q->sysfs_dir_lock); lock(&bdev->bd_mutex); lock(&mddev->open_mutex);
*** DEADLOCK ***
Since 'bd_mutex' and 'sysfs_dir_lock' is different is for each device, deadlock between md_open() and sd_probe_async() is impossible. However, lockdep is treating 'bd_mutex' and 'sysfs_dir_lock' from different devices the same, and patch "block: fix that part scan is disabled in device_add_disk()" is holding 'bd_mutex' inside 'sysfs_dir_lock', which causes the false positive warning.
Fix the false positive warning by don't grab 'bd_mutex' inside 'sysfs_dir_lock'.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-sysfs.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index f7cd16cec0ed..548d758365c6 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -948,10 +948,17 @@ int blk_register_queue(struct gendisk *disk) * before it's registration is done. */ disk->flags |= GENHD_FL_UP; - disk_init_partition(disk); ret = 0; unlock: mutex_unlock(&q->sysfs_dir_lock); + /* + * Init partitions after releasing 'sysfs_dir_lock', otherwise lockdep + * will be confused because it will treat 'bd_mutex' from different + * devices as the same lock. + */ + if (!ret) + disk_init_partition(disk); + return ret; } EXPORT_SYMBOL_GPL(blk_register_queue);
From: Amir Goldstein amir73il@gmail.com
mainline inclusion from mainline-v5.19-rc1 commit a32e697cda27679a0327ae2cafdad8c7170f548f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5IHD1 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
The inotify mask flags IN_ONESHOT and IN_EXCL_UNLINK are not "internal to kernel" and should be exposed in procfs fdinfo so CRIU can restore them.
Fixes: 6933599697c9 ("inotify: hide internal kernel bits from fdinfo") Link: https://lore.kernel.org/r/20220422120327.3459282-2-amir73il@gmail.com Signed-off-by: Amir Goldstein amir73il@gmail.com Signed-off-by: Jan Kara jack@suse.cz Signed-off-by: Li Nan linan122@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/notify/fdinfo.c | 11 ++--------- fs/notify/inotify/inotify.h | 12 ++++++++++++ fs/notify/inotify/inotify_user.c | 2 +- 3 files changed, 15 insertions(+), 10 deletions(-)
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index f0d6b54be412..765b50aeadd2 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -83,16 +83,9 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); inode = igrab(fsnotify_conn_inode(mark->connector)); if (inode) { - /* - * IN_ALL_EVENTS represents all of the mask bits - * that we expose to userspace. There is at - * least one bit (FS_EVENT_ON_CHILD) which is - * used only internally to the kernel. - */ - u32 mask = mark->mask & IN_ALL_EVENTS; - seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ", + seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:0 ", inode_mark->wd, inode->i_ino, inode->i_sb->s_dev, - mask, mark->ignored_mask); + inotify_mark_user_mask(mark)); show_mark_fhandle(m, inode); seq_putc(m, '\n'); iput(inode); diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index 2007e3711916..8f00151eb731 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -22,6 +22,18 @@ static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse) return container_of(fse, struct inotify_event_info, fse); }
+/* + * INOTIFY_USER_FLAGS represents all of the mask bits that we expose to + * userspace. There is at least one bit (FS_EVENT_ON_CHILD) which is + * used only internally to the kernel. + */ +#define INOTIFY_USER_MASK (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK) + +static inline __u32 inotify_mark_user_mask(struct fsnotify_mark *fsn_mark) +{ + return fsn_mark->mask & INOTIFY_USER_MASK; +} + extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group); extern int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index f13b64729d08..3986f1877457 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -93,7 +93,7 @@ static inline __u32 inotify_arg_to_mask(struct inode *inode, u32 arg) mask |= FS_EVENT_ON_CHILD;
/* mask off the flags used to open the fd */ - mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK)); + mask |= (arg & INOTIFY_USER_MASK);
return mask; }
From: Helge Deller deller@gmx.de
stable inclusion from stable-v5.10.130 commit b727561ddc9360de9631af2d970d8ffed676a750 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5IQ4M CVE: CVE-2021-33655
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit 65a01e601dbba8b7a51a2677811f70f783766682 upstream.
Prevent that users set a font size which is bigger than the physical screen. It's unlikely this may happen (because screens are usually much larger than the fonts and each font char is limited to 32x32 pixels), but it may happen on smaller screens/LCD displays.
Signed-off-by: Helge Deller deller@gmx.de Reviewed-by: Daniel Vetter daniel.vetter@ffwll.ch Reviewed-by: Geert Uytterhoeven geert@linux-m68k.org Cc: stable@vger.kernel.org # v4.14+ Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Chen Jun chenjun102@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/video/fbdev/core/fbcon.c | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index f102519ccefb..8d81e9321cf7 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2510,6 +2510,11 @@ static int fbcon_set_font(struct vc_data *vc, struct console_font *font, if (charcount != 256 && charcount != 512) return -EINVAL;
+ /* font bigger than screen resolution ? */ + if (w > FBCON_SWAP(info->var.rotate, info->var.xres, info->var.yres) || + h > FBCON_SWAP(info->var.rotate, info->var.yres, info->var.xres)) + return -EINVAL; + /* Make sure drawing engine can handle the font */ if (!(info->pixmap.blit_x & (1 << (font->width - 1))) || !(info->pixmap.blit_y & (1 << (font->height - 1))))
From: Helge Deller deller@gmx.de
stable inclusion from stable-v5.10.130 commit cecb806c766c78e1be62b6b7b1483ef59bbaeabe category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5IQ4M CVE: CVE-2021-33655
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit e64242caef18b4a5840b0e7a9bff37abd4f4f933 upstream.
We need to prevent that users configure a screen size which is smaller than the currently selected font size. Otherwise rendering chars on the screen will access memory outside the graphics memory region.
This patch adds a new function fbcon_modechange_possible() which implements this check and which later may be extended with other checks if necessary. The new function is called from the FBIOPUT_VSCREENINFO ioctl handler in fbmem.c, which will return -EINVAL if userspace asked for a too small screen size.
Signed-off-by: Helge Deller deller@gmx.de Reviewed-by: Geert Uytterhoeven geert@linux-m68k.org Cc: stable@vger.kernel.org # v5.4+ Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Chen Jun chenjun102@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/video/fbdev/core/fbcon.c | 28 ++++++++++++++++++++++++++++ drivers/video/fbdev/core/fbmem.c | 4 +++- include/linux/fbcon.h | 4 ++++ 3 files changed, 35 insertions(+), 1 deletion(-)
diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 8d81e9321cf7..b4260a830e78 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2776,6 +2776,34 @@ void fbcon_update_vcs(struct fb_info *info, bool all) } EXPORT_SYMBOL(fbcon_update_vcs);
+/* let fbcon check if it supports a new screen resolution */ +int fbcon_modechange_possible(struct fb_info *info, struct fb_var_screeninfo *var) +{ + struct fbcon_ops *ops = info->fbcon_par; + struct vc_data *vc; + unsigned int i; + + WARN_CONSOLE_UNLOCKED(); + + if (!ops) + return 0; + + /* prevent setting a screen size which is smaller than font size */ + for (i = first_fb_vc; i <= last_fb_vc; i++) { + vc = vc_cons[i].d; + if (!vc || vc->vc_mode != KD_TEXT || + registered_fb[con2fb_map[i]] != info) + continue; + + if (vc->vc_font.width > FBCON_SWAP(var->rotate, var->xres, var->yres) || + vc->vc_font.height > FBCON_SWAP(var->rotate, var->yres, var->xres)) + return -EINVAL; + } + + return 0; +} +EXPORT_SYMBOL_GPL(fbcon_modechange_possible); + int fbcon_mode_deleted(struct fb_info *info, struct fb_videomode *mode) { diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c index 00939ca2065a..d2cffaf3fda8 100644 --- a/drivers/video/fbdev/core/fbmem.c +++ b/drivers/video/fbdev/core/fbmem.c @@ -1109,7 +1109,9 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd, return -EFAULT; console_lock(); lock_fb_info(info); - ret = fb_set_var(info, &var); + ret = fbcon_modechange_possible(info, &var); + if (!ret) + ret = fb_set_var(info, &var); if (!ret) fbcon_update_vcs(info, var.activate & FB_ACTIVATE_ALL); unlock_fb_info(info); diff --git a/include/linux/fbcon.h b/include/linux/fbcon.h index ff5596dd30f8..2382dec6d6ab 100644 --- a/include/linux/fbcon.h +++ b/include/linux/fbcon.h @@ -15,6 +15,8 @@ void fbcon_new_modelist(struct fb_info *info); void fbcon_get_requirement(struct fb_info *info, struct fb_blit_caps *caps); void fbcon_fb_blanked(struct fb_info *info, int blank); +int fbcon_modechange_possible(struct fb_info *info, + struct fb_var_screeninfo *var); void fbcon_update_vcs(struct fb_info *info, bool all); void fbcon_remap_all(struct fb_info *info); int fbcon_set_con2fb_map_ioctl(void __user *argp); @@ -33,6 +35,8 @@ static inline void fbcon_new_modelist(struct fb_info *info) {} static inline void fbcon_get_requirement(struct fb_info *info, struct fb_blit_caps *caps) {} static inline void fbcon_fb_blanked(struct fb_info *info, int blank) {} +static inline int fbcon_modechange_possible(struct fb_info *info, + struct fb_var_screeninfo *var) { return 0; } static inline void fbcon_update_vcs(struct fb_info *info, bool all) {} static inline void fbcon_remap_all(struct fb_info *info) {} static inline int fbcon_set_con2fb_map_ioctl(void __user *argp) { return 0; }
From: Helge Deller deller@gmx.de
stable inclusion from stable-v5.10.130 commit b81212828ad19ab3eccf00626cd04099215060bf category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5IQ4M CVE: CVE-2021-33655
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit 6c11df58fd1ac0aefcb3b227f72769272b939e56 upstream.
Verify that the fbdev or drm driver correctly adjusted the virtual screen sizes. On failure report the failing driver and reject the screen size change.
Signed-off-by: Helge Deller deller@gmx.de Reviewed-by: Geert Uytterhoeven geert@linux-m68k.org Cc: stable@vger.kernel.org # v5.4+ Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Chen Jun chenjun102@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/video/fbdev/core/fbmem.c | 10 ++++++++++ 1 file changed, 10 insertions(+)
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c index d2cffaf3fda8..3b3ccb235522 100644 --- a/drivers/video/fbdev/core/fbmem.c +++ b/drivers/video/fbdev/core/fbmem.c @@ -1019,6 +1019,16 @@ fb_set_var(struct fb_info *info, struct fb_var_screeninfo *var) if (ret) return ret;
+ /* verify that virtual resolution >= physical resolution */ + if (var->xres_virtual < var->xres || + var->yres_virtual < var->yres) { + pr_warn("WARNING: fbcon: Driver '%s' missed to adjust virtual screen size (%ux%u vs. %ux%u)\n", + info->fix.id, + var->xres_virtual, var->yres_virtual, + var->xres, var->yres); + return -EINVAL; + } + if ((var->activate & FB_ACTIVATE_MASK) != FB_ACTIVATE_NOW) return 0;
From: Eric Snowberg eric.snowberg@oracle.com
mainline inclusion from mainline-v5.19-rc8 commit 543ce63b664e2c2f9533d089a4664b559c3e6b5b category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5I0FP CVE: CVE-2022-21505
Reference: https://seclists.org/oss-sec/2022/q3/57 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The lockdown LSM is primarily used in conjunction with UEFI Secure Boot. This LSM may also be used on machines without UEFI. It can also be enabled when UEFI Secure Boot is disabled. One of lockdown's features is to prevent kexec from loading untrusted kernels. Lockdown can be enabled through a bootparam or after the kernel has booted through securityfs.
If IMA appraisal is used with the "ima_appraise=log" boot param, lockdown can be defeated with kexec on any machine when Secure Boot is disabled or unavailable. IMA prevents setting "ima_appraise=log" from the boot param when Secure Boot is enabled, but this does not cover cases where lockdown is used without Secure Boot.
To defeat lockdown, boot without Secure Boot and add ima_appraise=log to the kernel command line; then:
$ echo "integrity" > /sys/kernel/security/lockdown $ echo "appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig" > \ /sys/kernel/security/ima/policy $ kexec -ls unsigned-kernel
Add a call to verify ima appraisal is set to "enforce" whenever lockdown is enabled. This fixes CVE-2022-21505.
Cc: stable@vger.kernel.org Fixes: 29d3c1c8dfe7 ("kexec: Allow kexec_file() with appropriate IMA policy when locked down") Signed-off-by: Eric Snowberg eric.snowberg@oracle.com Acked-by: Mimi Zohar zohar@linux.ibm.com Reviewed-by: John Haxby john.haxby@oracle.com Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: GUO Zihua guozihua@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- security/integrity/ima/ima_policy.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index 5a06050729a7..b1ab4b3d99fb 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -1900,6 +1900,10 @@ bool ima_appraise_signature(enum kernel_read_file_id id) if (id >= READING_MAX_ID) return false;
+ if (id == READING_KEXEC_IMAGE && !(ima_appraise & IMA_APPRAISE_ENFORCE) + && security_locked_down(LOCKDOWN_KEXEC)) + return false; + func = read_idmap[id] ?: FILE_CHECK;
rcu_read_lock();
From: ZhaoLong Wang wangzhaolong1@huawei.com
hulk inclusion category: bugfix bugzilla: 187163, https://gitee.com/openeuler/kernel/issues/I5GBC4 CVE: NA
--------------------------------
The ubifs_compress() function does not compress the data When the data length is short than 128 bytes or the compressed data length is not ideal.It cause that the compressed length of the truncated data in the truncate_data_node() function may be greater than the length of the raw data read from the flash.
The above two lengths are transferred to the ubifs_encrypt() function as parameters. This may lead to assertion fails and then the file system becomes read-only.
This patch use the actual length of the data in the memory as the input parameter for assert comparison, which avoids the problem.
Signed-off-by: ZhaoLong Wang wangzhaolong1@huawei.com Reviewed-by: zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/ubifs/crypto.c | 11 +++++++++++ fs/ubifs/journal.c | 29 +++++++++++++++++------------ 2 files changed, 28 insertions(+), 12 deletions(-)
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c index 22be7aeb96c4..1dc22cf29c65 100644 --- a/fs/ubifs/crypto.c +++ b/fs/ubifs/crypto.c @@ -24,6 +24,17 @@ static bool ubifs_crypt_empty_dir(struct inode *inode) return ubifs_check_dir_empty(inode) == 0; }
+/** + * ubifs_encrypt - Encrypt data. + * @inode: inode which refers to the data node + * @dn: data node to encrypt + * @in_len: length of data to be compressed + * @out_len: allocated memory size for the data area of @dn + * @block: logical block number of the block + * + * This function encrypt a possibly-compressed data in the data node. + * The encrypted data length will store in @out_len. + */ int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn, unsigned int in_len, unsigned int *out_len, int block) { diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 72586512e51f..ee9888087983 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -1472,23 +1472,25 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, * @block: data block number * @dn: data node to re-compress * @new_len: new length + * @dn_size: size of the data node @dn in memory * * This function is used when an inode is truncated and the last data node of * the inode has to be re-compressed/encrypted and re-written. */ static int truncate_data_node(const struct ubifs_info *c, const struct inode *inode, unsigned int block, struct ubifs_data_node *dn, - int *new_len) + int *new_len, int dn_size) { void *buf; - int err, dlen, compr_type, out_len, old_dlen; + int err, dlen, compr_type, out_len, data_size;
out_len = le32_to_cpu(dn->size); buf = kmalloc_array(out_len, WORST_COMPR_FACTOR, GFP_NOFS); if (!buf) return -ENOMEM;
- dlen = old_dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + data_size = dn_size - UBIFS_DATA_NODE_SZ; compr_type = le16_to_cpu(dn->compr_type);
if (IS_ENCRYPTED(inode)) { @@ -1508,11 +1510,11 @@ static int truncate_data_node(const struct ubifs_info *c, const struct inode *in }
if (IS_ENCRYPTED(inode)) { - err = ubifs_encrypt(inode, dn, out_len, &old_dlen, block); + err = ubifs_encrypt(inode, dn, out_len, &data_size, block); if (err) goto out;
- out_len = old_dlen; + out_len = data_size; } else { dn->compr_size = 0; } @@ -1549,7 +1551,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, struct ubifs_ino_node *ino; struct ubifs_trun_node *trun; struct ubifs_data_node *dn; - int err, dlen, len, lnum, offs, bit, sz, sync = IS_SYNC(inode); + int err, dlen, len, lnum, offs, bit, sz, dn_size, sync = IS_SYNC(inode); struct ubifs_inode *ui = ubifs_inode(inode); ino_t inum = inode->i_ino; unsigned int blk; @@ -1562,10 +1564,13 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, ubifs_assert(c, S_ISREG(inode->i_mode)); ubifs_assert(c, mutex_is_locked(&ui->ui_mutex));
- sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ + - UBIFS_MAX_DATA_NODE_SZ * WORST_COMPR_FACTOR; + dn_size = COMPRESSED_DATA_NODE_BUF_SZ; + + if (IS_ENCRYPTED(inode)) + dn_size += UBIFS_CIPHER_BLOCK_SIZE;
- sz += ubifs_auth_node_sz(c); + sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ + + dn_size + ubifs_auth_node_sz(c);
ino = kmalloc(sz, GFP_NOFS); if (!ino) @@ -1596,15 +1601,15 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, if (dn_len <= 0 || dn_len > UBIFS_BLOCK_SIZE) { ubifs_err(c, "bad data node (block %u, inode %lu)", blk, inode->i_ino); - ubifs_dump_node(c, dn, sz - UBIFS_INO_NODE_SZ - - UBIFS_TRUN_NODE_SZ); + ubifs_dump_node(c, dn, dn_size); goto out_free; }
if (dn_len <= dlen) dlen = 0; /* Nothing to do */ else { - err = truncate_data_node(c, inode, blk, dn, &dlen); + err = truncate_data_node(c, inode, blk, dn, + &dlen, dn_size); if (err) goto out_free; }
From: Zhihao Cheng chengzhihao1@huawei.com
hulk inclusion category: bugfix bugzilla: 187250, https://gitee.com/openeuler/kernel/issues/I5HSMS CVE: NA
-------------------------------------------------
Following process: vfs_setxattr(host) ubifs_xattr_set down_write(host_ui->xattr_sem) <- lock first time create_xattr ubifs_new_inode(host) fscrypt_prepare_new_inode(host) fscrypt_policy_to_inherit(host) if (IS_ENCRYPTED(inode)) fscrypt_require_key(host) fscrypt_get_encryption_info(host) ubifs_xattr_get(host) down_read(host_ui->xattr_sem) <- AA deadlock
, which may trigger an AA deadlock problem:
[ 102.620871] INFO: task setfattr:1599 blocked for more than 10 seconds. [ 102.625298] Not tainted 5.19.0-rc7-00001-gb666b6823ce0-dirty #711 [ 102.628732] task:setfattr state:D stack: 0 pid: 1599 [ 102.628749] Call Trace: [ 102.628753] <TASK> [ 102.628776] __schedule+0x482/0x1060 [ 102.629964] schedule+0x92/0x1a0 [ 102.629976] rwsem_down_read_slowpath+0x287/0x8c0 [ 102.629996] down_read+0x84/0x170 [ 102.630585] ubifs_xattr_get+0xd1/0x370 [ubifs] [ 102.630730] ubifs_crypt_get_context+0x1f/0x30 [ubifs] [ 102.630791] fscrypt_get_encryption_info+0x7d/0x1c0 [ 102.630810] fscrypt_policy_to_inherit+0x56/0xc0 [ 102.630817] fscrypt_prepare_new_inode+0x35/0x160 [ 102.630830] ubifs_new_inode+0xcc/0x4b0 [ubifs] [ 102.630873] ubifs_xattr_set+0x591/0x9f0 [ubifs] [ 102.630961] xattr_set+0x8c/0x3e0 [ubifs] [ 102.631003] __vfs_setxattr+0x71/0xc0 [ 102.631026] vfs_setxattr+0x105/0x270 [ 102.631034] do_setxattr+0x6d/0x110 [ 102.631041] setxattr+0xa0/0xd0 [ 102.631087] __x64_sys_setxattr+0x2f/0x40
Fetch a reproducer in [Link].
Just like ext4 does, which skips encrypting for inode with EXT4_EA_INODE_FL flag. Stop encypting xattr inode for ubifs.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216260 Fixes: f4e3634a3b64222 ("ubifs: Fix races between xattr_{set|get} ...") Fixes: d475a507457b5ca ("ubifs: Add skeleton for fscrypto") Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/ubifs/dir.c | 25 ++++++++++++++----------- fs/ubifs/ubifs.h | 2 +- fs/ubifs/xattr.c | 2 +- 3 files changed, 16 insertions(+), 13 deletions(-)
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index f5777f59a101..acd7e83a35e4 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -68,13 +68,14 @@ static int inherit_flags(const struct inode *dir, umode_t mode) * @c: UBIFS file-system description object * @dir: parent directory inode * @mode: inode mode flags + * @is_xattr: whether the inode is xattr inode * * This function finds an unused inode number, allocates new inode and * initializes it. Returns new inode in case of success and an error code in * case of failure. */ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, - umode_t mode) + umode_t mode, bool is_xattr) { int err; struct inode *inode; @@ -99,10 +100,12 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, current_time(inode); inode->i_mapping->nrpages = 0;
- err = fscrypt_prepare_new_inode(dir, inode, &encrypted); - if (err) { - ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err); - goto out_iput; + if (!is_xattr) { + err = fscrypt_prepare_new_inode(dir, inode, &encrypted); + if (err) { + ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err); + goto out_iput; + } }
switch (mode & S_IFMT) { @@ -308,7 +311,7 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
sz_change = CALC_DENT_SIZE(fname_len(&nm));
- inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fname; @@ -369,7 +372,7 @@ static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry) if (err) return ERR_PTR(err);
- inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_free; @@ -461,7 +464,7 @@ static int ubifs_tmpfile(struct inode *dir, struct dentry *dentry, return err; }
- inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_budg; @@ -1002,7 +1005,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
sz_change = CALC_DENT_SIZE(fname_len(&nm));
- inode = ubifs_new_inode(c, dir, S_IFDIR | mode); + inode = ubifs_new_inode(c, dir, S_IFDIR | mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fname; @@ -1089,7 +1092,7 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
sz_change = CALC_DENT_SIZE(fname_len(&nm));
- inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { kfree(dev); err = PTR_ERR(inode); @@ -1171,7 +1174,7 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
sz_change = CALC_DENT_SIZE(fname_len(&nm));
- inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO); + inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fname; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 9a4a3191ed07..15cfee7c1125 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1996,7 +1996,7 @@ int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags);
/* dir.c */ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, - umode_t mode); + umode_t mode, bool is_xattr); int ubifs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); int ubifs_check_dir_empty(struct inode *dir); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 258b97939ba2..81efe638acd5 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -110,7 +110,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, if (err) return err;
- inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO); + inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO, true); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_budg;
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: bugfix bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA
--------------------------------
Will disable hugetlb_vmemmap when dynamic hugetlb is enabled in later patch.
This reverts commit c7ae7c0dd37aa112f9cb3e23878f4c5fe67f86bc.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/dynamic_hugetlb.c | 11 ----------- 1 file changed, 11 deletions(-)
diff --git a/mm/dynamic_hugetlb.c b/mm/dynamic_hugetlb.c index eb9b528b73de..f8ebc8ab7d60 100644 --- a/mm/dynamic_hugetlb.c +++ b/mm/dynamic_hugetlb.c @@ -1133,17 +1133,6 @@ void __init dynamic_hugetlb_init(void) if (!enable_dhugetlb) return;
- /* - * The dynamic_hugetlb feature need to split and merge pages frequently. - * hugetlb_vmemmap will affects the perforemance of page split and merge. - * If want to use dynamic hugetlb, please disable hugetlb_vmemmap. - */ - if (hugetlb_free_vmemmap_enabled) { - enable_dhugetlb = false; - pr_info("Please set hugetlb_free_vmemmap=off if want to enable dynamic hugetlb\n"); - return; - } - count = max(hugepage_index(max_pfn), (unsigned long)DEFAULT_PAGELIST_COUNT); size = sizeof(struct dhugetlb_pagelist) + count * sizeof(struct dhugetlb_pool *); dhugetlb_pagelist_t = kzalloc(size, GFP_KERNEL);
From: Kees Cook keescook@chromium.org
mainline inclusion from mainline-v5.13-rc1 commit 0d66ccc1627013c95f1e7ef10b95b8451cd7834e category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
As shown in the comment in jump_label.h, choosing the initial state of static branches changes the assembly layout. If the condition is expected to be likely it's inline, and if unlikely it is out of line via a jump.
A few places in the kernel use (or could be using) a CONFIG to choose the default state, which would give a small performance benefit to their compile-time declared default. Provide the infrastructure to do this.
Signed-off-by: Kees Cook keescook@chromium.org Signed-off-by: Thomas Gleixner tglx@linutronix.de Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Link: https://lore.kernel.org/r/20210401232347.2791257-2-keescook@chromium.org Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/jump_label.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+)
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 7e1dce5670fc..d01c23025af0 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -388,6 +388,21 @@ struct static_key_false { [0 ... (count) - 1] = STATIC_KEY_FALSE_INIT, \ }
+#define _DEFINE_STATIC_KEY_1(name) DEFINE_STATIC_KEY_TRUE(name) +#define _DEFINE_STATIC_KEY_0(name) DEFINE_STATIC_KEY_FALSE(name) +#define DEFINE_STATIC_KEY_MAYBE(cfg, name) \ + __PASTE(_DEFINE_STATIC_KEY_, IS_ENABLED(cfg))(name) + +#define _DEFINE_STATIC_KEY_RO_1(name) DEFINE_STATIC_KEY_TRUE_RO(name) +#define _DEFINE_STATIC_KEY_RO_0(name) DEFINE_STATIC_KEY_FALSE_RO(name) +#define DEFINE_STATIC_KEY_MAYBE_RO(cfg, name) \ + __PASTE(_DEFINE_STATIC_KEY_RO_, IS_ENABLED(cfg))(name) + +#define _DECLARE_STATIC_KEY_1(name) DECLARE_STATIC_KEY_TRUE(name) +#define _DECLARE_STATIC_KEY_0(name) DECLARE_STATIC_KEY_FALSE(name) +#define DECLARE_STATIC_KEY_MAYBE(cfg, name) \ + __PASTE(_DECLARE_STATIC_KEY_, IS_ENABLED(cfg))(name) + extern bool ____wrong_branch_error(void);
#define static_key_enabled(x) \ @@ -488,6 +503,10 @@ extern bool ____wrong_branch_error(void);
#endif /* CONFIG_JUMP_LABEL */
+#define static_branch_maybe(config, x) \ + (IS_ENABLED(config) ? static_branch_likely(x) \ + : static_branch_unlikely(x)) + /* * Advanced usage; refcount, branch is enabled when: count != 0 */
From: "Matthew Wilcox (Oracle)" willy@infradead.org
mainline inclusion from mainline-v5.14-rc1 commit 0f2317e34e2c7b97efd4600122115410795ebeea category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
If you pass a const pointer to compound_head(), you get a const pointer back; if you pass a mutable pointer, you get a mutable pointer back. Also remove an unnecessary forward definition of struct page; we're about to dereference page->compound_head, so it must already have been defined.
Link: https://lkml.kernel.org/r/20210416231531.2521383-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) willy@infradead.org Reviewed-by: Vlastimil Babka vbabka@suse.cz Reviewed-by: Anshuman Khandual anshuman.khandual@arm.com Reviewed-by: William Kucharski william.kucharski@oracle.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/page-flags.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 65e1cbe1d1ce..b40b4e0ada31 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -194,17 +194,17 @@ enum pageflags {
#ifndef __GENERATING_BOUNDS_H
-struct page; /* forward declaration */ - -static inline struct page *compound_head(struct page *page) +static inline unsigned long _compound_head(const struct page *page) { unsigned long head = READ_ONCE(page->compound_head);
if (unlikely(head & 1)) - return (struct page *) (head - 1); - return page; + return head - 1; + return (unsigned long)page; }
+#define compound_head(page) ((typeof(page))_compound_head(page)) + static __always_inline int PageTail(struct page *page) { return READ_ONCE(page->compound_head) & 1;
From: Muchun Song songmuchun@bytedance.com
mainline inclusion from mainline-v5.18-rc1 commit e7d324850bfcb30df563d144c0363cc44595277d category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Patch series "Free the 2nd vmemmap page associated with each HugeTLB page", v7.
This series can minimize the overhead of struct page for 2MB HugeTLB pages significantly. It further reduces the overhead of struct page by 12.5% for a 2MB HugeTLB compared to the previous approach, which means 2GB per 1TB HugeTLB. It is a nice gain. Comments and reviews are welcome. Thanks.
The main implementation and details can refer to the commit log of patch 1. In this series, I have changed the following four helpers, the following table shows the impact of the overhead of those helpers.
+------------------+-----------------------+ | APIs | head page | tail page | +------------------+-----------+-----------+ | PageHead() | Y | N | +------------------+-----------+-----------+ | PageTail() | Y | N | +------------------+-----------+-----------+ | PageCompound() | N | N | +------------------+-----------+-----------+ | compound_head() | Y | N | +------------------+-----------+-----------+
Y: Overhead is increased. N: Overhead is _NOT_ increased.
It shows that the overhead of those helpers on a tail page don't change between "hugetlb_free_vmemmap=on" and "hugetlb_free_vmemmap=off". But the overhead on a head page will be increased when "hugetlb_free_vmemmap=on" (except PageCompound()). So I believe that Matthew Wilcox's folio series will help with this.
The users of PageHead() and PageTail() are much less than compound_head() and most users of PageTail() are VM_BUG_ON(), so I have done some tests about the overhead of compound_head() on head pages.
I have tested the overhead of calling compound_head() on a head page, which is 2.11ns (Measure the call time of 10 million times compound_head(), and then average).
For a head page whose address is not aligned with PAGE_SIZE or a non-compound page, the overhead of compound_head() is 2.54ns which is increased by 20%. For a head page whose address is aligned with PAGE_SIZE, the overhead of compound_head() is 2.97ns which is increased by 40%. Most pages are the former. I do not think the overhead is significant since the overhead of compound_head() itself is low.
This patch (of 5):
This patch minimizes the overhead of struct page for 2MB HugeTLB pages significantly. It further reduces the overhead of struct page by 12.5% for a 2MB HugeTLB compared to the previous approach, which means 2GB per 1TB HugeTLB (2MB type).
After the feature of "Free sonme vmemmap pages of HugeTLB page" is enabled, the mapping of the vmemmap addresses associated with a 2MB HugeTLB page becomes the figure below.
HugeTLB struct pages(8 pages) page frame(8 pages) +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+---> PG_head | | | 0 | -------------> | 0 | | | +-----------+ +-----------+ | | | 1 | -------------> | 1 | | | +-----------+ +-----------+ | | | 2 | ----------------^ ^ ^ ^ ^ ^ | | +-----------+ | | | | | | | | 3 | ------------------+ | | | | | | +-----------+ | | | | | | | 4 | --------------------+ | | | | 2MB | +-----------+ | | | | | | 5 | ----------------------+ | | | | +-----------+ | | | | | 6 | ------------------------+ | | | +-----------+ | | | | 7 | --------------------------+ | | +-----------+ | | | | | | +-----------+
As we can see, the 2nd vmemmap page frame (indexed by 1) is reused and remaped. However, the 2nd vmemmap page frame is also can be freed to the buddy allocator, then we can change the mapping from the figure above to the figure below.
HugeTLB struct pages(8 pages) page frame(8 pages) +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+---> PG_head | | | 0 | -------------> | 0 | | | +-----------+ +-----------+ | | | 1 | ---------------^ ^ ^ ^ ^ ^ ^ | | +-----------+ | | | | | | | | | 2 | -----------------+ | | | | | | | +-----------+ | | | | | | | | 3 | -------------------+ | | | | | | +-----------+ | | | | | | | 4 | ---------------------+ | | | | 2MB | +-----------+ | | | | | | 5 | -----------------------+ | | | | +-----------+ | | | | | 6 | -------------------------+ | | | +-----------+ | | | | 7 | ---------------------------+ | | +-----------+ | | | | | | +-----------+
After we do this, all tail vmemmap pages (1-7) are mapped to the head vmemmap page frame (0). In other words, there are more than one page struct with PG_head associated with each HugeTLB page. We __know__ that there is only one head page struct, the tail page structs with PG_head are fake head page structs. We need an approach to distinguish between those two different types of page structs so that compound_head(), PageHead() and PageTail() can work properly if the parameter is the tail page struct but with PG_head.
The following code snippet describes how to distinguish between real and fake head page struct.
if (test_bit(PG_head, &page->flags)) { unsigned long head = READ_ONCE(page[1].compound_head);
if (head & 1) { if (head == (unsigned long)page + 1) ==> head page struct else ==> tail page struct } else ==> head page struct }
We can safely access the field of the @page[1] with PG_head because the @page is a compound page composed with at least two contiguous pages.
[songmuchun@bytedance.com: restore lost comment changes]
Link: https://lkml.kernel.org/r/20211101031651.75851-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20211101031651.75851-2-songmuchun@bytedance.com Signed-off-by: Muchun Song songmuchun@bytedance.com Reviewed-by: Barry Song song.bao.hua@hisilicon.com Cc: Mike Kravetz mike.kravetz@oracle.com Cc: Oscar Salvador osalvador@suse.de Cc: Michal Hocko mhocko@suse.com Cc: David Hildenbrand david@redhat.com Cc: Chen Huang chenhuang5@huawei.com Cc: Bodeddula Balasubramaniam bodeddub@amazon.com Cc: Jonathan Corbet corbet@lwn.net Cc: Matthew Wilcox willy@infradead.org Cc: Xiongchun Duan duanxiongchun@bytedance.com Cc: Fam Zheng fam.zheng@bytedance.com Cc: Qi Zheng zhengqi.arch@bytedance.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Conflicts: include/linux/page-flags.h Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../admin-guide/kernel-parameters.txt | 2 +- include/linux/page-flags.h | 73 ++++++++++++++++++- mm/hugetlb_vmemmap.c | 62 +++++++++------- mm/sparse-vmemmap.c | 21 ++++++ 4 files changed, 125 insertions(+), 33 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2b04cf8fbab4..e0957b73f63d 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1596,7 +1596,7 @@ [KNL] Reguires CONFIG_HUGETLB_PAGE_FREE_VMEMMAP enabled. Allows heavy hugetlb users to free up some more - memory (6 * PAGE_SIZE for each 2MB hugetlb page). + memory (7 * PAGE_SIZE for each 2MB hugetlb page). Format: { on | off (default) }
on: enable the feature diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index b40b4e0ada31..7b31a81be7e1 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -194,25 +194,82 @@ enum pageflags {
#ifndef __GENERATING_BOUNDS_H
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +extern bool hugetlb_free_vmemmap_enabled; + +/* + * If the feature of freeing some vmemmap pages associated with each HugeTLB + * page is enabled, the head vmemmap page frame is reused and all of the tail + * vmemmap addresses map to the head vmemmap page frame (furture details can + * refer to the figure at the head of the mm/hugetlb_vmemmap.c). In other + * words, there are more than one page struct with PG_head associated with each + * HugeTLB page. We __know__ that there is only one head page struct, the tail + * page structs with PG_head are fake head page structs. We need an approach + * to distinguish between those two different types of page structs so that + * compound_head() can return the real head page struct when the parameter is + * the tail page struct but with PG_head. + * + * The page_fixed_fake_head() returns the real head page struct if the @page is + * fake page head, otherwise, returns @page which can either be a true page + * head or tail. + */ +static __always_inline const struct page *page_fixed_fake_head(const struct page *page) +{ + if (!hugetlb_free_vmemmap_enabled) + return page; + + /* + * Only addresses aligned with PAGE_SIZE of struct page may be fake head + * struct page. The alignment check aims to avoid access the fields ( + * e.g. compound_head) of the @page[1]. It can avoid touch a (possibly) + * cold cacheline in some cases. + */ + if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) && + test_bit(PG_head, &page->flags)) { + /* + * We can safely access the field of the @page[1] with PG_head + * because the @page is a compound page composed with at least + * two contiguous pages. + */ + unsigned long head = READ_ONCE(page[1].compound_head); + + if (likely(head & 1)) + return (const struct page *)(head - 1); + } + return page; +} +#else +static inline const struct page *page_fixed_fake_head(const struct page *page) +{ + return page; +} +#endif + +static __always_inline int page_is_fake_head(struct page *page) +{ + return page_fixed_fake_head(page) != page; +} + static inline unsigned long _compound_head(const struct page *page) { unsigned long head = READ_ONCE(page->compound_head);
if (unlikely(head & 1)) return head - 1; - return (unsigned long)page; + return (unsigned long)page_fixed_fake_head(page); }
#define compound_head(page) ((typeof(page))_compound_head(page))
static __always_inline int PageTail(struct page *page) { - return READ_ONCE(page->compound_head) & 1; + return READ_ONCE(page->compound_head) & 1 || page_is_fake_head(page); }
static __always_inline int PageCompound(struct page *page) { - return test_bit(PG_head, &page->flags) || PageTail(page); + return test_bit(PG_head, &page->flags) || + READ_ONCE(page->compound_head) & 1; }
#define PAGE_POISON_PATTERN -1l @@ -600,7 +657,15 @@ static inline void set_page_writeback_keepwrite(struct page *page) test_set_page_writeback_keepwrite(page); }
-__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY) +static __always_inline int PageHead(struct page *page) +{ + PF_POISONED_CHECK(page); + return test_bit(PG_head, &page->flags) && !page_is_fake_head(page); +} + +__SETPAGEFLAG(Head, head, PF_ANY) +__CLEARPAGEFLAG(Head, head, PF_ANY) +CLEARPAGEFLAG(Head, head, PF_ANY)
static __always_inline void set_compound_head(struct page *page, struct page *head) { diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index c540c21e26f5..4977f5a520c2 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -124,9 +124,9 @@ * page of page structs (page 0) associated with the HugeTLB page contains the 4 * page structs necessary to describe the HugeTLB. The only use of the remaining * pages of page structs (page 1 to page 7) is to point to page->compound_head. - * Therefore, we can remap pages 2 to 7 to page 1. Only 2 pages of page structs + * Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs * will be used for each HugeTLB page. This will allow us to free the remaining - * 6 pages to the buddy allocator. + * 7 pages to the buddy allocator. * * Here is how things look after remapping. * @@ -134,30 +134,30 @@ * +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ * | | | 0 | -------------> | 0 | * | | +-----------+ +-----------+ - * | | | 1 | -------------> | 1 | - * | | +-----------+ +-----------+ - * | | | 2 | ----------------^ ^ ^ ^ ^ ^ - * | | +-----------+ | | | | | - * | | | 3 | ------------------+ | | | | - * | | +-----------+ | | | | - * | | | 4 | --------------------+ | | | - * | PMD | +-----------+ | | | - * | level | | 5 | ----------------------+ | | - * | mapping | +-----------+ | | - * | | | 6 | ------------------------+ | - * | | +-----------+ | - * | | | 7 | --------------------------+ + * | | | 1 | ---------------^ ^ ^ ^ ^ ^ ^ + * | | +-----------+ | | | | | | + * | | | 2 | -----------------+ | | | | | + * | | +-----------+ | | | | | + * | | | 3 | -------------------+ | | | | + * | | +-----------+ | | | | + * | | | 4 | ---------------------+ | | | + * | PMD | +-----------+ | | | + * | level | | 5 | -----------------------+ | | + * | mapping | +-----------+ | | + * | | | 6 | -------------------------+ | + * | | +-----------+ | + * | | | 7 | ---------------------------+ * | | +-----------+ * | | * | | * | | * +-----------+ * - * When a HugeTLB is freed to the buddy system, we should allocate 6 pages for + * When a HugeTLB is freed to the buddy system, we should allocate 7 pages for * vmemmap pages and restore the previous mapping relationship. * * For the HugeTLB page of the pud level mapping. It is similar to the former. - * We also can use this approach to free (PAGE_SIZE - 2) vmemmap pages. + * We also can use this approach to free (PAGE_SIZE - 1) vmemmap pages. * * Apart from the HugeTLB page of the pmd/pud level mapping, some architectures * (e.g. aarch64) provides a contiguous bit in the translation table entries @@ -166,7 +166,13 @@ * * The contiguous bit is used to increase the mapping size at the pmd and pte * (last) level. So this type of HugeTLB page can be optimized only when its - * size of the struct page structs is greater than 2 pages. + * size of the struct page structs is greater than 1 page. + * + * Notice: The head vmemmap page is not freed to the buddy allocator and all + * tail vmemmap pages are mapped to the head vmemmap page frame. So we can see + * more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page) + * associated with each HugeTLB page. The compound_head() can handle this + * correctly (more details refer to the comment above compound_head()). */ #define pr_fmt(fmt) "HugeTLB: " fmt
@@ -175,19 +181,21 @@ /* * There are a lot of struct page structures associated with each HugeTLB page. * For tail pages, the value of compound_head is the same. So we can reuse first - * page of tail page structures. We map the virtual addresses of the remaining - * pages of tail page structures to the first tail page struct, and then free - * these page frames. Therefore, we need to reserve two pages as vmemmap areas. + * page of head page structures. We map the virtual addresses of all the pages + * of tail page structures to the head page struct, and then free these page + * frames. Therefore, we need to reserve one pages as vmemmap areas. */ -#define RESERVE_VMEMMAP_NR 2U +#define RESERVE_VMEMMAP_NR 1U #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT)
-bool hugetlb_free_vmemmap_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); +bool hugetlb_free_vmemmap_enabled __read_mostly = + IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); +EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled);
static int __init early_hugetlb_free_vmemmap_param(char *buf) { /* We cannot optimize if a "struct page" crosses page boundaries. */ - if ((!is_power_of_2(sizeof(struct page)))) { + if (!is_power_of_2(sizeof(struct page))) { pr_warn("cannot free vmemmap pages because "struct page" crosses page boundaries\n"); return 0; } @@ -236,7 +244,6 @@ int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) */ ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse, GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE); - if (!ret) ClearHPageVmemmapOptimized(head);
@@ -282,9 +289,8 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; /* - * The head page and the first tail page are not to be freed to buddy - * allocator, the other pages will map to the first tail page, so they - * can be freed. + * The head page is not to be freed to buddy allocator, the other tail + * pages will map to the head page, so they can be freed. * * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true * on some architectures (e.g. aarch64). See Documentation/arm64/ diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 396a49462894..39bdbf0b28dd 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -245,6 +245,26 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, set_pte_at(&init_mm, addr, pte, entry); }
+/* + * How many struct page structs need to be reset. When we reuse the head + * struct page, the special metadata (e.g. page->flags or page->mapping) + * cannot copy to the tail struct page structs. The invalid value will be + * checked in the free_tail_pages_check(). In order to avoid the message + * of "corrupted mapping in tail page". We need to reset at least 3 (one + * head struct page struct and two tail struct page structs) struct page + * structs. + */ +#define NR_RESET_STRUCT_PAGE 3 + +static inline void reset_struct_pages(struct page *start) +{ + int i; + struct page *from = start + NR_RESET_STRUCT_PAGE; + + for (i = 0; i < NR_RESET_STRUCT_PAGE; i++) + memcpy(start + i, from, sizeof(*from)); +} + static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk) { @@ -258,6 +278,7 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, list_del(&page->lru); to = page_to_virt(page); copy_page(to, (void *)walk->reuse_addr); + reset_struct_pages(to);
set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); }
From: Muchun Song songmuchun@bytedance.com
mainline inclusion from mainline-v5.18-rc1 commit a6b40850c442bf996e729e1d441d3dbc37cea171 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The page_fixed_fake_head() is used throughout memory management and the conditional check requires checking a global variable, although the overhead of this check may be small, it increases when the memory cache comes under pressure. Also, the global variable will not be modified after system boot, so it is very appropriate to use static key machanism.
Link: https://lkml.kernel.org/r/20211101031651.75851-3-songmuchun@bytedance.com Signed-off-by: Muchun Song songmuchun@bytedance.com Reviewed-by: Barry Song song.bao.hua@hisilicon.com Cc: Bodeddula Balasubramaniam bodeddub@amazon.com Cc: Chen Huang chenhuang5@huawei.com Cc: David Hildenbrand david@redhat.com Cc: Fam Zheng fam.zheng@bytedance.com Cc: Jonathan Corbet corbet@lwn.net Cc: Matthew Wilcox willy@infradead.org Cc: Michal Hocko mhocko@suse.com Cc: Mike Kravetz mike.kravetz@oracle.com Cc: Oscar Salvador osalvador@suse.de Cc: Qi Zheng zhengqi.arch@bytedance.com Cc: Xiongchun Duan duanxiongchun@bytedance.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Conflicts: mm/memory_hotplug.c Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/hugetlb.h | 6 ------ include/linux/page-flags.h | 16 ++++++++++++++-- mm/hugetlb_vmemmap.c | 12 ++++++------ 3 files changed, 20 insertions(+), 14 deletions(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 634630ebc8a7..fd1dc8d29436 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1103,12 +1103,6 @@ static inline int hugetlb_insert_hugepage_pte_by_pa(struct mm_struct *mm, } #endif /* CONFIG_HUGETLB_PAGE */
-#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP -extern bool hugetlb_free_vmemmap_enabled; -#else -#define hugetlb_free_vmemmap_enabled false -#endif - static inline spinlock_t *huge_pte_lock(struct hstate *h, struct mm_struct *mm, pte_t *pte) { diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 7b31a81be7e1..81061122af68 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -195,7 +195,14 @@ enum pageflags { #ifndef __GENERATING_BOUNDS_H
#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP -extern bool hugetlb_free_vmemmap_enabled; +DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, + hugetlb_free_vmemmap_enabled_key); + +static __always_inline bool hugetlb_free_vmemmap_enabled(void) +{ + return static_branch_maybe(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, + &hugetlb_free_vmemmap_enabled_key); +}
/* * If the feature of freeing some vmemmap pages associated with each HugeTLB @@ -215,7 +222,7 @@ extern bool hugetlb_free_vmemmap_enabled; */ static __always_inline const struct page *page_fixed_fake_head(const struct page *page) { - if (!hugetlb_free_vmemmap_enabled) + if (!hugetlb_free_vmemmap_enabled()) return page;
/* @@ -243,6 +250,11 @@ static inline const struct page *page_fixed_fake_head(const struct page *page) { return page; } + +static inline bool hugetlb_free_vmemmap_enabled(void) +{ + return false; +} #endif
static __always_inline int page_is_fake_head(struct page *page) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 4977f5a520c2..791626983c2e 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -188,9 +188,9 @@ #define RESERVE_VMEMMAP_NR 1U #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT)
-bool hugetlb_free_vmemmap_enabled __read_mostly = - IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); -EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled); +DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, + hugetlb_free_vmemmap_enabled_key); +EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled_key);
static int __init early_hugetlb_free_vmemmap_param(char *buf) { @@ -204,9 +204,9 @@ static int __init early_hugetlb_free_vmemmap_param(char *buf) return -EINVAL;
if (!strcmp(buf, "on")) - hugetlb_free_vmemmap_enabled = true; + static_branch_enable(&hugetlb_free_vmemmap_enabled_key); else if (!strcmp(buf, "off")) - hugetlb_free_vmemmap_enabled = false; + static_branch_disable(&hugetlb_free_vmemmap_enabled_key); else return -EINVAL;
@@ -284,7 +284,7 @@ void __init hugetlb_vmemmap_init(struct hstate *h) BUILD_BUG_ON(__NR_USED_SUBPAGE >= RESERVE_VMEMMAP_SIZE / sizeof(struct page));
- if (!hugetlb_free_vmemmap_enabled) + if (!hugetlb_free_vmemmap_enabled()) return;
vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT;
From: Muchun Song songmuchun@bytedance.com
mainline inclusion from mainline-v5.18-rc1 commit d8d55f5616cf3b900a23a72dd24e7b07211e7859 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The init_mm.page_table_lock is used to protect kernel page tables, we can use it to serialize splitting vmemmap PMD mappings instead of mmap write lock, which can increase the concurrency of vmemmap_remap_free().
Actually, It increase the concurrency between allocations of HugeTLB pages. But it is not the only benefit. There are a lot of users of mmap read lock of init_mm. The mmap write lock is holding through vmemmap_remap_free(), removing mmap write lock usage to make it does not affect other users of mmap read lock. It is not making anything worse and always a win to move.
Now the kernel page table walker does not hold the page_table_lock when walking pmd entries. There may be consistency issue of a pmd entry, because pmd entry might change from a huge pmd entry to a PTE page table. There is only one user of kernel page table walker, namely ptdump. The ptdump already considers the consistency, which use a local variable to cache the value of pmd entry. But we also need to update ->action to ACTION_CONTINUE to make sure the walker does not walk every pte entry again when concurrent thread has split the huge pmd.
Link: https://lkml.kernel.org/r/20211101031651.75851-4-songmuchun@bytedance.com Signed-off-by: Muchun Song songmuchun@bytedance.com Cc: Barry Song song.bao.hua@hisilicon.com Cc: Bodeddula Balasubramaniam bodeddub@amazon.com Cc: Chen Huang chenhuang5@huawei.com Cc: David Hildenbrand david@redhat.com Cc: Fam Zheng fam.zheng@bytedance.com Cc: Jonathan Corbet corbet@lwn.net Cc: Matthew Wilcox willy@infradead.org Cc: Michal Hocko mhocko@suse.com Cc: Mike Kravetz mike.kravetz@oracle.com Cc: Oscar Salvador osalvador@suse.de Cc: Qi Zheng zhengqi.arch@bytedance.com Cc: Xiongchun Duan duanxiongchun@bytedance.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Conflicts: mm/sparse-vmemmap.c Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/ptdump.c | 16 +++++++++++---- mm/sparse-vmemmap.c | 47 ++++++++++++++++++++++++++++++--------------- 2 files changed, 43 insertions(+), 20 deletions(-)
diff --git a/mm/ptdump.c b/mm/ptdump.c index 93f2f63dc52d..43661863096b 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -39,8 +39,10 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 0, pgd_val(val));
- if (pgd_leaf(val)) + if (pgd_leaf(val)) { st->note_page(st, addr, 0, pgd_val(val)); + walk->action = ACTION_CONTINUE; + }
return 0; } @@ -59,8 +61,10 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 1, p4d_val(val));
- if (p4d_leaf(val)) + if (p4d_leaf(val)) { st->note_page(st, addr, 1, p4d_val(val)); + walk->action = ACTION_CONTINUE; + }
return 0; } @@ -79,8 +83,10 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 2, pud_val(val));
- if (pud_leaf(val)) + if (pud_leaf(val)) { st->note_page(st, addr, 2, pud_val(val)); + walk->action = ACTION_CONTINUE; + }
return 0; } @@ -98,8 +104,10 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
if (st->effective_prot) st->effective_prot(st, 3, pmd_val(val)); - if (pmd_leaf(val)) + if (pmd_leaf(val)) { st->note_page(st, addr, 3, pmd_val(val)); + walk->action = ACTION_CONTINUE; + }
return 0; } diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 39bdbf0b28dd..eef9053f948f 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -53,8 +53,7 @@ struct vmemmap_remap_walk { struct list_head *vmemmap_pages; };
-static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, - struct vmemmap_remap_walk *walk) +static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) { pmd_t __pmd; int i; @@ -76,15 +75,34 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, set_pte_at(&init_mm, addr, pte, entry); }
- /* Make pte visible before pmd. See comment in __pte_alloc(). */ - smp_wmb(); - pmd_populate_kernel(&init_mm, pmd, pgtable); - - flush_tlb_kernel_range(start, start + PMD_SIZE); + spin_lock(&init_mm.page_table_lock); + if (likely(pmd_leaf(*pmd))) { + /* Make pte visible before pmd. See comment in __pte_alloc(). */ + smp_wmb(); + pmd_populate_kernel(&init_mm, pmd, pgtable); + flush_tlb_kernel_range(start, start + PMD_SIZE); + } else { + pte_free_kernel(&init_mm, pgtable); + } + spin_unlock(&init_mm.page_table_lock);
return 0; }
+static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) +{ + int leaf; + + spin_lock(&init_mm.page_table_lock); + leaf = pmd_leaf(*pmd); + spin_unlock(&init_mm.page_table_lock); + + if (!leaf) + return 0; + + return __split_vmemmap_huge_pmd(pmd, start); +} + static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct vmemmap_remap_walk *walk) @@ -121,13 +139,12 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
pmd = pmd_offset(pud, addr); do { - if (pmd_leaf(*pmd)) { - int ret; + int ret; + + ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK); + if (ret) + return ret;
- ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, walk); - if (ret) - return ret; - } next = pmd_addr_end(addr, end); vmemmap_pte_range(pmd, addr, next, walk); } while (pmd++, addr = next, addr != end); @@ -321,10 +338,8 @@ int vmemmap_remap_free(unsigned long start, unsigned long end, */ BUG_ON(start - reuse != PAGE_SIZE);
- mmap_write_lock(&init_mm); + mmap_read_lock(&init_mm); ret = vmemmap_remap_range(reuse, end, &walk); - mmap_write_downgrade(&init_mm); - if (ret && walk.nr_walked) { end = reuse + walk.nr_walked * PAGE_SIZE; /*
From: Muchun Song songmuchun@bytedance.com
mainline inclusion from mainline-v5.18-rc1 commit b147c89cd429321a59147368378c8aba17c8480f category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Since the head vmemmap page frame associated with each HugeTLB page is reused, we should hide the PG_head flag of tail struct page from the user. Add a tese case to check whether it is work properly. The test steps are as follows.
1) alloc 2MB hugeTLB 2) get each page frame 3) apply those APIs in each page frame 4) Those APIs work completely the same as before.
Reading the flags of a page by /proc/kpageflags is done in stable_page_flags(), which has invoked PageHead(), PageTail(), PageCompound() and compound_head().
If those APIs work properly, the head page must have 15 and 17 bits set. And tail pages must have 16 and 17 bits set but 15 bit unset. Those flags are checked in check_page_flags().
Link: https://lkml.kernel.org/r/20211101031651.75851-5-songmuchun@bytedance.com Signed-off-by: Muchun Song songmuchun@bytedance.com Reviewed-by: Barry Song song.bao.hua@hisilicon.com Cc: Bodeddula Balasubramaniam bodeddub@amazon.com Cc: Chen Huang chenhuang5@huawei.com Cc: David Hildenbrand david@redhat.com Cc: Fam Zheng fam.zheng@bytedance.com Cc: Jonathan Corbet corbet@lwn.net Cc: Matthew Wilcox willy@infradead.org Cc: Michal Hocko mhocko@suse.com Cc: Mike Kravetz mike.kravetz@oracle.com Cc: Oscar Salvador osalvador@suse.de Cc: Qi Zheng zhengqi.arch@bytedance.com Cc: Xiongchun Duan duanxiongchun@bytedance.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Conflicts: tools/testing/selftests/vm/Makefile tools/testing/selftests/vm/run_vmtests.sh Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- tools/testing/selftests/vm/.gitignore | 1 + tools/testing/selftests/vm/Makefile | 1 + tools/testing/selftests/vm/hugepage-vmemmap.c | 144 ++++++++++++++++++ tools/testing/selftests/vm/run_vmtests | 11 ++ 4 files changed, 157 insertions(+) create mode 100644 tools/testing/selftests/vm/hugepage-vmemmap.c
diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index b3a183c36cb5..905dc4efa879 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only hugepage-mmap hugepage-shm +hugepage-vmemmap khugepaged map_hugetlb map_populate diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index b150cc837177..549761bc7193 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -27,6 +27,7 @@ TEST_GEN_FILES += gup_benchmark TEST_GEN_FILES += hmm-tests TEST_GEN_FILES += hugepage-mmap TEST_GEN_FILES += hugepage-shm +TEST_GEN_FILES += hugepage-vmemmap TEST_GEN_FILES += map_hugetlb TEST_GEN_FILES += map_fixed_noreplace TEST_GEN_FILES += map_populate diff --git a/tools/testing/selftests/vm/hugepage-vmemmap.c b/tools/testing/selftests/vm/hugepage-vmemmap.c new file mode 100644 index 000000000000..557bdbd4f87e --- /dev/null +++ b/tools/testing/selftests/vm/hugepage-vmemmap.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A test case of using hugepage memory in a user application using the + * mmap system call with MAP_HUGETLB flag. Before running this program + * make sure the administrator has allocated enough default sized huge + * pages to cover the 2 MB allocation. + */ +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <sys/mman.h> +#include <fcntl.h> + +#define MAP_LENGTH (2UL * 1024 * 1024) + +#ifndef MAP_HUGETLB +#define MAP_HUGETLB 0x40000 /* arch specific */ +#endif + +#define PAGE_SIZE 4096 + +#define PAGE_COMPOUND_HEAD (1UL << 15) +#define PAGE_COMPOUND_TAIL (1UL << 16) +#define PAGE_HUGE (1UL << 17) + +#define HEAD_PAGE_FLAGS (PAGE_COMPOUND_HEAD | PAGE_HUGE) +#define TAIL_PAGE_FLAGS (PAGE_COMPOUND_TAIL | PAGE_HUGE) + +#define PM_PFRAME_BITS 55 +#define PM_PFRAME_MASK ~((1UL << PM_PFRAME_BITS) - 1) + +/* + * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. + * That means the addresses starting with 0x800000... will need to be + * specified. Specifying a fixed address is not required on ppc64, i386 + * or x86_64. + */ +#ifdef __ia64__ +#define MAP_ADDR (void *)(0x8000000000000000UL) +#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) +#else +#define MAP_ADDR NULL +#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) +#endif + +static void write_bytes(char *addr, size_t length) +{ + unsigned long i; + + for (i = 0; i < length; i++) + *(addr + i) = (char)i; +} + +static unsigned long virt_to_pfn(void *addr) +{ + int fd; + unsigned long pagemap; + + fd = open("/proc/self/pagemap", O_RDONLY); + if (fd < 0) + return -1UL; + + lseek(fd, (unsigned long)addr / PAGE_SIZE * sizeof(pagemap), SEEK_SET); + read(fd, &pagemap, sizeof(pagemap)); + close(fd); + + return pagemap & ~PM_PFRAME_MASK; +} + +static int check_page_flags(unsigned long pfn) +{ + int fd, i; + unsigned long pageflags; + + fd = open("/proc/kpageflags", O_RDONLY); + if (fd < 0) + return -1; + + lseek(fd, pfn * sizeof(pageflags), SEEK_SET); + + read(fd, &pageflags, sizeof(pageflags)); + if ((pageflags & HEAD_PAGE_FLAGS) != HEAD_PAGE_FLAGS) { + close(fd); + printf("Head page flags (%lx) is invalid\n", pageflags); + return -1; + } + + /* + * pages other than the first page must be tail and shouldn't be head; + * this also verifies kernel has correctly set the fake page_head to tail + * while hugetlb_free_vmemmap is enabled. + */ + for (i = 1; i < MAP_LENGTH / PAGE_SIZE; i++) { + read(fd, &pageflags, sizeof(pageflags)); + if ((pageflags & TAIL_PAGE_FLAGS) != TAIL_PAGE_FLAGS || + (pageflags & HEAD_PAGE_FLAGS) == HEAD_PAGE_FLAGS) { + close(fd); + printf("Tail page flags (%lx) is invalid\n", pageflags); + return -1; + } + } + + close(fd); + + return 0; +} + +int main(int argc, char **argv) +{ + void *addr; + unsigned long pfn; + + addr = mmap(MAP_ADDR, MAP_LENGTH, PROT_READ | PROT_WRITE, MAP_FLAGS, -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* Trigger allocation of HugeTLB page. */ + write_bytes(addr, MAP_LENGTH); + + pfn = virt_to_pfn(addr); + if (pfn == -1UL) { + munmap(addr, MAP_LENGTH); + perror("virt_to_pfn"); + exit(1); + } + + printf("Returned address is %p whose pfn is %lx\n", addr, pfn); + + if (check_page_flags(pfn) < 0) { + munmap(addr, MAP_LENGTH); + perror("check_page_flags"); + exit(1); + } + + /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */ + if (munmap(addr, MAP_LENGTH)) { + perror("munmap"); + exit(1); + } + + return 0; +} diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests index d578ad831813..949c71fe5951 100755 --- a/tools/testing/selftests/vm/run_vmtests +++ b/tools/testing/selftests/vm/run_vmtests @@ -108,6 +108,17 @@ else echo "[PASS]" fi
+echo "------------------------" +echo "running hugepage-vmemmap" +echo "------------------------" +./hugepage-vmemmap +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi + echo "NOTE: The above hugetlb tests provide minimal coverage. Use" echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" echo " hugetlb regression testing."
From: Muchun Song songmuchun@bytedance.com
mainline inclusion from mainline-v5.18-rc1 commit e54084173487804f5e2f23facf107fd9336e637e category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The vmemmap_remap_free/alloc are relevant to HugeTLB, so move those functiongs to the scope of CONFIG_HUGETLB_PAGE_FREE_VMEMMAP.
Link: https://lkml.kernel.org/r/20211101031651.75851-6-songmuchun@bytedance.com Signed-off-by: Muchun Song songmuchun@bytedance.com Reviewed-by: Barry Song song.bao.hua@hisilicon.com Cc: Bodeddula Balasubramaniam bodeddub@amazon.com Cc: Chen Huang chenhuang5@huawei.com Cc: David Hildenbrand david@redhat.com Cc: Fam Zheng fam.zheng@bytedance.com Cc: Jonathan Corbet corbet@lwn.net Cc: Matthew Wilcox willy@infradead.org Cc: Michal Hocko mhocko@suse.com Cc: Mike Kravetz mike.kravetz@oracle.com Cc: Oscar Salvador osalvador@suse.de Cc: Qi Zheng zhengqi.arch@bytedance.com Cc: Xiongchun Duan duanxiongchun@bytedance.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/mm.h | 2 ++ mm/sparse-vmemmap.c | 2 ++ 2 files changed, 4 insertions(+)
diff --git a/include/linux/mm.h b/include/linux/mm.h index 25891b581bf4..efa3972a23bf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3104,10 +3104,12 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) } #endif
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP int vmemmap_remap_free(unsigned long start, unsigned long end, unsigned long reuse); int vmemmap_remap_alloc(unsigned long start, unsigned long end, unsigned long reuse, gfp_t gfp_mask); +#endif
void *sparse_buffer_alloc(unsigned long size); struct page * __populate_section_memmap(unsigned long pfn, diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index eef9053f948f..e5ed2680ec57 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -34,6 +34,7 @@ #include <asm/pgalloc.h> #include <asm/tlbflush.h>
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP /** * struct vmemmap_remap_walk - walk vmemmap page table * @@ -419,6 +420,7 @@ int vmemmap_remap_alloc(unsigned long start, unsigned long end,
return 0; } +#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
/* * Allocate a block of memory to be used to back the virtual memory map
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO
--------------------------------
There is a formal solution to support hugetlb vmemmap feature on arm64.
This reverts commit 5838d235a395e221968e1f024bfaf8650278a522.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/Kconfig b/fs/Kconfig index 6e723c90a506..37a8895a3254 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -239,7 +239,7 @@ config HUGETLB_PAGE
config HUGETLB_PAGE_FREE_VMEMMAP def_bool HUGETLB_PAGE - depends on X86_64 || ARM64 + depends on X86_64 depends on SPARSEMEM_VMEMMAP
config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON
From: Muchun Song songmuchun@bytedance.com
mainline inclusion from mainline-v5.19-rc1 commit 2e4ec02bbcc05b8905d65c763ebde6bc85508e90 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The feature of minimizing overhead of struct page associated with each HugeTLB page is implemented on x86_64, however, the infrastructure of this feature is already there, we could easily enable it for other architectures. Introduce ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP for other architectures to be easily enabled. Just select this config if they want to enable this feature.
Link: https://lkml.kernel.org/r/20220331065640.5777-1-songmuchun@bytedance.com Signed-off-by: Muchun Song songmuchun@bytedance.com Suggested-by: Andrew Morton akpm@linux-foundation.org Reviewed-by: Barry Song baohua@kernel.org Tested-by: Barry Song baohua@kernel.org Reviewed-by: Anshuman Khandual anshuman.khandual@arm.com Cc: Bodeddula Balasubramaniam bodeddub@amazon.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: David Hildenbrand david@redhat.com Cc: David Rientjes rientjes@google.com Cc: Fam Zheng fam.zheng@bytedance.com Cc: James Morse james.morse@arm.com Cc: Mark Rutland mark.rutland@arm.com Cc: Mike Kravetz mike.kravetz@oracle.com Cc: Oscar Salvador osalvador@suse.de Cc: Will Deacon will@kernel.org Cc: Xiongchun Duan duanxiongchun@bytedance.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/Kconfig | 1 + fs/Kconfig | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d17396ef4323..661e05e1f762 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -103,6 +103,7 @@ config X86 select ARCH_WANT_DEFAULT_BPF_JIT if X86_64 select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANT_HUGE_PMD_SHARE + select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP if X86_64 select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANTS_THP_SWAP if X86_64 select BUILDTIME_TABLE_SORT diff --git a/fs/Kconfig b/fs/Kconfig index 37a8895a3254..b60a7614cb16 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -237,9 +237,17 @@ config HUGETLBFS config HUGETLB_PAGE def_bool HUGETLBFS
+# +# Select this config option from the architecture Kconfig, if it is preferred +# to enable the feature of minimizing overhead of struct page associated with +# each HugeTLB page. +# +config ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP + bool + config HUGETLB_PAGE_FREE_VMEMMAP def_bool HUGETLB_PAGE - depends on X86_64 + depends on ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP depends on SPARSEMEM_VMEMMAP
config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON
From: Muchun Song songmuchun@bytedance.com
mainline inclusion from mainline-v5.19-rc1 commit 1e63ac088f20f7a4425c430c31ecd3cf167fb3f2 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The feature of minimizing overhead of struct page associated with each HugeTLB page aims to free its vmemmap pages (used as struct page) to save memory, where is ~14GB/16GB per 1TB HugeTLB pages (2MB/1GB type). In short, when a HugeTLB page is allocated or freed, the vmemmap array representing the range associated with the page will need to be remapped. When a page is allocated, vmemmap pages are freed after remapping. When a page is freed, previously discarded vmemmap pages must be allocated before remapping. More implementations and details can be found here [1].
The infrastructure of freeing vmemmap pages associated with each HugeTLB page is already there, we can easily enable HUGETLB_PAGE_FREE_VMEMMAP for arm64, the only thing to be fixed is flush_dcache_page() .
flush_dcache_page() need to be adapted to operate on the head page's flags since the tail vmemmap pages are mapped with read-only after the feature is enabled (clear operation is not permitted).
There was some discussions about this in the thread [2], but there was no conclusion in the end. And I copied the concern proposed by Anshuman to here and explain why those concern is superfluous. It is safe to enable it for x86_64 as well as arm64.
1st concern: ''' But what happens when a hot remove section's vmemmap area (which is being teared down) is nearby another vmemmap area which is either created or being destroyed for HugeTLB alloc/free purpose. As you mentioned HugeTLB pages inside the hot remove section might be safe. But what about other HugeTLB areas whose vmemmap area shares page table entries with vmemmap entries for a section being hot removed ? Massive HugeTLB alloc /use/free test cycle using memory just adjacent to a memory hotplug area, which is always added and removed periodically, should be able to expose this problem. '''
Answer: At the time memory is removed, all HugeTLB pages either have been migrated away or dissolved. So there is no race between memory hot remove and free_huge_page_vmemmap(). Therefore, HugeTLB pages inside the hot remove section is safe. Let's talk your question "what about other HugeTLB areas whose vmemmap area shares page table entries with vmemmap entries for a section being hot removed ?", the question is not established. The minimal granularity size of hotplug memory 128MB (on arm64, 4k base page), any HugeTLB smaller than 128MB is within a section, then, there is no share PTE page tables between HugeTLB in this section and ones in other sections and a HugeTLB page could not cross two sections. In this case, the section cannot be freed. Any HugeTLB bigger than 128MB (section size) whose vmemmap pages is an integer multiple of 2MB (PMD-mapped). As long as:
1) HugeTLBs are naturally aligned, power-of-two sizes 2) The HugeTLB size >= the section size 3) The HugeTLB size >= the vmemmap leaf mapping size
Then a HugeTLB will not share any leaf page table entries with *anything else*, but will share intermediate entries. In this case, at the time memory is removed, all HugeTLB pages either have been migrated away or dissolved. So there is also no race between memory hot remove and free_huge_page_vmemmap().
2nd concern: ''' differently, not sure if ptdump would require any synchronization.
Dumping an wrong value is probably okay but crashing because a page table entry is being freed after ptdump acquired the pointer is bad. On arm64, ptdump() is protected against hotremove via [get|put]_online_mems(). '''
Answer: The ptdump should be fine since vmemmap_remap_free() only exchanges PTEs or splits the PMD entry (which means allocating a PTE page table). Both operations do not free any page tables (PTE), so ptdump cannot run into a UAF on any page tables. The worst case is just dumping an wrong value.
[1] https://lore.kernel.org/all/20210510030027.56044-1-songmuchun@bytedance.com/ [2] https://lore.kernel.org/all/20210518091826.36937-1-songmuchun@bytedance.com/
[songmuchun@bytedance.com: restructure the code comment inside flush_dcache_page()] Link: https://lkml.kernel.org/r/20220414072646.21910-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220331065640.5777-2-songmuchun@bytedance.com Signed-off-by: Muchun Song songmuchun@bytedance.com Reviewed-by: Barry Song baohua@kernel.org Tested-by: Barry Song baohua@kernel.org Cc: Will Deacon will@kernel.org Cc: David Hildenbrand david@redhat.com Cc: Bodeddula Balasubramaniam bodeddub@amazon.com Cc: Oscar Salvador osalvador@suse.de Cc: Mike Kravetz mike.kravetz@oracle.com Cc: David Rientjes rientjes@google.com Cc: Mark Rutland mark.rutland@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: James Morse james.morse@arm.com Cc: Xiongchun Duan duanxiongchun@bytedance.com Cc: Fam Zheng fam.zheng@bytedance.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/Kconfig | 1 + arch/arm64/mm/flush.c | 14 ++++++++++++++ 2 files changed, 15 insertions(+)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e253fdba1249..d9da5c4f91e0 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -81,6 +81,7 @@ config ARM64 select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) + select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANT_RESERVE_CRASH_KERNEL if KEXEC_CORE select ARCH_HAS_UBSAN_SANITIZE_ALL diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 145fe60c5e68..2bb6defad92f 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -69,6 +69,20 @@ EXPORT_SYMBOL_GPL(__sync_icache_dcache); */ void flush_dcache_page(struct page *page) { + /* + * Only the head page's flags of HugeTLB can be cleared since the tail + * vmemmap pages associated with each HugeTLB page are mapped with + * read-only when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP is enabled (more + * details can refer to vmemmap_remap_pte()). Although + * __sync_icache_dcache() only set PG_dcache_clean flag on the head + * page struct, there is more than one page struct with PG_dcache_clean + * associated with the HugeTLB page since the head vmemmap page frame + * is reused (more details can refer to the comments above + * page_fixed_fake_head()). + */ + if (hugetlb_free_vmemmap_enabled() && PageHuge(page)) + page = compound_head(page); + if (test_bit(PG_dcache_clean, &page->flags)) clear_bit(PG_dcache_clean, &page->flags); }
From: Muchun Song songmuchun@bytedance.com
mainline inclusion from mainline-v5.19-rc1 commit 5981611d0a006472d367d7a8e6ead8afaecf17c7 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Patch series "cleanup hugetlb_vmemmap".
The word of "free" is not expressive enough to express the feature of optimizing vmemmap pages associated with each HugeTLB, rename this keywork to "optimize" is more clear. In this series, cheanup related codes to make it more clear and expressive. This is suggested by David.
This patch (of 3):
The word of "free" is not expressive enough to express the feature of optimizing vmemmap pages associated with each HugeTLB, rename this keywork to "optimize". And some function names are prefixed with "huge_page" instead of "hugetlb", it is easily to be confused with THP. In this patch, cheanup related functions to make code more clear and expressive.
Link: https://lkml.kernel.org/r/20220404074652.68024-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220404074652.68024-2-songmuchun@bytedance.com Signed-off-by: Muchun Song songmuchun@bytedance.com Cc: David Hildenbrand david@redhat.com Cc: Mike Kravetz mike.kravetz@oracle.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Conflicts: mm/hugetlb.c Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 8 ++++---- mm/hugetlb_vmemmap.c | 42 ++++++++++++++++++++--------------------- mm/hugetlb_vmemmap.h | 20 ++++++++++---------- 4 files changed, 35 insertions(+), 37 deletions(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index fd1dc8d29436..218fc150d7f8 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -593,7 +593,7 @@ struct hstate { unsigned int surplus_huge_pages_node[MAX_NUMNODES]; unsigned int resv_huge_pages_node[MAX_NUMNODES]; #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP - unsigned int nr_free_vmemmap_pages; + unsigned int optimize_vmemmap_pages; #endif #ifdef CONFIG_CGROUP_HUGETLB /* cgroup control files */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a8c815386ecc..c5168c7f282a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1448,7 +1448,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return;
- if (alloc_huge_page_vmemmap(h, page)) { + if (hugetlb_vmemmap_alloc(h, page)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the @@ -1519,7 +1519,7 @@ static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
static inline void flush_free_hpage_work(struct hstate *h) { - if (free_vmemmap_pages_per_hpage(h)) + if (hugetlb_optimize_vmemmap_pages(h)) flush_work(&free_hpage_work); }
@@ -1642,7 +1642,7 @@ void free_huge_page(struct page *page)
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { - free_huge_page_vmemmap(h, page); + hugetlb_vmemmap_free(h, page); INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); hugetlb_set_page_subpool(page, NULL); @@ -2066,7 +2066,7 @@ int dissolve_free_huge_page(struct page *page) * Attempt to allocate vmemmmap here so that we can take * appropriate action on failure. */ - rc = alloc_huge_page_vmemmap(h, head); + rc = hugetlb_vmemmap_alloc(h, head); if (!rc) { /* * Move PageHWPoison flag from head page to the raw diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 791626983c2e..91b79b9d9e25 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Free some vmemmap pages of HugeTLB + * Optimize vmemmap pages associated with HugeTLB * * Copyright (c) 2020, Bytedance. All rights reserved. * @@ -192,7 +192,7 @@ DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, hugetlb_free_vmemmap_enabled_key); EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled_key);
-static int __init early_hugetlb_free_vmemmap_param(char *buf) +static int __init hugetlb_vmemmap_early_param(char *buf) { /* We cannot optimize if a "struct page" crosses page boundaries. */ if (!is_power_of_2(sizeof(struct page))) { @@ -212,29 +212,26 @@ static int __init early_hugetlb_free_vmemmap_param(char *buf)
return 0; } -early_param("hugetlb_free_vmemmap", early_hugetlb_free_vmemmap_param); - -static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h) -{ - return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT; -} +early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_early_param);
/* * Previously discarded vmemmap pages will be allocated and remapping * after this function returns zero. */ -int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) +int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) { int ret; unsigned long vmemmap_addr = (unsigned long)head; - unsigned long vmemmap_end, vmemmap_reuse; + unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
if (!HPageVmemmapOptimized(head)) return 0;
- vmemmap_addr += RESERVE_VMEMMAP_SIZE; - vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h); - vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + vmemmap_addr += RESERVE_VMEMMAP_SIZE; + vmemmap_pages = hugetlb_optimize_vmemmap_pages(h); + vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); + vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + /* * The pages which the vmemmap virtual address range [@vmemmap_addr, * @vmemmap_end) are mapped to are freed to the buddy allocator, and @@ -250,17 +247,18 @@ int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) return ret; }
-void free_huge_page_vmemmap(struct hstate *h, struct page *head) +void hugetlb_vmemmap_free(struct hstate *h, struct page *head) { unsigned long vmemmap_addr = (unsigned long)head; - unsigned long vmemmap_end, vmemmap_reuse; + unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
- if (!free_vmemmap_pages_per_hpage(h)) + vmemmap_pages = hugetlb_optimize_vmemmap_pages(h); + if (!vmemmap_pages) return;
- vmemmap_addr += RESERVE_VMEMMAP_SIZE; - vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h); - vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + vmemmap_addr += RESERVE_VMEMMAP_SIZE; + vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); + vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
/* * Remap the vmemmap virtual address range [@vmemmap_addr, @vmemmap_end) @@ -297,8 +295,8 @@ void __init hugetlb_vmemmap_init(struct hstate *h) * hugetlbpage.rst for more details. */ if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR)) - h->nr_free_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR; + h->optimize_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR;
- pr_info("can free %d vmemmap pages for %s\n", h->nr_free_vmemmap_pages, - h->name); + pr_info("can optimize %d vmemmap pages for %s\n", + h->optimize_vmemmap_pages, h->name); } diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index cb2bef8f9e73..9760537849b5 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Free some vmemmap pages of HugeTLB + * Optimize vmemmap pages associated with HugeTLB * * Copyright (c) 2020, Bytedance. All rights reserved. * @@ -11,25 +11,25 @@ #include <linux/hugetlb.h>
#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP -int alloc_huge_page_vmemmap(struct hstate *h, struct page *head); -void free_huge_page_vmemmap(struct hstate *h, struct page *head); +int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head); +void hugetlb_vmemmap_free(struct hstate *h, struct page *head); void hugetlb_vmemmap_init(struct hstate *h);
/* - * How many vmemmap pages associated with a HugeTLB page that can be freed - * to the buddy allocator. + * How many vmemmap pages associated with a HugeTLB page that can be + * optimized and freed to the buddy allocator. */ -static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h) +static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h) { - return h->nr_free_vmemmap_pages; + return h->optimize_vmemmap_pages; } #else -static inline int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) +static inline int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) { return 0; }
-static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head) +static inline void hugetlb_vmemmap_free(struct hstate *h, struct page *head) { }
@@ -37,7 +37,7 @@ static inline void hugetlb_vmemmap_init(struct hstate *h) { }
-static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h) +static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h) { return 0; }
From: Muchun Song songmuchun@bytedance.com
mainline inclusion from mainline-v5.19-rc1 commit f10f1442c309ccef7a80ba3dc4abde0978e86fb4 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The word of "free" is not expressive enough to express the feature of optimizing vmemmap pages associated with each HugeTLB, rename this keywork to "optimize". In this patch , cheanup the static key and hugetlb_free_vmemmap_enabled() to make code more expressive.
Link: https://lkml.kernel.org/r/20220404074652.68024-3-songmuchun@bytedance.com Signed-off-by: Muchun Song songmuchun@bytedance.com Cc: David Hildenbrand david@redhat.com Cc: Mike Kravetz mike.kravetz@oracle.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Conflicts: mm/memory_hotplug.c Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/mm/flush.c | 2 +- include/linux/page-flags.h | 12 ++++++------ mm/hugetlb_vmemmap.c | 10 +++++----- 3 files changed, 12 insertions(+), 12 deletions(-)
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 2bb6defad92f..892e53e9c788 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -80,7 +80,7 @@ void flush_dcache_page(struct page *page) * is reused (more details can refer to the comments above * page_fixed_fake_head()). */ - if (hugetlb_free_vmemmap_enabled() && PageHuge(page)) + if (hugetlb_optimize_vmemmap_enabled() && PageHuge(page)) page = compound_head(page);
if (test_bit(PG_dcache_clean, &page->flags)) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 81061122af68..d1cf17a71a12 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -196,16 +196,16 @@ enum pageflags {
#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, - hugetlb_free_vmemmap_enabled_key); + hugetlb_optimize_vmemmap_key);
-static __always_inline bool hugetlb_free_vmemmap_enabled(void) +static __always_inline bool hugetlb_optimize_vmemmap_enabled(void) { return static_branch_maybe(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, - &hugetlb_free_vmemmap_enabled_key); + &hugetlb_optimize_vmemmap_key); }
/* - * If the feature of freeing some vmemmap pages associated with each HugeTLB + * If the feature of optimizing vmemmap pages associated with each HugeTLB * page is enabled, the head vmemmap page frame is reused and all of the tail * vmemmap addresses map to the head vmemmap page frame (furture details can * refer to the figure at the head of the mm/hugetlb_vmemmap.c). In other @@ -222,7 +222,7 @@ static __always_inline bool hugetlb_free_vmemmap_enabled(void) */ static __always_inline const struct page *page_fixed_fake_head(const struct page *page) { - if (!hugetlb_free_vmemmap_enabled()) + if (!hugetlb_optimize_vmemmap_enabled()) return page;
/* @@ -251,7 +251,7 @@ static inline const struct page *page_fixed_fake_head(const struct page *page) return page; }
-static inline bool hugetlb_free_vmemmap_enabled(void) +static inline bool hugetlb_optimize_vmemmap_enabled(void) { return false; } diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 91b79b9d9e25..f25294973398 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -189,8 +189,8 @@ #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT)
DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, - hugetlb_free_vmemmap_enabled_key); -EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled_key); + hugetlb_optimize_vmemmap_key); +EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
static int __init hugetlb_vmemmap_early_param(char *buf) { @@ -204,9 +204,9 @@ static int __init hugetlb_vmemmap_early_param(char *buf) return -EINVAL;
if (!strcmp(buf, "on")) - static_branch_enable(&hugetlb_free_vmemmap_enabled_key); + static_branch_enable(&hugetlb_optimize_vmemmap_key); else if (!strcmp(buf, "off")) - static_branch_disable(&hugetlb_free_vmemmap_enabled_key); + static_branch_disable(&hugetlb_optimize_vmemmap_key); else return -EINVAL;
@@ -282,7 +282,7 @@ void __init hugetlb_vmemmap_init(struct hstate *h) BUILD_BUG_ON(__NR_USED_SUBPAGE >= RESERVE_VMEMMAP_SIZE / sizeof(struct page));
- if (!hugetlb_free_vmemmap_enabled()) + if (!hugetlb_optimize_vmemmap_enabled()) return;
vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT;
From: Muchun Song songmuchun@bytedance.com
mainline inclusion from mainline-v5.19-rc1 commit 47010c040dec8af6347ec6259104fc13f7e7e30a category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The word of "free" is not expressive enough to express the feature of optimizing vmemmap pages associated with each HugeTLB, rename this keywork to "optimize". In this patch , cheanup configs to make code more expressive.
Link: https://lkml.kernel.org/r/20220404074652.68024-4-songmuchun@bytedance.com Signed-off-by: Muchun Song songmuchun@bytedance.com Cc: Mike Kravetz mike.kravetz@oracle.com Cc: David Hildenbrand david@redhat.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Conflicts: arch/arm64/configs/openeuler_defconfig arch/x86/configs/openeuler_defconfig Documentation/admin-guide/kernel-parameters.txt include/linux/hugetlb.h mm/Makefile Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/admin-guide/kernel-parameters.txt | 5 ++++- Documentation/admin-guide/mm/hugetlbpage.rst | 2 +- arch/arm64/Kconfig | 2 +- arch/arm64/configs/openeuler_defconfig | 4 ++-- arch/arm64/mm/flush.c | 2 +- arch/x86/Kconfig | 2 +- arch/x86/configs/openeuler_defconfig | 4 ++-- arch/x86/mm/init_64.c | 2 +- fs/Kconfig | 16 ++++++++-------- include/linux/hugetlb.h | 2 +- include/linux/mm.h | 2 +- include/linux/page-flags.h | 6 +++--- mm/Makefile | 2 +- mm/hugetlb_vmemmap.c | 4 ++-- mm/hugetlb_vmemmap.h | 4 ++-- mm/sparse-vmemmap.c | 4 ++-- 16 files changed, 33 insertions(+), 30 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index e0957b73f63d..86a7ea4a9964 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1593,7 +1593,7 @@ Format: size[KMG]
hugetlb_free_vmemmap= - [KNL] Reguires CONFIG_HUGETLB_PAGE_FREE_VMEMMAP + [KNL] Reguires CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP enabled. Allows heavy hugetlb users to free up some more memory (7 * PAGE_SIZE for each 2MB hugetlb page). @@ -1602,6 +1602,9 @@ on: enable the feature off: disable the feature
+ Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y, + the default is on. + hugevmalloc [KNL,PPC,ARM64,X86] Requires CONFIG_HAVE_ARCH_HUGE_VMALLOC Format: { on | off } Default set by CONFIG_HUGE_VMALLOC_DEFAULT_ENABLED. diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst index d70828c07658..0f8acc4a6cf0 100644 --- a/Documentation/admin-guide/mm/hugetlbpage.rst +++ b/Documentation/admin-guide/mm/hugetlbpage.rst @@ -164,7 +164,7 @@ default_hugepagesz will all result in 256 2M huge pages being allocated. Valid default huge page size is architecture dependent. hugetlb_free_vmemmap - When CONFIG_HUGETLB_PAGE_FREE_VMEMMAP is set, this enables freeing + When CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is set, this enables optimizing unused vmemmap pages associated with each HugeTLB page.
When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages`` diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index d9da5c4f91e0..c4f6c80ea976 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -81,7 +81,7 @@ config ARM64 select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) - select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP + select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANT_RESERVE_CRASH_KERNEL if KEXEC_CORE select ARCH_HAS_UBSAN_SANITIZE_ALL diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index d246fd508ef6..593ac67497e3 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -6281,8 +6281,8 @@ CONFIG_TMPFS_XATTR=y # CONFIG_TMPFS_INODE64 is not set CONFIG_HUGETLBFS=y CONFIG_HUGETLB_PAGE=y -CONFIG_HUGETLB_PAGE_FREE_VMEMMAP=y -# CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON is not set +CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP=y +# CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON is not set CONFIG_MEMFD_CREATE=y CONFIG_ARCH_HAS_GIGANTIC_PAGE=y CONFIG_CONFIGFS_FS=y diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 892e53e9c788..c7678e7df53a 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -72,7 +72,7 @@ void flush_dcache_page(struct page *page) /* * Only the head page's flags of HugeTLB can be cleared since the tail * vmemmap pages associated with each HugeTLB page are mapped with - * read-only when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP is enabled (more + * read-only when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is enabled (more * details can refer to vmemmap_remap_pte()). Although * __sync_icache_dcache() only set PG_dcache_clean flag on the head * page struct, there is more than one page struct with PG_dcache_clean diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 661e05e1f762..f39cfb5a6535 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -103,7 +103,7 @@ config X86 select ARCH_WANT_DEFAULT_BPF_JIT if X86_64 select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANT_HUGE_PMD_SHARE - select ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP if X86_64 + select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP if X86_64 select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANTS_THP_SWAP if X86_64 select BUILDTIME_TABLE_SORT diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 3eac70518e6f..f013c7b95881 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -7370,8 +7370,8 @@ CONFIG_TMPFS_XATTR=y # CONFIG_TMPFS_INODE64 is not set CONFIG_HUGETLBFS=y CONFIG_HUGETLB_PAGE=y -CONFIG_HUGETLB_PAGE_FREE_VMEMMAP=y -# CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON is not set +CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP=y +# CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON is not set CONFIG_DYNAMIC_HUGETLB=y CONFIG_MEMFD_CREATE=y CONFIG_ARCH_HAS_GIGANTIC_PAGE=y diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 1eed5eaee41f..b1d5c05aeca8 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1225,7 +1225,7 @@ static struct kcore_list kcore_vsyscall;
static void __init register_page_bootmem_info(void) { -#if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP) +#if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) int i;
for_each_online_node(i) diff --git a/fs/Kconfig b/fs/Kconfig index b60a7614cb16..aa097ca64ef6 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -242,22 +242,22 @@ config HUGETLB_PAGE # to enable the feature of minimizing overhead of struct page associated with # each HugeTLB page. # -config ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP +config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP bool
-config HUGETLB_PAGE_FREE_VMEMMAP +config HUGETLB_PAGE_OPTIMIZE_VMEMMAP def_bool HUGETLB_PAGE - depends on ARCH_WANT_HUGETLB_PAGE_FREE_VMEMMAP + depends on ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP depends on SPARSEMEM_VMEMMAP
-config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON - bool "Default freeing vmemmap pages of HugeTLB to on" +config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON + bool "Default optimizing vmemmap pages of HugeTLB to on" default n - depends on HUGETLB_PAGE_FREE_VMEMMAP + depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP help - When using HUGETLB_PAGE_FREE_VMEMMAP, the freeing unused vmemmap + When using HUGETLB_PAGE_OPTIMIZE_VMEMMAP, the optimizing unused vmemmap pages associated with each HugeTLB page is default off. Say Y here - to enable freeing vmemmap pages of HugeTLB by default. It can then + to enable optimizing vmemmap pages of HugeTLB by default. It can then be disabled on the command line via hugetlb_free_vmemmap=off.
config DYNAMIC_HUGETLB diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 218fc150d7f8..0dfe08439095 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -592,7 +592,7 @@ struct hstate { unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; unsigned int resv_huge_pages_node[MAX_NUMNODES]; -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP unsigned int optimize_vmemmap_pages; #endif #ifdef CONFIG_CGROUP_HUGETLB diff --git a/include/linux/mm.h b/include/linux/mm.h index efa3972a23bf..1ae73cc4b806 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3104,7 +3104,7 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) } #endif
-#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP int vmemmap_remap_free(unsigned long start, unsigned long end, unsigned long reuse); int vmemmap_remap_alloc(unsigned long start, unsigned long end, diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index d1cf17a71a12..26b36cac9307 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -194,13 +194,13 @@ enum pageflags {
#ifndef __GENERATING_BOUNDS_H
-#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP -DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, hugetlb_optimize_vmemmap_key);
static __always_inline bool hugetlb_optimize_vmemmap_enabled(void) { - return static_branch_maybe(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, + return static_branch_maybe(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, &hugetlb_optimize_vmemmap_key); }
diff --git a/mm/Makefile b/mm/Makefile index d2a6a786f915..e83233177c7a 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -71,7 +71,7 @@ obj-$(CONFIG_FRONTSWAP) += frontswap.o obj-$(CONFIG_ZSWAP) += zswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o -obj-$(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP) += hugetlb_vmemmap.o +obj-$(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) += hugetlb_vmemmap.o obj-$(CONFIG_DYNAMIC_HUGETLB) += dynamic_hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index f25294973398..2655434a946b 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -188,7 +188,7 @@ #define RESERVE_VMEMMAP_NR 1U #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT)
-DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, +DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, hugetlb_optimize_vmemmap_key); EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
@@ -276,7 +276,7 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
/* * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct - * page structs that can be used when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP, + * page structs that can be used when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP, * so add a BUILD_BUG_ON to catch invalid usage of the tail struct page. */ BUILD_BUG_ON(__NR_USED_SUBPAGE >= diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index 9760537849b5..109b0a53b6fe 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -10,7 +10,7 @@ #define _LINUX_HUGETLB_VMEMMAP_H #include <linux/hugetlb.h>
-#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head); void hugetlb_vmemmap_free(struct hstate *h, struct page *head); void hugetlb_vmemmap_init(struct hstate *h); @@ -41,5 +41,5 @@ static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h) { return 0; } -#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */ +#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ #endif /* _LINUX_HUGETLB_VMEMMAP_H */ diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index e5ed2680ec57..5b40a7473dc8 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -34,7 +34,7 @@ #include <asm/pgalloc.h> #include <asm/tlbflush.h>
-#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP /** * struct vmemmap_remap_walk - walk vmemmap page table * @@ -420,7 +420,7 @@ int vmemmap_remap_alloc(unsigned long start, unsigned long end,
return 0; } -#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */ +#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
/* * Allocate a block of memory to be used to back the virtual memory map
From: Xiaoming Ni nixiaoming@huawei.com
mainline inclusion from mainline-v5.17-rc1 commit 3ddd9a808cee7284931312f2f3e854c9617f44b2 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Patch series "sysctl: first set of kernel/sysctl cleanups", v2.
Finally had time to respin the series of the work we had started last year on cleaning up the kernel/sysct.c kitchen sink. People keeps stuffing their sysctls in that file and this creates a maintenance burden. So this effort is aimed at placing sysctls where they actually belong.
I'm going to split patches up into series as there is quite a bit of work.
This first set adds register_sysctl_init() for uses of registerting a sysctl on the init path, adds const where missing to a few places, generalizes common values so to be more easy to share, and starts the move of a few kernel/sysctl.c out where they belong.
The majority of rework on v2 in this first patch set is 0-day fixes. Eric Biederman's feedback is later addressed in subsequent patch sets.
I'll only post the first two patch sets for now. We can address the rest once the first two patch sets get completely reviewed / Acked.
This patch (of 9):
The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain.
To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic.
Today though folks heavily rely on tables on kernel/sysctl.c so they can easily just extend this table with their needed sysctls. In order to help users move their sysctls out we need to provide a helper which can be used during code initialization.
We special-case the initialization use of register_sysctl() since it *is* safe to fail, given all that sysctls do is provide a dynamic interface to query or modify at runtime an existing variable. So the use case of register_sysctl() on init should *not* stop if the sysctls don't end up getting registered. It would be counter productive to stop boot if a simple sysctl registration failed.
Provide a helper for init then, and document the recommended init levels to use for callers of this routine. We will later use this in subsequent patches to start slimming down kernel/sysctl.c tables and moving sysctl registration to the code which actually needs these sysctls.
[mcgrof@kernel.org: major commit log and documentation rephrasing also moved to fs/proc/proc_sysctl.c ]
Link: https://lkml.kernel.org/r/20211123202347.818157-1-mcgrof@kernel.org Link: https://lkml.kernel.org/r/20211123202347.818157-2-mcgrof@kernel.org Signed-off-by: Xiaoming Ni nixiaoming@huawei.com Signed-off-by: Luis Chamberlain mcgrof@kernel.org Reviewed-by: Kees Cook keescook@chromium.org Cc: Iurii Zaikin yzaikin@google.com Cc: "Eric W. Biederman" ebiederm@xmission.com Cc: Peter Zijlstra peterz@infradead.org Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Paul Turner pjt@google.com Cc: Andy Shevchenko andriy.shevchenko@linux.intel.com Cc: Sebastian Reichel sre@kernel.org Cc: Tetsuo Handa penguin-kernel@I-love.SAKURA.ne.jp Cc: Petr Mladek pmladek@suse.com Cc: Sergey Senozhatsky senozhatsky@chromium.org Cc: Qing Wang wangqing@vivo.com Cc: Benjamin LaHaise bcrl@kvack.org Cc: Al Viro viro@zeniv.linux.org.uk Cc: Jan Kara jack@suse.cz Cc: Amir Goldstein amir73il@gmail.com Cc: Stephen Kitt steve@sk2.org Cc: Antti Palosaari crope@iki.fi Cc: Arnd Bergmann arnd@arndb.de Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Clemens Ladisch clemens@ladisch.de Cc: David Airlie airlied@linux.ie Cc: Jani Nikula jani.nikula@linux.intel.com Cc: Joel Becker jlbec@evilplan.org Cc: Joonas Lahtinen joonas.lahtinen@linux.intel.com Cc: Joseph Qi joseph.qi@linux.alibaba.com Cc: Julia Lawall julia.lawall@inria.fr Cc: Lukas Middendorf kernel@tuxforce.de Cc: Mark Fasheh mark@fasheh.com Cc: Phillip Potter phil@philpotter.co.uk Cc: Rodrigo Vivi rodrigo.vivi@intel.com Cc: Douglas Gilbert dgilbert@interlog.com Cc: James E.J. Bottomley jejb@linux.ibm.com Cc: Jani Nikula jani.nikula@intel.com Cc: John Ogness john.ogness@linutronix.de Cc: Martin K. Petersen martin.petersen@oracle.com Cc: "Rafael J. Wysocki" rafael@kernel.org Cc: Steven Rostedt (VMware) rostedt@goodmis.org Cc: Suren Baghdasaryan surenb@google.com Cc: "Theodore Ts'o" tytso@mit.edu Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/proc/proc_sysctl.c | 33 +++++++++++++++++++++++++++++++++ include/linux/sysctl.h | 3 +++ 2 files changed, 36 insertions(+)
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index ffed75f833b7..df435cd91a5b 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -16,6 +16,7 @@ #include <linux/module.h> #include <linux/bpf-cgroup.h> #include <linux/mount.h> +#include <linux/kmemleak.h> #include "internal.h"
static const struct dentry_operations proc_sys_dentry_operations; @@ -1380,6 +1381,38 @@ struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *tab } EXPORT_SYMBOL(register_sysctl);
+/** + * __register_sysctl_init() - register sysctl table to path + * @path: path name for sysctl base + * @table: This is the sysctl table that needs to be registered to the path + * @table_name: The name of sysctl table, only used for log printing when + * registration fails + * + * The sysctl interface is used by userspace to query or modify at runtime + * a predefined value set on a variable. These variables however have default + * values pre-set. Code which depends on these variables will always work even + * if register_sysctl() fails. If register_sysctl() fails you'd just loose the + * ability to query or modify the sysctls dynamically at run time. Chances of + * register_sysctl() failing on init are extremely low, and so for both reasons + * this function does not return any error as it is used by initialization code. + * + * Context: Can only be called after your respective sysctl base path has been + * registered. So for instance, most base directories are registered early on + * init before init levels are processed through proc_sys_init() and + * sysctl_init(). + */ +void __init __register_sysctl_init(const char *path, struct ctl_table *table, + const char *table_name) +{ + struct ctl_table_header *hdr = register_sysctl(path, table); + + if (unlikely(!hdr)) { + pr_err("failed when register_sysctl %s to %s\n", table_name, path); + return; + } + kmemleak_not_leak(hdr); +} + static char *append_path(const char *path, char *pos, const char *name) { int namelen; diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 51298a4f4623..161eba9fd912 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -195,6 +195,9 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, void unregister_sysctl_table(struct ctl_table_header * table);
extern int sysctl_init(void); +extern void __register_sysctl_init(const char *path, struct ctl_table *table, + const char *table_name); +#define register_sysctl_init(path, table) __register_sysctl_init(path, table, #table) void do_sysctl_args(void);
extern int pwrsw_enabled;
From: Muchun Song songmuchun@bytedance.com
mainline inclusion from mainline-v5.19-rc1 commit 0effdf461c5789be02d40c1868c70cc02ea24627 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Patch series "add hugetlb_optimize_vmemmap sysctl", v11.
This series aims to add hugetlb_optimize_vmemmap sysctl to enable or disable the feature of optimizing vmemmap pages associated with HugeTLB pages.
This patch (of 4):
If the size of "struct page" is not the power of two but with the feature of minimizing overhead of struct page associated with each HugeTLB is enabled, then the vmemmap pages of HugeTLB will be corrupted after remapping (panic is about to happen in theory). But this only exists when !CONFIG_MEMCG && !CONFIG_SLUB on x86_64. However, it is not a conventional configuration nowadays. So it is not a real word issue, just the result of a code review.
But we cannot prevent anyone from configuring that combined configure. This hugetlb_optimize_vmemmap should be disable in this case to fix this issue.
Link: https://lkml.kernel.org/r/20220512041142.39501-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220512041142.39501-2-songmuchun@bytedance.com Signed-off-by: Muchun Song songmuchun@bytedance.com Reviewed-by: Mike Kravetz mike.kravetz@oracle.com Cc: David Hildenbrand david@redhat.com Cc: Iurii Zaikin yzaikin@google.com Cc: Jonathan Corbet corbet@lwn.net Cc: Kees Cook keescook@chromium.org Cc: Luis Chamberlain mcgrof@kernel.org Cc: Masahiro Yamada masahiroy@kernel.org Cc: Oscar Salvador osalvador@suse.de Cc: Xiongchun Duan duanxiongchun@bytedance.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/hugetlb_vmemmap.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 2655434a946b..4b8e59eeb971 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -194,12 +194,6 @@ EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
static int __init hugetlb_vmemmap_early_param(char *buf) { - /* We cannot optimize if a "struct page" crosses page boundaries. */ - if (!is_power_of_2(sizeof(struct page))) { - pr_warn("cannot free vmemmap pages because "struct page" crosses page boundaries\n"); - return 0; - } - if (!buf) return -EINVAL;
@@ -285,6 +279,12 @@ void __init hugetlb_vmemmap_init(struct hstate *h) if (!hugetlb_optimize_vmemmap_enabled()) return;
+ if (!is_power_of_2(sizeof(struct page))) { + pr_warn_once("cannot optimize vmemmap pages because "struct page" crosses page boundaries\n"); + static_branch_disable(&hugetlb_optimize_vmemmap_key); + return; + } + vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; /* * The head page is not to be freed to buddy allocator, the other tail
From: Muchun Song songmuchun@bytedance.com
mainline inclusion from mainline-v5.19-rc1 commit 9c54c522bb76cbef480722bd44059e2ba8304bd2 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Use kstrtobool rather than open coding "on" and "off" parsing in mm/hugetlb_vmemmap.c, which is more powerful to handle all kinds of parameters like 'Yy1Nn0' or [oO][NnFf] for "on" and "off".
Link: https://lkml.kernel.org/r/20220512041142.39501-4-songmuchun@bytedance.com Signed-off-by: Muchun Song songmuchun@bytedance.com Reviewed-by: Mike Kravetz mike.kravetz@oracle.com Cc: David Hildenbrand david@redhat.com Cc: Iurii Zaikin yzaikin@google.com Cc: Jonathan Corbet corbet@lwn.net Cc: Kees Cook keescook@chromium.org Cc: Luis Chamberlain mcgrof@kernel.org Cc: Masahiro Yamada masahiroy@kernel.org Cc: Oscar Salvador osalvador@suse.de Cc: Xiongchun Duan duanxiongchun@bytedance.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/admin-guide/kernel-parameters.txt | 6 +++--- mm/hugetlb_vmemmap.c | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 86a7ea4a9964..247acf7fc837 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1597,10 +1597,10 @@ enabled. Allows heavy hugetlb users to free up some more memory (7 * PAGE_SIZE for each 2MB hugetlb page). - Format: { on | off (default) } + Format: { [oO][Nn]/Y/y/1 | [oO][Ff]/N/n/0 (default) }
- on: enable the feature - off: disable the feature + [oO][Nn]/Y/y/1: enable the feature + [oO][Ff]/N/n/0: disable the feature
Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y, the default is on. diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 4b8e59eeb971..112e74504905 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -194,15 +194,15 @@ EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
static int __init hugetlb_vmemmap_early_param(char *buf) { - if (!buf) + bool enable; + + if (kstrtobool(buf, &enable)) return -EINVAL;
- if (!strcmp(buf, "on")) + if (enable) static_branch_enable(&hugetlb_optimize_vmemmap_key); - else if (!strcmp(buf, "off")) - static_branch_disable(&hugetlb_optimize_vmemmap_key); else - return -EINVAL; + static_branch_disable(&hugetlb_optimize_vmemmap_key);
return 0; }
From: Muchun Song songmuchun@bytedance.com
mainline inclusion from mainline-v5.19-rc1 commit 78f39084b41d287aedb2ea55f2c1895cfa11d61a category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
We must add hugetlb_free_vmemmap=on (or "off") to the boot cmdline and reboot the server to enable or disable the feature of optimizing vmemmap pages associated with HugeTLB pages. However, rebooting usually takes a long time. So add a sysctl to enable or disable the feature at runtime without rebooting. Why we need this? There are 3 use cases.
1) The feature of minimizing overhead of struct page associated with each HugeTLB is disabled by default without passing "hugetlb_free_vmemmap=on" to the boot cmdline. When we (ByteDance) deliver the servers to the users who want to enable this feature, they have to configure the grub (change boot cmdline) and reboot the servers, whereas rebooting usually takes a long time (we have thousands of servers). It's a very bad experience for the users. So we need a approach to enable this feature after rebooting. This is a use case in our practical environment.
2) Some use cases are that HugeTLB pages are allocated 'on the fly' instead of being pulled from the HugeTLB pool, those workloads would be affected with this feature enabled. Those workloads could be identified by the characteristics of they never explicitly allocating huge pages with 'nr_hugepages' but only set 'nr_overcommit_hugepages' and then let the pages be allocated from the buddy allocator at fault time. We can confirm it is a real use case from the commit 099730d67417. For those workloads, the page fault time could be ~2x slower than before. We suspect those users want to disable this feature if the system has enabled this before and they don't think the memory savings benefit is enough to make up for the performance drop.
3) If the workload which wants vmemmap pages to be optimized and the workload which wants to set 'nr_overcommit_hugepages' and does not want the extera overhead at fault time when the overcommitted pages be allocated from the buddy allocator are deployed in the same server. The user could enable this feature and set 'nr_hugepages' and 'nr_overcommit_hugepages', then disable the feature. In this case, the overcommited HugeTLB pages will not encounter the extra overhead at fault time.
Link: https://lkml.kernel.org/r/20220512041142.39501-5-songmuchun@bytedance.com Signed-off-by: Muchun Song songmuchun@bytedance.com Reviewed-by: Mike Kravetz mike.kravetz@oracle.com Cc: Jonathan Corbet corbet@lwn.net Cc: Luis Chamberlain mcgrof@kernel.org Cc: Kees Cook keescook@chromium.org Cc: Iurii Zaikin yzaikin@google.com Cc: Oscar Salvador osalvador@suse.de Cc: David Hildenbrand david@redhat.com Cc: Masahiro Yamada masahiroy@kernel.org Cc: Xiongchun Duan duanxiongchun@bytedance.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Conflicts: include/linux/memory_hotplug.h mm/hugetlb_vmemmap.c mm/memory_hotplug.c Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/admin-guide/sysctl/vm.rst | 39 +++++++++++ mm/hugetlb_vmemmap.c | 92 ++++++++++++++++++++++--- 2 files changed, 122 insertions(+), 9 deletions(-)
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index a5fbef4740c2..5de629b932ae 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -560,6 +560,45 @@ Change the minimum size of the hugepage pool. See Documentation/admin-guide/mm/hugetlbpage.rst
+hugetlb_optimize_vmemmap +======================== + +This knob is not available when memory_hotplug.memmap_on_memory (kernel parameter) +is configured or the size of 'struct page' (a structure defined in +include/linux/mm_types.h) is not power of two (an unusual system config could +result in this). + +Enable (set to 1) or disable (set to 0) the feature of optimizing vmemmap pages +associated with each HugeTLB page. + +Once enabled, the vmemmap pages of subsequent allocation of HugeTLB pages from +buddy allocator will be optimized (7 pages per 2MB HugeTLB page and 4095 pages +per 1GB HugeTLB page), whereas already allocated HugeTLB pages will not be +optimized. When those optimized HugeTLB pages are freed from the HugeTLB pool +to the buddy allocator, the vmemmap pages representing that range needs to be +remapped again and the vmemmap pages discarded earlier need to be rellocated +again. If your use case is that HugeTLB pages are allocated 'on the fly' (e.g. +never explicitly allocating HugeTLB pages with 'nr_hugepages' but only set +'nr_overcommit_hugepages', those overcommitted HugeTLB pages are allocated 'on +the fly') instead of being pulled from the HugeTLB pool, you should weigh the +benefits of memory savings against the more overhead (~2x slower than before) +of allocation or freeing HugeTLB pages between the HugeTLB pool and the buddy +allocator. Another behavior to note is that if the system is under heavy memory +pressure, it could prevent the user from freeing HugeTLB pages from the HugeTLB +pool to the buddy allocator since the allocation of vmemmap pages could be +failed, you have to retry later if your system encounter this situation. + +Once disabled, the vmemmap pages of subsequent allocation of HugeTLB pages from +buddy allocator will not be optimized meaning the extra overhead at allocation +time from buddy allocator disappears, whereas already optimized HugeTLB pages +will not be affected. If you want to make sure there are no optimized HugeTLB +pages, you can set "nr_hugepages" to 0 first and then disable this. Note that +writing 0 to nr_hugepages will make any "in use" HugeTLB pages become surplus +pages. So, those surplus pages are still optimized until they are no longer +in use. You would need to wait for those surplus pages to be released before +there are no optimized pages in the system. + + nr_hugepages_mempolicy ======================
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 112e74504905..4340bf6c5551 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -188,21 +188,40 @@ #define RESERVE_VMEMMAP_NR 1U #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT)
+enum vmemmap_optimize_mode { + VMEMMAP_OPTIMIZE_OFF, + VMEMMAP_OPTIMIZE_ON, +}; + DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, hugetlb_optimize_vmemmap_key); EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
+static enum vmemmap_optimize_mode vmemmap_optimize_mode = + IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); + +static void vmemmap_optimize_mode_switch(enum vmemmap_optimize_mode to) +{ + if (vmemmap_optimize_mode == to) + return; + + if (to == VMEMMAP_OPTIMIZE_OFF) + static_branch_dec(&hugetlb_optimize_vmemmap_key); + else + static_branch_inc(&hugetlb_optimize_vmemmap_key); + WRITE_ONCE(vmemmap_optimize_mode, to); +} + static int __init hugetlb_vmemmap_early_param(char *buf) { bool enable; + enum vmemmap_optimize_mode mode;
if (kstrtobool(buf, &enable)) return -EINVAL;
- if (enable) - static_branch_enable(&hugetlb_optimize_vmemmap_key); - else - static_branch_disable(&hugetlb_optimize_vmemmap_key); + mode = enable ? VMEMMAP_OPTIMIZE_ON : VMEMMAP_OPTIMIZE_OFF; + vmemmap_optimize_mode_switch(mode);
return 0; } @@ -235,8 +254,10 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) */ ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse, GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE); - if (!ret) + if (!ret) { ClearHPageVmemmapOptimized(head); + static_branch_dec(&hugetlb_optimize_vmemmap_key); + }
return ret; } @@ -250,6 +271,11 @@ void hugetlb_vmemmap_free(struct hstate *h, struct page *head) if (!vmemmap_pages) return;
+ if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF) + return; + + static_branch_inc(&hugetlb_optimize_vmemmap_key); + vmemmap_addr += RESERVE_VMEMMAP_SIZE; vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); vmemmap_reuse = vmemmap_addr - PAGE_SIZE; @@ -259,7 +285,9 @@ void hugetlb_vmemmap_free(struct hstate *h, struct page *head) * to the page which @vmemmap_reuse is mapped to, then free the pages * which the range [@vmemmap_addr, @vmemmap_end] is mapped to. */ - if (!vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse)) + if (vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse)) + static_branch_dec(&hugetlb_optimize_vmemmap_key); + else SetHPageVmemmapOptimized(head); }
@@ -276,9 +304,6 @@ void __init hugetlb_vmemmap_init(struct hstate *h) BUILD_BUG_ON(__NR_USED_SUBPAGE >= RESERVE_VMEMMAP_SIZE / sizeof(struct page));
- if (!hugetlb_optimize_vmemmap_enabled()) - return; - if (!is_power_of_2(sizeof(struct page))) { pr_warn_once("cannot optimize vmemmap pages because "struct page" crosses page boundaries\n"); static_branch_disable(&hugetlb_optimize_vmemmap_key); @@ -300,3 +325,52 @@ void __init hugetlb_vmemmap_init(struct hstate *h) pr_info("can optimize %d vmemmap pages for %s\n", h->optimize_vmemmap_pages, h->name); } + +#ifdef CONFIG_PROC_SYSCTL +static int hugetlb_optimize_vmemmap_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, + loff_t *ppos) +{ + int ret; + enum vmemmap_optimize_mode mode; + static DEFINE_MUTEX(sysctl_mutex); + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&sysctl_mutex); + mode = vmemmap_optimize_mode; + table->data = &mode; + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (write && !ret) + vmemmap_optimize_mode_switch(mode); + mutex_unlock(&sysctl_mutex); + + return ret; +} + +static struct ctl_table hugetlb_vmemmap_sysctls[] = { + { + .procname = "hugetlb_optimize_vmemmap", + .maxlen = sizeof(enum vmemmap_optimize_mode), + .mode = 0644, + .proc_handler = hugetlb_optimize_vmemmap_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static __init int hugetlb_vmemmap_sysctls_init(void) +{ + /* + * If "memory_hotplug.memmap_on_memory" is enabled or "struct page" + * crosses page boundaries, the vmemmap pages cannot be optimized. + */ + if (is_power_of_2(sizeof(struct page))) + register_sysctl_init("vm", hugetlb_vmemmap_sysctls); + + return 0; +} +late_initcall(hugetlb_vmemmap_sysctls_init); +#endif /* CONFIG_PROC_SYSCTL */
From: Muchun Song songmuchun@bytedance.com
mainline inclusion from mainline-v5.19-rc1 commit 0111def915b280c64c05f73f01b59ca404255aa3 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The following:
commit 47010c040dec ("mm: hugetlb_vmemmap: cleanup CONFIG_HUGETLB_PAGE_FREE_VMEMMAP*")
forgot to update CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON used in vmemmap_optimize_mode to CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON. The result is we cannot enable hugetlb_optimize_vmemmap at boot time when we configure CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON. Fix it.
Link: https://lkml.kernel.org/r/20220527081948.68832-1-songmuchun@bytedance.com Fixes: 47010c040dec ("mm: hugetlb_vmemmap: cleanup CONFIG_HUGETLB_PAGE_FREE_VMEMMAP*") Signed-off-by: Muchun Song songmuchun@bytedance.com Reported-by: Vlastimil Babka vbabka@suse.cz Acked-by: Vlastimil Babka vbabka@suse.cz Cc: Mike Kravetz mike.kravetz@oracle.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/hugetlb_vmemmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 4340bf6c5551..e9f63cb9e3d4 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -198,7 +198,7 @@ DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
static enum vmemmap_optimize_mode vmemmap_optimize_mode = - IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); + IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
static void vmemmap_optimize_mode_switch(enum vmemmap_optimize_mode to) {
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA
--------------------------------
Disable hugetlb_vmemmap when dynamic hugetlb is enabled. By the way, fix a similar spelling error.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/dynamic_hugetlb.c | 4 ++-- mm/huge_memory.c | 2 +- mm/hugetlb_vmemmap.c | 11 ++++++++++- 3 files changed, 13 insertions(+), 4 deletions(-)
diff --git a/mm/dynamic_hugetlb.c b/mm/dynamic_hugetlb.c index f8ebc8ab7d60..c1b968a7e668 100644 --- a/mm/dynamic_hugetlb.c +++ b/mm/dynamic_hugetlb.c @@ -1150,6 +1150,6 @@ static int __init dynamic_hugetlb_setup(char *s) { if (!strcmp(s, "on")) enable_dhugetlb = true; - return 1; + return 0; } -__setup("dynamic_hugetlb=", dynamic_hugetlb_setup); +early_param("dynamic_hugetlb", dynamic_hugetlb_setup); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index bfe079e294cb..79c855b5adad 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -401,7 +401,7 @@ static int __init hugepage_init(void) */ if (enable_dhugetlb) { transparent_hugepage_flags = 0; - pr_info("transparent hugepage is disabled due to confilct with dynamic hugetlb\n"); + pr_info("transparent hugepage is disabled due to conflict with dynamic hugetlb\n"); return -EINVAL; }
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index e9f63cb9e3d4..7ec8560d267d 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -176,6 +176,7 @@ */ #define pr_fmt(fmt) "HugeTLB: " fmt
+#include <linux/dynamic_hugetlb.h> #include "hugetlb_vmemmap.h"
/* @@ -304,6 +305,12 @@ void __init hugetlb_vmemmap_init(struct hstate *h) BUILD_BUG_ON(__NR_USED_SUBPAGE >= RESERVE_VMEMMAP_SIZE / sizeof(struct page));
+ if (enable_dhugetlb) { + pr_warn_once("cannot optimize vmemmap pages due to conflict with dynamic hugetlb\n"); + static_branch_disable(&hugetlb_optimize_vmemmap_key); + return; + } + if (!is_power_of_2(sizeof(struct page))) { pr_warn_once("cannot optimize vmemmap pages because "struct page" crosses page boundaries\n"); static_branch_disable(&hugetlb_optimize_vmemmap_key); @@ -366,8 +373,10 @@ static __init int hugetlb_vmemmap_sysctls_init(void) /* * If "memory_hotplug.memmap_on_memory" is enabled or "struct page" * crosses page boundaries, the vmemmap pages cannot be optimized. + * If "dynamic hugetlb" is enabled, the vmemmap pages cannot be + * optimized. */ - if (is_power_of_2(sizeof(struct page))) + if (is_power_of_2(sizeof(struct page)) && !enable_dhugetlb) register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
return 0;