From: Li Lingfeng lilingfeng3@huawei.com
hulk inclusion category: bugfix bugzilla: 187387, https://gitee.com/openeuler/kernel/issues/I5KTEM CVE: NA
--------------------------------
When we need to create a private copy of io_identity, we will get the use of current task's private structs, whitch means we should increase their reference counts. If we have grabed some other structs before, we should drop them and clear related flags. Otherwise, leak of old structs and uaf of new structs may occur.
Signed-off-by: Li Lingfeng lilingfeng3@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/io_uring.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+)
diff --git a/fs/io_uring.c b/fs/io_uring.c index faa81d48c812..257e4af176e6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1364,6 +1364,47 @@ static bool io_identity_cow(struct io_kiocb *req) return true; }
+static void io_drop_identity(struct io_kiocb *req) +{ + struct io_identity *id = req->work.identity; + + if (req->work.flags & IO_WQ_WORK_MM) { + mmdrop(id->mm); + req->work.flags &= ~IO_WQ_WORK_MM; + } +#ifdef CONFIG_BLK_CGROUP + if (req->work.flags & IO_WQ_WORK_BLKCG) { + css_put(id->blkcg_css); + req->work.flags &= ~IO_WQ_WORK_BLKCG; + } +#endif + if (req->work.flags & IO_WQ_WORK_CREDS) { + put_cred(id->creds); + req->work.flags &= ~IO_WQ_WORK_CREDS; + } + if (req->work.flags & IO_WQ_WORK_FILES) { + put_files_struct(req->work.identity->files); + put_nsproxy(req->work.identity->nsproxy); + req->work.flags &= ~IO_WQ_WORK_FILES; + } + if (req->work.flags & IO_WQ_WORK_CANCEL) + req->work.flags &= ~IO_WQ_WORK_CANCEL; + if (req->work.flags & IO_WQ_WORK_FS) { + struct fs_struct *fs = id->fs; + + spin_lock(&id->fs->lock); + if (--fs->users) + fs = NULL; + spin_unlock(&id->fs->lock); + + if (fs) + free_fs_struct(fs); + req->work.flags &= ~IO_WQ_WORK_FS; + } + if (req->work.flags & IO_WQ_WORK_FSIZE) + req->work.flags &= ~IO_WQ_WORK_FSIZE; +} + static bool io_grab_identity(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -1469,6 +1510,7 @@ static void io_prep_async_work(struct io_kiocb *req) if (io_grab_identity(req)) return;
+ io_drop_identity(req); if (!io_identity_cow(req)) return;
From: Gaosheng Cui cuigaosheng1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5J0X7 CVE: NA
--------------------------------
Since arm64 does not use a builtin decompressor, the EFI stub is built into the kernel proper. So instead, separate the contents of libstub and its dependencies, by putting them into their own namespace by prefixing all of its symbols with __efistub.This way, we have tight control over what parts of the kernel proper are referenced by the stub.
Follow the rules and support strchr function for EFI stub, it will be used to parse cmdline args.
Signed-off-by: Gaosheng Cui cuigaosheng1@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kernel/image-vars.h | 1 + arch/arm64/lib/strchr.S | 4 ++-- drivers/firmware/efi/libstub/string.c | 18 ++++++++++++++++++ 3 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index c615b285ff5b..7ea4b84f1518 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -35,6 +35,7 @@ __efistub_strnlen = __pi_strnlen; __efistub_strcmp = __pi_strcmp; __efistub_strncmp = __pi_strncmp; __efistub_strrchr = __pi_strrchr; +__efistub_strchr = __pi_strchr; __efistub___clean_dcache_area_poc = __pi___clean_dcache_area_poc;
#ifdef CONFIG_KASAN diff --git a/arch/arm64/lib/strchr.S b/arch/arm64/lib/strchr.S index 1f47eae3b0d6..5893ad8d4484 100644 --- a/arch/arm64/lib/strchr.S +++ b/arch/arm64/lib/strchr.S @@ -18,7 +18,7 @@ * Returns: * x0 - address of first occurrence of 'c' or 0 */ -SYM_FUNC_START_WEAK(strchr) +SYM_FUNC_START_WEAK_PI(strchr) and w1, w1, #0xff 1: ldrb w2, [x0], #1 cmp w2, w1 @@ -28,5 +28,5 @@ SYM_FUNC_START_WEAK(strchr) cmp w2, w1 csel x0, x0, xzr, eq ret -SYM_FUNC_END(strchr) +SYM_FUNC_END_PI(strchr) EXPORT_SYMBOL_NOKASAN(strchr) diff --git a/drivers/firmware/efi/libstub/string.c b/drivers/firmware/efi/libstub/string.c index 5d13e43869ee..006c9f0a8e0c 100644 --- a/drivers/firmware/efi/libstub/string.c +++ b/drivers/firmware/efi/libstub/string.c @@ -113,3 +113,21 @@ long simple_strtol(const char *cp, char **endp, unsigned int base)
return simple_strtoull(cp, endp, base); } + +#ifndef __HAVE_ARCH_STRCHR +/** + * strchr - Find the first occurrence of a character in a string + * @s: The string to be searched + * @c: The character to search for + * + * Note that the %NUL-terminator is considered part of the string, and can + * be searched for. + */ +char *strchr(const char *s, int c) +{ + for (; *s != (char)c; ++s) + if (*s == '\0') + return NULL; + return (char *)s; +} +#endif
From: Cui GaoSheng cuigaosheng1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5J0X7 CVE: NA
--------------------------------
CONFIG_RANDOMIZE_BASE=y relocates the kernel to a random base address.
However, on arm64, it does not take into account the memmap= parameter passed in from the kernel command line. This results in the kernel sometimes being put in the middle of memmap.
Add support for memmap kernel parameters parsing on ARM64. The below modes are only supported:
memmap=nn[KMG]$ss[KMG]
Region of memory to be reserved is from ss to ss+nn, the region must be in the range of existed memory, otherwise will be ignored.
Teach KASLR to not insert the kernel in memmap defined regions. We support up to 32 memmap regions: any additional regions will cause KASLR to disable.
Signed-off-by: Cui GaoSheng cuigaosheng1@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/firmware/efi/libstub/arm64-stub.c | 89 +++++++++++++++++++ .../firmware/efi/libstub/efi-stub-helper.c | 2 + drivers/firmware/efi/libstub/efi-stub.c | 3 + drivers/firmware/efi/libstub/efistub.h | 10 +++ 4 files changed, 104 insertions(+)
diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c index 4ee5ced0c6a4..143e3c13e742 100644 --- a/drivers/firmware/efi/libstub/arm64-stub.c +++ b/drivers/firmware/efi/libstub/arm64-stub.c @@ -15,6 +15,95 @@
#include "efistub.h"
+#define MAX_MEMMAP_REGIONS 32 + +struct mem_vector { + unsigned long long start; + unsigned long long size; +}; + +static struct mem_vector mem_avoid[MAX_MEMMAP_REGIONS]; + +static int +efi_parse_memmap(char *p, unsigned long long *start, unsigned long long *size) +{ + char *oldp; + u64 mem_size; + + if (!p) + return -EINVAL; + + oldp = p; + mem_size = memparse(p, &p); + if (p == oldp) + return -EINVAL; + if (!mem_size) + return -EINVAL; + if (*p != '$') + return -EINVAL; + + *start = memparse(p + 1, &p); + *size = mem_size; + + return 0; +} + +void efi_parse_option_memmap(const char *str) +{ + int rc; + static int idx; + char *k, *p = (char *)str; + + while (p && (idx < MAX_MEMMAP_REGIONS)) { + k = strchr(p, ','); + if (k) + *k++ = 0; + + rc = efi_parse_memmap(p, &mem_avoid[idx].start, &mem_avoid[idx].size); + if (rc < 0) + efi_err("Failed to parse memmap cmdlines, index: %d, str: %s\n", idx, p); + + p = k; + idx++; + } +} + +void mem_avoid_memmap(void) +{ + int i; + efi_status_t status; + unsigned long nr_pages; + unsigned long long start, end; + + for (i = 0; i < MAX_MEMMAP_REGIONS; i++) { + if (!mem_avoid[i].size) + continue; + start = round_down(mem_avoid[i].start, EFI_ALLOC_ALIGN); + end = round_up(mem_avoid[i].start + mem_avoid[i].size, EFI_ALLOC_ALIGN); + nr_pages = (end - start) / EFI_PAGE_SIZE; + + mem_avoid[i].start = start; + mem_avoid[i].size = end - start; + status = efi_bs_call(allocate_pages, EFI_ALLOCATE_ADDRESS, + EFI_LOADER_DATA, nr_pages, &mem_avoid[i].start); + if (status != EFI_SUCCESS) { + efi_err("Failed to reserve memmap, index: %d, status: %lu\n", i, status); + mem_avoid[i].size = 0; + } + } +} + +void free_avoid_memmap(void) +{ + int i; + + for (i = 0; i < MAX_MEMMAP_REGIONS; i++) { + if (!mem_avoid[i].size) + continue; + efi_free(mem_avoid[i].size, mem_avoid[i].start); + } +} + efi_status_t check_platform_features(void) { u64 tg; diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c index aa8da0a49829..0e0033fa7d51 100644 --- a/drivers/firmware/efi/libstub/efi-stub-helper.c +++ b/drivers/firmware/efi/libstub/efi-stub-helper.c @@ -232,6 +232,8 @@ efi_status_t efi_parse_options(char const *cmdline) } else if (!strcmp(param, "video") && val && strstarts(val, "efifb:")) { efi_parse_option_graphics(val + strlen("efifb:")); + } else if (!strcmp(param, "memmap") && val) { + efi_parse_option_memmap(val); } } efi_bs_call(free_pool, buf); diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c index 0ab439c53eee..6840a57b8f3b 100644 --- a/drivers/firmware/efi/libstub/efi-stub.c +++ b/drivers/firmware/efi/libstub/efi-stub.c @@ -194,6 +194,8 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle,
si = setup_graphics();
+ mem_avoid_memmap(); + status = handle_kernel_image(&image_addr, &image_size, &reserve_addr, &reserve_size, @@ -311,6 +313,7 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle, efi_free(image_size, image_addr); efi_free(reserve_size, reserve_addr); fail_free_screeninfo: + free_avoid_memmap(); free_screen_info(si); fail_free_cmdline: efi_bs_call(free_pool, cmdline_ptr); diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index 2d7abcd99de9..cf59df863fa7 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -805,6 +805,16 @@ efi_status_t efi_parse_options(char const *cmdline);
void efi_parse_option_graphics(char *option);
+#ifdef CONFIG_ARM64 +void efi_parse_option_memmap(const char *str); +void mem_avoid_memmap(void); +void free_avoid_memmap(void); +#else +static inline void efi_parse_option_memmap(const char *str) { } +static inline void mem_avoid_memmap(void) { } +static inline void free_avoid_memmap(void) { } +#endif + efi_status_t efi_setup_gop(struct screen_info *si, efi_guid_t *proto, unsigned long size);
From: George Kennedy george.kennedy@oracle.com
mainline inclusion from mainline-v5.16-rc7 commit 158b515f703e75e7d68289bf4d98c664e1d632df category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5KIF5
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------
Avoid double free in tun_free_netdev() by moving the dev->tstats and tun->security allocs to a new ndo_init routine (tun_net_init()) that will be called by register_netdevice(). ndo_init is paired with the desctructor (tun_free_netdev()), so if there's an error in register_netdevice() the destructor will handle the frees.
BUG: KASAN: double-free or invalid-free in selinux_tun_dev_free_security+0x1a/0x20 security/selinux/hooks.c:5605
CPU: 0 PID: 25750 Comm: syz-executor416 Not tainted 5.16.0-rc2-syzk #1 Hardware name: Red Hat KVM, BIOS Call Trace: <TASK> __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x89/0xb5 lib/dump_stack.c:106 print_address_description.constprop.9+0x28/0x160 mm/kasan/report.c:247 kasan_report_invalid_free+0x55/0x80 mm/kasan/report.c:372 ____kasan_slab_free mm/kasan/common.c:346 [inline] __kasan_slab_free+0x107/0x120 mm/kasan/common.c:374 kasan_slab_free include/linux/kasan.h:235 [inline] slab_free_hook mm/slub.c:1723 [inline] slab_free_freelist_hook mm/slub.c:1749 [inline] slab_free mm/slub.c:3513 [inline] kfree+0xac/0x2d0 mm/slub.c:4561 selinux_tun_dev_free_security+0x1a/0x20 security/selinux/hooks.c:5605 security_tun_dev_free_security+0x4f/0x90 security/security.c:2342 tun_free_netdev+0xe6/0x150 drivers/net/tun.c:2215 netdev_run_todo+0x4df/0x840 net/core/dev.c:10627 rtnl_unlock+0x13/0x20 net/core/rtnetlink.c:112 __tun_chr_ioctl+0x80c/0x2870 drivers/net/tun.c:3302 tun_chr_ioctl+0x2f/0x40 drivers/net/tun.c:3311 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:874 [inline] __se_sys_ioctl fs/ioctl.c:860 [inline] __x64_sys_ioctl+0x19d/0x220 fs/ioctl.c:860 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3a/0x80 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae
Reported-by: syzkaller syzkaller@googlegroups.com Signed-off-by: George Kennedy george.kennedy@oracle.com Suggested-by: Jakub Kicinski kuba@kernel.org Link: https://lore.kernel.org/r/1639679132-19884-1-git-send-email-george.kennedy@o... Signed-off-by: Jakub Kicinski kuba@kernel.org Conflict: driver/net/tun.c Signed-off-by: Zhengchao Shao shaozhengchao@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/tun.c | 115 ++++++++++++++++++++++++---------------------- 1 file changed, 59 insertions(+), 56 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c index aef966a9dae2..993236198242 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -220,6 +220,9 @@ struct tun_struct { struct tun_prog __rcu *steering_prog; struct tun_prog __rcu *filter_prog; struct ethtool_link_ksettings link_ksettings; + /* init args */ + struct file *file; + struct ifreq *ifr; };
struct veth { @@ -227,6 +230,9 @@ struct veth { __be16 h_vlan_TCI; };
+static void tun_flow_init(struct tun_struct *tun); +static void tun_flow_uninit(struct tun_struct *tun); + static int tun_napi_receive(struct napi_struct *napi, int budget) { struct tun_file *tfile = container_of(napi, struct tun_file, napi); @@ -964,6 +970,49 @@ static int check_filter(struct tap_filter *filter, const struct sk_buff *skb)
static const struct ethtool_ops tun_ethtool_ops;
+static int tun_net_init(struct net_device *dev) +{ + struct tun_struct *tun = netdev_priv(dev); + struct ifreq *ifr = tun->ifr; + int err; + + tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats); + if (!tun->pcpu_stats) + return -ENOMEM; + + spin_lock_init(&tun->lock); + + err = security_tun_dev_alloc_security(&tun->security); + if (err < 0) { + free_percpu(tun->pcpu_stats); + return err; + } + + tun_flow_init(tun); + + dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | + TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX; + dev->features = dev->hw_features | NETIF_F_LLTX; + dev->vlan_features = dev->features & + ~(NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX); + + tun->flags = (tun->flags & ~TUN_FEATURES) | + (ifr->ifr_flags & TUN_FEATURES); + + INIT_LIST_HEAD(&tun->disabled); + err = tun_attach(tun, tun->file, false, ifr->ifr_flags & IFF_NAPI, + ifr->ifr_flags & IFF_NAPI_FRAGS, false); + if (err < 0) { + tun_flow_uninit(tun); + security_tun_dev_free_security(tun->security); + free_percpu(tun->pcpu_stats); + return err; + } + return 0; +} + /* Net device detach from fd. */ static void tun_net_uninit(struct net_device *dev) { @@ -1205,6 +1254,7 @@ static int tun_net_change_carrier(struct net_device *dev, bool new_carrier) }
static const struct net_device_ops tun_netdev_ops = { + .ndo_init = tun_net_init, .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, .ndo_stop = tun_net_close, @@ -1285,6 +1335,7 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) }
static const struct net_device_ops tap_netdev_ops = { + .ndo_init = tun_net_init, .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, .ndo_stop = tun_net_close, @@ -1325,7 +1376,7 @@ static void tun_flow_uninit(struct tun_struct *tun) #define MAX_MTU 65535
/* Initialize net device. */ -static void tun_net_init(struct net_device *dev) +static void tun_net_initialize(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev);
@@ -2257,11 +2308,6 @@ static void tun_free_netdev(struct net_device *dev) BUG_ON(!(list_empty(&tun->disabled)));
free_percpu(tun->pcpu_stats); - /* We clear pcpu_stats so that tun_set_iff() can tell if - * tun_free_netdev() has been called from register_netdevice(). - */ - tun->pcpu_stats = NULL; - tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); __tun_set_ebpf(tun, &tun->steering_prog, NULL); @@ -2773,41 +2819,16 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) tun->rx_batched = 0; RCU_INIT_POINTER(tun->steering_prog, NULL);
- tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats); - if (!tun->pcpu_stats) { - err = -ENOMEM; - goto err_free_dev; - } - - spin_lock_init(&tun->lock); - - err = security_tun_dev_alloc_security(&tun->security); - if (err < 0) - goto err_free_stat; - - tun_net_init(dev); - tun_flow_init(tun); + tun->ifr = ifr; + tun->file = file;
- dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | - TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | - NETIF_F_HW_VLAN_STAG_TX; - dev->features = dev->hw_features | NETIF_F_LLTX; - dev->vlan_features = dev->features & - ~(NETIF_F_HW_VLAN_CTAG_TX | - NETIF_F_HW_VLAN_STAG_TX); - - tun->flags = (tun->flags & ~TUN_FEATURES) | - (ifr->ifr_flags & TUN_FEATURES); - - INIT_LIST_HEAD(&tun->disabled); - err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI, - ifr->ifr_flags & IFF_NAPI_FRAGS, false); - if (err < 0) - goto err_free_flow; + tun_net_initialize(dev);
err = register_netdevice(tun->dev); - if (err < 0) - goto err_detach; + if (err < 0) { + free_netdev(dev); + return err; + } /* free_netdev() won't check refcnt, to aovid race * with dev_put() we need publish tun after registration. */ @@ -2824,24 +2845,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
strcpy(ifr->ifr_name, tun->dev->name); return 0; - -err_detach: - tun_detach_all(dev); - /* We are here because register_netdevice() has failed. - * If register_netdevice() already called tun_free_netdev() - * while dealing with the error, tun->pcpu_stats has been cleared. - */ - if (!tun->pcpu_stats) - goto err_free_dev; - -err_free_flow: - tun_flow_uninit(tun); - security_tun_dev_free_security(tun->security); -err_free_stat: - free_percpu(tun->pcpu_stats); -err_free_dev: - free_netdev(dev); - return err; }
static void tun_get_iff(struct tun_struct *tun, struct ifreq *ifr)
From: Luo Meng luomeng12@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5KK52 CVE: NA
--------------------------------
If dm_get_device() create dd in multipath_message(), and then call table_deps() after dm_put_table_device(), it will lead to concurrency UAF bugs.
One of the concurrency UAF can be shown as below:
(USE) | (FREE) | target_message | multipath_message | dm_put_device | dm_put_table_device # | kfree(td) # table_device *td ioctl # DM_TABLE_DEPS_CMD | ... table_deps | ... dm_get_live_or_inactive_table | ... retrieve_dep | ... list_for_each_entry | ... deps->dev[count++] = | ... huge_encode_dev | ... (dd->dm_dev->bdev->bd_dev) | list_del(&dd->list) | kfree(dd) # dm_dev_internal
The root cause of UAF bugs is that find_device() failed in dm_get_device() and will create dd and refcount set 1, kfree() in dm_put_table() is not protected. When td, which there are still pointers point to, is released, the concurrency UAF bug will happen.
This patch add a flag to determine whether to create a new dd.
Signed-off-by: Luo Meng luomeng12@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/md/dm-mpath.c | 2 +- drivers/md/dm-table.c | 44 +++++++++++++++++++++-------------- include/linux/device-mapper.h | 2 ++ 3 files changed, 30 insertions(+), 18 deletions(-)
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index bced42f082b0..e0bfa16aab37 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1962,7 +1962,7 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv, goto out; }
- r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev); + r = __dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev, false); if (r) { DMWARN("message: error getting device %s", argv[1]); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 5c590895c14c..78627402b5fb 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -361,12 +361,8 @@ dev_t dm_get_dev_t(const char *path) } EXPORT_SYMBOL_GPL(dm_get_dev_t);
-/* - * Add a device to the list, or just increment the usage count if - * it's already present. - */ -int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, - struct dm_dev **result) +int __dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, + struct dm_dev **result, bool create_dd) { int r; dev_t dev; @@ -390,19 +386,22 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
dd = find_device(&t->devices, dev); if (!dd) { - dd = kmalloc(sizeof(*dd), GFP_KERNEL); - if (!dd) - return -ENOMEM; - - if ((r = dm_get_table_device(t->md, dev, mode, &dd->dm_dev))) { - kfree(dd); - return r; - } + if (create_dd) { + dd = kmalloc(sizeof(*dd), GFP_KERNEL); + if (!dd) + return -ENOMEM;
- refcount_set(&dd->count, 1); - list_add(&dd->list, &t->devices); - goto out; + r = dm_get_table_device(t->md, dev, mode, &dd->dm_dev); + if (r) { + kfree(dd); + return r; + }
+ refcount_set(&dd->count, 1); + list_add(&dd->list, &t->devices); + goto out; + } else + return -ENODEV; } else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) { r = upgrade_mode(dd, mode, t->md); if (r) @@ -413,6 +412,17 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, *result = dd->dm_dev; return 0; } +EXPORT_SYMBOL(__dm_get_device); + +/* + * Add a device to the list, or just increment the usage count if + * it's already present. + */ +int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, + struct dm_dev **result) +{ + return __dm_get_device(ti, path, mode, result, true); +} EXPORT_SYMBOL(dm_get_device);
static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 50cc070cb1f7..47db4a14c925 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -162,6 +162,8 @@ dev_t dm_get_dev_t(const char *path); int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, struct dm_dev **result); void dm_put_device(struct dm_target *ti, struct dm_dev *d); +int __dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, + struct dm_dev **result, bool create_dd);
/* * Information about a target type
From: Pablo Neira Ayuso pablo@netfilter.org
mainline inclusion from mainline-v5.14-rc1 commit e0241ae6ac59ffa318255640c047f7c90457fbe5 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I58CKN CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Replace netlink_unicast() calls by nfnetlink_unicast() which already deals with translating EAGAIN to ENOBUFS as the nfnetlink core expects.
nfnetlink_unicast() calls nlmsg_unicast() which returns zero in case of success, otherwise the netlink core function netlink_rcv_skb() turns err > 0 into an acknowlegment.
Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Xu Jia xujia39@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/netfilter/ipset/ip_set_core.c | 44 +++++--------------- net/netfilter/nf_conntrack_netlink.c | 62 ++++++++-------------------- net/netfilter/nfnetlink_acct.c | 9 +--- net/netfilter/nfnetlink_cthelper.c | 10 ++--- net/netfilter/nfnetlink_cttimeout.c | 34 ++++----------- 5 files changed, 42 insertions(+), 117 deletions(-)
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 2b19189a930f..1db4fa0580ff 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1706,8 +1706,8 @@ static const struct nla_policy ip_set_adt_policy[IPSET_ATTR_CMD_MAX + 1] = { };
static int -call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, - struct nlattr *tb[], enum ipset_adt adt, +call_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb, + struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 flags, bool use_lineno) { int ret; @@ -1759,8 +1759,7 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
*errline = lineno;
- netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); + nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); /* Signal netlink not to send its ACK/errmsg. */ return -EINTR; } @@ -1804,7 +1803,7 @@ static int ip_set_ad(struct net *net, struct sock *ctnl, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; - ret = call_ad(ctnl, skb, set, tb, adt, flags, + ret = call_ad(net, ctnl, skb, set, tb, adt, flags, use_lineno); } else { int nla_rem; @@ -1815,7 +1814,7 @@ static int ip_set_ad(struct net *net, struct sock *ctnl, nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; - ret = call_ad(ctnl, skb, set, tb, adt, + ret = call_ad(net, ctnl, skb, set, tb, adt, flags, use_lineno); if (ret < 0) return ret; @@ -1888,7 +1887,6 @@ static int ip_set_header(struct net *net, struct sock *ctnl, const struct ip_set *set; struct sk_buff *skb2; struct nlmsghdr *nlh2; - int ret = 0;
if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME])) @@ -1914,11 +1912,7 @@ static int ip_set_header(struct net *net, struct sock *ctnl, goto nla_put_failure; nlmsg_end(skb2, nlh2);
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); - if (ret < 0) - return ret; - - return 0; + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
nla_put_failure: nlmsg_cancel(skb2, nlh2); @@ -1975,11 +1969,8 @@ static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb, nlmsg_end(skb2, nlh2);
pr_debug("Send TYPE, nlmsg_len: %u\n", nlh2->nlmsg_len); - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); - if (ret < 0) - return ret;
- return 0; + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
nla_put_failure: nlmsg_cancel(skb2, nlh2); @@ -2002,7 +1993,6 @@ static int ip_set_protocol(struct net *net, struct sock *ctnl, { struct sk_buff *skb2; struct nlmsghdr *nlh2; - int ret = 0;
if (unlikely(!attr[IPSET_ATTR_PROTOCOL])) return -IPSET_ERR_PROTOCOL; @@ -2021,11 +2011,7 @@ static int ip_set_protocol(struct net *net, struct sock *ctnl, goto nla_put_failure; nlmsg_end(skb2, nlh2);
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); - if (ret < 0) - return ret; - - return 0; + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
nla_put_failure: nlmsg_cancel(skb2, nlh2); @@ -2046,7 +2032,6 @@ static int ip_set_byname(struct net *net, struct sock *ctnl, struct nlmsghdr *nlh2; ip_set_id_t id = IPSET_INVALID_ID; const struct ip_set *set; - int ret = 0;
if (unlikely(protocol_failed(attr) || !attr[IPSET_ATTR_SETNAME])) @@ -2070,11 +2055,7 @@ static int ip_set_byname(struct net *net, struct sock *ctnl, goto nla_put_failure; nlmsg_end(skb2, nlh2);
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); - if (ret < 0) - return ret; - - return 0; + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
nla_put_failure: nlmsg_cancel(skb2, nlh2); @@ -2098,7 +2079,6 @@ static int ip_set_byindex(struct net *net, struct sock *ctnl, struct nlmsghdr *nlh2; ip_set_id_t id = IPSET_INVALID_ID; const struct ip_set *set; - int ret = 0;
if (unlikely(protocol_failed(attr) || !attr[IPSET_ATTR_INDEX])) @@ -2124,11 +2104,7 @@ static int ip_set_byindex(struct net *net, struct sock *ctnl, goto nla_put_failure; nlmsg_end(skb2, nlh2);
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); - if (ret < 0) - return ret; - - return 0; + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
nla_put_failure: nlmsg_cancel(skb2, nlh2); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index eeeaa34b3e7b..a753762fb078 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1634,9 +1634,8 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl,
ct = nf_ct_tuplehash_to_ctrack(h);
- err = -ENOMEM; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) { + if (!skb2) { nf_ct_put(ct); return -ENOMEM; } @@ -1644,20 +1643,12 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl, err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NFNL_MSG_TYPE(nlh->nlmsg_type), ct, true, 0); nf_ct_put(ct); - if (err <= 0) - goto free; - - err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); - if (err < 0) - goto out; - - return 0; + if (err <= 0) { + kfree_skb(skb2); + return -ENOMEM; + }
-free: - kfree_skb(skb2); -out: - /* this avoids a loop in nfnetlink. */ - return err == -EAGAIN ? -ENOBUFS : err; + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); }
static int ctnetlink_done_list(struct netlink_callback *cb) @@ -2613,20 +2604,12 @@ static int ctnetlink_stat_ct(struct net *net, struct sock *ctnl, nlh->nlmsg_seq, NFNL_MSG_TYPE(nlh->nlmsg_type), sock_net(skb->sk)); - if (err <= 0) - goto free; - - err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); - if (err < 0) - goto out; - - return 0; + if (err <= 0) { + kfree_skb(skb2); + return -ENOMEM; + }
-free: - kfree_skb(skb2); -out: - /* this avoids a loop in nfnetlink. */ - return err == -EAGAIN ? -ENOBUFS : err; + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); }
static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { @@ -3368,11 +3351,10 @@ static int ctnetlink_get_expect(struct net *net, struct sock *ctnl, } }
- err = -ENOMEM; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) { + if (!skb2) { nf_ct_expect_put(exp); - goto out; + return -ENOMEM; }
rcu_read_lock(); @@ -3380,20 +3362,12 @@ static int ctnetlink_get_expect(struct net *net, struct sock *ctnl, nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp); rcu_read_unlock(); nf_ct_expect_put(exp); - if (err <= 0) - goto free; - - err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); - if (err < 0) - goto out; - - return 0; + if (err <= 0) { + kfree_skb(skb2); + return -ENOMEM; + }
-free: - kfree_skb(skb2); -out: - /* this avoids a loop in nfnetlink. */ - return err == -EAGAIN ? -ENOBUFS : err; + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); }
static bool expect_iter_name(struct nf_conntrack_expect *exp, void *data) diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 5bfec829c12f..a9091b721181 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -308,13 +308,8 @@ static int nfnl_acct_get(struct net *net, struct sock *nfnl, kfree_skb(skb2); break; } - ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret > 0) - ret = 0; - - /* this avoids a loop in nfnetlink. */ - return ret == -EAGAIN ? -ENOBUFS : ret; + ret = nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); + break; } return ret; } diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 91afbf8ac8cf..7e5820e19de3 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -676,14 +676,10 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl, break; }
- ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret > 0) - ret = 0; - - /* this avoids a loop in nfnetlink. */ - return ret == -EAGAIN ? -ENOBUFS : ret; + ret = nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); + break; } + return ret; }
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 89a381f7f945..49d7499cd4ac 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -279,13 +279,8 @@ static int cttimeout_get_timeout(struct net *net, struct sock *ctnl, kfree_skb(skb2); break; } - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret > 0) - ret = 0; - - /* this avoids a loop in nfnetlink. */ - return ret == -EAGAIN ? -ENOBUFS : ret; + ret = nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); + break; } return ret; } @@ -429,9 +424,9 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, const struct nf_conntrack_l4proto *l4proto; unsigned int *timeouts = NULL; struct sk_buff *skb2; - int ret, err; __u16 l3num; __u8 l4num; + int ret;
if (!cda[CTA_TIMEOUT_L3PROTO] || !cda[CTA_TIMEOUT_L4PROTO]) return -EINVAL; @@ -440,9 +435,8 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); l4proto = nf_ct_l4proto_find(l4num);
- err = -EOPNOTSUPP; if (l4proto->l4proto != l4num) - goto err; + return -EOPNOTSUPP;
switch (l4proto->l4proto) { case IPPROTO_ICMP: @@ -482,13 +476,11 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, }
if (!timeouts) - goto err; + return -EOPNOTSUPP;
skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) { - err = -ENOMEM; - goto err; - } + if (!skb2) + return -ENOMEM;
ret = cttimeout_default_fill_info(net, skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, @@ -497,17 +489,9 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, l3num, l4proto, timeouts); if (ret <= 0) { kfree_skb(skb2); - err = -ENOMEM; - goto err; + return -ENOMEM; } - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); - if (ret > 0) - ret = 0; - - /* this avoids a loop in nfnetlink. */ - return ret == -EAGAIN ? -ENOBUFS : ret; -err: - return err; + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); }
static struct nf_ct_timeout *ctnl_timeout_find_get(struct net *net,
From: Pablo Neira Ayuso pablo@netfilter.org
mainline inclusion from mainline-v5.15-rc1 commit 241d1af4c11a75d4c17ecc0193a6ab60553efbfc category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I58CKN CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Use nfnetlink_unicast() which already translates EAGAIN to ENOBUFS, since EAGAIN is reserved to report missing module dependencies to the nfnetlink core.
e0241ae6ac59 ("netfilter: use nfnetlink_unicast() forgot to update this spot.
Reported-by: Yajun Deng yajun.deng@linux.dev Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Xu Jia xujia39@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/netfilter/nft_compat.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 8e56f353ff35..6538f8968429 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -687,14 +687,12 @@ static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl, goto out_put; }
- ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret > 0) - ret = 0; + ret = nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); out_put: rcu_read_lock(); module_put(THIS_MODULE); - return ret == -EAGAIN ? -ENOBUFS : ret; + + return ret; }
static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = {
From: "Eric W. Biederman" ebiederm@xmission.com
mainline inclusion from mainline-v5.15-rc1 commit d21918e5a94a862ccb297b9f2be38574c865fda0 category: bugfix bugzilla: 187336, https://gitee.com/openeuler/kernel/issues/I5LLC6
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Replace get_nr_threads with atomic_read(¤t->signal->live) as that is a more accurate number that is decremented sooner.
Acked-by: Kees Cook keescook@chromium.org Link: https://lkml.kernel.org/r/87lf6z6qbd.fsf_-_@disp2133 Signed-off-by: "Eric W. Biederman" ebiederm@xmission.com
Conflicts: kernel/seccomp.c
Signed-off-by: GONG, Ruiqi gongruiqi1@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/seccomp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index b2dd045d6afe..1e0b33dd681b 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1260,7 +1260,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, seccomp_log(this_syscall, SIGSYS, action, true); /* Dump core only if this is the last remaining thread. */ if (action != SECCOMP_RET_KILL_THREAD || - get_nr_threads(current) == 1) { + (atomic_read(¤t->signal->live) == 1)) { kernel_siginfo_t info;
/* Show the original registers in the dump. */
From: Jann Horn jannh@google.com
stable inclusion from stable-v5.10.130 commit 6c32496964da0dc230cea763a0e934b2e02dabd5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5LJFH CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit eeaa345e128515135ccb864c04482180c08e3259 upstream.
The fastpath in slab_alloc_node() assumes that c->slab is stable as long as the TID stays the same. However, two places in __slab_alloc() currently don't update the TID when deactivating the CPU slab.
If multiple operations race the right way, this could lead to an object getting lost; or, in an even more unlikely situation, it could even lead to an object being freed onto the wrong slab's freelist, messing up the `inuse` counter and eventually causing a page to be freed to the page allocator while it still contains slab objects.
(I haven't actually tested these cases though, this is just based on looking at the code. Writing testcases for this stuff seems like it'd be a pain...)
The race leading to state inconsistency is (all operations on the same CPU and kmem_cache):
- task A: begin do_slab_free(): - read TID - read pcpu freelist (==NULL) - check `slab == c->slab` (true) - [PREEMPT A->B] - task B: begin slab_alloc_node(): - fastpath fails (`c->freelist` is NULL) - enter __slab_alloc() - slub_get_cpu_ptr() (disables preemption) - enter ___slab_alloc() - take local_lock_irqsave() - read c->freelist as NULL - get_freelist() returns NULL - write `c->slab = NULL` - drop local_unlock_irqrestore() - goto new_slab - slub_percpu_partial() is NULL - get_partial() returns NULL - slub_put_cpu_ptr() (enables preemption) - [PREEMPT B->A] - task A: finish do_slab_free(): - this_cpu_cmpxchg_double() succeeds() - [CORRUPT STATE: c->slab==NULL, c->freelist!=NULL]
From there, the object on c->freelist will get lost if task B is allowed to continue from here: It will proceed to the retry_load_slab label, set c->slab, then jump to load_freelist, which clobbers c->freelist.
But if we instead continue as follows, we get worse corruption:
- task A: run __slab_free() on object from other struct slab: - CPU_PARTIAL_FREE case (slab was on no list, is now on pcpu partial) - task A: run slab_alloc_node() with NUMA node constraint: - fastpath fails (c->slab is NULL) - call __slab_alloc() - slub_get_cpu_ptr() (disables preemption) - enter ___slab_alloc() - c->slab is NULL: goto new_slab - slub_percpu_partial() is non-NULL - set c->slab to slub_percpu_partial(c) - [CORRUPT STATE: c->slab points to slab-1, c->freelist has objects from slab-2] - goto redo - node_match() fails - goto deactivate_slab - existing c->freelist is passed into deactivate_slab() - inuse count of slab-1 is decremented to account for object from slab-2
At this point, the inuse count of slab-1 is 1 lower than it should be. This means that if we free all allocated objects in slab-1 except for one, SLUB will think that slab-1 is completely unused, and may free its page, leading to use-after-free.
Fixes: c17dda40a6a4e ("slub: Separate out kmem_cache_cpu processing from deactivate_slab") Fixes: 03e404af26dc2 ("slub: fast release on full slab") Cc: stable@vger.kernel.org Signed-off-by: Jann Horn jannh@google.com Acked-by: Christoph Lameter cl@linux.com Acked-by: David Rientjes rientjes@google.com Reviewed-by: Muchun Song songmuchun@bytedance.com Tested-by: Hyeonggon Yoo 42.hyeyoo@gmail.com Signed-off-by: Vlastimil Babka vbabka@suse.cz Link: https://lore.kernel.org/r/20220608182205.2945720-1-jannh@google.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/slub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mm/slub.c b/mm/slub.c index 98452815a066..ad44734dbf72 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2318,6 +2318,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
c->page = NULL; c->freelist = NULL; + c->tid = next_tid(c->tid); }
/* @@ -2451,8 +2452,6 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { stat(s, CPUSLAB_FLUSH); deactivate_slab(s, c->page, c->freelist, c); - - c->tid = next_tid(c->tid); }
/* @@ -2738,6 +2737,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
if (!freelist) { c->page = NULL; + c->tid = next_tid(c->tid); stat(s, DEACTIVATE_BYPASS); goto new_slab; }
From: Lee Jones lee.jones@linaro.org
mainline inclusion from mainline-v5.11-rc1 commit a609c58086e381c13bdad1ba97e6510a13d465e7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5L672 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Fixes the following W=1 kernel build warning(s):
drivers/tty/serial/8250/8250_port.c:349:14: warning: no previous prototype for ‘au_serial_in’ [-Wmissing-prototypes] drivers/tty/serial/8250/8250_port.c:359:6: warning: no previous prototype for ‘au_serial_out’ [-Wmissing-prototypes]
Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Jiri Slaby jirislaby@kernel.org Cc: Mike Hudson Exoray@isys.ca Cc: linux-serial@vger.kernel.org Signed-off-by: Lee Jones lee.jones@linaro.org Link: https://lore.kernel.org/r/20201112105857.2078977-3-lee.jones@linaro.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yi Yang yiyang13@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/tty/serial/8250/8250_early.c | 3 --- include/linux/serial_8250.h | 5 +++++ 2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/drivers/tty/serial/8250/8250_early.c b/drivers/tty/serial/8250/8250_early.c index 70d7826788f5..c171ce6db691 100644 --- a/drivers/tty/serial/8250/8250_early.c +++ b/drivers/tty/serial/8250/8250_early.c @@ -204,9 +204,6 @@ OF_EARLYCON_DECLARE(omap8250, "ti,omap4-uart", early_omap8250_setup);
#ifdef CONFIG_SERIAL_8250_RT288X
-unsigned int au_serial_in(struct uart_port *p, int offset); -void au_serial_out(struct uart_port *p, int offset, int value); - static int __init early_au_setup(struct earlycon_device *dev, const char *opt) { dev->port.serial_in = au_serial_in; diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h index 2b70f736b091..9e655055112d 100644 --- a/include/linux/serial_8250.h +++ b/include/linux/serial_8250.h @@ -187,4 +187,9 @@ extern void serial8250_set_isa_configurator(void (*v) (int port, struct uart_port *up, u32 *capabilities));
+#ifdef CONFIG_SERIAL_8250_RT288X +unsigned int au_serial_in(struct uart_port *p, int offset); +void au_serial_out(struct uart_port *p, int offset, int value); +#endif + #endif
From: Thadeu Lima de Souza Cascardo cascardo@canonical.com
mainline inclusion from mainline-v6.0-rc1 commit 9ad36309e2719a884f946678e0296be10f0bb4c1 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5LJLR CVE: CVE-2022-2588
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
When a route filter is replaced and the old filter has a 0 handle, the old one won't be removed from the hashtable, while it will still be freed.
The test was there since before commit 1109c00547fc ("net: sched: RCU cls_route"), when a new filter was not allocated when there was an old one. The old filter was reused and the reinserting would only be necessary if an old filter was replaced. That was still wrong for the same case where the old handle was 0.
Remove the old filter from the list independently from its handle value.
This fixes CVE-2022-2588, also reported as ZDI-CAN-17440.
Reported-by: Zhenpeng Lin zplin@u.northwestern.edu Signed-off-by: Thadeu Lima de Souza Cascardo cascardo@canonical.com Reviewed-by: Kamal Mostafa kamal@canonical.com Cc: stable@vger.kernel.org Acked-by: Jamal Hadi Salim jhs@mojatatu.com Link: https://lore.kernel.org/r/20220809170518.164662-1-cascardo@canonical.com Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Xu Jia xujia39@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/sched/cls_route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 5efa3e7ace15..315ca2b7e2ed 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -526,7 +526,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, rcu_assign_pointer(f->next, f1); rcu_assign_pointer(*fp, f);
- if (fold && fold->handle && f->handle != fold->handle) { + if (fold) { th = to_hash(fold->handle); h = from_hash(fold->handle >> 16); b = rtnl_dereference(head->table[th]);
From: Thadeu Lima de Souza Cascardo cascardo@canonical.com
mainline inclusion from mainline-v6.0-rc1 commit e362359ace6f87c201531872486ff295df306d13 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5MGOO CVE: CVE-2022-2585
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Commit 55e8c8eb2c7b ("posix-cpu-timers: Store a reference to a pid not a task") started looking up tasks by PID when deleting a CPU timer.
When a non-leader thread calls execve, it will switch PIDs with the leader process. Then, as it calls exit_itimers, posix_cpu_timer_del cannot find the task because the timer still points out to the old PID.
That means that armed timers won't be disarmed, that is, they won't be removed from the timerqueue_list. exit_itimers will still release their memory, and when that list is later processed, it leads to a use-after-free.
Clean up the timers from the de-threaded task before freeing them. This prevents a reported use-after-free.
Fixes: 55e8c8eb2c7b ("posix-cpu-timers: Store a reference to a pid not a task") Signed-off-by: Thadeu Lima de Souza Cascardo cascardo@canonical.com Signed-off-by: Thomas Gleixner tglx@linutronix.de Reviewed-by: Thomas Gleixner tglx@linutronix.de Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220809170751.164716-1-cascardo@canonical.com
Conflicts: fs/exec.c
Signed-off-by: Yu Liao liaoyu15@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/exec.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/fs/exec.c b/fs/exec.c index cf2077bffc0a..fd7c0320a9bf 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1278,6 +1278,9 @@ int begin_new_exec(struct linux_binprm * bprm) bprm->mm = NULL;
#ifdef CONFIG_POSIX_TIMERS + spin_lock_irq(&me->sighand->siglock); + posix_cpu_timers_exit(me); + spin_unlock_irq(&me->sighand->siglock); exit_itimers(me->signal); flush_itimer_signals(); #endif
From: GUO Zihua guozihua@huawei.com
stable inclusion from stable-v5.10.136 commit 3c77292d52b341831cb09c24ca4112a1e4f9e91f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5MF5J CVE: NA
Reference: https://lore.kernel.org/linux-arm-kernel/20220712075031.29061-1-guozihua@hua...
--------------------------------
commit 7ae19d422c7da84b5f13bc08b98bd737a08d3a53 upstream.
A kasan error was reported during fuzzing:
BUG: KASAN: slab-out-of-bounds in neon_poly1305_blocks.constprop.0+0x1b4/0x250 [poly1305_neon] Read of size 4 at addr ffff0010e293f010 by task syz-executor.5/1646715 CPU: 4 PID: 1646715 Comm: syz-executor.5 Kdump: loaded Not tainted 5.10.0.aarch64 #1 Hardware name: Huawei TaiShan 2280 /BC11SPCD, BIOS 1.59 01/31/2019 Call trace: dump_backtrace+0x0/0x394 show_stack+0x34/0x4c arch/arm64/kernel/stacktrace.c:196 __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x158/0x1e4 lib/dump_stack.c:118 print_address_description.constprop.0+0x68/0x204 mm/kasan/report.c:387 __kasan_report+0xe0/0x140 mm/kasan/report.c:547 kasan_report+0x44/0xe0 mm/kasan/report.c:564 check_memory_region_inline mm/kasan/generic.c:187 [inline] __asan_load4+0x94/0xd0 mm/kasan/generic.c:252 neon_poly1305_blocks.constprop.0+0x1b4/0x250 [poly1305_neon] neon_poly1305_do_update+0x6c/0x15c [poly1305_neon] neon_poly1305_update+0x9c/0x1c4 [poly1305_neon] crypto_shash_update crypto/shash.c:131 [inline] shash_finup_unaligned+0x84/0x15c crypto/shash.c:179 crypto_shash_finup+0x8c/0x140 crypto/shash.c:193 shash_digest_unaligned+0xb8/0xe4 crypto/shash.c:201 crypto_shash_digest+0xa4/0xfc crypto/shash.c:217 crypto_shash_tfm_digest+0xb4/0x150 crypto/shash.c:229 essiv_skcipher_setkey+0x164/0x200 [essiv] crypto_skcipher_setkey+0xb0/0x160 crypto/skcipher.c:612 skcipher_setkey+0x3c/0x50 crypto/algif_skcipher.c:305 alg_setkey+0x114/0x2a0 crypto/af_alg.c:220 alg_setsockopt+0x19c/0x210 crypto/af_alg.c:253 __sys_setsockopt+0x190/0x2e0 net/socket.c:2123 __do_sys_setsockopt net/socket.c:2134 [inline] __se_sys_setsockopt net/socket.c:2131 [inline] __arm64_sys_setsockopt+0x78/0x94 net/socket.c:2131 __invoke_syscall arch/arm64/kernel/syscall.c:36 [inline] invoke_syscall+0x64/0x100 arch/arm64/kernel/syscall.c:48 el0_svc_common.constprop.0+0x220/0x230 arch/arm64/kernel/syscall.c:155 do_el0_svc+0xb4/0xd4 arch/arm64/kernel/syscall.c:217 el0_svc+0x24/0x3c arch/arm64/kernel/entry-common.c:353 el0_sync_handler+0x160/0x164 arch/arm64/kernel/entry-common.c:369 el0_sync+0x160/0x180 arch/arm64/kernel/entry.S:683
This error can be reproduced by the following code compiled as ko on a system with kasan enabled:
char test_data[] = "\x00\x01\x02\x03\x04\x05\x06\x07" "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" "\x10\x11\x12\x13\x14\x15\x16\x17" "\x18\x19\x1a\x1b\x1c\x1d\x1e";
int init(void) { struct crypto_shash *tfm = NULL; char *data = NULL, *out = NULL;
tfm = crypto_alloc_shash("poly1305", 0, 0); data = kmalloc(POLY1305_KEY_SIZE - 1, GFP_KERNEL); out = kmalloc(POLY1305_DIGEST_SIZE, GFP_KERNEL); memcpy(data, test_data, POLY1305_KEY_SIZE - 1); crypto_shash_tfm_digest(tfm, data, POLY1305_KEY_SIZE - 1, out);
kfree(data); kfree(out); return 0; }
void deinit(void) { }
module_init(init) module_exit(deinit) MODULE_LICENSE("GPL");
The root cause of the bug sits in neon_poly1305_blocks. The logic neon_poly1305_blocks() performed is that if it was called with both s[] and r[] uninitialized, it will first try to initialize them with the data from the first "block" that it believed to be 32 bytes in length. First 16 bytes are used as the key and the next 16 bytes for s[]. This would lead to the aforementioned read out-of-bound. However, after calling poly1305_init_arch(), only 16 bytes were deducted from the input and s[] is initialized yet again with the following 16 bytes. The second initialization of s[] is certainly redundent which indicates that the first initialization should be for r[] only.
This patch fixes the issue by calling poly1305_init_arm64() instead of poly1305_init_arch(). This is also the implementation for the same algorithm on arm platform.
Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation") Cc: stable@vger.kernel.org Signed-off-by: GUO Zihua guozihua@huawei.com Reviewed-by: Eric Biggers ebiggers@google.com Acked-by: Will Deacon will@kernel.org Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: GUO Zihua guozihua@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/crypto/poly1305-glue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm64/crypto/poly1305-glue.c b/arch/arm64/crypto/poly1305-glue.c index 01e22fe40823..9f4599014854 100644 --- a/arch/arm64/crypto/poly1305-glue.c +++ b/arch/arm64/crypto/poly1305-glue.c @@ -52,7 +52,7 @@ static void neon_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src, { if (unlikely(!dctx->sset)) { if (!dctx->rset) { - poly1305_init_arch(dctx, src); + poly1305_init_arm64(&dctx->h, src); src += POLY1305_BLOCK_SIZE; len -= POLY1305_BLOCK_SIZE; dctx->rset = 1;
From: Muchun Song songmuchun@bytedance.com
mainline inclusion from mainline-v5.19-rc7 commit 39d35edee4537487e5178f258e23518272a66413 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Higher order allocations for vmemmap pages from buddy allocator must be able to be treated as indepdenent small pages as they can be freed individually by the caller. There is no problem for higher order vmemmap pages allocated at boot time since each individual small page will be initialized at boot time. However, it will be an issue for memory hotplug case since those higher order vmemmap pages are allocated from buddy allocator without initializing each individual small page's refcount. The system will panic in put_page_testzero() when CONFIG_DEBUG_VM is enabled if the vmemmap page is freed.
Link: https://lkml.kernel.org/r/20220620023019.94257-1-songmuchun@bytedance.com Fixes: d8d55f5616cf ("mm: sparsemem: use page table lock to protect kernel pmd operations") Signed-off-by: Muchun Song songmuchun@bytedance.com Reviewed-by: Mike Kravetz mike.kravetz@oracle.com Cc: Xiongchun Duan duanxiongchun@bytedance.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Conflicts: mm/sparse-vmemmap.c Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/sparse-vmemmap.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 5b40a7473dc8..269b464c448d 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -78,6 +78,14 @@ static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
spin_lock(&init_mm.page_table_lock); if (likely(pmd_leaf(*pmd))) { + /* + * Higher order allocations from buddy allocator must be able to + * be treated as indepdenent small pages (as they can be freed + * individually). + */ + if (!PageReserved(page)) + split_page(page, get_order(PMD_SIZE)); + /* Make pte visible before pmd. See comment in __pte_alloc(). */ smp_wmb(); pmd_populate_kernel(&init_mm, pmd, pgtable);
From: Aili Yao yaoaili@kingsoft.com
mainline inclusion from mainline-v5.12-rc1 commit 30c9cf49270423f8cb0d2c152486e248f375cccb category: feature bugzilla: 187341, https://gitee.com/openeuler/kernel/issues/I5JGLK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
When a memory uncorrected error is triggered by process who accessed the address with error, It's Action Required Case for only current process which triggered this; This Action Required case means Action optional to other process who share the same page. Usually killing current process will be sufficient, other processes sharing the same page will get be signaled when they really touch the poisoned page.
But there is another scenario that other processes sharing the same page want to be signaled early with PF_MCE_EARLY set. In this case, we should get them into kill list and signal BUS_MCEERR_AO to them.
So in this patch, task_early_kill will check current process if force_early is set, and if not current,the code will fallback to find_early_kill_thread() to check if there is PF_MCE_EARLY process who cares the error.
In kill_proc(), BUS_MCEERR_AR is only send to current, other processes in kill list will be signaled with BUS_MCEERR_AO.
Link: https://lkml.kernel.org/r/20210122132424.313c8f5f.yaoaili@kingsoft.com Signed-off-by: Aili Yao yaoaili@kingsoft.com Reviewed-by: Oscar Salvador osalvador@suse.de Acked-by: Naoya Horiguchi naoya.horiguchi@nec.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/memory-failure.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 97a00a8e6f79..fa8504987698 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -244,9 +244,13 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) pfn, t->comm, t->pid);
if (flags & MF_ACTION_REQUIRED) { - WARN_ON_ONCE(t != current); - ret = force_sig_mceerr(BUS_MCEERR_AR, + if (t == current) + ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)tk->addr, addr_lsb); + else + /* Signal other processes sharing the page if they have PF_MCE_EARLY set. */ + ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr, + addr_lsb, t); } else { /* * Don't use force here, it's convenient if the signal @@ -444,26 +448,26 @@ static struct task_struct *find_early_kill_thread(struct task_struct *tsk) * Determine whether a given process is "early kill" process which expects * to be signaled when some page under the process is hwpoisoned. * Return task_struct of the dedicated thread (main thread unless explicitly - * specified) if the process is "early kill," and otherwise returns NULL. + * specified) if the process is "early kill" and otherwise returns NULL. * - * Note that the above is true for Action Optional case, but not for Action - * Required case where SIGBUS should sent only to the current thread. + * Note that the above is true for Action Optional case. For Action Required + * case, it's only meaningful to the current thread which need to be signaled + * with SIGBUS, this error is Action Optional for other non current + * processes sharing the same error page,if the process is "early kill", the + * task_struct of the dedicated thread will also be returned. */ static struct task_struct *task_early_kill(struct task_struct *tsk, int force_early) { if (!tsk->mm) return NULL; - if (force_early) { - /* - * Comparing ->mm here because current task might represent - * a subthread, while tsk always points to the main thread. - */ - if (tsk->mm == current->mm) - return current; - else - return NULL; - } + /* + * Comparing ->mm here because current task might represent + * a subthread, while tsk always points to the main thread. + */ + if (force_early && tsk->mm == current->mm) + return current; + return find_early_kill_thread(tsk); }
From: Guo Mengqi guomengqi3@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5J0YW CVE: NA --------------------------------
Sharepool owns an statistics system which allow user to check the memory use easily. The statistics codes are quite independent from the major functions. However, the realization is very similar with the major functions, which doubles the lock use and cause nesting problems.
Thus we remove the statistics system, and put all the statistics into raw data structures as built-in statistics. The user api did not change. This can greatly reduce the complexity of locks, as well as remove hundred lines of redundant codes.
Signed-off-by: Guo Mengqi guomengqi3@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/share_pool.h | 7 +- mm/share_pool.c | 843 ++++++++++++++++--------------------- 2 files changed, 375 insertions(+), 475 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 1911cd35843b..c2ef26661a4f 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -65,6 +65,7 @@ extern int sysctl_sp_perf_alloc;
extern int sysctl_sp_perf_k2u;
+#ifdef __GENKSYMS__ /* we estimate an sp-group ususally contains at most 64 sp-group */ #define SP_SPG_HASH_BITS 6
@@ -206,6 +207,7 @@ struct sp_group_node { struct sp_group *spg; unsigned long prot; }; +#endif
struct sp_walk_data { struct page **pages; @@ -508,11 +510,6 @@ static inline bool mg_is_sharepool_addr(unsigned long addr) return false; }
-static inline struct sp_proc_stat *sp_get_proc_stat_ref(struct mm_struct *mm) -{ - return NULL; -} - static inline void spa_overview_show(struct seq_file *seq) { } diff --git a/mm/share_pool.c b/mm/share_pool.c index 750524f1afc2..862595d82c91 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -117,19 +117,175 @@ static DEFINE_IDA(sp_group_id_ida);
/*** Statistical and maintenance tools ***/
-/* idr of all sp_proc_stats */ -static DEFINE_IDR(sp_proc_stat_idr); -/* rw semaphore for sp_proc_stat_idr */ -static DECLARE_RWSEM(sp_proc_stat_sem); - -/* idr of all sp_spg_stats */ -static DEFINE_IDR(sp_spg_stat_idr); -/* rw semaphore for sp_spg_stat_idr */ -static DECLARE_RWSEM(sp_spg_stat_sem); +/* list of all sp_group_masters */ +static LIST_HEAD(master_list); +/* mutex to protect insert/delete ops from master_list */ +static DEFINE_MUTEX(master_list_lock);
/* for kthread buff_module_guard_work */ static struct sp_proc_stat kthread_stat;
+#ifndef __GENKSYMS__ +struct sp_spg_stat { + int spg_id; + /* record the number of hugepage allocation failures */ + atomic_t hugepage_failures; + /* number of sp_area */ + atomic_t spa_num; + /* total size of all sp_area from sp_alloc and k2u */ + atomic64_t size; + /* total size of all sp_area from sp_alloc 0-order page */ + atomic64_t alloc_nsize; + /* total size of all sp_area from sp_alloc hugepage */ + atomic64_t alloc_hsize; + /* total size of all sp_area from ap_alloc */ + atomic64_t alloc_size; + /* total size of all sp_area from sp_k2u */ + atomic64_t k2u_size; +}; + +/* per process memory usage statistics indexed by tgid */ +struct sp_proc_stat { + int tgid; + struct mm_struct *mm; + char comm[TASK_COMM_LEN]; + /* + * alloc amount minus free amount, may be negative when freed by + * another task in the same sp group. + */ + atomic64_t alloc_size; + atomic64_t alloc_nsize; + atomic64_t alloc_hsize; + atomic64_t k2u_size; +}; + +/* per process/sp-group memory usage statistics */ +struct spg_proc_stat { + int tgid; + int spg_id; /* 0 for non-group data, such as k2u_task */ + /* + * alloc amount minus free amount, may be negative when freed by + * another task in the same sp group. + */ + atomic64_t alloc_size; + atomic64_t alloc_nsize; + atomic64_t alloc_hsize; + atomic64_t k2u_size; +}; + +/* + * address space management + */ +struct sp_mapping { + unsigned long flag; + atomic_t user; + unsigned long start[MAX_DEVID]; + unsigned long end[MAX_DEVID]; + struct rb_root area_root; + + struct rb_node *free_area_cache; + unsigned long cached_hole_size; + unsigned long cached_vstart; + + /* list head for all groups attached to this mapping, dvpp mapping only */ + struct list_head group_head; +}; + +/* Processes in the same sp_group can share memory. + * Memory layout for share pool: + * + * |-------------------- 8T -------------------|---|------ 8T ------------| + * | Device 0 | Device 1 |...| | + * |----------------------------------------------------------------------| + * |------------- 16G -------------| 16G | | | + * | DVPP GROUP0 | DVPP GROUP1 | ... | ... |...| sp normal memory | + * | sp | sp | | | | | + * |----------------------------------------------------------------------| + * + * The host SVM feature reserves 8T virtual memory by mmap, and due to the + * restriction of DVPP, while SVM and share pool will both allocate memory + * for DVPP, the memory have to be in the same 32G range. + * + * Share pool reserves 16T memory, with 8T for normal uses and 8T for DVPP. + * Within this 8T DVPP memory, SVM will call sp_config_dvpp_range() to + * tell us which 16G memory range is reserved for share pool . + * + * In some scenarios where there is no host SVM feature, share pool uses + * the default 8G memory setting for DVPP. + */ +struct sp_group { + int id; + unsigned long flag; + struct file *file; + struct file *file_hugetlb; + /* number of process in this group */ + int proc_num; + /* list head of processes (sp_group_node, each represents a process) */ + struct list_head procs; + /* list head of sp_area. it is protected by spin_lock sp_area_lock */ + struct list_head spa_list; + /* group statistics */ + struct sp_spg_stat instat; + /* we define the creator process of a sp_group as owner */ + struct task_struct *owner; + /* is_alive == false means it's being destroyed */ + bool is_alive; + atomic_t use_count; + /* protect the group internal elements, except spa_list */ + struct rw_semaphore rw_lock; + /* list node for dvpp mapping */ + struct list_head mnode; + struct sp_mapping *dvpp; + struct sp_mapping *normal; +}; + +/* a per-process(per mm) struct which manages a sp_group_node list */ +struct sp_group_master { + /* + * number of sp groups the process belongs to, + * a.k.a the number of sp_node in node_list + */ + unsigned int count; + /* list head of sp_node */ + struct list_head node_list; + struct mm_struct *mm; + /* + * Used to apply for the shared pool memory of the current process. + * For example, sp_alloc non-share memory or k2task. + */ + struct sp_group *local; + struct sp_proc_stat instat; + struct list_head list_node; +}; + +/* + * each instance represents an sp group the process belongs to + * sp_group_master : sp_group_node = 1 : N + * sp_group_node->spg : sp_group = 1 : 1 + * sp_group_node : sp_group->procs = N : 1 + */ +struct sp_group_node { + /* list node in sp_group->procs */ + struct list_head proc_node; + /* list node in sp_group_maseter->node_list */ + struct list_head group_node; + struct sp_group_master *master; + struct sp_group *spg; + unsigned long prot; + struct spg_proc_stat instat; +}; +#endif + +/* The caller should hold mmap_sem to protect master (TBD) */ +static void sp_init_group_master_stat(struct mm_struct *mm, struct sp_proc_stat *stat) +{ + atomic64_set(&stat->alloc_nsize, 0); + atomic64_set(&stat->alloc_hsize, 0); + atomic64_set(&stat->k2u_size, 0); + stat->mm = mm; + get_task_comm(stat->comm, current); +} + #define SP_MAPPING_DVPP 0x1 #define SP_MAPPING_NORMAL 0x2 static struct sp_mapping *sp_mapping_normal; @@ -326,8 +482,6 @@ static int init_local_group(struct mm_struct *mm) return ret; }
-static void sp_proc_stat_drop(struct sp_proc_stat *stat); -static int sp_init_proc_stat(struct mm_struct *mm, struct task_struct *tsk); /* The caller must hold sp_group_sem */ static int sp_init_group_master_locked(struct task_struct *tsk, struct mm_struct *mm) { @@ -344,20 +498,19 @@ static int sp_init_group_master_locked(struct task_struct *tsk, struct mm_struct INIT_LIST_HEAD(&master->node_list); master->count = 0; master->mm = mm; + sp_init_group_master_stat(mm, &master->instat); mm->sp_group_master = master;
- ret = sp_init_proc_stat(mm, tsk); - if (ret) - goto free_master; + mutex_lock(&master_list_lock); + list_add_tail(&master->list_node, &master_list); + mutex_unlock(&master_list_lock);
ret = init_local_group(mm); if (ret) - goto put_stat; + goto free_master;
return 0;
-put_stat: - sp_proc_stat_drop(master->stat); free_master: mm->sp_group_master = NULL; kfree(master); @@ -397,67 +550,6 @@ static struct sp_group *sp_get_local_group(struct task_struct *tsk, struct mm_st return master->local; }
-static struct sp_proc_stat *sp_get_proc_stat(struct mm_struct *mm) -{ - struct sp_proc_stat *stat; - - if (!mm->sp_group_master) - return NULL; - - down_read(&sp_proc_stat_sem); - stat = mm->sp_group_master->stat; - up_read(&sp_proc_stat_sem); - - /* maybe NULL or not, we always return it */ - return stat; -} - -static struct sp_proc_stat *create_proc_stat(struct mm_struct *mm, - struct task_struct *tsk) -{ - struct sp_proc_stat *stat; - - stat = kmalloc(sizeof(*stat), GFP_KERNEL); - if (stat == NULL) - return ERR_PTR(-ENOMEM); - - atomic_set(&stat->use_count, 1); - atomic64_set(&stat->alloc_size, 0); - atomic64_set(&stat->k2u_size, 0); - stat->tgid = tsk->tgid; - stat->mm = mm; - mutex_init(&stat->lock); - hash_init(stat->hash); - get_task_comm(stat->comm, tsk); - - return stat; -} - -static int sp_init_proc_stat(struct mm_struct *mm, struct task_struct *tsk) -{ - struct sp_proc_stat *stat; - int alloc_id, tgid = tsk->tgid; - struct sp_group_master *master = mm->sp_group_master; - - stat = create_proc_stat(mm, tsk); - if (IS_ERR(stat)) - return PTR_ERR(stat); - - down_write(&sp_proc_stat_sem); - alloc_id = idr_alloc(&sp_proc_stat_idr, stat, tgid, tgid + 1, GFP_KERNEL); - if (alloc_id < 0) { - up_write(&sp_proc_stat_sem); - pr_err_ratelimited("proc stat idr alloc failed %d\n", alloc_id); - kfree(stat); - return alloc_id; - } - - master->stat = stat; - up_write(&sp_proc_stat_sem); - - return 0; -} - static void update_spg_stat_alloc(unsigned long size, bool inc, bool huge, struct sp_spg_stat *stat) { @@ -494,158 +586,64 @@ static void update_spg_stat_k2u(unsigned long size, bool inc, } }
-/* per process/sp-group memory usage statistics */ -struct spg_proc_stat { - int tgid; - int spg_id; /* 0 for non-group data, such as k2u_task */ - struct hlist_node pnode; /* hlist node in sp_proc_stat->hash */ - struct hlist_node gnode; /* hlist node in sp_spg_stat->hash */ - struct sp_proc_stat *proc_stat; - struct sp_spg_stat *spg_stat; - /* - * alloc amount minus free amount, may be negative when freed by - * another task in the same sp group. - */ - atomic64_t alloc_size; - atomic64_t k2u_size; -}; - -static void update_spg_proc_stat_alloc(unsigned long size, bool inc, - struct spg_proc_stat *stat) +static void update_mem_usage_alloc(unsigned long size, bool inc, + bool is_hugepage, struct sp_group_node *spg_node) { - struct sp_proc_stat *proc_stat = stat->proc_stat; + struct sp_proc_stat *proc_stat = &spg_node->master->instat;
if (inc) { - atomic64_add(size, &stat->alloc_size); - atomic64_add(size, &proc_stat->alloc_size); - } else { - atomic64_sub(size, &stat->alloc_size); - atomic64_sub(size, &proc_stat->alloc_size); + if (is_hugepage) { + atomic64_add(size, &spg_node->instat.alloc_hsize); + atomic64_add(size, &proc_stat->alloc_hsize); + return; + } + atomic64_add(size, &spg_node->instat.alloc_nsize); + atomic64_add(size, &proc_stat->alloc_nsize); + return; } + + if (is_hugepage) { + atomic64_sub(size, &spg_node->instat.alloc_hsize); + atomic64_sub(size, &proc_stat->alloc_hsize); + return; + } + atomic64_sub(size, &spg_node->instat.alloc_nsize); + atomic64_sub(size, &proc_stat->alloc_nsize); + return; }
-static void update_spg_proc_stat_k2u(unsigned long size, bool inc, - struct spg_proc_stat *stat) +static void update_mem_usage_k2u(unsigned long size, bool inc, + struct sp_group_node *spg_node) { - struct sp_proc_stat *proc_stat = stat->proc_stat; + struct sp_proc_stat *proc_stat = &spg_node->master->instat;
if (inc) { - atomic64_add(size, &stat->k2u_size); + atomic64_add(size, &spg_node->instat.k2u_size); atomic64_add(size, &proc_stat->k2u_size); } else { - atomic64_sub(size, &stat->k2u_size); + atomic64_sub(size, &spg_node->instat.k2u_size); atomic64_sub(size, &proc_stat->k2u_size); } }
-static struct spg_proc_stat *find_spg_proc_stat( - struct sp_proc_stat *proc_stat, int tgid, int spg_id) -{ - struct spg_proc_stat *stat = NULL; - - mutex_lock(&proc_stat->lock); - hash_for_each_possible(proc_stat->hash, stat, pnode, spg_id) { - if (stat->spg_id == spg_id) - break; - } - mutex_unlock(&proc_stat->lock); - - return stat; -} - -static struct spg_proc_stat *create_spg_proc_stat(int tgid, int spg_id) +static void sp_init_spg_proc_stat(struct spg_proc_stat *stat, int spg_id) { - struct spg_proc_stat *stat; - - stat = kmalloc(sizeof(struct spg_proc_stat), GFP_KERNEL); - if (stat == NULL) - return ERR_PTR(-ENOMEM); - - stat->tgid = tgid; + stat->tgid = current->tgid; stat->spg_id = spg_id; - atomic64_set(&stat->alloc_size, 0); + atomic64_set(&stat->alloc_nsize, 0); + atomic64_set(&stat->alloc_hsize, 0); atomic64_set(&stat->k2u_size, 0); - - return stat; }
-static struct spg_proc_stat *sp_init_spg_proc_stat(struct sp_proc_stat *proc_stat, - struct sp_group *spg) +static void sp_init_group_stat(struct sp_spg_stat *stat) { - struct spg_proc_stat *stat; - int spg_id = spg->id; /* visit spg id locklessly */ - struct sp_spg_stat *spg_stat = spg->stat; - - stat = create_spg_proc_stat(proc_stat->tgid, spg_id); - if (IS_ERR(stat)) - return stat; - - stat->proc_stat = proc_stat; - stat->spg_stat = spg_stat; - - mutex_lock(&proc_stat->lock); - hash_add(proc_stat->hash, &stat->pnode, stat->spg_id); - mutex_unlock(&proc_stat->lock); - - mutex_lock(&spg_stat->lock); - hash_add(spg_stat->hash, &stat->gnode, stat->tgid); - mutex_unlock(&spg_stat->lock); - return stat; -} - -static struct sp_spg_stat *create_spg_stat(int spg_id) -{ - struct sp_spg_stat *stat; - - stat = kmalloc(sizeof(*stat), GFP_KERNEL); - if (stat == NULL) - return ERR_PTR(-ENOMEM); - - stat->spg_id = spg_id; atomic_set(&stat->hugepage_failures, 0); atomic_set(&stat->spa_num, 0); atomic64_set(&stat->size, 0); atomic64_set(&stat->alloc_nsize, 0); atomic64_set(&stat->alloc_hsize, 0); atomic64_set(&stat->alloc_size, 0); - mutex_init(&stat->lock); - hash_init(stat->hash); - - return stat; -} - -static int sp_init_spg_stat(struct sp_group *spg) -{ - struct sp_spg_stat *stat; - int ret, spg_id = spg->id; - - stat = create_spg_stat(spg_id); - if (IS_ERR(stat)) - return PTR_ERR(stat); - - down_write(&sp_spg_stat_sem); - ret = idr_alloc(&sp_spg_stat_idr, stat, spg_id, spg_id + 1, - GFP_KERNEL); - up_write(&sp_spg_stat_sem); - if (ret < 0) { - pr_err_ratelimited("group %d idr alloc failed, ret %d\n", - spg_id, ret); - kfree(stat); - } - - spg->stat = stat; - return ret; -} - -static void free_spg_stat(int spg_id) -{ - struct sp_spg_stat *stat; - - down_write(&sp_spg_stat_sem); - stat = idr_remove(&sp_spg_stat_idr, spg_id); - up_write(&sp_spg_stat_sem); - WARN_ON(!stat); - kfree(stat); + atomic64_set(&stat->k2u_size, 0); }
/* statistics of all sp area, protected by sp_area_lock */ @@ -733,17 +731,17 @@ static void spa_inc_usage(struct sp_area *spa) case SPA_TYPE_ALLOC: spa_stat.alloc_num += 1; spa_stat.alloc_size += size; - update_spg_stat_alloc(size, true, is_huge, spa->spg->stat); + update_spg_stat_alloc(size, true, is_huge, &spa->spg->instat); break; case SPA_TYPE_K2TASK: spa_stat.k2u_task_num += 1; spa_stat.k2u_task_size += size; - update_spg_stat_k2u(size, true, spa->spg->stat); + update_spg_stat_k2u(size, true, &spa->spg->instat); break; case SPA_TYPE_K2SPG: spa_stat.k2u_spg_num += 1; spa_stat.k2u_spg_size += size; - update_spg_stat_k2u(size, true, spa->spg->stat); + update_spg_stat_k2u(size, true, &spa->spg->instat); break; default: WARN(1, "invalid spa type"); @@ -779,17 +777,17 @@ static void spa_dec_usage(struct sp_area *spa) case SPA_TYPE_ALLOC: spa_stat.alloc_num -= 1; spa_stat.alloc_size -= size; - update_spg_stat_alloc(size, false, is_huge, spa->spg->stat); + update_spg_stat_alloc(size, false, is_huge, &spa->spg->instat); break; case SPA_TYPE_K2TASK: spa_stat.k2u_task_num -= 1; spa_stat.k2u_task_size -= size; - update_spg_stat_k2u(size, false, spa->spg->stat); + update_spg_stat_k2u(size, false, &spa->spg->instat); break; case SPA_TYPE_K2SPG: spa_stat.k2u_spg_num -= 1; spa_stat.k2u_spg_size -= size; - update_spg_stat_k2u(size, false, spa->spg->stat); + update_spg_stat_k2u(size, false, &spa->spg->instat); break; default: WARN(1, "invalid spa type"); @@ -809,42 +807,52 @@ static void spa_dec_usage(struct sp_area *spa) } }
-static void update_spg_proc_stat(unsigned long size, bool inc, - struct spg_proc_stat *stat, enum spa_type type) +static void update_mem_usage(unsigned long size, bool inc, bool is_hugepage, + struct sp_group_node *spg_node, enum spa_type type) { - if (unlikely(!stat)) { + if (unlikely(!spg_node)) { sp_dump_stack(); - WARN(1, "null process stat\n"); + WARN(1, "null sp group node\n"); return; }
switch (type) { case SPA_TYPE_ALLOC: - update_spg_proc_stat_alloc(size, inc, stat); + update_mem_usage_alloc(size, inc, is_hugepage, spg_node); break; case SPA_TYPE_K2TASK: case SPA_TYPE_K2SPG: - update_spg_proc_stat_k2u(size, inc, stat); + update_mem_usage_k2u(size, inc, spg_node); break; default: WARN(1, "invalid stat type\n"); } }
+struct sp_group_node *find_spg_node_by_spg(struct mm_struct *mm, + struct sp_group *spg) +{ + struct sp_group_node *spg_node; + + list_for_each_entry(spg_node, &mm->sp_group_master->node_list, group_node) { + if (spg_node->spg == spg) + return spg_node; + } + return NULL; +} + static void sp_update_process_stat(struct task_struct *tsk, bool inc, struct sp_area *spa) { - struct spg_proc_stat *stat; + struct sp_group_node *spg_node; unsigned long size = spa->real_size; enum spa_type type = spa->type;
- down_write(&sp_group_sem); - stat = find_spg_proc_stat(tsk->mm->sp_group_master->stat, tsk->tgid, spa->spg->id); - up_write(&sp_group_sem); - if (!stat) - return; - - update_spg_proc_stat(size, inc, stat, type); + spg_node = find_spg_node_by_spg(tsk->mm, spa->spg); + if (!spg_node) + pr_err("share pool: spg node not found!\n"); + else + update_mem_usage(size, inc, spa->is_hugepage, spg_node, type); }
static inline void check_interrupt_context(void) @@ -903,7 +911,6 @@ static void free_sp_group_locked(struct sp_group *spg) { fput(spg->file); fput(spg->file_hugetlb); - free_spg_stat(spg->id); idr_remove(&sp_group_idr, spg->id); free_sp_group_id((unsigned int)spg->id); sp_mapping_detach(spg, spg->dvpp); @@ -1163,6 +1170,7 @@ static struct sp_group *create_spg(int spg_id, unsigned long flag) INIT_LIST_HEAD(&spg->spa_list); INIT_LIST_HEAD(&spg->mnode); init_rwsem(&spg->rw_lock); + sp_init_group_stat(&spg->instat);
sprintf(name, "sp_group_%d", spg_id); spg->file = shmem_kernel_file_setup(name, MAX_LFS_FILESIZE, @@ -1182,16 +1190,10 @@ static struct sp_group *create_spg(int spg_id, unsigned long flag) goto out_fput; }
- ret = sp_init_spg_stat(spg); - if (ret < 0) - goto out_fput_all; - if (!is_local_group(spg_id)) system_group_count++; return spg;
-out_fput_all: - fput(spg->file_hugetlb); out_fput: fput(spg->file); out_idr: @@ -1301,6 +1303,7 @@ static struct sp_group_node *create_spg_node(struct mm_struct *mm, spg_node->spg = spg; spg_node->master = master; spg_node->prot = prot; + sp_init_spg_proc_stat(&spg_node->instat, spg->id);
list_add_tail(&spg_node->group_node, &master->node_list); master->count++; @@ -1319,12 +1322,6 @@ static int insert_spg_node(struct sp_group *spg, struct sp_group_node *node) spg->proc_num++; list_add_tail(&node->proc_node, &spg->procs);
- /* - * The only way where sp_init_spg_proc_stat got failed is that there is no - * memory for sp_spg_stat. We will avoid this failure when we put sp_spg_stat - * into sp_group_node later. - */ - sp_init_spg_proc_stat(node->master->stat, spg); return 0; }
@@ -1498,6 +1495,7 @@ int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id) up_write(&spg->rw_lock); goto out_drop_group; } + mm->sp_group_master->instat.tgid = tsk->tgid;
ret = sp_mapping_group_setup(mm, spg); if (ret) { @@ -1625,27 +1623,6 @@ int sp_group_add_task(int pid, int spg_id) } EXPORT_SYMBOL_GPL(sp_group_add_task);
-static void free_spg_proc_stat(struct mm_struct *mm, int spg_id) -{ - int i; - struct sp_proc_stat *proc_stat = sp_get_proc_stat(mm); - struct spg_proc_stat *stat; - struct sp_spg_stat *spg_stat; - struct hlist_node *tmp; - - hash_for_each_safe(proc_stat->hash, i, tmp, stat, pnode) { - if (stat->spg_id == spg_id) { - spg_stat = stat->spg_stat; - mutex_lock(&spg_stat->lock); - hash_del(&stat->gnode); - mutex_unlock(&spg_stat->lock); - hash_del(&stat->pnode); - kfree(stat); - break; - } - } -} - /** * mg_sp_group_del_task() - delete a process from a sp group. * @pid: the pid of the task to be deleted @@ -1737,7 +1714,6 @@ int mg_sp_group_del_task(int pid, int spg_id) return -EINVAL; }
- free_spg_proc_stat(mm, spg_id); up_write(&sp_group_sem);
out_put_mm: @@ -2605,14 +2581,12 @@ static int sp_alloc_mmap(struct mm_struct *mm, struct sp_area *spa,
static void sp_alloc_fallback(struct sp_area *spa, struct sp_alloc_context *ac) { - struct sp_spg_stat *stat = ac->spg->stat; - if (ac->file == ac->spg->file) { ac->state = ALLOC_NOMEM; return; }
- atomic_inc(&stat->hugepage_failures); + atomic_inc(&ac->spg->instat.hugepage_failures); if (!(ac->sp_flags & SP_HUGEPAGE_ONLY)) { ac->file = ac->spg->file; ac->size_aligned = ALIGN(ac->size, PAGE_SIZE); @@ -2941,7 +2915,7 @@ static void *sp_make_share_kva_to_task(unsigned long kva, unsigned long size, un int ret; void *uva; struct sp_area *spa; - struct spg_proc_stat *stat; + struct sp_group_node *spg_node; unsigned long prot = PROT_READ | PROT_WRITE; struct sp_k2u_context kc; struct sp_group *spg; @@ -2955,7 +2929,6 @@ static void *sp_make_share_kva_to_task(unsigned long kva, unsigned long size, un }
spg = current->mm->sp_group_master->local; - stat = find_spg_proc_stat(current->mm->sp_group_master->stat, current->tgid, spg->id); up_write(&sp_group_sem);
spa = sp_alloc_area(size, sp_flags, spg, SPA_TYPE_K2TASK, current->tgid); @@ -2972,7 +2945,11 @@ static void *sp_make_share_kva_to_task(unsigned long kva, unsigned long size, un if (IS_ERR(uva)) pr_err("remap k2u to task failed %ld\n", PTR_ERR(uva)); else { - update_spg_proc_stat(size, true, stat, SPA_TYPE_K2TASK); + spg_node = find_spg_node_by_spg(current->mm, spa->spg); + if (!spg_node) + pr_err("spg_node is null\n"); + else + update_mem_usage(size, true, spa->is_hugepage, spg_node, SPA_TYPE_K2TASK); spa->mm = current->mm; }
@@ -3998,43 +3975,6 @@ __setup("enable_sp_share_k2u_spg", enable_share_k2u_to_group);
/*** Statistical and maintenance functions ***/
-static void free_process_spg_proc_stat(struct sp_proc_stat *proc_stat) -{ - int i; - struct spg_proc_stat *stat; - struct hlist_node *tmp; - struct sp_spg_stat *spg_stat; - - /* traverse proc_stat->hash locklessly as process is exiting */ - hash_for_each_safe(proc_stat->hash, i, tmp, stat, pnode) { - spg_stat = stat->spg_stat; - mutex_lock(&spg_stat->lock); - hash_del(&stat->gnode); - mutex_unlock(&spg_stat->lock); - - hash_del(&stat->pnode); - kfree(stat); - } -} - -static void free_sp_proc_stat(struct sp_proc_stat *stat) -{ - free_process_spg_proc_stat(stat); - - down_write(&sp_proc_stat_sem); - stat->mm->sp_group_master->stat = NULL; - idr_remove(&sp_proc_stat_idr, stat->tgid); - up_write(&sp_proc_stat_sem); - kfree(stat); -} - -/* the caller make sure stat is not NULL */ -static void sp_proc_stat_drop(struct sp_proc_stat *stat) -{ - if (atomic_dec_and_test(&stat->use_count)) - free_sp_proc_stat(stat); -} - static void get_mm_rss_info(struct mm_struct *mm, unsigned long *anon, unsigned long *file, unsigned long *shmem, unsigned long *total_rss) { @@ -4044,54 +3984,29 @@ static void get_mm_rss_info(struct mm_struct *mm, unsigned long *anon, *total_rss = *anon + *file + *shmem; }
-static long get_proc_alloc(struct sp_proc_stat *stat) -{ - return byte2kb(atomic64_read(&stat->alloc_size)); -} - static long get_proc_k2u(struct sp_proc_stat *stat) { return byte2kb(atomic64_read(&stat->k2u_size)); }
-static long get_spg_alloc(struct sp_spg_stat *stat) -{ - return byte2kb(atomic64_read(&stat->alloc_size)); -} - -static long get_spg_alloc_nsize(struct sp_spg_stat *stat) +static long get_proc_alloc(struct sp_proc_stat *stat) { - return byte2kb(atomic64_read(&stat->alloc_nsize)); + return byte2kb(atomic64_read(&stat->alloc_nsize) + + atomic64_read(&stat->alloc_hsize)); }
-static long get_spg_proc_alloc(struct spg_proc_stat *stat) +static void get_process_sp_res(struct sp_proc_stat *stat, + long *sp_res_out, long *sp_res_nsize_out) { - return byte2kb(atomic64_read(&stat->alloc_size)); + *sp_res_out = byte2kb(atomic64_read(&stat->alloc_nsize) + + atomic64_read(&stat->alloc_hsize)); + *sp_res_nsize_out = byte2kb(atomic64_read(&stat->alloc_nsize)); }
-static long get_spg_proc_k2u(struct spg_proc_stat *stat) +static long get_sp_res_by_spg_proc(struct sp_group_node *spg_node) { - return byte2kb(atomic64_read(&stat->k2u_size)); -} - -static void get_process_sp_res(struct sp_proc_stat *stat, - long *sp_res_out, long *sp_res_nsize_out) -{ - int i; - struct spg_proc_stat *spg_proc_stat; - struct sp_spg_stat *spg_stat; - long sp_res = 0, sp_res_nsize = 0; - - mutex_lock(&stat->lock); - hash_for_each(stat->hash, i, spg_proc_stat, pnode) { - spg_stat = spg_proc_stat->spg_stat; - sp_res += get_spg_alloc(spg_stat); - sp_res_nsize += get_spg_alloc_nsize(spg_stat); - } - mutex_unlock(&stat->lock); - - *sp_res_out = sp_res; - *sp_res_nsize_out = sp_res_nsize; + return byte2kb(atomic64_read(&spg_node->instat.alloc_nsize) + + atomic64_read(&spg_node->instat.alloc_hsize)); }
/* @@ -4112,24 +4027,15 @@ static void get_process_non_sp_res(unsigned long total_rss, unsigned long shmem, *non_sp_shm_out = non_sp_shm; }
-static long get_sp_res_by_spg_proc(struct spg_proc_stat *stat) +static long get_spg_proc_alloc(struct sp_group_node *spg_node) { - return byte2kb(atomic64_read(&stat->spg_stat->alloc_size)); + return byte2kb(atomic64_read(&spg_node->instat.alloc_nsize) + + atomic64_read(&spg_node->instat.alloc_hsize)); }
-static unsigned long get_process_prot_locked(int spg_id, struct mm_struct *mm) +static long get_spg_proc_k2u(struct sp_group_node *spg_node) { - unsigned long prot = 0; - struct sp_group_node *spg_node; - struct sp_group_master *master = mm->sp_group_master; - - list_for_each_entry(spg_node, &master->node_list, group_node) { - if (spg_node->spg->id == spg_id) { - prot = spg_node->prot; - break; - } - } - return prot; + return byte2kb(atomic64_read(&spg_node->instat.k2u_size)); }
static void print_process_prot(struct seq_file *seq, unsigned long prot) @@ -4148,9 +4054,8 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, struct mm_struct *mm = task->mm; struct sp_group_master *master; struct sp_proc_stat *proc_stat; - struct spg_proc_stat *spg_proc_stat; - int i; - unsigned long anon, file, shmem, total_rss, prot; + struct sp_group_node *spg_node; + unsigned long anon, file, shmem, total_rss; long sp_res, sp_res_nsize, non_sp_res, non_sp_shm;
if (!sp_is_enabled()) @@ -4159,12 +4064,13 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, if (!mm) return 0;
+ down_read(&mm->mmap_lock); master = mm->sp_group_master; if (!master) return 0;
get_mm_rss_info(mm, &anon, &file, &shmem, &total_rss); - proc_stat = master->stat; + proc_stat = &master->instat; get_process_sp_res(proc_stat, &sp_res, &sp_res_nsize); get_process_non_sp_res(total_rss, shmem, sp_res_nsize, &non_sp_res, &non_sp_shm); @@ -4182,24 +4088,18 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns,
seq_puts(m, "\n\nProcess in Each SP Group\n\n"); seq_printf(m, "%-8s %-9s %-9s %-9s %-4s\n", - "Group_ID", "SP_ALLOC", "SP_K2U", "SP_RES", "PROT"); + "Group_ID", "SP_ALLOC", "SP_K2U", "SP_RES", "PROT");
- /* to prevent ABBA deadlock, first hold sp_group_sem */ - down_read(&sp_group_sem); - mutex_lock(&proc_stat->lock); - hash_for_each(proc_stat->hash, i, spg_proc_stat, pnode) { - prot = get_process_prot_locked(spg_proc_stat->spg_id, mm); + list_for_each_entry(spg_node, &master->node_list, proc_node) { seq_printf(m, "%-8d %-9ld %-9ld %-9ld ", - spg_proc_stat->spg_id, - get_spg_proc_alloc(spg_proc_stat), - get_spg_proc_k2u(spg_proc_stat), - get_sp_res_by_spg_proc(spg_proc_stat)); - print_process_prot(m, prot); + spg_node->spg->id, + get_spg_proc_alloc(spg_node), + get_spg_proc_k2u(spg_node), + get_sp_res_by_spg_proc(spg_node)); + print_process_prot(m, spg_node->prot); seq_putc(m, '\n'); } - mutex_unlock(&proc_stat->lock); - up_read(&sp_group_sem); - + up_read(&mm->mmap_lock); return 0; }
@@ -4329,31 +4229,42 @@ void spa_overview_show(struct seq_file *seq) } }
-/* the caller must hold sp_group_sem */ -static int idr_spg_stat_cb(int id, void *p, void *data) +static int spg_info_show(int id, void *p, void *data) { - struct sp_spg_stat *s = p; + struct sp_group *spg = p; struct seq_file *seq = data;
- if (is_local_group(id) && atomic64_read(&s->size) == 0) + if (id >= SPG_ID_LOCAL_MIN && id <= SPG_ID_LOCAL_MAX) return 0;
if (seq != NULL) { - seq_printf(seq, "Group %6d ", id); + if (id == 0) + seq_puts(seq, "Non Group "); + else + seq_printf(seq, "Group %6d ", id); + + down_read(&spg->rw_lock); seq_printf(seq, "size: %lld KB, spa num: %d, total alloc: %lld KB, normal alloc: %lld KB, huge alloc: %lld KB\n", - byte2kb(atomic64_read(&s->size)), - atomic_read(&s->spa_num), - byte2kb(atomic64_read(&s->alloc_size)), - byte2kb(atomic64_read(&s->alloc_nsize)), - byte2kb(atomic64_read(&s->alloc_hsize))); + byte2kb(atomic64_read(&spg->instat.size)), + atomic_read(&spg->instat.spa_num), + byte2kb(atomic64_read(&spg->instat.alloc_size)), + byte2kb(atomic64_read(&spg->instat.alloc_nsize)), + byte2kb(atomic64_read(&spg->instat.alloc_hsize))); + up_read(&spg->rw_lock); } else { - pr_info("Group %6d ", id); + if (id == 0) + pr_info("Non Group "); + else + pr_info("Group %6d ", id); + + down_read(&spg->rw_lock); pr_info("size: %lld KB, spa num: %d, total alloc: %lld KB, normal alloc: %lld KB, huge alloc: %lld KB\n", - byte2kb(atomic64_read(&s->size)), - atomic_read(&s->spa_num), - byte2kb(atomic64_read(&s->alloc_size)), - byte2kb(atomic64_read(&s->alloc_nsize)), - byte2kb(atomic64_read(&s->alloc_hsize))); + byte2kb(atomic64_read(&spg->instat.size)), + atomic_read(&spg->instat.spa_num), + byte2kb(atomic64_read(&spg->instat.alloc_size)), + byte2kb(atomic64_read(&spg->instat.alloc_nsize)), + byte2kb(atomic64_read(&spg->instat.alloc_hsize))); + up_read(&spg->rw_lock); }
return 0; @@ -4366,17 +4277,17 @@ void spg_overview_show(struct seq_file *seq)
if (seq != NULL) { seq_printf(seq, "Share pool total size: %lld KB, spa total num: %d.\n", - byte2kb(atomic64_read(&sp_overall_stat.spa_total_size)), - atomic_read(&sp_overall_stat.spa_total_num)); + byte2kb(atomic64_read(&sp_overall_stat.spa_total_size)), + atomic_read(&sp_overall_stat.spa_total_num)); } else { pr_info("Share pool total size: %lld KB, spa total num: %d.\n", - byte2kb(atomic64_read(&sp_overall_stat.spa_total_size)), - atomic_read(&sp_overall_stat.spa_total_num)); + byte2kb(atomic64_read(&sp_overall_stat.spa_total_size)), + atomic_read(&sp_overall_stat.spa_total_num)); }
- down_read(&sp_spg_stat_sem); - idr_for_each(&sp_spg_stat_idr, idr_spg_stat_cb, seq); - up_read(&sp_spg_stat_sem); + down_read(&sp_group_sem); + idr_for_each(&sp_group_idr, spg_info_show, seq); + up_read(&sp_group_sem);
if (seq != NULL) seq_puts(seq, "\n"); @@ -4390,118 +4301,109 @@ static int spa_stat_show(struct seq_file *seq, void *offset) spa_overview_show(seq); /* print the file header */ seq_printf(seq, "%-10s %-16s %-16s %-10s %-7s %-5s %-8s %-8s\n", - "Group ID", "va_start", "va_end", "Size(KB)", "Type", "Huge", "PID", "Ref"); + "Group ID", "va_start", "va_end", "Size(KB)", "Type", "Huge", "PID", "Ref"); spa_normal_stat_show(seq); spa_dvpp_stat_show(seq); return 0; }
-static int idr_proc_stat_cb(int id, void *p, void *data) +static int proc_usage_by_group(int id, void *p, void *data) { - struct sp_spg_stat *spg_stat = p; + struct sp_group *spg = p; struct seq_file *seq = data; - int i, tgid; - struct sp_proc_stat *proc_stat; - struct spg_proc_stat *spg_proc_stat; - + struct sp_group_node *spg_node; struct mm_struct *mm; - unsigned long anon, file, shmem, total_rss, prot; - /* - * non_sp_res: resident memory size excluding share pool memory - * sp_res: resident memory size of share pool, including normal - * page and hugepage memory - * non_sp_shm: resident shared memory size excluding share pool - * memory - */ + struct sp_group_master *master; + int tgid; + unsigned long anon, file, shmem, total_rss; long sp_res, sp_res_nsize, non_sp_res, non_sp_shm;
- /* to prevent ABBA deadlock, first hold sp_group_sem */ - mutex_lock(&spg_stat->lock); - hash_for_each(spg_stat->hash, i, spg_proc_stat, gnode) { - proc_stat = spg_proc_stat->proc_stat; - tgid = proc_stat->tgid; - mm = proc_stat->mm; + down_read(&spg->rw_lock); + list_for_each_entry(spg_node, &spg->procs, proc_node) { + + master = spg_node->master; + if (!master) { + pr_info("master is NULL! process %d, group %d\n", + spg_node->instat.tgid, id); + continue; + } + mm = master->mm; + tgid = master->instat.tgid;
get_mm_rss_info(mm, &anon, &file, &shmem, &total_rss); - get_process_sp_res(proc_stat, &sp_res, &sp_res_nsize); + get_process_sp_res(&master->instat, &sp_res, &sp_res_nsize); get_process_non_sp_res(total_rss, shmem, sp_res_nsize, - &non_sp_res, &non_sp_shm); - prot = get_process_prot_locked(id, mm); + &non_sp_res, &non_sp_shm);
seq_printf(seq, "%-8d ", tgid); - seq_printf(seq, "%-8d ", id); + if (id == 0) + seq_printf(seq, "%-8c ", '-'); + else + seq_printf(seq, "%-8d ", id); seq_printf(seq, "%-9ld %-9ld %-9ld %-10ld %-10ld %-8ld %-7ld %-7ld %-10ld ", - get_spg_proc_alloc(spg_proc_stat), - get_spg_proc_k2u(spg_proc_stat), - get_sp_res_by_spg_proc(spg_proc_stat), - sp_res, non_sp_res, - page2kb(mm->total_vm), page2kb(total_rss), - page2kb(shmem), non_sp_shm); - print_process_prot(seq, prot); + get_spg_proc_alloc(spg_node), + get_spg_proc_k2u(spg_node), + get_sp_res_by_spg_proc(spg_node), + sp_res, non_sp_res, + page2kb(mm->total_vm), page2kb(total_rss), + page2kb(shmem), non_sp_shm); + print_process_prot(seq, spg_node->prot); seq_putc(seq, '\n'); } - mutex_unlock(&spg_stat->lock); + up_read(&spg->rw_lock); + return 0; }
-static int proc_stat_show(struct seq_file *seq, void *offset) +static int proc_group_usage_show(struct seq_file *seq, void *offset) { spg_overview_show(seq); spa_overview_show(seq); + /* print the file header */ seq_printf(seq, "%-8s %-8s %-9s %-9s %-9s %-10s %-10s %-8s %-7s %-7s %-10s %-4s\n", - "PID", "Group_ID", "SP_ALLOC", "SP_K2U", "SP_RES", "SP_RES_T", - "Non-SP_RES", "VIRT", "RES", "Shm", "Non-SP_Shm", "PROT"); + "PID", "Group_ID", "SP_ALLOC", "SP_K2U", "SP_RES", "SP_RES_T", + "Non-SP_RES", "VIRT", "RES", "Shm", "Non-SP_Shm", "PROT"); /* print kthread buff_module_guard_work */ seq_printf(seq, "%-8s %-8s %-9lld %-9lld\n", - "guard", "-", - byte2kb(atomic64_read(&kthread_stat.alloc_size)), - byte2kb(atomic64_read(&kthread_stat.k2u_size))); + "guard", "-", + byte2kb(atomic64_read(&kthread_stat.alloc_size)), + byte2kb(atomic64_read(&kthread_stat.k2u_size)));
- /* - * This ugly code is just for fixing the ABBA deadlock against - * sp_group_add_task. - */ down_read(&sp_group_sem); - down_read(&sp_spg_stat_sem); - idr_for_each(&sp_spg_stat_idr, idr_proc_stat_cb, seq); - up_read(&sp_spg_stat_sem); + idr_for_each(&sp_group_idr, proc_usage_by_group, seq); up_read(&sp_group_sem);
return 0; }
-static int idr_proc_overview_cb(int id, void *p, void *data) +static int proc_usage_show(struct seq_file *seq, void *offset) { - struct sp_proc_stat *proc_stat = p; - struct seq_file *seq = data; - struct mm_struct *mm = proc_stat->mm; + struct sp_group_master *master = NULL; unsigned long anon, file, shmem, total_rss; long sp_res, sp_res_nsize, non_sp_res, non_sp_shm; + struct sp_proc_stat *proc_stat;
- get_mm_rss_info(mm, &anon, &file, &shmem, &total_rss); - get_process_sp_res(proc_stat, &sp_res, &sp_res_nsize); - get_process_non_sp_res(total_rss, shmem, sp_res_nsize, - &non_sp_res, &non_sp_shm); - - seq_printf(seq, "%-8d %-16s %-9ld %-9ld %-9ld %-10ld %-10ld %-8ld\n", - id, proc_stat->comm, - get_proc_alloc(proc_stat), - get_proc_k2u(proc_stat), - sp_res, non_sp_res, non_sp_shm, - page2kb(mm->total_vm)); - return 0; -} - -static int proc_overview_show(struct seq_file *seq, void *offset) -{ seq_printf(seq, "%-8s %-16s %-9s %-9s %-9s %-10s %-10s %-8s\n", - "PID", "COMM", "SP_ALLOC", "SP_K2U", "SP_RES", "Non-SP_RES", - "Non-SP_Shm", "VIRT"); + "PID", "COMM", "SP_ALLOC", "SP_K2U", "SP_RES", "Non-SP_RES", + "Non-SP_Shm", "VIRT"); + + mutex_lock(&master_list_lock); + list_for_each_entry(master, &master_list, list_node) { + proc_stat = &master->instat; + get_mm_rss_info(master->mm, &anon, &file, &shmem, &total_rss); + get_process_sp_res(&master->instat, &sp_res, &sp_res_nsize); + get_process_non_sp_res(total_rss, shmem, sp_res_nsize, + &non_sp_res, &non_sp_shm); + seq_printf(seq, "%-8d %-16s %-9ld %-9ld %-9ld %-10ld %-10ld %-8ld\n", + proc_stat->tgid, proc_stat->comm, + get_proc_alloc(proc_stat), + get_proc_k2u(proc_stat), + sp_res, non_sp_res, non_sp_shm, + page2kb(master->mm->total_vm)); + } + mutex_unlock(&master_list_lock);
- down_read(&sp_proc_stat_sem); - idr_for_each(&sp_proc_stat_idr, idr_proc_overview_cb, seq); - up_read(&sp_proc_stat_sem); return 0; }
@@ -4510,9 +4412,9 @@ static void __init proc_sharepool_init(void) if (!proc_mkdir("sharepool", NULL)) return;
- proc_create_single_data("sharepool/proc_stat", 0400, NULL, proc_stat_show, NULL); proc_create_single_data("sharepool/spa_stat", 0400, NULL, spa_stat_show, NULL); - proc_create_single_data("sharepool/proc_overview", 0400, NULL, proc_overview_show, NULL); + proc_create_single_data("sharepool/proc_stat", 0400, NULL, proc_group_usage_show, NULL); + proc_create_single_data("sharepool/proc_overview", 0400, NULL, proc_usage_show, NULL); }
/*** End of tatistical and maintenance functions ***/ @@ -4736,18 +4638,15 @@ void sp_group_post_exit(struct mm_struct *mm) * A process not in an sp group doesn't need to print because there * wont't be any memory which is not freed. */ - stat = sp_get_proc_stat(mm); + stat = &master->instat; if (stat) { - alloc_size = atomic64_read(&stat->alloc_size); + alloc_size = atomic64_read(&stat->alloc_nsize) + atomic64_read(&stat->alloc_hsize); k2u_size = atomic64_read(&stat->k2u_size);
if (alloc_size != 0 || k2u_size != 0) pr_info("process %s(%d) exits. It applied %ld aligned KB, k2u shared %ld aligned KB\n", stat->comm, stat->tgid, byte2kb(alloc_size), byte2kb(k2u_size)); - - /* match with sp_init_proc_stat, we expect stat is released after this call */ - sp_proc_stat_drop(stat); }
down_write(&sp_group_sem); @@ -4760,6 +4659,10 @@ void sp_group_post_exit(struct mm_struct *mm) } up_write(&sp_group_sem);
+ mutex_lock(&master_list_lock); + list_del(&master->list_node); + mutex_unlock(&master_list_lock); + kfree(master); }
From: Guo Mengqi guomengqi3@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5J0Z9 CVE: NA
--------------------------------
when there is only one mm in a group allocating memory, if process is killed, the error path in sp_alloc_mmap_populate tries to access the next spg_node->master->mm in group's proc list. However, in this case the next spg_node in proc list is head and spg_node->master would be NULL, which leads to log below:
[file:test_sp_alloc.c, func:alloc_large_repeat, line:437] start to alloc... [ 264.699086][ T1772] share pool: gonna sp_alloc_unmap... [ 264.699939][ T1772] share pool: list_next_entry(spg_node, proc_node) is ffff0004c4907028 [ 264.700380][ T1772] share pool: master is 0 [ 264.701240][ T1772] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000018 ... [ 264.704764][ T1772] Internal error: Oops: 96000006 [#1] SMP [ 264.705166][ T1772] Modules linked in: sharepool_dev(OE) [ 264.705823][ T1772] CPU: 3 PID: 1772 Comm: test_sp_alloc Tainted: G OE 5.10.0+ #23 ... [ 264.712513][ T1772] Call trace: [ 264.713057][ T1772] sp_alloc+0x528/0xa88 [ 264.713740][ T1772] dev_ioctl+0x6ec/0x1d00 [sharepool_dev] [ 264.714035][ T1772] __arm64_sys_ioctl+0xb0/0xe8 ... [ 264.716891][ T1772] ---[ end trace 1587677032f666c6 ]--- [ 264.717457][ T1772] Kernel panic - not syncing: Oops: Fatal exception [ 264.717961][ T1772] SMP: stopping secondary CPUs [ 264.718787][ T1772] Kernel Offset: disabled [ 264.719718][ T1772] Memory Limit: none [ 264.720333][ T1772] ---[ end Kernel panic - not syncing: Oops: Fatal exception ]---
Add a list_is_last check to avoid this null pointer access.
Signed-off-by: Guo Mengqi guomengqi3@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/share_pool.c | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 862595d82c91..ab77d0d7648c 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -2631,6 +2631,7 @@ static int sp_alloc_populate(struct mm_struct *mm, struct sp_area *spa, if (ret) sp_add_work_compact(); } + return ret; }
@@ -2651,14 +2652,8 @@ static int __sp_alloc_mmap_populate(struct mm_struct *mm, struct sp_area *spa, int ret;
ret = sp_alloc_mmap(mm, spa, spg_node, ac); - if (ret < 0) { - if (ac->need_fallocate) { - /* e.g. second sp_mmap fail */ - sp_fallocate(spa); - ac->need_fallocate = false; - } + if (ret < 0) return ret; - }
if (!ac->have_mbind) { ret = sp_mbind(mm, spa->va_start, spa->real_size, spa->node_id); @@ -2673,18 +2668,13 @@ static int __sp_alloc_mmap_populate(struct mm_struct *mm, struct sp_area *spa, ret = sp_alloc_populate(mm, spa, ac); if (ret) { err: - sp_alloc_unmap(list_next_entry(spg_node, proc_node)->master->mm, spa, spg_node); - if (unlikely(fatal_signal_pending(current))) pr_warn_ratelimited("allocation failed, current thread is killed\n"); else pr_warn_ratelimited("allocation failed due to mm populate failed(potential no enough memory when -12): %d\n", - ret); - sp_fallocate(spa); /* need this, otherwise memleak */ - sp_alloc_fallback(spa, ac); + ret); } else ac->need_fallocate = true; - return ret; }
@@ -2693,7 +2683,7 @@ static int sp_alloc_mmap_populate(struct sp_area *spa, { int ret = -EINVAL; int mmap_ret = 0; - struct mm_struct *mm; + struct mm_struct *mm, *end_mm = NULL; struct sp_group_node *spg_node;
/* create mapping for each process in the group */ @@ -2702,7 +2692,7 @@ static int sp_alloc_mmap_populate(struct sp_area *spa, mmap_ret = __sp_alloc_mmap_populate(mm, spa, spg_node, ac); if (mmap_ret) { if (ac->state != ALLOC_COREDUMP) - return mmap_ret; + goto unmap; ac->state = ALLOC_NORMAL; continue; } @@ -2710,6 +2700,25 @@ static int sp_alloc_mmap_populate(struct sp_area *spa, }
return ret; + +unmap: + /* use the next mm in proc list as end mark */ + if (!list_is_last(&spg_node->proc_node, &spa->spg->procs)) + end_mm = list_next_entry(spg_node, proc_node)->master->mm; + sp_alloc_unmap(end_mm, spa, spg_node); + + /* only fallocate spa if physical memory had been allocated */ + if (ac->need_fallocate) { + sp_fallocate(spa); + ac->need_fallocate = false; + } + + /* if hugepage allocation fails, this will transfer to normal page + * and try again. (only if SP_HUGEPAGE_ONLY is not flagged + */ + sp_alloc_fallback(spa, ac); + + return mmap_ret; }
/* spa maybe an error pointer, so introduce variable spg */
From: Mike Rapoport rppt@linux.ibm.com
mainline inclusion from mainline-v5.15-rc1 commit c3ab6baf6a004eab7344a1d8880a971f2414e1b6 category: bugfix bugzilla: 186414, https://gitee.com/openeuler/kernel/issues/I5K7IY CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Patch series "mm: ensure consistency of memory map poisoning".
Currently memory map allocation for FLATMEM case does not poison the struct pages regardless of CONFIG_PAGE_POISON setting.
This happens because allocation of the memory map for FLATMEM and SPARSMEM use different memblock functions and those that are used for SPARSMEM case (namely memblock_alloc_try_nid_raw() and memblock_alloc_exact_nid_raw()) implicitly poison the allocated memory.
Another side effect of this implicit poisoning is that early setup code that uses the same functions to allocate memory burns cycles for the memory poisoning even if it was not intended.
These patches introduce memmap_alloc() wrapper that ensure that the memory map allocation is consistent for different memory models.
This patch (of 4):
Currently memory map for the holes is initialized only when SPARSEMEM memory model is used. Yet, even with FLATMEM there could be holes in the physical memory layout that have memory map entries.
For instance, the memory reserved using e820 API on i386 or "reserved-memory" nodes in device tree would not appear in memblock.memory and hence the struct pages for such holes will be skipped during memory map initialization.
These struct pages will be zeroed because the memory map for FLATMEM systems is allocated with memblock_alloc_node() that clears the allocated memory. While zeroed struct pages do not cause immediate problems, the correct behaviour is to initialize every page using __init_single_page(). Besides, enabling page poison for FLATMEM case will trigger PF_POISONED_CHECK() unless the memory map is properly initialized.
Make sure init_unavailable_range() is called for both SPARSEMEM and FLATMEM so that struct pages representing memory holes would appear as PG_Reserved with any memory layout.
[rppt@kernel.org: fix microblaze] Link: https://lkml.kernel.org/r/YQWW3RCE4eWBuMu/@kernel.org
Link: https://lkml.kernel.org/r/20210714123739.16493-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20210714123739.16493-2-rppt@kernel.org Signed-off-by: Mike Rapoport rppt@linux.ibm.com Acked-by: David Hildenbrand david@redhat.com Tested-by: Guenter Roeck linux@roeck-us.net Cc: Michal Simek monstr@monstr.eu Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Conflicts: arch/microblaze/include/asm/page.h mm/page_alloc.c Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/microblaze/include/asm/page.h | 2 +- mm/page_alloc.c | 8 -------- 2 files changed, 1 insertion(+), 9 deletions(-)
diff --git a/arch/microblaze/include/asm/page.h b/arch/microblaze/include/asm/page.h index b13463d39b38..8cd320a9ddb4 100644 --- a/arch/microblaze/include/asm/page.h +++ b/arch/microblaze/include/asm/page.h @@ -162,7 +162,7 @@ extern int page_is_ram(unsigned long pfn); # define ARCH_PFN_OFFSET (PAGE_OFFSET >> PAGE_SHIFT) # else /* CONFIG_MMU */ # define ARCH_PFN_OFFSET (memory_start >> PAGE_SHIFT) -# define pfn_valid(pfn) ((pfn) < (max_mapnr + ARCH_PFN_OFFSET)) +# define pfn_valid(pfn) ((pfn) >= ARCH_PFN_OFFSET && (pfn) < (max_mapnr + ARCH_PFN_OFFSET)) # endif /* CONFIG_MMU */
# endif /* __ASSEMBLY__ */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cf9c69d631f3..d6085d48cb63 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6451,7 +6451,6 @@ static void __meminit zone_init_free_lists(struct zone *zone) } }
-#if !defined(CONFIG_FLAT_NODE_MEM_MAP) /* * Only struct pages that correspond to ranges defined by memblock.memory * are zeroed and initialized by going through __init_single_page() during @@ -6496,13 +6495,6 @@ static void __init init_unavailable_range(unsigned long spfn, pr_info("On node %d, zone %s: %lld pages in unavailable ranges", node, zone_names[zone], pgcnt); } -#else -static inline void init_unavailable_range(unsigned long spfn, - unsigned long epfn, - int zone, int node) -{ -} -#endif
static void __init memmap_init_zone_range(struct zone *zone, unsigned long start_pfn,
From: Mike Rapoport rppt@linux.ibm.com
mainline inclusion from mainline-v5.15-rc1 commit 22e7878102f94a50e9a4c2c19f909a9a0898c4ce category: bugfix bugzilla: 186414, https://gitee.com/openeuler/kernel/issues/I5K7IY CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The microblaze's implementation of pte_alloc_one_kernel() used memblock_alloc_try_nid_raw() along with clear_page() to allocated a zeroed page during early setup.
Replace calls of these functions with a call to memblock_alloc_try_nid() that already returns zeroed page and respects the same allocation limits as memblock_alloc_try_nid_raw().
While on it drop early_get_page() wrapper that was only used in pte_alloc_one_kernel().
Link: https://lkml.kernel.org/r/20210714123739.16493-3-rppt@kernel.org Signed-off-by: Mike Rapoport rppt@linux.ibm.com Reviewed-by: David Hildenbrand david@redhat.com Cc: Michal Simek monstr@monstr.eu Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Conflicts: arch/microblaze/mm/init.c Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/microblaze/include/asm/pgtable.h | 2 -- arch/microblaze/mm/init.c | 12 ------------ arch/microblaze/mm/pgtable.c | 17 ++++++++--------- 3 files changed, 8 insertions(+), 23 deletions(-)
diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index 3fa1df90925e..b193ee496fff 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -486,8 +486,6 @@ extern int mem_init_done;
asmlinkage void __init mmu_init(void);
-void __init *early_get_page(void); - #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 45da639bd22c..4a0c30ced72b 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -318,18 +318,6 @@ asmlinkage void __init mmu_init(void) dma_contiguous_reserve(memory_start + lowmem_size - 1); }
-/* This is only called until mem_init is done. */ -void __init *early_get_page(void) -{ - /* - * Mem start + kernel_tlb -> here is limit - * because of mem mapping from head.S - */ - return memblock_alloc_try_nid_raw(PAGE_SIZE, PAGE_SIZE, - MEMBLOCK_LOW_LIMIT, memory_start + kernel_tlb, - NUMA_NO_NODE); -} - #endif /* CONFIG_MMU */
void * __ref zalloc_maybe_bootmem(size_t size, gfp_t mask) diff --git a/arch/microblaze/mm/pgtable.c b/arch/microblaze/mm/pgtable.c index 38ccb909bc9d..c1833b159d3b 100644 --- a/arch/microblaze/mm/pgtable.c +++ b/arch/microblaze/mm/pgtable.c @@ -33,6 +33,7 @@ #include <linux/init.h> #include <linux/mm_types.h> #include <linux/pgtable.h> +#include <linux/memblock.h>
#include <asm/pgalloc.h> #include <linux/io.h> @@ -242,15 +243,13 @@ unsigned long iopa(unsigned long addr)
__ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { - pte_t *pte; - if (mem_init_done) { - pte = (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); - } else { - pte = (pte_t *)early_get_page(); - if (pte) - clear_page(pte); - } - return pte; + if (mem_init_done) + return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + else + return memblock_alloc_try_nid(PAGE_SIZE, PAGE_SIZE, + MEMBLOCK_LOW_LIMIT, + memory_start + kernel_tlb, + NUMA_NO_NODE); }
void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
From: Mike Rapoport rppt@linux.ibm.com
mainline inclusion from mainline-v5.15-rc1 commit c803b3c8b3b70f306ee6300bf8acdd70ffd1441a category: bugfix bugzilla: 186414, https://gitee.com/openeuler/kernel/issues/I5K7IY CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
There are several places that allocate memory for the memory map: alloc_node_mem_map() for FLATMEM, sparse_buffer_init() and __populate_section_memmap() for SPARSEMEM.
The memory allocated in the FLATMEM case is zeroed and it is never poisoned, regardless of CONFIG_PAGE_POISON setting.
The memory allocated in the SPARSEMEM cases is not zeroed and it is implicitly poisoned inside memblock if CONFIG_PAGE_POISON is set.
Introduce memmap_alloc() wrapper for memblock allocators that will be used for both FLATMEM and SPARSEMEM cases and will makei memory map zeroing and poisoning consistent for different memory models.
Link: https://lkml.kernel.org/r/20210714123739.16493-4-rppt@kernel.org Signed-off-by: Mike Rapoport rppt@linux.ibm.com Cc: Michal Simek monstr@monstr.eu Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Conflicts: mm/page_alloc.c Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/internal.h | 4 ++++ mm/page_alloc.c | 24 ++++++++++++++++++++++-- mm/sparse.c | 6 ++---- 3 files changed, 28 insertions(+), 6 deletions(-)
diff --git a/mm/internal.h b/mm/internal.h index 917b86b2870c..8ef8cdd929fa 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -208,6 +208,10 @@ extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); extern void zone_pcp_enable(struct zone *zone);
+extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, + int nid, bool exact_nid); + #if defined CONFIG_COMPACTION || defined CONFIG_CMA
/* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d6085d48cb63..ff6fffec8770 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6564,6 +6564,26 @@ void __meminit __weak arch_memmap_init(unsigned long size, int nid, { }
+void __init *memmap_alloc(phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, int nid, bool exact_nid) +{ + void *ptr; + + if (exact_nid) + ptr = memblock_alloc_exact_nid_raw(size, align, min_addr, + MEMBLOCK_ALLOC_ACCESSIBLE, + nid); + else + ptr = memblock_alloc_try_nid_raw(size, align, min_addr, + MEMBLOCK_ALLOC_ACCESSIBLE, + nid); + + if (ptr && size > 0) + page_init_poison(ptr, size); + + return ptr; +} + static int zone_batchsize(struct zone *zone) { #ifdef CONFIG_MMU @@ -7297,8 +7317,8 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) end = pgdat_end_pfn(pgdat); end = ALIGN(end, MAX_ORDER_NR_PAGES); size = (end - start) * sizeof(struct page); - map = memblock_alloc_node(size, SMP_CACHE_BYTES, - pgdat->node_id); + map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT, + pgdat->node_id, false); if (!map) panic("Failed to allocate %ld bytes for node %d memory map\n", size, pgdat->node_id); diff --git a/mm/sparse.c b/mm/sparse.c index 5a48ea3e9968..d92a29000d66 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -453,8 +453,7 @@ struct page __init *__populate_section_memmap(unsigned long pfn, if (map) return map;
- map = memblock_alloc_try_nid_raw(size, size, addr, - MEMBLOCK_ALLOC_ACCESSIBLE, nid); + map = memmap_alloc(size, size, addr, nid, false); if (!map) panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n", __func__, size, PAGE_SIZE, nid, &addr); @@ -481,8 +480,7 @@ static void __init sparse_buffer_init(unsigned long size, int nid) * and we want it to be properly aligned to the section size - this is * especially the case for VMEMMAP which maps memmap to PMDs */ - sparsemap_buf = memblock_alloc_exact_nid_raw(size, section_map_size(), - addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid); + sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true); sparsemap_buf_end = sparsemap_buf + size; }
From: Mike Rapoport rppt@linux.ibm.com
mainline inclusion from mainline-v5.15-rc1 commit 08678804e0b305bbbf5b756ad365373e5fe885a2 category: bugfix bugzilla: 186414, https://gitee.com/openeuler/kernel/issues/I5K7IY CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Functions memblock_alloc_exact_nid_raw() and memblock_alloc_try_nid_raw() are intended for early memory allocation without overhead of zeroing the allocated memory. Since these functions were used to allocate the memory map, they have ended up with addition of a call to page_init_poison() that poisoned the allocated memory when CONFIG_PAGE_POISON was set.
Since the memory map is allocated using a dedicated memmep_alloc() function that takes care of the poisoning, remove page poisoning from the memblock_alloc_*_raw() functions.
Link: https://lkml.kernel.org/r/20210714123739.16493-5-rppt@kernel.org Signed-off-by: Mike Rapoport rppt@linux.ibm.com Cc: Michal Simek monstr@monstr.eu Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/memblock.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-)
diff --git a/mm/memblock.c b/mm/memblock.c index b93fa16292d0..53e92fc7ef6f 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1501,18 +1501,12 @@ void * __init memblock_alloc_exact_nid_raw( phys_addr_t min_addr, phys_addr_t max_addr, int nid) { - void *ptr; - memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n", __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_);
- ptr = memblock_alloc_internal(size, align, - min_addr, max_addr, nid, true); - if (ptr && size > 0) - page_init_poison(ptr, size); - - return ptr; + return memblock_alloc_internal(size, align, min_addr, max_addr, nid, + true); }
/** @@ -1539,18 +1533,12 @@ void * __init memblock_alloc_try_nid_raw( phys_addr_t min_addr, phys_addr_t max_addr, int nid) { - void *ptr; - memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n", __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_);
- ptr = memblock_alloc_internal(size, align, - min_addr, max_addr, nid, false); - if (ptr && size > 0) - page_init_poison(ptr, size); - - return ptr; + return memblock_alloc_internal(size, align, min_addr, max_addr, nid, + false); }
/**
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4HOXK CVE: NA
--------------------------------
This reverts commit e56e8310a3ea2751463ab8ed03dd64baab3fee46. This feature will be reimplement.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/drop_caches.c | 36 ++------------------------------ include/linux/fs.h | 9 -------- include/linux/page_cache_limit.h | 3 --- kernel/sysctl.c | 8 ------- mm/page_cache_limit.c | 2 -- mm/truncate.c | 34 +++--------------------------- 6 files changed, 5 insertions(+), 87 deletions(-)
diff --git a/fs/drop_caches.c b/fs/drop_caches.c index ff70ef7674e3..f00fcc4a4f72 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -9,17 +9,12 @@ #include <linux/writeback.h> #include <linux/sysctl.h> #include <linux/gfp.h> - -#ifdef CONFIG_SHRINK_PAGECACHE -#include <linux/page_cache_limit.h> -#endif - #include "internal.h"
/* A global variable is a bit ugly, but it keeps the code simple */ int sysctl_drop_caches;
-static void drop_pagecache_sb(struct super_block *sb, void *nid) +static void drop_pagecache_sb(struct super_block *sb, void *unused) { struct inode *inode, *toput_inode = NULL;
@@ -40,12 +35,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *nid) spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock);
- if (!nid) - invalidate_mapping_pages(inode->i_mapping, 0, -1); - else - node_invalidate_mapping_pages(inode->i_mapping, - *(int *)nid, 0, -1); - + invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); toput_inode = inode;
@@ -84,25 +74,3 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int write, } return 0; } - -#ifdef CONFIG_SHRINK_PAGECACHE -int proc_shrink_node_caches(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int ret; - - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (ret || !write) - return ret; - - if (node_to_shrink >= MAX_NUMNODES) - return -EINVAL; - - if (!node_isset(node_to_shrink, node_states[N_MEMORY])) - return 0; - - iterate_supers(drop_pagecache_sb, &node_to_shrink); - - return 0; -} -#endif diff --git a/include/linux/fs.h b/include/linux/fs.h index a7bc1eaa27ee..fa9d89379da1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2680,15 +2680,6 @@ extern bool is_bad_inode(struct inode *); unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end);
-#ifdef CONFIG_SHRINK_PAGECACHE -unsigned long node_invalidate_mapping_pages(struct address_space *mapping, - int nid, pgoff_t start, pgoff_t end); -#else -static inline unsigned long -node_invalidate_mapping_pages(struct address_space *mapping, int nid, - pgoff_t start, pgoff_t end) { return 0; } -#endif - void invalidate_mapping_pagevec(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_pagevec); diff --git a/include/linux/page_cache_limit.h b/include/linux/page_cache_limit.h index 442d6126c529..2df08a0604d8 100644 --- a/include/linux/page_cache_limit.h +++ b/include/linux/page_cache_limit.h @@ -12,7 +12,6 @@ enum page_cache_reclaim_flag { extern int pagecache_reclaim_enable; extern int pagecache_limit_ratio; extern int pagecache_reclaim_ratio; -extern int node_to_shrink;
int proc_page_cache_limit(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -21,8 +20,6 @@ unsigned long __shrink_node_page_cache(int nid, gfp_t mask, void kpagecache_limitd_stop(int nid); int kpagecache_limitd_run(int nid); void wakeup_all_kpagecache_limitd(void); -int proc_shrink_node_caches(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); #else static inline void kpagecache_limitd_stop(int nid) {} static inline int kpagecache_limitd_run(int nid) { return 0; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a0df602c9372..e26bda90e8b0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3240,14 +3240,6 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = (void *)&one_hundred, }, - { - .procname = "node_drop_caches", - .data = &node_to_shrink, - .maxlen = sizeof(node_to_shrink), - .mode = 0600, - .proc_handler = proc_shrink_node_caches, - .extra1 = SYSCTL_ZERO, - }, #endif #ifdef CONFIG_ASCEND_SHARE_POOL { diff --git a/mm/page_cache_limit.c b/mm/page_cache_limit.c index 0ccc1388c8dc..0a3098c9bb33 100644 --- a/mm/page_cache_limit.c +++ b/mm/page_cache_limit.c @@ -5,13 +5,11 @@ #include <linux/module.h> #include <linux/err.h> #include <linux/swap.h> -#include <linux/fs.h> #include <linux/page_cache_limit.h>
int pagecache_reclaim_enable; int pagecache_limit_ratio; int pagecache_reclaim_ratio; -int node_to_shrink;
static unsigned long pagecache_limit_pages; static unsigned long node_pagecache_limit_pages[MAX_NUMNODES]; diff --git a/mm/truncate.c b/mm/truncate.c index 6d4887a43cd8..98d08f197766 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -465,7 +465,7 @@ void truncate_inode_pages_final(struct address_space *mapping) EXPORT_SYMBOL(truncate_inode_pages_final);
static unsigned long __invalidate_mapping_pages(struct address_space *mapping, - pgoff_t start, pgoff_t end, unsigned long *nr_pagevec, int nid) + pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) { pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; @@ -487,10 +487,6 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, page); continue; } - - if (nid != NUMA_NO_NODE && page_to_nid(page) != nid) - continue; - index += thp_nr_pages(page) - 1;
ret = invalidate_inode_page(page); @@ -533,34 +529,10 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { - return __invalidate_mapping_pages(mapping, start, end, NULL, NUMA_NO_NODE); + return __invalidate_mapping_pages(mapping, start, end, NULL); } EXPORT_SYMBOL(invalidate_mapping_pages);
- -/** - * node_invalidate_mapping_pages - Invalidate all the unlocked pages in @nid of one inode - * @mapping: the address_space which holds the pages to invalidate - * @nid: pages belong to this node will be invalidate - * @start: the offset 'from' which to invalidate - * @end: the offset 'to' which to invalidate (inclusive) - * - * This function only removes the unlocked pages, if you want to - * remove all the pages of one inode, you must call truncate_inode_pages. - * - * node_invalidate_mapping_pages() will not block on IO activity. It will not - * invalidate pages which are dirty, locked, under writeback or mapped into - * pagetables. - * - * Return: the number of the pages that were invalidated - */ -#ifdef CONFIG_SHRINK_PAGECACHE -unsigned long node_invalidate_mapping_pages(struct address_space *mapping, - int nid, pgoff_t start, pgoff_t end) -{ - return __invalidate_mapping_pages(mapping, start, end, NULL, nid); -} -#endif /** * This helper is similar with the above one, except that it accounts for pages * that are likely on a pagevec and count them in @nr_pagevec, which will used by @@ -569,7 +541,7 @@ unsigned long node_invalidate_mapping_pages(struct address_space *mapping, void invalidate_mapping_pagevec(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) { - __invalidate_mapping_pages(mapping, start, end, nr_pagevec, NUMA_NO_NODE); + __invalidate_mapping_pages(mapping, start, end, nr_pagevec); }
/*
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4HOXK CVE: NA
--------------------------------
This reverts commit 9fea105d74885fef299b43e6734d19ba8921242e. This feature will be reimplement.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/page_cache_limit.h | 2 -- include/linux/pagemap.h | 2 -- mm/filemap.c | 2 -- 3 files changed, 6 deletions(-)
diff --git a/include/linux/page_cache_limit.h b/include/linux/page_cache_limit.h index 2df08a0604d8..7906b12af947 100644 --- a/include/linux/page_cache_limit.h +++ b/include/linux/page_cache_limit.h @@ -19,11 +19,9 @@ unsigned long __shrink_node_page_cache(int nid, gfp_t mask, unsigned long nr_to_reclaim, enum page_cache_reclaim_flag flag); void kpagecache_limitd_stop(int nid); int kpagecache_limitd_run(int nid); -void wakeup_all_kpagecache_limitd(void); #else static inline void kpagecache_limitd_stop(int nid) {} static inline int kpagecache_limitd_run(int nid) { return 0; } -static inline void wakeup_all_kpagecache_limitd(void) {} #endif
#endif diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index dbb25f1dc2e9..0bfa9cce6589 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -15,7 +15,6 @@ #include <linux/bitops.h> #include <linux/hardirq.h> /* for in_interrupt() */ #include <linux/hugetlb_inline.h> -#include <linux/page_cache_limit.h>
struct pagevec;
@@ -778,7 +777,6 @@ static inline int add_to_page_cache(struct page *page, { int error;
- wakeup_all_kpagecache_limitd(); __SetPageLocked(page); error = add_to_page_cache_locked(page, mapping, offset, gfp_mask); if (unlikely(error)) diff --git a/mm/filemap.c b/mm/filemap.c index a00fc493f5cf..4f9cd18f9197 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -42,7 +42,6 @@ #include <linux/psi.h> #include <linux/ramfs.h> #include <linux/page_idle.h> -#include <linux/page_cache_limit.h> #include "internal.h"
#define CREATE_TRACE_POINTS @@ -924,7 +923,6 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, void *shadow = NULL; int ret;
- wakeup_all_kpagecache_limitd(); __SetPageLocked(page); ret = __add_to_page_cache_locked(page, mapping, offset, gfp_mask, &shadow);
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4HOXK CVE: NA
--------------------------------
This reverts commit 955d63aec936df0ffbb53118ab28b4c208ac8abf. This feature will be reimplement.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/page_cache_limit.h | 4 --- mm/memory_hotplug.c | 3 --- mm/page_cache_limit.c | 45 +++++++------------------------- 3 files changed, 9 insertions(+), 43 deletions(-)
diff --git a/include/linux/page_cache_limit.h b/include/linux/page_cache_limit.h index 7906b12af947..e4ef5919cb92 100644 --- a/include/linux/page_cache_limit.h +++ b/include/linux/page_cache_limit.h @@ -17,11 +17,7 @@ int proc_page_cache_limit(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); unsigned long __shrink_node_page_cache(int nid, gfp_t mask, unsigned long nr_to_reclaim, enum page_cache_reclaim_flag flag); -void kpagecache_limitd_stop(int nid); -int kpagecache_limitd_run(int nid); #else -static inline void kpagecache_limitd_stop(int nid) {} -static inline int kpagecache_limitd_run(int nid) { return 0; } #endif
#endif diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 73ea92dae74a..7456d825414d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -38,7 +38,6 @@ #include <linux/rmap.h>
#include <asm/tlbflush.h> -#include <linux/page_cache_limit.h>
#include "internal.h" #include "shuffle.h" @@ -736,7 +735,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
kswapd_run(nid); kcompactd_run(nid); - kpagecache_limitd_run(nid);
writeback_set_ratelimit();
@@ -1487,7 +1485,6 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) if (arg.status_change_nid >= 0) { kswapd_stop(node); kcompactd_stop(node); - kpagecache_limitd_stop(node); }
writeback_set_ratelimit(); diff --git a/mm/page_cache_limit.c b/mm/page_cache_limit.c index 0a3098c9bb33..1581334429e1 100644 --- a/mm/page_cache_limit.c +++ b/mm/page_cache_limit.c @@ -31,27 +31,18 @@ static unsigned long get_node_total_pages(int nid) return managed_pages; }
-static void setup_node_pagecache_limit(int nid) -{ - unsigned long node_total_pages; - - node_total_pages = get_node_total_pages(nid); - node_pagecache_limit_pages[nid] = node_total_pages * pagecache_limit_ratio / 100; -} - -#define ALL_NODE (-1) -static void setup_pagecache_limit(int nid) +static void setup_pagecache_limit(void) { int i; + unsigned long node_total_pages;
pagecache_limit_pages = pagecache_limit_ratio * totalram_pages() / 100;
- if (nid != ALL_NODE) - setup_node_pagecache_limit(nid); - - else - for (i = 0; i < MAX_NUMNODES; i++) - setup_node_pagecache_limit(i); + for (i = 0; i < MAX_NUMNODES; i++) { + node_total_pages = get_node_total_pages(i); + node_pagecache_limit_pages[i] = node_total_pages * + pagecache_limit_ratio / 100; + } }
int proc_page_cache_limit(struct ctl_table *table, int write, @@ -62,7 +53,7 @@ int proc_page_cache_limit(struct ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (write && !ret) - setup_pagecache_limit(ALL_NODE); + setup_pagecache_limit();
return ret; } @@ -81,8 +72,6 @@ void kpagecache_limitd_stop(int nid) kvfree(pagecache_limitd_wait_queue[nid]); pagecache_limitd_wait_queue[nid] = NULL; } - - setup_pagecache_limit(nid); }
static void wakeup_kpagecache_limitd(int nid) @@ -218,7 +207,7 @@ static int pagecache_limitd(void *arg) return 0; }
-static int __kpagecache_limitd_run(int nid) +int kpagecache_limitd_run(int nid) { int ret = 0; wait_queue_head_t *queue_head = NULL; @@ -247,22 +236,6 @@ static int __kpagecache_limitd_run(int nid) return ret; }
-int kpagecache_limitd_run(int nid) -{ - int ret; - - if (nid < 0 || nid >= MAX_NUMNODES) - return -EINVAL; - - ret = __kpagecache_limitd_run(nid); - if (ret) - return ret; - - setup_pagecache_limit(nid); - - return 0; -} - static int __init kpagecache_limitd_init(void) { int nid;
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4HOXK CVE: NA
--------------------------------
This reverts commit 7be2f4c4fdd7938f161ff6e43a0c60d9c9412a62. This feature will be reimplement.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/page_cache_limit.h | 9 ------ mm/page_cache_limit.c | 27 +++------------- mm/vmscan.c | 53 ++------------------------------ 3 files changed, 6 insertions(+), 83 deletions(-)
diff --git a/include/linux/page_cache_limit.h b/include/linux/page_cache_limit.h index e4ef5919cb92..98f12734114b 100644 --- a/include/linux/page_cache_limit.h +++ b/include/linux/page_cache_limit.h @@ -2,21 +2,12 @@ #define _PAGECACHE_H
#ifdef CONFIG_SHRINK_PAGECACHE -enum page_cache_reclaim_flag { - PAGE_CACHE_RECLAIM_NO_UNMAP, - PAGE_CACHE_RECLAIM_UNMAP, - PAGE_CACHE_RECLAIM_WRITEPAGE, - PAGE_CACHE_RECLAIM_NR_FLAGS, -}; - extern int pagecache_reclaim_enable; extern int pagecache_limit_ratio; extern int pagecache_reclaim_ratio;
int proc_page_cache_limit(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -unsigned long __shrink_node_page_cache(int nid, gfp_t mask, - unsigned long nr_to_reclaim, enum page_cache_reclaim_flag flag); #else #endif
diff --git a/mm/page_cache_limit.c b/mm/page_cache_limit.c index 1581334429e1..33164e19cfa2 100644 --- a/mm/page_cache_limit.c +++ b/mm/page_cache_limit.c @@ -4,8 +4,6 @@ #include <linux/kthread.h> #include <linux/module.h> #include <linux/err.h> -#include <linux/swap.h> -#include <linux/page_cache_limit.h>
int pagecache_reclaim_enable; int pagecache_limit_ratio; @@ -144,31 +142,14 @@ static unsigned long node_nr_page_reclaim(int nid) return nr_to_reclaim; }
-static void shrink_node_page_cache(int nid, gfp_t mask) +static void shrink_node_page_cache(int nid) { - int i; unsigned long nr_to_reclaim; - unsigned long nr_reclaimed; - enum page_cache_reclaim_flag flag;
nr_to_reclaim = node_nr_page_reclaim(nid); - if (nr_to_reclaim <= 0) - return; - - flag = 0; - for (i = PAGE_CACHE_RECLAIM_NO_UNMAP; - i < PAGE_CACHE_RECLAIM_NR_FLAGS; i++) { - nr_reclaimed = __shrink_node_page_cache(nid, mask, nr_to_reclaim, flag); - nr_to_reclaim -= nr_reclaimed; - - if (nr_to_reclaim <= 0) - break; - - flag |= i; - } }
-static void shrink_page_cache(gfp_t mask) +static void shrink_page_cache(void) { int nid;
@@ -176,7 +157,7 @@ static void shrink_page_cache(gfp_t mask) return;
for_each_node_state(nid, N_MEMORY) - shrink_node_page_cache(nid, mask); + shrink_node_page_cache(nid); }
static DECLARE_COMPLETION(setup_done); @@ -192,7 +173,7 @@ static int pagecache_limitd(void *arg) set_freezable(); for (;;) { try_to_freeze(); - shrink_page_cache(GFP_KERNEL | __GFP_HIGHMEM); + shrink_page_cache();
prepare_to_wait(pagecache_limitd_wait_queue[nid], &wait, TASK_INTERRUPTIBLE); diff --git a/mm/vmscan.c b/mm/vmscan.c index c851e5f91842..7aea8c2cf0a8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -64,10 +64,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/vmscan.h>
-#ifdef CONFIG_SHRINK_PAGECACHE -#include <linux/page_cache_limit.h> -#endif - struct scan_control { /* How many pages shrink_list() should reclaim */ unsigned long nr_to_reclaim; @@ -128,9 +124,6 @@ struct scan_control { /* The file pages on the current node are dangerously low */ unsigned int file_is_tiny:1;
- /* can't shrink slab pages */ - unsigned int no_shrink_slab:1; - /* Allocation order */ s8 order;
@@ -2880,9 +2873,8 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
shrink_lruvec(lruvec, sc);
- if (!sc->no_shrink_slab) - shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, - sc->priority); + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, + sc->priority);
/* Record the group's reclaim efficiency */ vmpressure(sc->gfp_mask, memcg, false, @@ -4600,44 +4592,3 @@ struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr) return page; } EXPORT_SYMBOL_GPL(get_page_from_vaddr); - -#ifdef CONFIG_SHRINK_PAGECACHE -/* - * return the number of reclaimed pages - */ -unsigned long __shrink_node_page_cache(int nid, gfp_t mask, unsigned long nr_to_reclaim, - enum page_cache_reclaim_flag reclaim_flag) -{ - struct scan_control sc = { - .nr_to_reclaim = nr_to_reclaim, - .gfp_mask = mask, - .may_swap = 0, - .may_unmap = reclaim_flag | PAGE_CACHE_RECLAIM_UNMAP, - .may_writepage = reclaim_flag | PAGE_CACHE_RECLAIM_WRITEPAGE, - .target_mem_cgroup = NULL, - .priority = DEF_PRIORITY, - .reclaim_idx = MAX_NR_ZONES, - .no_shrink_slab = 1, - }; - - struct zonelist *zonelist = node_zonelist(nid, __GFP_THISNODE); - struct reclaim_state *old_rs = current->reclaim_state; - unsigned long nr_reclaimed; - unsigned int noreclaim_flag; - - if (!(mask & __GFP_RECLAIM)) - return 0; - - noreclaim_flag = memalloc_noreclaim_save(); - fs_reclaim_acquire(sc.gfp_mask); - current->reclaim_state = NULL; - - nr_reclaimed = do_try_to_free_pages(zonelist, &sc); - - current->reclaim_state = old_rs; - fs_reclaim_release(sc.gfp_mask); - memalloc_noreclaim_restore(noreclaim_flag); - - return nr_reclaimed; -} -#endif
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4HOXK CVE: NA
--------------------------------
This reverts commit 425bce986237e0409673c28774bbe546b433dee7. This feature will be reimplement.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/page_cache_limit.c | 52 +------------------------------------------ 1 file changed, 1 insertion(+), 51 deletions(-)
diff --git a/mm/page_cache_limit.c b/mm/page_cache_limit.c index 33164e19cfa2..4afc08373a35 100644 --- a/mm/page_cache_limit.c +++ b/mm/page_cache_limit.c @@ -104,60 +104,10 @@ void wakeup_all_kpagecache_limitd(void) wakeup_kpagecache_limitd(nid); }
-static unsigned long node_nr_page_cache(int nid) -{ - struct pglist_data *pgdat; - unsigned long num = 0; - - pgdat = NODE_DATA(nid); - if (!pgdat) - return 0; - - num = node_page_state(pgdat, NR_FILE_PAGES); - num -= node_page_state(pgdat, NR_SHMEM); - - return num; -} - -static unsigned long node_nr_page_reclaim(int nid) -{ - unsigned long nr_page_cache; - unsigned long nr_to_reclaim; - unsigned long total_pages; - - if (!node_pagecache_limit_pages[nid]) - return 0; - - nr_page_cache = node_nr_page_cache(nid); - if (!nr_page_cache) - return 0; - - if (nr_page_cache < node_pagecache_limit_pages[nid]) - return 0; - - total_pages = get_node_total_pages(nid); - nr_to_reclaim = nr_page_cache - node_pagecache_limit_pages[nid]; - nr_to_reclaim += total_pages * pagecache_reclaim_ratio / 100; - - return nr_to_reclaim; -} - -static void shrink_node_page_cache(int nid) -{ - unsigned long nr_to_reclaim; - - nr_to_reclaim = node_nr_page_reclaim(nid); -} - static void shrink_page_cache(void) { - int nid; - - if (!pagecache_reclaim_enable || !pagecache_overlimit()) + if (!pagecache_overlimit()) return; - - for_each_node_state(nid, N_MEMORY) - shrink_node_page_cache(nid); }
static DECLARE_COMPLETION(setup_done);
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4HOXK CVE: NA
--------------------------------
This reverts commit b072a9d4198f820a00979e0c38ee0cd85a55b779. This feature will be reimplement.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/page_cache_limit.c | 133 ------------------------------------------ 1 file changed, 133 deletions(-)
diff --git a/mm/page_cache_limit.c b/mm/page_cache_limit.c index 4afc08373a35..55fdea087804 100644 --- a/mm/page_cache_limit.c +++ b/mm/page_cache_limit.c @@ -1,9 +1,5 @@ #include <linux/mm.h> #include <linux/sysctl.h> -#include <linux/freezer.h> -#include <linux/kthread.h> -#include <linux/module.h> -#include <linux/err.h>
int pagecache_reclaim_enable; int pagecache_limit_ratio; @@ -11,8 +7,6 @@ int pagecache_reclaim_ratio;
static unsigned long pagecache_limit_pages; static unsigned long node_pagecache_limit_pages[MAX_NUMNODES]; -static wait_queue_head_t *pagecache_limitd_wait_queue[MAX_NUMNODES]; -static struct task_struct *pagecache_limitd_tasks[MAX_NUMNODES];
static unsigned long get_node_total_pages(int nid) { @@ -55,130 +49,3 @@ int proc_page_cache_limit(struct ctl_table *table, int write,
return ret; } - -void kpagecache_limitd_stop(int nid) -{ - if (nid < 0 || nid >= MAX_NUMNODES) - return; - - if (pagecache_limitd_tasks[nid]) { - kthread_stop(pagecache_limitd_tasks[nid]); - pagecache_limitd_tasks[nid] = NULL; - } - - if (pagecache_limitd_wait_queue[nid]) { - kvfree(pagecache_limitd_wait_queue[nid]); - pagecache_limitd_wait_queue[nid] = NULL; - } -} - -static void wakeup_kpagecache_limitd(int nid) -{ - if (!pagecache_limitd_wait_queue[nid]) - return; - - if (!waitqueue_active(pagecache_limitd_wait_queue[nid])) - return; - - wake_up_interruptible(pagecache_limitd_wait_queue[nid]); -} - -static bool pagecache_overlimit(void) -{ - unsigned long total_pagecache; - - total_pagecache = global_node_page_state(NR_FILE_PAGES); - total_pagecache -= global_node_page_state(NR_SHMEM); - - return total_pagecache > pagecache_limit_pages; -} - -void wakeup_all_kpagecache_limitd(void) -{ - int nid; - - if (!pagecache_reclaim_enable || !pagecache_overlimit()) - return; - - for_each_node_state(nid, N_MEMORY) - wakeup_kpagecache_limitd(nid); -} - -static void shrink_page_cache(void) -{ - if (!pagecache_overlimit()) - return; -} - -static DECLARE_COMPLETION(setup_done); -static int pagecache_limitd(void *arg) -{ - DEFINE_WAIT(wait); - int nid = *(int *)arg; - - if (nid < 0 || nid >= MAX_NUMNODES) - nid = numa_node_id(); - - complete(&setup_done); - set_freezable(); - for (;;) { - try_to_freeze(); - shrink_page_cache(); - - prepare_to_wait(pagecache_limitd_wait_queue[nid], &wait, - TASK_INTERRUPTIBLE); - if (kthread_should_stop()) - break; - schedule(); - finish_wait(pagecache_limitd_wait_queue[nid], &wait); - } - - finish_wait(pagecache_limitd_wait_queue[nid], &wait); - - return 0; -} - -int kpagecache_limitd_run(int nid) -{ - int ret = 0; - wait_queue_head_t *queue_head = NULL; - - if (pagecache_limitd_tasks[nid] && pagecache_limitd_wait_queue[nid]) - return 0; - - queue_head = kvmalloc(sizeof(wait_queue_head_t), GFP_KERNEL); - if (!queue_head) - return -ENOMEM; - - init_waitqueue_head(queue_head); - pagecache_limitd_wait_queue[nid] = queue_head; - pagecache_limitd_tasks[nid] = kthread_run(pagecache_limitd, - (void *)&nid, "kpagecache_limitd%d", nid); - - if (IS_ERR(pagecache_limitd_tasks[nid])) { - BUG_ON(system_state < SYSTEM_RUNNING); - ret = PTR_ERR(pagecache_limitd_tasks[nid]); - pr_err("Failed to start pagecache_limitd on node %d\n", nid); - pagecache_limitd_tasks[nid] = NULL; - kvfree(queue_head); - } else - wait_for_completion(&setup_done); - - return ret; -} - -static int __init kpagecache_limitd_init(void) -{ - int nid; - int ret; - - for_each_node_state(nid, N_MEMORY) { - ret = kpagecache_limitd_run(nid); - if (ret == -ENOMEM) - break; - } - - return 0; -} - -module_init(kpagecache_limitd_init);
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4HOXK CVE: NA
--------------------------------
This reverts commit 933db18abca2a5d0a2eaa2fc40a85c2d88cf896a. This feature will be reimplement.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/page_cache_limit.h | 14 --------- kernel/sysctl.c | 32 -------------------- mm/Kconfig | 12 -------- mm/Makefile | 1 - mm/page_cache_limit.c | 51 -------------------------------- 5 files changed, 110 deletions(-) delete mode 100644 include/linux/page_cache_limit.h delete mode 100644 mm/page_cache_limit.c
diff --git a/include/linux/page_cache_limit.h b/include/linux/page_cache_limit.h deleted file mode 100644 index 98f12734114b..000000000000 --- a/include/linux/page_cache_limit.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef _PAGECACHE_H -#define _PAGECACHE_H - -#ifdef CONFIG_SHRINK_PAGECACHE -extern int pagecache_reclaim_enable; -extern int pagecache_limit_ratio; -extern int pagecache_reclaim_ratio; - -int proc_page_cache_limit(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -#else -#endif - -#endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e26bda90e8b0..c1eebbcd0a1c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -104,9 +104,6 @@ #ifdef CONFIG_LOCKUP_DETECTOR #include <linux/nmi.h> #endif -#ifdef CONFIG_SHRINK_PAGECACHE -#include <linux/page_cache_limit.h> -#endif
#if defined(CONFIG_SYSCTL)
@@ -3212,35 +3209,6 @@ static struct ctl_table vm_table[] = { .extra2 = SYSCTL_ONE, }, #endif -#ifdef CONFIG_SHRINK_PAGECACHE - { - .procname = "cache_reclaim_enable", - .data = &pagecache_reclaim_enable, - .maxlen = sizeof(pagecache_reclaim_enable), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "cache_limit_ratio", - .data = &pagecache_limit_ratio, - .maxlen = sizeof(pagecache_limit_ratio), - .mode = 0600, - .proc_handler = proc_page_cache_limit, - .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&one_hundred, - }, - { - .procname = "cache_reclaim_ratio", - .data = &pagecache_reclaim_ratio, - .maxlen = sizeof(pagecache_reclaim_ratio), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&one_hundred, - }, -#endif #ifdef CONFIG_ASCEND_SHARE_POOL { .procname = "sharepool_debug_mode", diff --git a/mm/Kconfig b/mm/Kconfig index 4475bd9f8762..9e66dfb15c52 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -500,18 +500,6 @@ config FRONTSWAP
If unsure, say Y to enable frontswap.
-config SHRINK_PAGECACHE - bool "Enable shrinking the page cache" - depends on MMU - default n - help - SHRINK_PAGECACHE means that we do not want to keep the large number - of page cache in the system, even though page cache can greatly improve - the performance of the machine. Large number of page cache may result - in short of memory, which will result OOM at the same time, so in order - to keep page cache in a reasonable range, the number of page cache - should be limited, and that is what SHRINK_PAGECACHE does. - config MEMCG_QOS bool "Enable Memory Cgroup Priority" depends on MEMCG diff --git a/mm/Makefile b/mm/Makefile index e83233177c7a..7465668c4b02 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -126,7 +126,6 @@ obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_PIN_MEMORY) += pin_mem.o -obj-$(CONFIG_SHRINK_PAGECACHE) += page_cache_limit.o obj-$(CONFIG_ASCEND_SHARE_POOL) += share_pool.o obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o diff --git a/mm/page_cache_limit.c b/mm/page_cache_limit.c deleted file mode 100644 index 55fdea087804..000000000000 --- a/mm/page_cache_limit.c +++ /dev/null @@ -1,51 +0,0 @@ -#include <linux/mm.h> -#include <linux/sysctl.h> - -int pagecache_reclaim_enable; -int pagecache_limit_ratio; -int pagecache_reclaim_ratio; - -static unsigned long pagecache_limit_pages; -static unsigned long node_pagecache_limit_pages[MAX_NUMNODES]; - -static unsigned long get_node_total_pages(int nid) -{ - int zone_type; - unsigned long managed_pages = 0; - pg_data_t *pgdat = NODE_DATA(nid); - - if (!pgdat) - return 0; - - for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) - managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]); - - return managed_pages; -} - -static void setup_pagecache_limit(void) -{ - int i; - unsigned long node_total_pages; - - pagecache_limit_pages = pagecache_limit_ratio * totalram_pages() / 100; - - for (i = 0; i < MAX_NUMNODES; i++) { - node_total_pages = get_node_total_pages(i); - node_pagecache_limit_pages[i] = node_total_pages * - pagecache_limit_ratio / 100; - } -} - -int proc_page_cache_limit(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int ret; - - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - - if (write && !ret) - setup_pagecache_limit(); - - return ret; -}
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4HOXK CVE: NA
--------------------------------
Adding periodical memory reclaim support, there are three new interfaces:
1) /proc/sys/vm/cache_reclaim_s --- used to set reclaim interval 2) /proc/sys/vm/cache_reclaim_weight --- used to calculate reclaim amount 3) /proc/sys/vm/cache_reclaim_enable --- used to switch on/off this feature
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/admin-guide/sysctl/vm.rst | 32 +++++ include/linux/page_cache_limit.h | 7 ++ mm/Kconfig | 13 +++ mm/Makefile | 1 + mm/page_cache_limit.c | 148 ++++++++++++++++++++++++ mm/vmscan.c | 37 ++++++ 6 files changed, 238 insertions(+) create mode 100644 include/linux/page_cache_limit.h create mode 100644 mm/page_cache_limit.c
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 5de629b932ae..02092b8de1e9 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -74,6 +74,9 @@ Currently, these files are in /proc/sys/vm: - watermark_boost_factor - watermark_scale_factor - zone_reclaim_mode +- cache_reclaim_s +- cache_reclaim_weight +- cache_reclaim_enable
admin_reserve_kbytes @@ -1026,3 +1029,32 @@ of other processes running on other nodes will not be affected. Allowing regular swap effectively restricts allocations to the local node unless explicitly overridden by memory policies or cpuset configurations. + +cache_reclaim_s +=============== + +Cache_reclaim_s is used to set reclaim interval in periodical memory +reclaim. when periodical memory reclaim is enabled, it will relcaim +memory in every cache_reclaim_s second. + + +cache_reclaim_weight +==================== + +This is reclaim factor in every periodical reclaim. when periodical +memory reclaim is enabled, the reclaim amount in every reclaim can +calculate from: + reclaim_amount = cache_reclaim_weigh * SWAP_CLUSTER_MAX * nr_cpus_node(nid) + +SWAP_CLUSTER_MAX is defined in include/linux/swap.h. +nr_cpus_node is used to obtain the number of CPUs on node nid. + +Memory reclaim use workqueue mechanism, it will block the execution of +subsequent work, if memory reclaim tasks a lot of time, time sensitive +work may be affected. + + +cache_reclaim_enable +==================== + +This is used to switch on/off periodical memory reclaim feature. diff --git a/include/linux/page_cache_limit.h b/include/linux/page_cache_limit.h new file mode 100644 index 000000000000..dcfc54f88acc --- /dev/null +++ b/include/linux/page_cache_limit.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_PAGE_CACHE_LIMIT_H +#define _LINUX_PAGE_CACHE_LIMIT_H +#ifdef CONFIG_PAGE_CACHE_LIMIT +extern unsigned long page_cache_shrink_memory(unsigned long nr_to_reclaim); +#endif /* CONFIG_PAGE_CACHE_LIMIT */ +#endif /* _LINUX_PAGE_CACHE_LIMIT_H */ diff --git a/mm/Kconfig b/mm/Kconfig index 9e66dfb15c52..27c0b9de6357 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -536,6 +536,19 @@ config USERSWAP Support for User Swap. This is based on userfaultfd. We can implement our own swapout and swapin functions in usersapce.
+config PAGE_CACHE_LIMIT + bool "Support page cache limit" + depends on MMU && SYSCTL + default n + help + Keeping a number of page cache can improve the performance of system, + but if there is a lot fo page cache in system, that will result in + short of memory, subsequent memory reclamation operations may lead + to performance degradation, so add periodical memory relciam to + avoid too many page cache. + + if unsure, say N to disable the PAGE_CACHE_LIMIT. + config CMA bool "Contiguous Memory Allocator" depends on MMU diff --git a/mm/Makefile b/mm/Makefile index 7465668c4b02..9798d8735cc7 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -129,3 +129,4 @@ obj-$(CONFIG_PIN_MEMORY) += pin_mem.o obj-$(CONFIG_ASCEND_SHARE_POOL) += share_pool.o obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o +obj-$(CONFIG_PAGE_CACHE_LIMIT) += page_cache_limit.o diff --git a/mm/page_cache_limit.c b/mm/page_cache_limit.c new file mode 100644 index 000000000000..51b298c854b4 --- /dev/null +++ b/mm/page_cache_limit.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Support for periodic memory reclaim and page cache limit + */ + +#include <linux/mm.h> +#include <linux/page_cache_limit.h> +#include <linux/swap.h> +#include <linux/sysctl.h> +#include <linux/workqueue.h> + +static int vm_cache_reclaim_s __read_mostly; +static int vm_cache_reclaim_s_max = 43200; +static int vm_cache_reclaim_weight __read_mostly = 1; +static int vm_cache_reclaim_weight_max = 100; +static int vm_cache_reclaim_enable = 1; + +static void shrink_shepherd(struct work_struct *w); +static DECLARE_DEFERRABLE_WORK(shepherd, shrink_shepherd); +static struct work_struct vmscan_works[MAX_NUMNODES]; + +static bool should_periodical_reclaim(void) +{ + return vm_cache_reclaim_s && vm_cache_reclaim_enable; +} + +static unsigned long node_reclaim_num(void) +{ + int nid = numa_node_id(); + + return SWAP_CLUSTER_MAX * nr_cpus_node(nid) * vm_cache_reclaim_weight; +} + +static int cache_reclaim_enable_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (ret || !write) + return ret; + + if (should_periodical_reclaim()) + schedule_delayed_work(&shepherd, round_jiffies_relative( + (unsigned long)vm_cache_reclaim_s * HZ)); + + return 0; +} + +static int cache_reclaim_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (ret || !write) + return ret; + + if (should_periodical_reclaim()) + mod_delayed_work(system_unbound_wq, &shepherd, + round_jiffies_relative( + (unsigned long)vm_cache_reclaim_s * HZ)); + + return ret; +} + +static struct ctl_table ctl_table[] = { + { + .procname = "cache_reclaim_s", + .data = &vm_cache_reclaim_s, + .maxlen = sizeof(vm_cache_reclaim_s), + .mode = 0644, + .proc_handler = cache_reclaim_sysctl_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &vm_cache_reclaim_s_max, + }, + { + .procname = "cache_reclaim_weight", + .data = &vm_cache_reclaim_weight, + .maxlen = sizeof(vm_cache_reclaim_weight), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = &vm_cache_reclaim_weight_max, + }, + { + .procname = "cache_reclaim_enable", + .data = &vm_cache_reclaim_enable, + .maxlen = sizeof(vm_cache_reclaim_enable), + .mode = 0644, + .proc_handler = cache_reclaim_enable_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static struct ctl_table limit_dir_table[] = { + { + .procname = "vm", + .maxlen = 0, + .mode = 0555, + .child = ctl_table, + }, + {} +}; + +static void shrink_shepherd(struct work_struct *w) +{ + int node; + + if (!should_periodical_reclaim()) + return; + + for_each_online_node(node) { + if (!work_pending(&vmscan_works[node])) + queue_work_node(node, system_unbound_wq, &vmscan_works[node]); + } + + queue_delayed_work(system_unbound_wq, &shepherd, + round_jiffies_relative((unsigned long)vm_cache_reclaim_s * HZ)); +} + +static void shrink_page_work(struct work_struct *w) +{ + page_cache_shrink_memory(node_reclaim_num()); +} + +static void shrink_shepherd_timer(void) +{ + int i; + + for (i = 0; i < MAX_NUMNODES; i++) + INIT_WORK(&vmscan_works[i], shrink_page_work); +} + +static int __init shrink_page_init(void) +{ + if (!register_sysctl_table(limit_dir_table)) { + pr_err("register page cache limit sysctl failed."); + return -ENOMEM; + } + + shrink_shepherd_timer(); + + return 0; +} +late_initcall(shrink_page_init) diff --git a/mm/vmscan.c b/mm/vmscan.c index 7aea8c2cf0a8..3ddd6ae8a164 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -59,6 +59,7 @@ #include <linux/swapops.h> #include <linux/balloon_compaction.h>
+#include <linux/page_cache_limit.h> #include "internal.h"
#define CREATE_TRACE_POINTS @@ -4592,3 +4593,39 @@ struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr) return page; } EXPORT_SYMBOL_GPL(get_page_from_vaddr); + +#ifdef CONFIG_PAGE_CACHE_LIMIT +unsigned long page_cache_shrink_memory(unsigned long nr_to_reclaim) +{ + unsigned long nr_reclaimed; + unsigned int noreclaim_flag; + int nid = numa_node_id(); + struct scan_control sc = { + .gfp_mask = GFP_HIGHUSER_MOVABLE, + .reclaim_idx = ZONE_MOVABLE, + .may_writepage = !laptop_mode, + .nr_to_reclaim = nr_to_reclaim / 2, + .may_unmap = 1, + .may_swap = 1, + .priority = DEF_PRIORITY, + }; + + struct zonelist *zonelist = node_zonelist(nid, sc.gfp_mask); + struct scan_control orig_sc = sc; + + fs_reclaim_acquire(sc.gfp_mask); + noreclaim_flag = memalloc_noreclaim_save(); + set_task_reclaim_state(current, &sc.reclaim_state); + + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + sc = orig_sc; + sc.reclaim_idx--; + nr_reclaimed += do_try_to_free_pages(zonelist, &sc); + + set_task_reclaim_state(current, NULL); + memalloc_noreclaim_restore(noreclaim_flag); + fs_reclaim_release(sc.gfp_mask); + + return nr_reclaimed; +} +#endif
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4HOXK CVE: NA
--------------------------------
Add /pros/sys/vm/cache_limit_mbytes to set page cache limit. This interface set the upper limit of page cache, if usage of page cache is over cache_limit_mbytes, it will trigger memory reclaim, the reclaim size and reclaim interval are decided by interfaces /proc/sys/vm/cache_reclaim_s and /proc/sys/vm/cache_reclaim_weight, these two intefaces are introduced in previous patch.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/admin-guide/sysctl/vm.rst | 8 +++ include/linux/page_cache_limit.h | 3 +- mm/page_cache_limit.c | 71 ++++++++++++++++++++++++- mm/vmscan.c | 5 +- 4 files changed, 83 insertions(+), 4 deletions(-)
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 02092b8de1e9..eb227015a895 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -77,6 +77,7 @@ Currently, these files are in /proc/sys/vm: - cache_reclaim_s - cache_reclaim_weight - cache_reclaim_enable +- cache_limit_mbytes
admin_reserve_kbytes @@ -1058,3 +1059,10 @@ cache_reclaim_enable ====================
This is used to switch on/off periodical memory reclaim feature. + + +cache_limit_mbytes +================== + +This is used to set the upper limit of page cache in megabytes. +Page cache will be reclaimed periodically if page cache is over limit. diff --git a/include/linux/page_cache_limit.h b/include/linux/page_cache_limit.h index dcfc54f88acc..64a6e7017045 100644 --- a/include/linux/page_cache_limit.h +++ b/include/linux/page_cache_limit.h @@ -2,6 +2,7 @@ #ifndef _LINUX_PAGE_CACHE_LIMIT_H #define _LINUX_PAGE_CACHE_LIMIT_H #ifdef CONFIG_PAGE_CACHE_LIMIT -extern unsigned long page_cache_shrink_memory(unsigned long nr_to_reclaim); +extern unsigned long page_cache_shrink_memory(unsigned long nr_to_reclaim, + bool may_swap); #endif /* CONFIG_PAGE_CACHE_LIMIT */ #endif /* _LINUX_PAGE_CACHE_LIMIT_H */ diff --git a/mm/page_cache_limit.c b/mm/page_cache_limit.c index 51b298c854b4..05eb441f2bf2 100644 --- a/mm/page_cache_limit.c +++ b/mm/page_cache_limit.c @@ -8,12 +8,14 @@ #include <linux/swap.h> #include <linux/sysctl.h> #include <linux/workqueue.h> +#include "internal.h"
static int vm_cache_reclaim_s __read_mostly; static int vm_cache_reclaim_s_max = 43200; static int vm_cache_reclaim_weight __read_mostly = 1; static int vm_cache_reclaim_weight_max = 100; static int vm_cache_reclaim_enable = 1; +static unsigned long vm_cache_limit_mbytes __read_mostly;
static void shrink_shepherd(struct work_struct *w); static DECLARE_DEFERRABLE_WORK(shepherd, shrink_shepherd); @@ -31,6 +33,31 @@ static unsigned long node_reclaim_num(void) return SWAP_CLUSTER_MAX * nr_cpus_node(nid) * vm_cache_reclaim_weight; }
+static bool page_cache_over_limit(void) +{ + unsigned long lru_file; + unsigned long limit; + + limit = vm_cache_limit_mbytes * ((1024 * 1024UL) / PAGE_SIZE); + lru_file = global_node_page_state(NR_ACTIVE_FILE) + + global_node_page_state(NR_INACTIVE_FILE); + if (lru_file > limit) + return true; + + return false; +} + +static bool should_reclaim_page_cache(void) +{ + if (!should_periodical_reclaim()) + return false; + + if (!vm_cache_limit_mbytes) + return false; + + return true; +} + static int cache_reclaim_enable_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { @@ -64,6 +91,37 @@ static int cache_reclaim_sysctl_handler(struct ctl_table *table, int write, return ret; }
+static int cache_limit_mbytes_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int ret; + unsigned long vm_cache_limit_mbytes_max; + unsigned long origin_mbytes = vm_cache_limit_mbytes; + int nr_retries = MAX_RECLAIM_RETRIES; + + vm_cache_limit_mbytes_max = totalram_pages() >> (20 - PAGE_SHIFT); + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (ret || !write) + return ret; + + if (vm_cache_limit_mbytes > vm_cache_limit_mbytes_max) { + vm_cache_limit_mbytes = origin_mbytes; + return -EINVAL; + } + + if (write) { + while (should_reclaim_page_cache() && page_cache_over_limit() && + nr_retries--) { + if (signal_pending(current)) + return -EINTR; + + page_cache_shrink_memory(node_reclaim_num(), false); + } + } + + return 0; +} + static struct ctl_table ctl_table[] = { { .procname = "cache_reclaim_s", @@ -92,6 +150,13 @@ static struct ctl_table ctl_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "cache_limit_mbytes", + .data = &vm_cache_limit_mbytes, + .maxlen = sizeof(vm_cache_limit_mbytes), + .mode = 0644, + .proc_handler = cache_limit_mbytes_sysctl_handler, + }, {} };
@@ -123,7 +188,11 @@ static void shrink_shepherd(struct work_struct *w)
static void shrink_page_work(struct work_struct *w) { - page_cache_shrink_memory(node_reclaim_num()); + if (should_reclaim_page_cache()) { + if (page_cache_over_limit()) + page_cache_shrink_memory(node_reclaim_num(), false); + } else if (should_periodical_reclaim()) + page_cache_shrink_memory(node_reclaim_num(), true); }
static void shrink_shepherd_timer(void) diff --git a/mm/vmscan.c b/mm/vmscan.c index 3ddd6ae8a164..d96f52b2fbe0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4595,7 +4595,8 @@ struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr) EXPORT_SYMBOL_GPL(get_page_from_vaddr);
#ifdef CONFIG_PAGE_CACHE_LIMIT -unsigned long page_cache_shrink_memory(unsigned long nr_to_reclaim) +unsigned long page_cache_shrink_memory(unsigned long nr_to_reclaim, + bool may_swap) { unsigned long nr_reclaimed; unsigned int noreclaim_flag; @@ -4606,7 +4607,7 @@ unsigned long page_cache_shrink_memory(unsigned long nr_to_reclaim) .may_writepage = !laptop_mode, .nr_to_reclaim = nr_to_reclaim / 2, .may_unmap = 1, - .may_swap = 1, + .may_swap = may_swap, .priority = DEF_PRIORITY, };
From: Binfeng Wu wubinfeng@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5JSWJ CVE: NA
-------------------------------------------------
To support HCCS bus using in Ascend series accelerators, the SMMU ATOS (a software-accessible Address Translation Operations facility) feature is enabled for a special SMMU aka Agent SMMU in the Ascend accelerator.
In the VM scenario, the hypervisor creates Stage1 page table for the Agent SMMU. The Agent SMMU provides an interface for components in accelerator to translate addresses from IPA to PA. This allows the components to DMA on the HCCS bus using PA.
The origin SMMU ATOS feature only support translation of only a single group of addresses at a time. Ascend Agent SMMUs use the IMPLEMENTATION DEFINED region to implement translation of max 32 groups of addresses at the same time which can greatly improve the efficiency.
Reviewed-by: Yingtai Xie xieyingtai@huawei.com Reviewed-by: Xiaoyang Xu xuxiaoyang2@huawei.com Signed-off-by: Binfeng Wu wubinfeng@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/iommu/Kconfig | 9 + drivers/iommu/arm/arm-smmu-v3/Makefile | 1 + drivers/iommu/arm/arm-smmu-v3/ascend_smmu.c | 434 ++++++++++++++++++++ include/linux/ascend_smmu.h | 52 +++ 4 files changed, 496 insertions(+) create mode 100644 drivers/iommu/arm/arm-smmu-v3/ascend_smmu.c create mode 100644 include/linux/ascend_smmu.h
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index d97e38bfbe4b..044df15aa0c9 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -327,6 +327,15 @@ config ARM_SMMU_V3_SVA Say Y here if your system supports SVA extensions such as PCIe PASID and PRI.
+config AGENT_SMMU_ATOS + bool "An implementation of ATOS feature support for the ARM SMMUv3" + depends on ARM_SMMU_V3 + help + Support for ARM SMMUv3 ATOS feature which can translating IPA to PA. + + Say Y here if your system will be used in Ascend Advanced Accelerator + with HCCS bus. Or want use the ATOS of SMMU. + config S390_IOMMU def_bool y if S390 && PCI depends on S390 && PCI diff --git a/drivers/iommu/arm/arm-smmu-v3/Makefile b/drivers/iommu/arm/arm-smmu-v3/Makefile index 54feb1ecccad..1338466d4d0d 100644 --- a/drivers/iommu/arm/arm-smmu-v3/Makefile +++ b/drivers/iommu/arm/arm-smmu-v3/Makefile @@ -3,3 +3,4 @@ obj-$(CONFIG_ARM_SMMU_V3) += arm_smmu_v3.o arm_smmu_v3-objs-y += arm-smmu-v3.o arm_smmu_v3-objs-$(CONFIG_ARM_SMMU_V3_SVA) += arm-smmu-v3-sva.o arm_smmu_v3-objs := $(arm_smmu_v3-objs-y) +obj-$(CONFIG_AGENT_SMMU_ATOS) += ascend_smmu.o diff --git a/drivers/iommu/arm/arm-smmu-v3/ascend_smmu.c b/drivers/iommu/arm/arm-smmu-v3/ascend_smmu.c new file mode 100644 index 000000000000..adced972ebef --- /dev/null +++ b/drivers/iommu/arm/arm-smmu-v3/ascend_smmu.c @@ -0,0 +1,434 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Huawei Ascend accelerator common code for SMMUv3 ATOS feature implementations. + * + * Copyright (C) 2020-2021 Huawei Technologies Co., Ltd + * + * Author: Binfeng Wu wubinfeng@huawei.com + * + * This driver is intended to provide an interface for translating IPA to PA + * based on the SMMUv3 ATOS feature. + * + */ + +#include <linux/bitfield.h> +#include <linux/iopoll.h> +#include <linux/platform_device.h> +#include <linux/module.h> +#include <linux/acpi.h> +#include <linux/ascend_smmu.h> + +#define AGENT_SMMU_IDR1 0x4 +#define IDR1_SSIDSIZE GENMASK(10, 6) +#define IDR1_SIDSIZE GENMASK(5, 0) + +#define AGENT_SMMU_CR0 0x20 +#define CR0_SMMUEN (1 << 0) + +#define AGENT_SMMU_ATOS_CTRL 0x100 + +#define ENHANCED_ATOS_UNIT_ADDR 0x1700 /* first unit */ +#define ENHANCED_ATOS_UNIT_SIZE 0x18 + +#define ENHANCED_ATOS_SID 0x0 +#define ENHANCED_ATOS_STREAMID_MASK GENMASK_ULL(31, 0) +#define ENHANCED_ATOS_SUBSTREAMID_MASK GENMASK_ULL(51, 32) +#define ENHANCED_ATOS_SSID_VALID_MASK GENMASK_ULL(52, 52) + +#define ENHANCED_ATOS_ADDR 0x8 +#define ENHANCED_ATOS_ADDR_ADDR_MASK GENMASK_ULL(63, 12) +#define ENHANCED_ATOS_ADDR_TYPE_MASK GENMASK_ULL(11, 10) +#define ENHANCED_ATOS_ADDR_TYPE_S1 0x01 +#define ENHANCED_ATOS_ADDR_PnU_MASK GENMASK_ULL(9, 9) +#define ENHANCED_ATOS_ADDR_RnW_MASK GENMASK_ULL(8, 8) +#define ENHANCED_ATOS_ADDR_InD_MASK GENMASK_ULL(7, 7) +#define ENHANCED_ATOS_ADDR_HTTUI_MASK GENMASK_ULL(6, 6) + +#define ENHANCED_ATOS_PAR 0x10 +#define ENHANCED_ATOS_PAR_FAULT (1 << 0) +#define ENHANCED_ATOS_PAR_SIZE (1 << 11) +#define ENHANCED_ATOS_PAR_ADDR_MASK GENMASK_ULL(51, 12) +#define ENHANCED_ATOS_PAR_FAULTCODE GENMASK_ULL(11, 4) +#define ENHANCED_ATOS_PAR_REASON GENMASK_ULL(2, 1) + +#define AGENT_SMMU_POLL_US 5 +#define AGENT_SMMU_TIMEOUT_US 250 +#define MAX_REGISTERS 32 + +static LIST_HEAD(agent_smmu_list); +static DEFINE_SPINLOCK(agent_smmu_lock); + +struct agent_smmu { + struct device *dev; + void __iomem *base; + unsigned int max_sid; + unsigned int max_ssid; + rwlock_t rw_lock; + DECLARE_BITMAP(regs, MAX_REGISTERS); + + struct list_head list; + u64 device_id; /* DIE id */ +}; + +struct agent_smmu *agent_smmu_unlocked_find(u64 device_id) +{ + struct agent_smmu *temp = NULL; + + list_for_each_entry(temp, &agent_smmu_list, list) { + if (temp->device_id == device_id) { + return temp; + } + } + return NULL; +} + +static int agent_smmu_register(struct agent_smmu *agent) +{ + struct device *dev = agent->dev; + + spin_lock(&agent_smmu_lock); + if (agent_smmu_unlocked_find(agent->device_id)) { + dev_err(dev, "already added for %lld.\n", agent->device_id); + spin_unlock(&agent_smmu_lock); + return -EFAULT; + } + list_add_tail(&agent->list, &agent_smmu_list); + spin_unlock(&agent_smmu_lock); + + return 0; +} + +static void agent_smmu_unregister(struct agent_smmu *agent) +{ + spin_lock(&agent_smmu_lock); + list_del(&agent->list); + spin_unlock(&agent_smmu_lock); +} + +static int agent_smmu_platform_probe(struct platform_device *pdev) +{ + struct agent_smmu *agent = NULL; + struct device *dev = &pdev->dev; + struct resource *res = NULL; + u32 reg = 0; + int ret = 0; + acpi_status status = AE_OK; + + agent = devm_kzalloc(dev, sizeof(*agent), GFP_KERNEL); + if (!agent) { + dev_err(dev, "failed to allocate agent smmu.\n"); + return -ENOMEM; + } + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res || resource_size(res) + 1 < ENHANCED_ATOS_UNIT_ADDR + + ENHANCED_ATOS_UNIT_SIZE * MAX_REGISTERS) { + dev_err(dev, "MMIO region is null or too small, check it.\n"); + ret = -EINVAL; + goto err_free; + } + + // agent smmu may probe as smmu in device, so keep using ioreamp + agent->base = ioremap(res->start, resource_size(res)); + if (!agent->base) { + dev_err(dev, "unable to map agent smmu.\n"); + ret = -ENOMEM; + goto err_free; + } + + /* check agent smmu is enabled */ + reg = readl_relaxed(agent->base + AGENT_SMMU_CR0); + if (!(reg & CR0_SMMUEN)) { + dev_err(dev, "agent smmu is not enabled, check it.\n"); + ret = -EPERM; + goto err_iounmap; + } + + status = acpi_evaluate_integer(ACPI_HANDLE(&pdev->dev), METHOD_NAME__UID, + NULL, &agent->device_id); + if (ACPI_FAILURE(status) || agent_smmu_register(agent)) { + dev_err(dev, "agent smmu UID 0x%x has been probed.\n", status); + ret = -EINVAL; + goto err_iounmap; + } + + reg = readl_relaxed(agent->base + AGENT_SMMU_IDR1); + agent->max_sid = (1U << FIELD_GET(IDR1_SIDSIZE, reg)) - 1; + agent->max_ssid = (1U << FIELD_GET(IDR1_SSIDSIZE, reg)) - 1; + bitmap_zero(agent->regs, MAX_REGISTERS); + rwlock_init(&agent->rw_lock); + agent->dev = dev; + platform_set_drvdata(pdev, agent); + + dev_info(dev, "agent smmu 0x%llx probed successfully.\n", agent->device_id); + return ret; +err_iounmap: + iounmap(agent->base); + agent->base = NULL; +err_free: + devm_kfree(dev, agent); + return ret; +} + +static int agent_smmu_platform_remove(struct platform_device *pdev) +{ + struct agent_smmu *agent = platform_get_drvdata(pdev); + + agent_smmu_unregister(agent); + iounmap(agent->base); + agent->dev = NULL; + agent->base = NULL; + dev_info(&pdev->dev, "agent smmu removed successfully.\n"); + return 0; +} + +static void set_registers_unlocked(struct agent_smmu *agent, unsigned long *avl_regs, + unsigned long *loc_regs, int nr) +{ + int idx = 0; + + while (nr > 0) { + idx = find_next_bit(avl_regs, MAX_REGISTERS, idx); + set_bit(idx, loc_regs); + set_bit(idx, agent->regs); + nr--; + idx++; + } +} + +/** + * registers_acquire - take up available registers(some reg may keep unavailable + * state) from agent smmu according to the number of 'need', mark them in + * 'loc_regs' and return the number of registers in procession + * + * @agent: agent smmu + * @loc_regs: bitmap recored user's available registers + * @need: the number of task still need to be processed + */ +static int registers_acquire(struct agent_smmu *agent, unsigned long *loc_regs, + int need) +{ + int rest = 0; + u32 avl_regs_state = 0; + DECLARE_BITMAP(avl_regs, MAX_REGISTERS); + + write_lock(&agent->rw_lock); + if (bitmap_full(agent->regs, MAX_REGISTERS)) { + rest = 0; + } else { + avl_regs_state = readl_relaxed(agent->base + AGENT_SMMU_ATOS_CTRL); + avl_regs_state = ~avl_regs_state; + bitmap_from_arr32(avl_regs, &avl_regs_state, MAX_REGISTERS); + bitmap_andnot(avl_regs, avl_regs, agent->regs, MAX_REGISTERS); + rest = bitmap_weight(avl_regs, MAX_REGISTERS); + } + set_registers_unlocked(agent, avl_regs, loc_regs, need > rest ? rest : need); + write_unlock(&agent->rw_lock); + + return bitmap_weight(loc_regs, MAX_REGISTERS); +} + +static void write_enhanced_atos(struct agent_smmu *agent, int regs_idx, u64 sid, + u64 addr, dma_addr_t iova) +{ + void __iomem *unit_base; + + unit_base = agent->base + ENHANCED_ATOS_UNIT_ADDR + + ENHANCED_ATOS_UNIT_SIZE * regs_idx; + addr |= iova & ENHANCED_ATOS_ADDR_ADDR_MASK; + + writeq_relaxed(addr, unit_base + ENHANCED_ATOS_ADDR); + writeq_relaxed(sid, unit_base + ENHANCED_ATOS_SID); +} + +static int get_section_mask(u64 par, u64 *section_mask) +{ + int i = 0; + + // using default page size 4KB according to spec + *section_mask = ~((1 << 12) - 1); + + // e.g. PAR[Size] is 1 && PAR[14:12] is 0 && PAR[15] is 1, then lowest + // bit is 15, so section size is 2^(12+3+1) = 64KB + if (par & ENHANCED_ATOS_PAR_SIZE) { + par = FIELD_GET(ENHANCED_ATOS_PAR_ADDR_MASK, par); + if (!par) { + pr_err("agent smmu: err happen in agent smmu PAR[11]\n"); + return -EFAULT; + } + + par = (par ^ (par - 1)) >> 1; + for (i = 0; par; i++) { + par >>= 1; + } + *section_mask = ~((1 << (12 + i + 1)) - 1); + } + return 0; +} + +static int read_enhanced_atos(struct agent_smmu *agent, int regs_idx, int idx, + u32 state, struct agent_smmu_atos_data *data) +{ + void __iomem *unit_base = NULL; + u64 par = 0; + int ret = 0; + u64 section_mask = 0; + u64 section = 0; + int i = 0; + + unit_base = agent->base + ENHANCED_ATOS_UNIT_ADDR + + ENHANCED_ATOS_UNIT_SIZE * regs_idx; + par = readq_relaxed(unit_base + ENHANCED_ATOS_PAR); + + if (state & (1 << regs_idx)) { + return -EBUSY; + } else if (par & ENHANCED_ATOS_PAR_FAULT) { + data->pa[idx] = par & ENHANCED_ATOS_PAR_FAULTCODE; + data->pa[idx] |= par & ENHANCED_ATOS_PAR_REASON; + pr_err("agent smmu: err happened, get PAR 0x%llx\n", par); + return -EFAULT; + } else { + ret = get_section_mask(par, §ion_mask); + if (ret) + return ret; + // use ENHANCED_ATOS_PAR_ADDR_MASK not section_mask + // since ADDR[63,52] is ATTR or IMPDEF which we don't want + data->pa[idx] = (par & ENHANCED_ATOS_PAR_ADDR_MASK & section_mask) | + (data->iova[idx] & ~section_mask); + section = data->iova[idx] & section_mask; + + for (i = idx + 1; i < data->nr; i++) { + if ((data->iova[i] & section_mask) != section) + break; + data->pa[i] = (par & ENHANCED_ATOS_PAR_ADDR_MASK & section_mask) | + (data->iova[i] & ~section_mask); + } + } + return 0; +} + +#define bitmap_for_each_set_bit(i, src, nbits) \ + for ((i) = 0; ((i) = find_next_bit((src), (nbits), (i))) < (nbits); (i) += 1) + +int agent_smmu_iova_to_phys(struct agent_smmu_atos_data *data, int *succeed) +{ + struct agent_smmu *agent = NULL; + int ret = 0; + int i; + u64 sid = 0; + u64 addr = 0; + int idx = 0; + u32 state = 0; + DECLARE_BITMAP(loc_regs, MAX_REGISTERS); + DECLARE_BITMAP(bitmask, MAX_REGISTERS); + u32 bitmask_u32; + + if (!data || !data->iova || !data->pa || data->nr <= 0 || !succeed) { + return -EINVAL; + } + + // now only HTTUI = 1 is allowed + if (!data->httui) { + pr_err("agent smmu: check httui, make sure is valid\n"); + return -EINVAL; + } + + spin_lock(&agent_smmu_lock); + agent = agent_smmu_unlocked_find(data->device_id); + if (!agent || !get_device(agent->dev)) { + pr_err("agent smmu: %lld has been removed or hasn't initialized.\n", + data->device_id); + spin_unlock(&agent_smmu_lock); + return -EINVAL; + } + spin_unlock(&agent_smmu_lock); + + if (data->sid > agent->max_sid || data->ssid > agent->max_ssid) { + pr_err("agent smmu: sid or ssid out of acceptable range.\n"); + ret = -EINVAL; + goto put_device; + } + + *succeed = 0; + /* make sure default return is 0 because 0 make sence too */ + for (i = 0; i < data->nr; i++) { + data->pa[i] = 0; + } + /* joint sid and addr first*/ + sid = FIELD_PREP(ENHANCED_ATOS_STREAMID_MASK, data->sid); + sid |= FIELD_PREP(ENHANCED_ATOS_SUBSTREAMID_MASK, data->ssid); + sid |= FIELD_PREP(ENHANCED_ATOS_SSID_VALID_MASK, data->ssid ? 1 : 0); + addr |= FIELD_PREP(ENHANCED_ATOS_ADDR_TYPE_MASK, ENHANCED_ATOS_ADDR_TYPE_S1); + addr |= FIELD_PREP(ENHANCED_ATOS_ADDR_PnU_MASK, data->pnu ? 1 : 0); + addr |= FIELD_PREP(ENHANCED_ATOS_ADDR_RnW_MASK, data->rnw ? 1 : 0); + addr |= FIELD_PREP(ENHANCED_ATOS_ADDR_InD_MASK, data->ind ? 1 : 0); + addr |= FIELD_PREP(ENHANCED_ATOS_ADDR_HTTUI_MASK, data->httui ? 1 : 0); + bitmap_zero(loc_regs, MAX_REGISTERS); + if (!registers_acquire(agent, loc_regs, data->nr)) { + pr_err("agent smmu: busy now, try again later.\n"); + ret = -EBUSY; + goto put_device; + } + + idx = *succeed; + while (idx < data->nr) { + bitmap_zero(bitmask, MAX_REGISTERS); + + bitmap_for_each_set_bit(i, loc_regs, MAX_REGISTERS) { + if (idx >= data->nr) + break; + write_enhanced_atos(agent, i, sid, addr, data->iova[idx++]); + bitmap_set(bitmask, i, MAX_REGISTERS); + } + + bitmap_to_arr32(&bitmask_u32, bitmask, MAX_REGISTERS); + writel(bitmask_u32, agent->base + AGENT_SMMU_ATOS_CTRL); + readl_poll_timeout(agent->base + AGENT_SMMU_ATOS_CTRL, state, + !(state & bitmask_u32), AGENT_SMMU_POLL_US, + AGENT_SMMU_TIMEOUT_US); + + idx = *succeed; + bitmap_for_each_set_bit(i, bitmask, MAX_REGISTERS) { + if (idx >= data->nr) + break; + + if (data->pa[idx] != 0) { + idx++; + continue; + } + ret = read_enhanced_atos(agent, i, idx, state, data); + if (ret) { + *succeed = idx; + pr_err("agent smmu: translate failed, reason %d\n", ret); + goto free_bits; + } + idx++; + } + *succeed = idx; + } + +free_bits: + write_lock(&agent->rw_lock); + bitmap_andnot(agent->regs, agent->regs, loc_regs, MAX_REGISTERS); + write_unlock(&agent->rw_lock); +put_device: + put_device(agent->dev); + return ret; +} +EXPORT_SYMBOL_GPL(agent_smmu_iova_to_phys); + +static const struct acpi_device_id agent_smmu_acpi_match[] = { + {"SMMU0000", 0}, + {} +}; +MODULE_DEVICE_TABLE(acpi, agent_smmu_acpi_match); + +static struct platform_driver agent_smmu_driver = { + .driver = { + .name = "agent_smmu_platform", + .acpi_match_table = agent_smmu_acpi_match, + }, + .probe = agent_smmu_platform_probe, + .remove = agent_smmu_platform_remove, +}; +module_platform_driver(agent_smmu_driver); diff --git a/include/linux/ascend_smmu.h b/include/linux/ascend_smmu.h new file mode 100644 index 000000000000..0dd1ac63f62f --- /dev/null +++ b/include/linux/ascend_smmu.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_ASCEND_SMMU_H +#define __LINUX_ASCEND_SMMU_H + +#define INV_REQ 0xff +#define INV_STAGE 0xfe +#define INTERNAL_ERR 0xfd +#define C_BAD_STREAMID 0x02 +#define F_STE_FETCH 0x03 +#define C_BAD_STE 0x04 +#define F_STREAM_DISABLED 0x06 +#define C_BAD_SUBSTREAMID 0x08 +#define F_CD_FETCH 0x09 +#define C_BAD_CD 0x0a +#define F_WALK_EABT 0x0b +#define F_TRANSLATION 0x10 +#define F_ADDR_SIZE 0x11 +#define F_ACCESS 0x12 +#define F_PERMISSION 0x13 +#define F_TLB_CONFLICT 0x20 +#define F_CFG_CONFLICT 0x21 +#define F_VMS_FETCH 0x25 + +/** + * struct agent_smmu_atos_data - information required for address translation + * @sid: stream id + * @ssid: substream id + * @PnU: 0 for Unprivileged, 1 for Privileged + * @RnW: 0 for Write, 1 for Read + * @InD: 0 for Data, 1 for Instruction + * @HTTUI: 0 for HTTU might occur, 1 for HTTU inhibited + * @nr: number of addresses + * @iova: iova addresses to be translated + * @pa: translated physical addresses + * @device_id: agent smmu uid + */ +struct agent_smmu_atos_data { + u32 sid; + u32 ssid; + u32 pnu; + u32 rnw; + u32 ind; + u32 httui; + int nr; + dma_addr_t *iova; + phys_addr_t *pa; + u64 device_id; +}; + +extern int agent_smmu_iova_to_phys(struct agent_smmu_atos_data *data, int *succeed); + +#endif