[PATCH OLK-5.10 0/8] fix cves

Wupeng Ma

26 Nov 2025 26 Nov '25

10:40 a.m.

fix the fellowing cves: - CVE-2025-38084 - CVE-2025-21693 - CVE-2021-47653 Chengming Zhou (1): mm/zswap: change per-cpu mutex and buffer to per-acomp_ctx Dmitry Nikiforov (1): media: davinci: vpif: Fix memory leak in probe error path James Houghton (1): hugetlb: unshare some PMDs when splitting VMAs Jann Horn (1): mm/hugetlb: unshare page tables during VMA split, not before Johan Hovold (1): media: davinci: vpif: fix use-after-free on driver unbind Yosry Ahmed (3): mm: zswap: properly synchronize freeing resources during CPU hotunplug mm: zswap: move allocations during CPU init outside the lock mm: zswap: fix crypto_free_acomp() deadlock in zswap_cpu_comp_dead() drivers/media/platform/davinci/vpif.c | 99 ++++++++++---- include/linux/cpuhotplug.h | 2 +- include/linux/hugetlb.h | 6 + mm/hugetlb.c | 92 +++++++++++++ mm/mmap.c | 8 ++ mm/zswap.c | 186 ++++++++++++++------------ 6 files changed, 280 insertions(+), 113 deletions(-) -- 2.43.0

Show replies by date

Wupeng Ma

26 Nov 26 Nov

10:40 a.m.

New subject: [PATCH OLK-5.10 1/8] media: davinci: vpif: fix use-after-free on driver unbind

From: Johan Hovold <johan@kernel.org> mainline inclusion from mainline-v5.18-rc1 commit 43acb728bbc40169d2e2425e84a80068270974be category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IBP4IT CVE: CVE-2021-47653 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i... -------------------------------- The driver allocates and registers two platform device structures during probe, but the devices were never deregistered on driver unbind. This results in a use-after-free on driver unbind as the device structures were allocated using devres and would be freed by driver core when remove() returns. Fix this by adding the missing deregistration calls to the remove() callback and failing probe on registration errors. Note that the platform device structures must be freed using a proper release callback to avoid leaking associated resources like device names. Fixes: 479f7a118105 ("[media] davinci: vpif: adaptions for DT support") Cc: stable@vger.kernel.org # 4.12 Cc: Kevin Hilman <khilman@baylibre.com> Signed-off-by: Johan Hovold <johan@kernel.org> Reviewed-by: Lad Prabhakar <prabhakar.csengg@gmail.com> Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl> Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org> Conflicts: drivers/media/platform/davinci/vpif.c [Wupeng Ma: Context conflicts] Signed-off-by: Wupeng Ma <mawupeng1@huawei.com> --- drivers/media/platform/davinci/vpif.c | 97 ++++++++++++++++++++------- 1 file changed, 71 insertions(+), 26 deletions(-) diff --git a/drivers/media/platform/davinci/vpif.c b/drivers/media/platform/davinci/vpif.c index ee610daf90a3..d4135ae9a940 100644 --- a/drivers/media/platform/davinci/vpif.c +++ b/drivers/media/platform/davinci/vpif.c @@ -47,6 +47,11 @@ EXPORT_SYMBOL_GPL(vpif_lock); void __iomem *vpif_base; EXPORT_SYMBOL_GPL(vpif_base); +struct vpif_data { + struct platform_device *capture; + struct platform_device *display; +}; + /* * vpif_ch_params: video standard configuration parameters for vpif * @@ -423,11 +428,19 @@ int vpif_channel_getfid(u8 channel_id) } EXPORT_SYMBOL(vpif_channel_getfid); +static void vpif_pdev_release(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + + kfree(pdev); +} + static int vpif_probe(struct platform_device *pdev) { static struct resource *res, *res_irq; struct platform_device *pdev_capture, *pdev_display; struct device_node *endpoint = NULL; + struct vpif_data *data; int ret; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); @@ -435,6 +448,12 @@ static int vpif_probe(struct platform_device *pdev) if (IS_ERR(vpif_base)) return PTR_ERR(vpif_base); + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + platform_set_drvdata(pdev, data); + pm_runtime_enable(&pdev->dev); pm_runtime_get(&pdev->dev); @@ -463,49 +482,75 @@ static int vpif_probe(struct platform_device *pdev) goto err_put_rpm; } - pdev_capture = devm_kzalloc(&pdev->dev, sizeof(*pdev_capture), - GFP_KERNEL); - if (pdev_capture) { - pdev_capture->name = "vpif_capture"; - pdev_capture->id = -1; - pdev_capture->resource = res_irq; - pdev_capture->num_resources = 1; - pdev_capture->dev.dma_mask = pdev->dev.dma_mask; - pdev_capture->dev.coherent_dma_mask = pdev->dev.coherent_dma_mask; - pdev_capture->dev.parent = &pdev->dev; - platform_device_register(pdev_capture); - } else { - dev_warn(&pdev->dev, "Unable to allocate memory for pdev_capture.\n"); + pdev_capture = kzalloc(sizeof(*pdev_capture), GFP_KERNEL); + if (!pdev_capture) { + ret = -ENOMEM; + goto err_put_rpm; } - pdev_display = devm_kzalloc(&pdev->dev, sizeof(*pdev_display), - GFP_KERNEL); - if (pdev_display) { - pdev_display->name = "vpif_display"; - pdev_display->id = -1; - pdev_display->resource = res_irq; - pdev_display->num_resources = 1; - pdev_display->dev.dma_mask = pdev->dev.dma_mask; - pdev_display->dev.coherent_dma_mask = pdev->dev.coherent_dma_mask; - pdev_display->dev.parent = &pdev->dev; - platform_device_register(pdev_display); - } else { - dev_warn(&pdev->dev, "Unable to allocate memory for pdev_display.\n"); + pdev_capture->name = "vpif_capture"; + pdev_capture->id = -1; + pdev_capture->resource = res_irq; + pdev_capture->num_resources = 1; + pdev_capture->dev.dma_mask = pdev->dev.dma_mask; + pdev_capture->dev.coherent_dma_mask = pdev->dev.coherent_dma_mask; + pdev_capture->dev.parent = &pdev->dev; + pdev_capture->dev.release = vpif_pdev_release; + + ret = platform_device_register(pdev_capture); + if (ret) + goto err_put_pdev_capture; + + pdev_display = kzalloc(sizeof(*pdev_display), GFP_KERNEL); + if (!pdev_display) { + ret = -ENOMEM; + goto err_put_pdev_capture; } + pdev_display->name = "vpif_display"; + pdev_display->id = -1; + pdev_display->resource = res_irq; + pdev_display->num_resources = 1; + pdev_display->dev.dma_mask = pdev->dev.dma_mask; + pdev_display->dev.coherent_dma_mask = pdev->dev.coherent_dma_mask; + pdev_display->dev.parent = &pdev->dev; + pdev_display->dev.release = vpif_pdev_release; + + ret = platform_device_register(pdev_display); + if (ret) + goto err_put_pdev_display; + + data->capture = pdev_capture; + data->display = pdev_display; + return 0; +err_put_pdev_display: + platform_device_put(pdev_display); +err_put_pdev_capture: + platform_device_put(pdev_capture); err_put_rpm: pm_runtime_put(&pdev->dev); pm_runtime_disable(&pdev->dev); + kfree(data); return ret; } static int vpif_remove(struct platform_device *pdev) { + struct vpif_data *data = platform_get_drvdata(pdev); + + if (data->capture) + platform_device_unregister(data->capture); + if (data->display) + platform_device_unregister(data->display); + pm_runtime_put(&pdev->dev); pm_runtime_disable(&pdev->dev); + + kfree(data); + return 0; } -- 2.43.0

Wupeng Ma

10:40 a.m.

New subject: [PATCH OLK-5.10 2/8] media: davinci: vpif: Fix memory leak in probe error path

From: Dmitry Nikiforov <Dm1tryNk@yandex.ru> stable inclusion from stable-v5.15.187 commit 78b7d79b8626c173fd8ad8b8ce9ae90256618905 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IBP4IT CVE: CVE-2021-47653 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=... -------------------------------- [ Upstream commit 024bf40edf1155e7a587f0ec46294049777d9b02 ] If an error occurs during the initialization of `pdev_display`, the allocated platform device `pdev_capture` is not released properly, leading to a memory leak. Adjust error path handling to fix the leak. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 43acb728bbc4 ("media: davinci: vpif: fix use-after-free on driver unbind") Cc: stable@vger.kernel.org Signed-off-by: Dmitry Nikiforov <Dm1tryNk@yandex.ru> Reviewed-by: Johan Hovold <johan@kernel.org> Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl> Signed-off-by: Sasha Levin <sashal@kernel.org> Signed-off-by: Wupeng Ma <mawupeng1@huawei.com> --- drivers/media/platform/davinci/vpif.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/media/platform/davinci/vpif.c b/drivers/media/platform/davinci/vpif.c index d4135ae9a940..9c10b6f33fd2 100644 --- a/drivers/media/platform/davinci/vpif.c +++ b/drivers/media/platform/davinci/vpif.c @@ -504,7 +504,7 @@ static int vpif_probe(struct platform_device *pdev) pdev_display = kzalloc(sizeof(*pdev_display), GFP_KERNEL); if (!pdev_display) { ret = -ENOMEM; - goto err_put_pdev_capture; + goto err_del_pdev_capture; } pdev_display->name = "vpif_display"; @@ -527,6 +527,8 @@ static int vpif_probe(struct platform_device *pdev) err_put_pdev_display: platform_device_put(pdev_display); +err_del_pdev_capture: + platform_device_del(pdev_capture); err_put_pdev_capture: platform_device_put(pdev_capture); err_put_rpm: -- 2.43.0

Wupeng Ma

10:40 a.m.

New subject: [PATCH OLK-5.10 3/8] mm/zswap: change per-cpu mutex and buffer to per-acomp_ctx

From: Chengming Zhou <zhouchengming@bytedance.com> mainline inclusion from mainline-v6.8-rc1 commit 8ba2f844f050a82624ba3ad5146aa3c116f506f7 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IBLDIU CVE: CVE-2025-21693 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i... -------------------------------- First of all, we need to rename acomp_ctx->dstmem field to buffer, since we are now using for purposes other than compression. Then we change per-cpu mutex and buffer to per-acomp_ctx, since them belong to the acomp_ctx and are necessary parts when used in the compress/decompress contexts. So we can remove the old per-cpu mutex and dstmem. Link: https://lkml.kernel.org/r/20231213-zswap-dstmem-v5-5-9382162bbf05@bytedance.... Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com> Acked-by: Chris Li <chrisl@kernel.org> (Google) Reviewed-by: Nhat Pham <nphamcs@gmail.com> Cc: Barry Song <21cnbao@gmail.com> Cc: Dan Streetman <ddstreet@ieee.org> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Seth Jennings <sjenning@redhat.com> Cc: Vitaly Wool <vitaly.wool@konsulko.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Conflicts: include/linux/cpuhotplug.h mm/zswap.c [Ma Wupeng: Mainly context conflicts, no remove in enum cpuhp_state to make kabi stable] Signed-off-by: Ma Wupeng <mawupeng1@huawei.com> --- include/linux/cpuhotplug.h | 2 +- mm/zswap.c | 104 ++++++++++++------------------------- 2 files changed, 34 insertions(+), 72 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index b540e5a60ea9..ac109c61e882 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -86,7 +86,7 @@ enum cpuhp_state { CPUHP_ARM_BL_PREPARE, CPUHP_TRACE_RB_PREPARE, CPUHP_MM_ZS_PREPARE, - CPUHP_MM_ZSWP_MEM_PREPARE, + CPUHP_MM_ZSWP_MEM_PREPARE, /* keep this to make kabi stable */ CPUHP_MM_ZSWP_POOL_PREPARE, CPUHP_KVM_PPC_BOOK3S_PREPARE, CPUHP_ZCOMP_PREPARE, diff --git a/mm/zswap.c b/mm/zswap.c index d8e8d0084a22..5e495406555b 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -135,8 +135,8 @@ struct crypto_acomp_ctx { struct crypto_acomp *acomp; struct acomp_req *req; struct crypto_wait wait; - u8 *dstmem; - struct mutex *mutex; + u8 *buffer; + struct mutex mutex; }; struct zswap_pool { @@ -402,63 +402,26 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, /********************************* * per-cpu code **********************************/ -static DEFINE_PER_CPU(u8 *, zswap_dstmem); -/* - * If users dynamically change the zpool type and compressor at runtime, i.e. - * zswap is running, zswap can have more than one zpool on one cpu, but they - * are sharing dtsmem. So we need this mutex to be per-cpu. - */ -static DEFINE_PER_CPU(struct mutex *, zswap_mutex); - -static int zswap_dstmem_prepare(unsigned int cpu) -{ - struct mutex *mutex; - u8 *dst; - - dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); - if (!dst) - return -ENOMEM; - - mutex = kmalloc_node(sizeof(*mutex), GFP_KERNEL, cpu_to_node(cpu)); - if (!mutex) { - kfree(dst); - return -ENOMEM; - } - - mutex_init(mutex); - per_cpu(zswap_dstmem, cpu) = dst; - per_cpu(zswap_mutex, cpu) = mutex; - return 0; -} - -static int zswap_dstmem_dead(unsigned int cpu) -{ - struct mutex *mutex; - u8 *dst; - - mutex = per_cpu(zswap_mutex, cpu); - kfree(mutex); - per_cpu(zswap_mutex, cpu) = NULL; - - dst = per_cpu(zswap_dstmem, cpu); - kfree(dst); - per_cpu(zswap_dstmem, cpu) = NULL; - - return 0; -} - static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); struct crypto_acomp *acomp; struct acomp_req *req; + int ret; + + mutex_init(&acomp_ctx->mutex); + + acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); + if (!acomp_ctx->buffer) + return -ENOMEM; acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); if (IS_ERR(acomp)) { pr_err("could not alloc crypto acomp %s : %ld\n", pool->tfm_name, PTR_ERR(acomp)); - return PTR_ERR(acomp); + ret = PTR_ERR(acomp); + goto acomp_fail; } acomp_ctx->acomp = acomp; @@ -466,8 +429,8 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) if (!req) { pr_err("could not alloc crypto acomp_request %s\n", pool->tfm_name); - crypto_free_acomp(acomp_ctx->acomp); - return -ENOMEM; + ret = -ENOMEM; + goto req_fail; } acomp_ctx->req = req; @@ -480,10 +443,13 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &acomp_ctx->wait); - acomp_ctx->mutex = per_cpu(zswap_mutex, cpu); - acomp_ctx->dstmem = per_cpu(zswap_dstmem, cpu); - return 0; + +req_fail: + crypto_free_acomp(acomp_ctx->acomp); +acomp_fail: + kfree(acomp_ctx->buffer); + return ret; } static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) @@ -496,6 +462,7 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) acomp_request_free(acomp_ctx->req); if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) crypto_free_acomp(acomp_ctx->acomp); + kfree(acomp_ctx->buffer); } return 0; @@ -1009,14 +976,14 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); dlen = PAGE_SIZE; - mutex_lock(acomp_ctx->mutex); + mutex_lock(&acomp_ctx->mutex); sg_init_one(&input, src, entry->length); sg_init_table(&output, 1); sg_set_page(&output, page, PAGE_SIZE, 0); acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen); ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); dlen = acomp_ctx->req->dlen; - mutex_unlock(acomp_ctx->mutex); + mutex_unlock(&acomp_ctx->mutex); BUG_ON(ret); BUG_ON(dlen != PAGE_SIZE); @@ -1175,13 +1142,17 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* compress */ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - mutex_lock(acomp_ctx->mutex); + mutex_lock(&acomp_ctx->mutex); - dst = acomp_ctx->dstmem; + dst = acomp_ctx->buffer; sg_init_table(&input, 1); sg_set_page(&input, page, PAGE_SIZE, 0); - /* zswap_dstmem is of size (PAGE_SIZE * 2). Reflect same in sg_list */ + /* + * We need PAGE_SIZE * 2 here since there maybe over-compression case, + * and hardware-accelerators may won't check the dst buffer size, so + * giving the dst buffer with enough length to avoid buffer overflow. + */ sg_init_one(&output, dst, PAGE_SIZE * 2); acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); /* @@ -1222,7 +1193,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, memcpy(buf, &zhdr, hlen); memcpy(buf + hlen, dst, dlen); zpool_unmap_handle(entry->pool->zpool, handle); - mutex_unlock(acomp_ctx->mutex); + mutex_unlock(&acomp_ctx->mutex); /* populate entry */ entry->offset = offset; @@ -1250,7 +1221,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, return 0; put_dstmem: - mutex_unlock(acomp_ctx->mutex); + mutex_unlock(&acomp_ctx->mutex); zswap_pool_put(entry->pool); freepage: zswap_entry_cache_free(entry); @@ -1312,13 +1283,13 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, } acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - mutex_lock(acomp_ctx->mutex); + mutex_lock(&acomp_ctx->mutex); sg_init_one(&input, src, entry->length); sg_init_table(&output, 1); sg_set_page(&output, page, PAGE_SIZE, 0); acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen); ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); - mutex_unlock(acomp_ctx->mutex); + mutex_unlock(&acomp_ctx->mutex); if (zpool_can_sleep_mapped(entry->pool->zpool)) zpool_unmap_handle(entry->pool->zpool, entry->handle); @@ -1466,13 +1437,6 @@ static int zswap_setup(void) goto cache_fail; } - ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare", - zswap_dstmem_prepare, zswap_dstmem_dead); - if (ret) { - pr_err("dstmem alloc failed\n"); - goto dstmem_fail; - } - ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE, "mm/zswap_pool:prepare", zswap_cpu_comp_prepare, @@ -1505,8 +1469,6 @@ static int zswap_setup(void) if (pool) zswap_pool_destroy(pool); hp_fail: - cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE); -dstmem_fail: zswap_entry_cache_destroy(); cache_fail: /* if built-in, we aren't unloaded on failure; don't allow use */ -- 2.43.0

Wupeng Ma

10:40 a.m.

New subject: [PATCH OLK-5.10 4/8] mm: zswap: properly synchronize freeing resources during CPU hotunplug

From: Yosry Ahmed <yosryahmed@google.com> mainline inclusion from mainline-v6.13 commit 12dcb0ef540629a281533f9dedc1b6b8e14cfb65 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IBLDIU CVE: CVE-2025-21693 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i... -------------------------------- In zswap_compress() and zswap_decompress(), the per-CPU acomp_ctx of the current CPU at the beginning of the operation is retrieved and used throughout. However, since neither preemption nor migration are disabled, it is possible that the operation continues on a different CPU. If the original CPU is hotunplugged while the acomp_ctx is still in use, we run into a UAF bug as some of the resources attached to the acomp_ctx are freed during hotunplug in zswap_cpu_comp_dead() (i.e. acomp_ctx.buffer, acomp_ctx.req, or acomp_ctx.acomp). The problem was introduced in commit 1ec3b5fe6eec ("mm/zswap: move to use crypto_acomp API for hardware acceleration") when the switch to the crypto_acomp API was made. Prior to that, the per-CPU crypto_comp was retrieved using get_cpu_ptr() which disables preemption and makes sure the CPU cannot go away from under us. Preemption cannot be disabled with the crypto_acomp API as a sleepable context is needed. Use the acomp_ctx.mutex to synchronize CPU hotplug callbacks allocating and freeing resources with compression/decompression paths. Make sure that acomp_ctx.req is NULL when the resources are freed. In the compression/decompression paths, check if acomp_ctx.req is NULL after acquiring the mutex (meaning the CPU was offlined) and retry on the new CPU. The initialization of acomp_ctx.mutex is moved from the CPU hotplug callback to the pool initialization where it belongs (where the mutex is allocated). In addition to adding clarity, this makes sure that CPU hotplug cannot reinitialize a mutex that is already locked by compression/decompression. Previously a fix was attempted by holding cpus_read_lock() [1]. This would have caused a potential deadlock as it is possible for code already holding the lock to fall into reclaim and enter zswap (causing a deadlock). A fix was also attempted using SRCU for synchronization, but Johannes pointed out that synchronize_srcu() cannot be used in CPU hotplug notifiers [2]. Alternative fixes that were considered/attempted and could have worked: - Refcounting the per-CPU acomp_ctx. This involves complexity in handling the race between the refcount dropping to zero in zswap_[de]compress() and the refcount being re-initialized when the CPU is onlined. - Disabling migration before getting the per-CPU acomp_ctx [3], but that's discouraged and is a much bigger hammer than needed, and could result in subtle performance issues. [1]https://lkml.kernel.org/20241219212437.2714151-1-yosryahmed@google.com/ [2]https://lkml.kernel.org/20250107074724.1756696-2-yosryahmed@google.com/ [3]https://lkml.kernel.org/20250107222236.2715883-2-yosryahmed@google.com/ [yosryahmed@google.com: remove comment] Link: https://lkml.kernel.org/r/CAJD7tkaxS1wjn+swugt8QCvQ-rVF5RZnjxwPGX17k8x9zSMan... Link: https://lkml.kernel.org/r/20250108222441.3622031-1-yosryahmed@google.com Fixes: 1ec3b5fe6eec ("mm/zswap: move to use crypto_acomp API for hardware acceleration") Signed-off-by: Yosry Ahmed <yosryahmed@google.com> Reported-by: Johannes Weiner <hannes@cmpxchg.org> Closes: https://lore.kernel.org/lkml/20241113213007.GB1564047@cmpxchg.org/ Reported-by: Sam Sun <samsun1006219@gmail.com> Closes: https://lore.kernel.org/lkml/CAEkJfYMtSdM5HceNsXUDf5haghD5+o2e7Qv4OcuruL4tPg... Cc: Barry Song <baohua@kernel.org> Cc: Chengming Zhou <chengming.zhou@linux.dev> Cc: Kanchana P Sridhar <kanchana.p.sridhar@intel.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Vitaly Wool <vitalywool@gmail.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Conflicts: mm/zswap.c [Ma Wupeng: heavily conflicts] Signed-off-by: Ma Wupeng <mawupeng1@huawei.com> --- mm/zswap.c | 58 ++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 13 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 5e495406555b..bbaa62c4b7c5 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -410,11 +410,12 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) struct acomp_req *req; int ret; - mutex_init(&acomp_ctx->mutex); - + mutex_lock(&acomp_ctx->mutex); acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); - if (!acomp_ctx->buffer) - return -ENOMEM; + if (!acomp_ctx->buffer) { + ret = -ENOMEM; + goto buffer_fail; + } acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); if (IS_ERR(acomp)) { @@ -443,12 +444,15 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &acomp_ctx->wait); + mutex_unlock(&acomp_ctx->mutex); return 0; req_fail: crypto_free_acomp(acomp_ctx->acomp); acomp_fail: kfree(acomp_ctx->buffer); +buffer_fail: + mutex_unlock(&acomp_ctx->mutex); return ret; } @@ -457,17 +461,45 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); + mutex_lock(&acomp_ctx->mutex); if (!IS_ERR_OR_NULL(acomp_ctx)) { if (!IS_ERR_OR_NULL(acomp_ctx->req)) acomp_request_free(acomp_ctx->req); + acomp_ctx->req = NULL; if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) crypto_free_acomp(acomp_ctx->acomp); kfree(acomp_ctx->buffer); } + mutex_unlock(&acomp_ctx->mutex); return 0; } +static struct crypto_acomp_ctx *acomp_ctx_get_cpu_lock(struct zswap_pool *pool) +{ + struct crypto_acomp_ctx *acomp_ctx; + + for (;;) { + acomp_ctx = raw_cpu_ptr(pool->acomp_ctx); + mutex_lock(&acomp_ctx->mutex); + if (likely(acomp_ctx->req)) + return acomp_ctx; + /* + * It is possible that we were migrated to a different CPU after + * getting the per-CPU ctx but before the mutex was acquired. If + * the old CPU got offlined, zswap_cpu_comp_dead() could have + * already freed ctx->req (among other things) and set it to + * NULL. Just try again on the new CPU that we ended up on. + */ + mutex_unlock(&acomp_ctx->mutex); + } +} + +static void acomp_ctx_put_unlock(struct crypto_acomp_ctx *acomp_ctx) +{ + mutex_unlock(&acomp_ctx->mutex); +} + /********************************* * pool functions **********************************/ @@ -559,7 +591,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) struct zswap_pool *pool; char name[38]; /* 'zswap' + 32 char (max) num + \0 */ gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; - int ret; + int ret, cpu; if (!zswap_has_pool) { /* if either are unset, pool initialization failed, and we @@ -594,6 +626,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) goto error; } + for_each_possible_cpu(cpu) + mutex_init(&per_cpu_ptr(pool->acomp_ctx, cpu)->mutex); + ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); if (ret) @@ -1140,9 +1175,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, } /* compress */ - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - - mutex_lock(&acomp_ctx->mutex); + acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool); dst = acomp_ctx->buffer; sg_init_table(&input, 1); @@ -1193,7 +1226,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, memcpy(buf, &zhdr, hlen); memcpy(buf + hlen, dst, dlen); zpool_unmap_handle(entry->pool->zpool, handle); - mutex_unlock(&acomp_ctx->mutex); + acomp_ctx_put_unlock(acomp_ctx); /* populate entry */ entry->offset = offset; @@ -1221,7 +1254,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, return 0; put_dstmem: - mutex_unlock(&acomp_ctx->mutex); + acomp_ctx_put_unlock(acomp_ctx); zswap_pool_put(entry->pool); freepage: zswap_entry_cache_free(entry); @@ -1282,19 +1315,18 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, zpool_unmap_handle(entry->pool->zpool, entry->handle); } - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - mutex_lock(&acomp_ctx->mutex); + acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool); sg_init_one(&input, src, entry->length); sg_init_table(&output, 1); sg_set_page(&output, page, PAGE_SIZE, 0); acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen); ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); - mutex_unlock(&acomp_ctx->mutex); if (zpool_can_sleep_mapped(entry->pool->zpool)) zpool_unmap_handle(entry->pool->zpool, entry->handle); else kfree(tmp); + acomp_ctx_put_unlock(acomp_ctx); BUG_ON(ret); -- 2.43.0

Wupeng Ma

10:40 a.m.

New subject: [PATCH OLK-5.10 5/8] mm: zswap: move allocations during CPU init outside the lock

From: Yosry Ahmed <yosryahmed@google.com> mainline inclusion from mainline-v6.13 commit 779b9955f64327c339a16f68055af98252fd3315 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IBLDIU CVE: CVE-2025-21693 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i... -------------------------------- In zswap_cpu_comp_prepare(), allocations are made and assigned to various members of acomp_ctx under acomp_ctx->mutex. However, allocations may recurse into zswap through reclaim, trying to acquire the same mutex and deadlocking. Move the allocations before the mutex critical section. Only the initialization of acomp_ctx needs to be done with the mutex held. Link: https://lkml.kernel.org/r/20250113214458.2123410-1-yosryahmed@google.com Fixes: 12dcb0ef5406 ("mm: zswap: properly synchronize freeing resources during CPU hotunplug") Signed-off-by: Yosry Ahmed <yosryahmed@google.com> Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Nhat Pham <nphamcs@gmail.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Conflicts: mm/zswap.c [Ma Wupeng: is_sleepable is not involved is current version] Signed-off-by: Ma Wupeng <mawupeng1@huawei.com> --- mm/zswap.c | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index bbaa62c4b7c5..54cce59fc9b9 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -406,15 +406,15 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); - struct crypto_acomp *acomp; - struct acomp_req *req; + struct crypto_acomp *acomp = NULL; + struct acomp_req *req = NULL; + u8 *buffer = NULL; int ret; - mutex_lock(&acomp_ctx->mutex); - acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); - if (!acomp_ctx->buffer) { + buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); + if (!buffer) { ret = -ENOMEM; - goto buffer_fail; + goto fail; } acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); @@ -422,20 +422,25 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) pr_err("could not alloc crypto acomp %s : %ld\n", pool->tfm_name, PTR_ERR(acomp)); ret = PTR_ERR(acomp); - goto acomp_fail; + goto fail; } - acomp_ctx->acomp = acomp; - req = acomp_request_alloc(acomp_ctx->acomp); + req = acomp_request_alloc(acomp); if (!req) { pr_err("could not alloc crypto acomp_request %s\n", pool->tfm_name); ret = -ENOMEM; - goto req_fail; + goto fail; } - acomp_ctx->req = req; + /* + * Only hold the mutex after completing allocations, otherwise we may + * recurse into zswap through reclaim and attempt to hold the mutex + * again resulting in a deadlock. + */ + mutex_lock(&acomp_ctx->mutex); crypto_init_wait(&acomp_ctx->wait); + /* * if the backend of acomp is async zip, crypto_req_done() will wakeup * crypto_wait_req(); if the backend of acomp is scomp, the callback @@ -444,15 +449,16 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &acomp_ctx->wait); + acomp_ctx->buffer = buffer; + acomp_ctx->acomp = acomp; + acomp_ctx->req = req; mutex_unlock(&acomp_ctx->mutex); return 0; -req_fail: - crypto_free_acomp(acomp_ctx->acomp); -acomp_fail: - kfree(acomp_ctx->buffer); -buffer_fail: - mutex_unlock(&acomp_ctx->mutex); +fail: + if (acomp) + crypto_free_acomp(acomp); + kfree(buffer); return ret; } -- 2.43.0

Wupeng Ma

10:40 a.m.

New subject: [PATCH OLK-5.10 6/8] hugetlb: unshare some PMDs when splitting VMAs

From: James Houghton <jthoughton@google.com> stable inclusion from stable-v5.10.239 commit f1082f5f3d02f2f9b4ca36642274529cf5ddaf34 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICIQAH CVE: CVE-2025-38084 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=... -------------------------------- commit b30c14cd61025eeea2f2e8569606cd167ba9ad2d upstream. PMD sharing can only be done in PUD_SIZE-aligned pieces of VMAs; however, it is possible that HugeTLB VMAs are split without unsharing the PMDs first. Without this fix, it is possible to hit the uffd-wp-related WARN_ON_ONCE in hugetlb_change_protection [1]. The key there is that hugetlb_unshare_all_pmds will not attempt to unshare PMDs in non-PUD_SIZE-aligned sections of the VMA. It might seem ideal to unshare in hugetlb_vm_op_open, but we need to unshare in both the new and old VMAs, so unsharing in hugetlb_vm_op_split seems natural. [1]: https://lore.kernel.org/linux-mm/CADrL8HVeOkj0QH5VZZbRzybNE8CG-tEGFshnA+bG9n... Link: https://lkml.kernel.org/r/20230104231910.1464197-1-jthoughton@google.com Fixes: 6dfeaff93be1 ("hugetlb/userfaultfd: unshare all pmds for hugetlbfs when register wp") Signed-off-by: James Houghton <jthoughton@google.com> Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com> Acked-by: Peter Xu <peterx@redhat.com> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: Muchun Song <songmuchun@bytedance.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> [backport notes: I believe the "Fixes" tag is somewhat wrong - kernels before that commit already had an adjust_range_if_pmd_sharing_possible() that assumes that shared PMDs can't straddle page table boundaries. huge_pmd_unshare() takes different parameter type] Signed-off-by: Jann Horn <jannh@google.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Conflicts: mm/hugetlb.c [Wupeng Ma: context conflict] Signed-off-by: Wupeng Ma <mawupeng1@huawei.com> --- mm/hugetlb.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a0a051f9ec29..84389fc34542 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -89,6 +89,8 @@ int sysctl_hugetlb_pmem_allocall; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta, struct hugetlbfs_inode_info *info); +static void hugetlb_unshare_pmds(struct vm_area_struct *vma, + unsigned long start, unsigned long end); static inline void unlock_or_release_subpool(struct hugepage_subpool *spool, unsigned long irq_flags) @@ -4212,6 +4214,25 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) { if (addr & ~(huge_page_mask(hstate_vma(vma)))) return -EINVAL; + + /* + * PMD sharing is only possible for PUD_SIZE-aligned address ranges + * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this + * split, unshare PMDs in the PUD_SIZE interval surrounding addr now. + */ + if (addr & ~PUD_MASK) { + /* + * hugetlb_vm_op_split is called right before we attempt to + * split the VMA. We will need to unshare PMDs in the old and + * new VMAs, so let's unshare before we split. + */ + unsigned long floor = addr & PUD_MASK; + unsigned long ceil = floor + PUD_SIZE; + + if (floor >= vma->vm_start && ceil <= vma->vm_end) + hugetlb_unshare_pmds(vma, floor, ceil); + } + return 0; } @@ -6263,6 +6284,50 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) } } +static void hugetlb_unshare_pmds(struct vm_area_struct *vma, + unsigned long start, + unsigned long end) +{ + struct hstate *h = hstate_vma(vma); + unsigned long sz = huge_page_size(h); + struct mm_struct *mm = vma->vm_mm; + struct mmu_notifier_range range; + unsigned long address; + spinlock_t *ptl; + pte_t *ptep; + + if (!(vma->vm_flags & VM_MAYSHARE)) + return; + + if (start >= end) + return; + + flush_cache_range(vma, start, end); + /* + * No need to call adjust_range_if_pmd_sharing_possible(), because + * we have already done the PUD_SIZE alignment. + */ + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + start, end); + mmu_notifier_invalidate_range_start(&range); + i_mmap_lock_write(vma->vm_file->f_mapping); + for (address = start; address < end; address += PUD_SIZE) { + ptep = huge_pte_offset(mm, address, sz); + if (!ptep) + continue; + ptl = huge_pte_lock(h, mm, ptep); + huge_pmd_unshare(mm, vma, &address, ptep); + spin_unlock(ptl); + } + flush_hugetlb_tlb_range(vma, start, end); + i_mmap_unlock_write(vma->vm_file->f_mapping); + /* + * No need to call mmu_notifier_invalidate_range(), see + * Documentation/mm/mmu_notifier.rst. + */ + mmu_notifier_invalidate_range_end(&range); +} + #ifdef CONFIG_CMA static bool cma_reserve_called __initdata; -- 2.43.0

Wupeng Ma

10:40 a.m.

New subject: [PATCH OLK-5.10 7/8] mm/hugetlb: unshare page tables during VMA split, not before

From: Jann Horn <jannh@google.com> stable inclusion from stable-v5.10.239 commit e8847d18cd9fff1edbb45e963d9141273c3b539c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICIQAH CVE: CVE-2025-38084 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=... -------------------------------- commit 081056dc00a27bccb55ccc3c6f230a3d5fd3f7e0 upstream. Currently, __split_vma() triggers hugetlb page table unsharing through vm_ops->may_split(). This happens before the VMA lock and rmap locks are taken - which is too early, it allows racing VMA-locked page faults in our process and racing rmap walks from other processes to cause page tables to be shared again before we actually perform the split. Fix it by explicitly calling into the hugetlb unshare logic from __split_vma() in the same place where THP splitting also happens. At that point, both the VMA and the rmap(s) are write-locked. An annoying detail is that we can now call into the helper hugetlb_unshare_pmds() from two different locking contexts: 1. from hugetlb_split(), holding: - mmap lock (exclusively) - VMA lock - file rmap lock (exclusively) 2. hugetlb_unshare_all_pmds(), which I think is designed to be able to call us with only the mmap lock held (in shared mode), but currently only runs while holding mmap lock (exclusively) and VMA lock Backporting note: This commit fixes a racy protection that was introduced in commit b30c14cd6102 ("hugetlb: unshare some PMDs when splitting VMAs"); that commit claimed to fix an issue introduced in 5.13, but it should actually also go all the way back. [jannh@google.com: v2] Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-1-1329349bad1a... Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-0-1329349bad1a... Link: https://lkml.kernel.org/r/20250527-hugetlb-fixes-splitrace-v1-1-f4136f5ec58a... Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page") Signed-off-by: Jann Horn <jannh@google.com> Cc: Liam Howlett <liam.howlett@oracle.com> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Reviewed-by: Oscar Salvador <osalvador@suse.de> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: <stable@vger.kernel.org> [b30c14cd6102: hugetlb: unshare some PMDs when splitting VMAs] Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> [stable backport: code got moved around, VMA splitting is in __vma_adjust, hugetlb lock wasn't used back then] Signed-off-by: Jann Horn <jannh@google.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Conflicts: mm/hugetlb.c [Wupeng Ma: context conflict] Signed-off-by: Wupeng Ma <mawupeng1@huawei.com> --- include/linux/hugetlb.h | 6 +++++ mm/hugetlb.c | 53 +++++++++++++++++++++++++++++++---------- mm/mmap.c | 8 +++++++ 3 files changed, 54 insertions(+), 13 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index b25c809af612..8381d2ef7aa9 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -210,6 +210,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot); bool is_hugetlb_entry_migration(pte_t pte); +void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); +void hugetlb_split(struct vm_area_struct *vma, unsigned long addr); #else /* !CONFIG_HUGETLB_PAGE */ @@ -392,6 +394,10 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm, return 0; } +static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { } + +static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {} + #endif /* !CONFIG_HUGETLB_PAGE */ /* * hugepages at page global directory. If arch support diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 84389fc34542..6e4f58a0470a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -90,7 +90,7 @@ int sysctl_hugetlb_pmem_allocall; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta, struct hugetlbfs_inode_info *info); static void hugetlb_unshare_pmds(struct vm_area_struct *vma, - unsigned long start, unsigned long end); + unsigned long start, unsigned long end, bool take_locks); static inline void unlock_or_release_subpool(struct hugepage_subpool *spool, unsigned long irq_flags) @@ -4214,26 +4214,40 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) { if (addr & ~(huge_page_mask(hstate_vma(vma)))) return -EINVAL; + return 0; +} +void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) +{ /* * PMD sharing is only possible for PUD_SIZE-aligned address ranges * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this * split, unshare PMDs in the PUD_SIZE interval surrounding addr now. + * This function is called in the middle of a VMA split operation, with + * MM, VMA and rmap all write-locked to prevent concurrent page table + * walks (except hardware and gup_fast()). */ + mmap_assert_write_locked(vma->vm_mm); + i_mmap_assert_write_locked(vma->vm_file->f_mapping); + if (addr & ~PUD_MASK) { - /* - * hugetlb_vm_op_split is called right before we attempt to - * split the VMA. We will need to unshare PMDs in the old and - * new VMAs, so let's unshare before we split. - */ unsigned long floor = addr & PUD_MASK; unsigned long ceil = floor + PUD_SIZE; - if (floor >= vma->vm_start && ceil <= vma->vm_end) - hugetlb_unshare_pmds(vma, floor, ceil); + if (floor >= vma->vm_start && ceil <= vma->vm_end) { + /* + * Locking: + * Use take_locks=false here. + * The file rmap lock is already held. + * The hugetlb VMA lock can't be taken when we already + * hold the file rmap lock, and we don't need it because + * its purpose is to synchronize against concurrent page + * table walks, which are not possible thanks to the + * locks held by our caller. + */ + hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false); + } } - - return 0; } static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) @@ -6284,9 +6298,16 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) } } +/* + * If @take_locks is false, the caller must ensure that no concurrent page table + * access can happen (except for gup_fast() and hardware page walks). + * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like + * concurrent page fault handling) and the file rmap lock. + */ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, - unsigned long end) + unsigned long end, + bool take_locks) { struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); @@ -6310,7 +6331,11 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, start, end); mmu_notifier_invalidate_range_start(&range); - i_mmap_lock_write(vma->vm_file->f_mapping); + if (take_locks) { + i_mmap_lock_write(vma->vm_file->f_mapping); + } else { + i_mmap_assert_write_locked(vma->vm_file->f_mapping); + } for (address = start; address < end; address += PUD_SIZE) { ptep = huge_pte_offset(mm, address, sz); if (!ptep) @@ -6320,7 +6345,9 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, spin_unlock(ptl); } flush_hugetlb_tlb_range(vma, start, end); - i_mmap_unlock_write(vma->vm_file->f_mapping); + if (take_locks) { + i_mmap_unlock_write(vma->vm_file->f_mapping); + } /* * No need to call mmu_notifier_invalidate_range(), see * Documentation/mm/mmu_notifier.rst. diff --git a/mm/mmap.c b/mm/mmap.c index 6bf5d219aece..c0262befcaae 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -842,7 +842,15 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } } again: + /* + * Get rid of huge pages and shared page tables straddling the split + * boundary. + */ vma_adjust_trans_huge(orig_vma, start, end, adjust_next); + if (is_vm_hugetlb_page(orig_vma)) { + hugetlb_split(orig_vma, start); + hugetlb_split(orig_vma, end); + } if (file) { mapping = file->f_mapping; -- 2.43.0

Wupeng Ma

10:40 a.m.

New subject: [PATCH OLK-5.10 8/8] mm: zswap: fix crypto_free_acomp() deadlock in zswap_cpu_comp_dead()

From: Yosry Ahmed <yosry.ahmed@linux.dev> mainline inclusion from mainline-v6.15-rc1 commit c11bcbc0a517acf69282c8225059b2a8ac5fe628 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IBLDIU CVE: CVE-2025-21693 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i... -------------------------------- Currently, zswap_cpu_comp_dead() calls crypto_free_acomp() while holding the per-CPU acomp_ctx mutex. crypto_free_acomp() then holds scomp_lock (through crypto_exit_scomp_ops_async()). On the other hand, crypto_alloc_acomp_node() holds the scomp_lock (through crypto_scomp_init_tfm()), and then allocates memory. If the allocation results in reclaim, we may attempt to hold the per-CPU acomp_ctx mutex. The above dependencies can cause an ABBA deadlock. For example in the following scenario: (1) Task A running on CPU #1: crypto_alloc_acomp_node() Holds scomp_lock Enters reclaim Reads per_cpu_ptr(pool->acomp_ctx, 1) (2) Task A is descheduled (3) CPU #1 goes offline zswap_cpu_comp_dead(CPU #1) Holds per_cpu_ptr(pool->acomp_ctx, 1)) Calls crypto_free_acomp() Waits for scomp_lock (4) Task A running on CPU #2: Waits for per_cpu_ptr(pool->acomp_ctx, 1) // Read on CPU #1 DEADLOCK Since there is no requirement to call crypto_free_acomp() with the per-CPU acomp_ctx mutex held in zswap_cpu_comp_dead(), move it after the mutex is unlocked. Also move the acomp_request_free() and kfree() calls for consistency and to avoid any potential sublte locking dependencies in the future. With this, only setting acomp_ctx fields to NULL occurs with the mutex held. This is similar to how zswap_cpu_comp_prepare() only initializes acomp_ctx fields with the mutex held, after performing all allocations before holding the mutex. Opportunistically, move the NULL check on acomp_ctx so that it takes place before the mutex dereference. Link: https://lkml.kernel.org/r/20250226185625.2672936-1-yosry.ahmed@linux.dev Fixes: 12dcb0ef5406 ("mm: zswap: properly synchronize freeing resources during CPU hotunplug") Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Co-developed-by: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev> Reported-by: syzbot+1a517ccfcbc6a7ab0f82@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/67bcea51.050a0220.bbfd1.0096.GAE@google.com/ Acked-by: Herbert Xu <herbert@gondor.apana.org.au> Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev> Reviewed-by: Nhat Pham <nphamcs@gmail.com> Tested-by: Nhat Pham <nphamcs@gmail.com> Cc: David S. Miller <davem@davemloft.net> Cc: Eric Biggers <ebiggers@kernel.org> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Chris Murphy <lists@colorremedies.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Conflicts: mm/zswap.c [Wupeng Ma: context conflicts] Signed-off-by: Wupeng Ma <mawupeng1@huawei.com> --- mm/zswap.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 54cce59fc9b9..fdddf004bd4f 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -466,18 +466,32 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); + struct acomp_req *req; + struct crypto_acomp *acomp; + u8 *buffer; + + if (IS_ERR_OR_NULL(acomp_ctx)) + return 0; mutex_lock(&acomp_ctx->mutex); - if (!IS_ERR_OR_NULL(acomp_ctx)) { - if (!IS_ERR_OR_NULL(acomp_ctx->req)) - acomp_request_free(acomp_ctx->req); - acomp_ctx->req = NULL; - if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) - crypto_free_acomp(acomp_ctx->acomp); - kfree(acomp_ctx->buffer); - } + req = acomp_ctx->req; + acomp = acomp_ctx->acomp; + buffer = acomp_ctx->buffer; + acomp_ctx->req = NULL; + acomp_ctx->acomp = NULL; + acomp_ctx->buffer = NULL; mutex_unlock(&acomp_ctx->mutex); + /* + * Do the actual freeing after releasing the mutex to avoid subtle + * locking dependencies causing deadlocks. + */ + if (!IS_ERR_OR_NULL(req)) + acomp_request_free(req); + if (!IS_ERR_OR_NULL(acomp)) + crypto_free_acomp(acomp); + kfree(buffer); + return 0; } -- 2.43.0

patchwork bot

10:46 a.m.

反馈：您发送到kernel@openeuler.org的补丁/补丁集，已成功转换为PR！ PR链接地址： https://gitee.com/openeuler/kernel/pulls/19283 邮件列表地址：https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/NUW... FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/19283 Mailing list address: https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/NUW...

Age (days ago)

Last active (days ago)

List overview

9 comments

2 participants

participants (2)

patchwork bot
Wupeng Ma

[PATCH OLK-5.10 0/8] fix cves

tags

participants (2)