[PATCH openEuler-26.09 0/3] Fix device memory leaks in XSched
Fix device memory leaks in XSched Liu Kai (3): xsched/dmem: ensure XSched Context creation before device memory allocation xsched/dmem: add size field to xsched_dmem_pool for dmem region tracking xsched/dmem: ensure dmem region release on AI task completion to prevent memory leaks include/linux/xsched.h | 2 ++ include/uapi/linux/xcu_vstream.h | 2 +- kernel/xsched/dmem.c | 38 +++++++++++++++++++++++++++----- kernel/xsched/vstream.c | 22 +++++++++++++++--- 4 files changed, 55 insertions(+), 9 deletions(-) -- 2.34.1
hulk inclusion category: bugfix bugzilla: https://atomgit.com/openeuler/kernel/issues/8697 ---------------------------------------- When AI tasks request device memory, the timing of this request may precede the creation of the XSched Context. This can lead to failures in device memory allocation due to the absence of a valid context. To address this issue, this commit introduces a check during device memory allocation to verify the existence of the XSched Context. If the context is not found, it will be created prior to proceeding with the memory allocation. This ensures that all subsequent operations have access to a valid XSched Context, thereby preventing memory allocation failures caused by missing contexts. Fixes: 0bfa64812e4b ("xsched/dmem: introduce xsched_dmem_alloc()") Signed-off-by: Liu Kai <liukai284@huawei.com> --- include/uapi/linux/xcu_vstream.h | 2 +- kernel/xsched/dmem.c | 1 + kernel/xsched/vstream.c | 18 +++++++++++++++--- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/xcu_vstream.h b/include/uapi/linux/xcu_vstream.h index 14552fae2159..52abcae44928 100644 --- a/include/uapi/linux/xcu_vstream.h +++ b/include/uapi/linux/xcu_vstream.h @@ -22,7 +22,7 @@ typedef enum VSTREAM_COMMAND { VSTREAM_ALLOC = 0, VSTREAM_FREE, VSTREAM_KICK, - VSTREAM_ALLOC_HBM, + VSTREAM_HBM_ALLOC, VSTREAM_HBM_FREE, MAX_COMMAND } vstream_command_t; diff --git a/kernel/xsched/dmem.c b/kernel/xsched/dmem.c index 530fe77ad331..b202aac6a693 100644 --- a/kernel/xsched/dmem.c +++ b/kernel/xsched/dmem.c @@ -21,6 +21,7 @@ #include <linux/xsched.h> #include <linux/types.h> #include <linux/cgroup_dmem.h> +#include <linux/sizes.h> static struct dmem_cgroup_region *hbm_regions[XSCHED_NR_CUS]; diff --git a/kernel/xsched/vstream.c b/kernel/xsched/vstream.c index 7b769a2e2545..f9998912247e 100644 --- a/kernel/xsched/vstream.c +++ b/kernel/xsched/vstream.c @@ -623,17 +623,29 @@ int vstream_kick(struct vstream_args *arg) static int vstream_hbm_alloc(struct vstream_args *arg) { + vstream_info_t vstream_info; struct xsched_cu *xcu_found; struct xsched_context *ctx; + int ret; xcu_found = xcu_find(XCU_TYPE_XPU, arg->dev_id, arg->channel_id); if (!xcu_found) return -EINVAL; - ctx = ctx_find_by_tgid_and_xcu(current->tgid, xcu_found); - if (!ctx) { + vstream_info.tgid = current->tgid; + vstream_info.xcu = xcu_found; + vstream_info.dev_id = arg->dev_id; + vstream_info.channel_id = arg->channel_id; + vstream_info.fd = arg->fd; + + /* it will either allocate or find a context */ + mutex_lock(&xcu_found->ctx_list_lock); + ret = alloc_ctx_from_vstream(&vstream_info, &ctx); + mutex_unlock(&xcu_found->ctx_list_lock); + + if (ret) { XSCHED_ERR("Failed to find a context for HBM alloc"); - return -EINVAL; + return ret; } return xsched_dmem_alloc(ctx, arg); -- 2.34.1
hulk inclusion category: bugfix bugzilla: https://atomgit.com/openeuler/kernel/issues/8697 ---------------------------------------- To properly release device memory (dmem) regions allocated from an xsched_dmem_pool, the pool must retain knowledge of the size of each allocated region. Currently, this information is not stored, making it impossible to safely free variable-sized dmem allocations. This commit adds a 'size' member to struct xsched_dmem_pool to record the total size of the underlying dmem region. This enables correct deallocation via the device memory management interface, which requires both the base address and the allocation size. The change is minimal and backward-compatible, as the new field is only used during pool teardown or explicit dmem release paths. Fixes: ff8b804d8161 ("xsched/dmem: introduce xsched_dmem_free()") Signed-off-by: Liu Kai <liukai284@huawei.com> --- kernel/xsched/dmem.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/xsched/dmem.c b/kernel/xsched/dmem.c index b202aac6a693..05e93d564808 100644 --- a/kernel/xsched/dmem.c +++ b/kernel/xsched/dmem.c @@ -27,6 +27,7 @@ static struct dmem_cgroup_region *hbm_regions[XSCHED_NR_CUS]; struct xsched_dmem_pool { uint64_t id; + uint64_t size; struct dmem_cgroup_pool_state *pool; struct list_head pool_node; }; @@ -89,6 +90,7 @@ int xsched_dmem_alloc(struct xsched_context *ctx, struct vstream_args *args) } new_pool->pool = ret_pool; + new_pool->size = args->vh_args.size; /* protect list using ctx_lock */ spin_lock(&ctx->ctx_lock); @@ -98,7 +100,7 @@ int xsched_dmem_alloc(struct xsched_context *ctx, struct vstream_args *args) args->vh_args.pool_id = new_pool->id; XSCHED_DEBUG("charged %llu bytes, new_alloc = %p with id %llu", - args->vh_args.size, new_pool, new_pool->id); + new_pool->size, new_pool, new_pool->id); return 0; @@ -112,10 +114,11 @@ int xsched_dmem_alloc(struct xsched_context *ctx, struct vstream_args *args) int xsched_dmem_free(struct xsched_context *ctx, struct vstream_args *args) { struct xsched_dmem_pool *pool, *target = NULL; + uint64_t pool_id = args->vh_args.pool_id; spin_lock(&ctx->ctx_lock); list_for_each_entry(pool, &ctx->pool_list, pool_node) { - if (pool->id == args->vh_args.pool_id) { + if (pool->id == pool_id) { list_del(&pool->pool_node); target = pool; break; @@ -124,13 +127,13 @@ int xsched_dmem_free(struct xsched_context *ctx, struct vstream_args *args) spin_unlock(&ctx->ctx_lock); if (!target) { - XSCHED_ERR("pool with id %llu is not found\n", args->vh_args.pool_id); + XSCHED_ERR("pool with id %llu is not found\n", pool_id); return -EINVAL; } XSCHED_DEBUG("uncharged %llu bytes for pool = %p with id %llu\n", - args->vh_args.size, target, target->id); - dmem_cgroup_uncharge(target->pool, args->vh_args.size); + target->size, target, target->id); + dmem_cgroup_uncharge(target->pool, target->size); kfree(target); return 0; -- 2.34.1
hulk inclusion category: bugfix bugzilla: https://atomgit.com/openeuler/kernel/issues/8697 ---------------------------------------- When AI tasks complete, they may attempt to release associated device memory (dmem) regions. However, if the XSched Context is not available at this point, the dmem regions cannot be properly freed, leading to memory leaks. To address this issue, this commit introduces logic to ensure that all dmem regions are released when an AI task exits. These changes prevent memory leaks by ensuring that all allocated dmem resources are properly freed when an AI task completes, improving system stability and resource management. Fixes: ff8b804d8161 ("xsched/dmem: introduce xsched_dmem_free()") Signed-off-by: Liu Kai <liukai284@huawei.com> --- include/linux/xsched.h | 2 ++ kernel/xsched/dmem.c | 24 ++++++++++++++++++++++++ kernel/xsched/vstream.c | 4 ++++ 3 files changed, 30 insertions(+) diff --git a/include/linux/xsched.h b/include/linux/xsched.h index 37e893299a0b..bb11e528bbc4 100644 --- a/include/linux/xsched.h +++ b/include/linux/xsched.h @@ -484,11 +484,13 @@ void xsched_quota_refill(struct work_struct *work); int xsched_dmem_init(void); int xsched_dmem_alloc(struct xsched_context *ctx, struct vstream_args *args); int xsched_dmem_free(struct xsched_context *ctx, struct vstream_args *args); +void xsched_dmem_clear(struct xsched_context *ctx); #else static inline int xsched_dmem_alloc( struct xsched_context *ctx, struct vstream_args *args) { return 0; } static inline int xsched_dmem_free( struct xsched_context *ctx, struct vstream_args *args) { return 0; } +static inline void xsched_dmem_clear(struct xsched_context *ctx) { } #endif /* CONFIG_CGROUP_DMEM */ #endif /* !__LINUX_XSCHED_H__ */ diff --git a/kernel/xsched/dmem.c b/kernel/xsched/dmem.c index 05e93d564808..b655058aa5b5 100644 --- a/kernel/xsched/dmem.c +++ b/kernel/xsched/dmem.c @@ -138,3 +138,27 @@ int xsched_dmem_free(struct xsched_context *ctx, struct vstream_args *args) return 0; } + +void xsched_dmem_clear(struct xsched_context *ctx) +{ + struct xsched_dmem_pool *pool, *tmp; + + if (!ctx) + return; + + spin_lock(&ctx->ctx_lock); + + list_for_each_entry_safe(pool, tmp, &ctx->pool_list, pool_node) { + list_del(&pool->pool_node); + spin_unlock(&ctx->ctx_lock); + + XSCHED_DEBUG("uncharged %llu bytes for pool = %p with id %llu\n", + pool->size, pool, pool->id); + dmem_cgroup_uncharge(pool->pool, pool->size); + kfree(pool); + + spin_lock(&ctx->ctx_lock); + } + + spin_unlock(&ctx->ctx_lock); +} diff --git a/kernel/xsched/vstream.c b/kernel/xsched/vstream.c index f9998912247e..4f4f071539f1 100644 --- a/kernel/xsched/vstream.c +++ b/kernel/xsched/vstream.c @@ -80,6 +80,7 @@ static void xsched_task_free(struct kref *kref) usleep_range(100, 200); mutex_lock(&xcu->ctx_list_lock); + xsched_dmem_clear(ctx); list_for_each_entry_safe(vs, tmp, &ctx->vstream_list, ctx_node) { list_del(&vs->ctx_node); kfree(vs); @@ -660,7 +661,10 @@ static int vstream_hbm_free(struct vstream_args *arg) if (!xcu_found) return -EINVAL; + mutex_lock(&xcu_found->ctx_list_lock); ctx = ctx_find_by_tgid_and_xcu(current->tgid, xcu_found); + mutex_unlock(&xcu_found->ctx_list_lock); + if (!ctx) { XSCHED_ERR("Failed to find a context for HBM free"); return -EINVAL; -- 2.34.1
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://atomgit.com/openeuler/kernel/merge_requests/21180 邮件列表地址:https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/AZ7... FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://atomgit.com/openeuler/kernel/merge_requests/21180 Mailing list address: https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/AZ7...
participants (2)
-
Liu Kai -
patchwork bot