hulk inclusion category: bugfix bugzilla: https://atomgit.com/openeuler/kernel/issues/8697 ---------------------------------------- When AI tasks complete, they may attempt to release associated device memory (dmem) regions. However, if the XSched Context is not available at this point, the dmem regions cannot be properly freed, leading to memory leaks. To address this issue, this commit introduces logic to ensure that all dmem regions are released when an AI task exits. These changes prevent memory leaks by ensuring that all allocated dmem resources are properly freed when an AI task completes, improving system stability and resource management. Fixes: ff8b804d8161 ("xsched/dmem: introduce xsched_dmem_free()") Signed-off-by: Liu Kai <liukai284@huawei.com> --- include/linux/xsched.h | 2 ++ kernel/xsched/dmem.c | 24 ++++++++++++++++++++++++ kernel/xsched/vstream.c | 4 ++++ 3 files changed, 30 insertions(+) diff --git a/include/linux/xsched.h b/include/linux/xsched.h index 37e893299a0b..bb11e528bbc4 100644 --- a/include/linux/xsched.h +++ b/include/linux/xsched.h @@ -484,11 +484,13 @@ void xsched_quota_refill(struct work_struct *work); int xsched_dmem_init(void); int xsched_dmem_alloc(struct xsched_context *ctx, struct vstream_args *args); int xsched_dmem_free(struct xsched_context *ctx, struct vstream_args *args); +void xsched_dmem_clear(struct xsched_context *ctx); #else static inline int xsched_dmem_alloc( struct xsched_context *ctx, struct vstream_args *args) { return 0; } static inline int xsched_dmem_free( struct xsched_context *ctx, struct vstream_args *args) { return 0; } +static inline void xsched_dmem_clear(struct xsched_context *ctx) { } #endif /* CONFIG_CGROUP_DMEM */ #endif /* !__LINUX_XSCHED_H__ */ diff --git a/kernel/xsched/dmem.c b/kernel/xsched/dmem.c index 05e93d564808..b655058aa5b5 100644 --- a/kernel/xsched/dmem.c +++ b/kernel/xsched/dmem.c @@ -138,3 +138,27 @@ int xsched_dmem_free(struct xsched_context *ctx, struct vstream_args *args) return 0; } + +void xsched_dmem_clear(struct xsched_context *ctx) +{ + struct xsched_dmem_pool *pool, *tmp; + + if (!ctx) + return; + + spin_lock(&ctx->ctx_lock); + + list_for_each_entry_safe(pool, tmp, &ctx->pool_list, pool_node) { + list_del(&pool->pool_node); + spin_unlock(&ctx->ctx_lock); + + XSCHED_DEBUG("uncharged %llu bytes for pool = %p with id %llu\n", + pool->size, pool, pool->id); + dmem_cgroup_uncharge(pool->pool, pool->size); + kfree(pool); + + spin_lock(&ctx->ctx_lock); + } + + spin_unlock(&ctx->ctx_lock); +} diff --git a/kernel/xsched/vstream.c b/kernel/xsched/vstream.c index f9998912247e..4f4f071539f1 100644 --- a/kernel/xsched/vstream.c +++ b/kernel/xsched/vstream.c @@ -80,6 +80,7 @@ static void xsched_task_free(struct kref *kref) usleep_range(100, 200); mutex_lock(&xcu->ctx_list_lock); + xsched_dmem_clear(ctx); list_for_each_entry_safe(vs, tmp, &ctx->vstream_list, ctx_node) { list_del(&vs->ctx_node); kfree(vs); @@ -660,7 +661,10 @@ static int vstream_hbm_free(struct vstream_args *arg) if (!xcu_found) return -EINVAL; + mutex_lock(&xcu_found->ctx_list_lock); ctx = ctx_find_by_tgid_and_xcu(current->tgid, xcu_found); + mutex_unlock(&xcu_found->ctx_list_lock); + if (!ctx) { XSCHED_ERR("Failed to find a context for HBM free"); return -EINVAL; -- 2.34.1