Mukul Joshi (1): drm/amdkfd: Rework kfd_locked handling
Zhigang Luo (1): amd/amdkfd: sync all devices to wait all processes being evicted
drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 3 --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 27 +++++++++++++++--------- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + drivers/gpu/drm/amd/amdkfd/kfd_process.c | 8 ++++++- 4 files changed, 25 insertions(+), 14 deletions(-)
From: Mukul Joshi mukul.joshi@amd.com
mainline inclusion from mainline-v6.5-rc1 commit fe1f05df5919c67c3add49efb55e251a8d78ee4e category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I9U8NU CVE: CVE-2024-36949
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Currently, even if kfd_locked is set, a process is first created and then removed to work around a race condition in updating kfd_locked flag. Rework kfd_locked handling to ensure no processes is created if kfd_locked is set. This is achieved by updating kfd_locked under kfd_processes_mutex. With this there is no need for kfd_locked to be an atomic counter. Instead, it can be a regular integer.
Signed-off-by: Mukul Joshi mukul.joshi@amd.com Reviewed-by: Felix Kuehling Felix.Kuehling@amd.com Signed-off-by: Alex Deucher alexander.deucher@amd.com
Conflicts: drivers/gpu/drm/amd/amdkfd/kfd_device.c drivers/gpu/drm/amd/amdkfd/kfd_process.c drivers/gpu/drm/amd/amdkfd/kfd_chardev.c drivers/gpu/drm/amd/amdkfd/kfd_priv.h [Some contexts different. No functional impact.] Signed-off-by: Zheng Zucheng zhengzucheng@huawei.com --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 3 --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 24 +++++++++++++++++------- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + drivers/gpu/drm/amd/amdkfd/kfd_process.c | 8 +++++++- 4 files changed, 25 insertions(+), 11 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 0b19edf54178..1e8f7d5f67fb 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -122,9 +122,6 @@ static int kfd_open(struct inode *inode, struct file *filep) if (IS_ERR(process)) return PTR_ERR(process);
- if (kfd_is_locked()) - return -EAGAIN; - dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n", process->pasid, process->is_32bit_user_mode);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 938d0053a820..e3d7a408d659 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -36,7 +36,7 @@ * once locked, kfd driver will stop any further GPU execution. * create process (open) will return -EAGAIN. */ -static atomic_t kfd_locked = ATOMIC_INIT(0); +static int kfd_locked;
#ifdef KFD_SUPPORT_IOMMU_V2 static const struct kfd_device_info kaveri_device_info = { @@ -577,7 +577,7 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
int kgd2kfd_post_reset(struct kfd_dev *kfd) { - int ret, count; + int ret;
if (!kfd->init_complete) return 0; @@ -587,23 +587,31 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd) ret = kfd_resume(kfd); if (ret) return ret; - count = atomic_dec_return(&kfd_locked); - WARN_ONCE(count != 0, "KFD reset ref. error"); + + mutex_lock(&kfd_processes_mutex); + --kfd_locked; + mutex_unlock(&kfd_processes_mutex); return 0; }
bool kfd_is_locked(void) { - return (atomic_read(&kfd_locked) > 0); + lockdep_assert_held(&kfd_processes_mutex); + return (kfd_locked > 0); + }
void kgd2kfd_suspend(struct kfd_dev *kfd) { + int count; if (!kfd->init_complete) return;
+ mutex_lock(&kfd_processes_mutex); + count = ++kfd_locked; + mutex_unlock(&kfd_processes_mutex); /* For first KFD device suspend all the KFD processes */ - if (atomic_inc_return(&kfd_locked) == 1) + if (count == 1) kfd_suspend_all_processes();
kfd->dqm->ops.stop(kfd->dqm); @@ -622,7 +630,9 @@ int kgd2kfd_resume(struct kfd_dev *kfd) if (ret) return ret;
- count = atomic_dec_return(&kfd_locked); + mutex_lock(&kfd_processes_mutex); + count = --kfd_locked; + mutex_unlock(&kfd_processes_mutex); WARN_ONCE(count < 0, "KFD suspend / resume ref. error"); if (count == 0) ret = kfd_resume_all_processes(); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 92b285ca73aa..e04b2bf61a3b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -148,6 +148,7 @@ extern int noretry; * Halt if HWS hang is detected */ extern int halt_if_hws_hang; +extern struct mutex kfd_processes_mutex;
/** * enum kfd_sched_policy diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 4694386cc623..458e7af54a0c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -44,7 +44,7 @@ struct mm_struct; * Unique/indexed by mm_struct* */ DEFINE_HASHTABLE(kfd_processes_table, KFD_PROCESS_TABLE_SIZE); -static DEFINE_MUTEX(kfd_processes_mutex); +DEFINE_MUTEX(kfd_processes_mutex);
DEFINE_SRCU(kfd_processes_srcu);
@@ -220,6 +220,12 @@ struct kfd_process *kfd_create_process(struct file *filep) */ mutex_lock(&kfd_processes_mutex);
+ if (kfd_is_locked()) { + mutex_unlock(&kfd_processes_mutex); + pr_debug("KFD is locked! Cannot create process"); + return ERR_PTR(-EINVAL); + } + /* A prior open of /dev/kfd could have already created the process. */ process = find_process(thread); if (process)
From: Zhigang Luo Zhigang.Luo@amd.com
mainline inclusion from mainline-v6.10-rc1 commit dfb15c4ab58658aaa6161b546e7eb852ae7cc132 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I9U8NU CVE: CVE-2024-36949
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
If there are more than one device doing reset in parallel, the first device will call kfd_suspend_all_processes() to evict all processes on all devices, this call takes time to finish. other device will start reset and recover without waiting. if the process has not been evicted before doing recover, it will be restored, then caused page fault.
Signed-off-by: Zhigang Luo Zhigang.Luo@amd.com Reviewed-by: Felix Kuehling felix.kuehling@amd.com Signed-off-by: Alex Deucher alexander.deucher@amd.com
Conflicts: drivers/gpu/drm/amd/amdkfd/kfd_device.c [Some contexts different. No functional impact.] Signed-off-by: Zheng Zucheng zhengzucheng@huawei.com --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index e3d7a408d659..9f2eb8cf744a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -603,16 +603,14 @@ bool kfd_is_locked(void)
void kgd2kfd_suspend(struct kfd_dev *kfd) { - int count; if (!kfd->init_complete) return;
mutex_lock(&kfd_processes_mutex); - count = ++kfd_locked; - mutex_unlock(&kfd_processes_mutex); /* For first KFD device suspend all the KFD processes */ - if (count == 1) + if (++kfd_locked == 1) kfd_suspend_all_processes(); + mutex_unlock(&kfd_processes_mutex);
kfd->dqm->ops.stop(kfd->dqm);
@@ -621,7 +619,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd)
int kgd2kfd_resume(struct kfd_dev *kfd) { - int ret, count; + int ret;
if (!kfd->init_complete) return 0; @@ -631,11 +629,10 @@ int kgd2kfd_resume(struct kfd_dev *kfd) return ret;
mutex_lock(&kfd_processes_mutex); - count = --kfd_locked; - mutex_unlock(&kfd_processes_mutex); - WARN_ONCE(count < 0, "KFD suspend / resume ref. error"); - if (count == 0) + if (--kfd_locked == 0) ret = kfd_resume_all_processes(); + WARN_ONCE(kfd_locked < 0, "KFD suspend / resume ref. error"); + mutex_unlock(&kfd_processes_mutex);
return ret; }
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/9744 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/5...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/9744 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/5...