Fix CVE-2024-36949
Mukul Joshi (1): drm/amdkfd: Rework kfd_locked handling
Zhigang Luo (1): amd/amdkfd: sync all devices to wait all processes being evicted
drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 7 ------- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 23 ++++++++++++++--------- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 ++ drivers/gpu/drm/amd/amdkfd/kfd_process.c | 8 +++++++- 4 files changed, 23 insertions(+), 17 deletions(-)
From: Mukul Joshi mukul.joshi@amd.com
mainline inclusion from mainline-v6.5-rc1 commit fe1f05df5919c67c3add49efb55e251a8d78ee4e category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I9U8NU CVE: CVE-2024-36949
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Currently, even if kfd_locked is set, a process is first created and then removed to work around a race condition in updating kfd_locked flag. Rework kfd_locked handling to ensure no processes is created if kfd_locked is set. This is achieved by updating kfd_locked under kfd_processes_mutex. With this there is no need for kfd_locked to be an atomic counter. Instead, it can be a regular integer.
Signed-off-by: Mukul Joshi mukul.joshi@amd.com Reviewed-by: Felix Kuehling Felix.Kuehling@amd.com Signed-off-by: Alex Deucher alexander.deucher@amd.com
Conflicts: drivers/gpu/drm/amd/amdkfd/kfd_device.c drivers/gpu/drm/amd/amdkfd/kfd_process.c [Some contexts different. No functional impact.] Signed-off-by: Zheng Zucheng zhengzucheng@huawei.com --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 7 ------- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 21 +++++++++++++++------ drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 ++ drivers/gpu/drm/amd/amdkfd/kfd_process.c | 8 +++++++- 4 files changed, 24 insertions(+), 14 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 799a91a064a1..2a99edf8666e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -128,13 +128,6 @@ static int kfd_open(struct inode *inode, struct file *filep) if (IS_ERR(process)) return PTR_ERR(process);
- if (kfd_is_locked()) { - dev_dbg(kfd_device, "kfd is locked!\n" - "process %d unreferenced", process->pasid); - kfd_unref_process(process); - return -EAGAIN; - } - /* filep now owns the reference returned by kfd_create_process */ filep->private_data = process;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 148e43dee657..bf8474d40f10 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -38,7 +38,7 @@ * once locked, kfd driver will stop any further GPU execution. * create process (open) will return -EAGAIN. */ -static atomic_t kfd_locked = ATOMIC_INIT(0); +static int kfd_locked;
#ifdef CONFIG_DRM_AMDGPU_CIK extern const struct kfd2kgd_calls gfx_v7_kfd2kgd; @@ -842,8 +842,9 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd) ret = kfd_resume(kfd); if (ret) return ret; - atomic_dec(&kfd_locked); - + mutex_lock(&kfd_processes_mutex); + --kfd_locked; + mutex_unlock(&kfd_processes_mutex); atomic_set(&kfd->sram_ecc_flag, 0);
kfd_smi_event_update_gpu_reset(kfd, true); @@ -853,18 +854,23 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
bool kfd_is_locked(void) { - return (atomic_read(&kfd_locked) > 0); + lockdep_assert_held(&kfd_processes_mutex); + return (kfd_locked > 0); }
void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) { + int count; if (!kfd->init_complete) return;
/* for runtime suspend, skip locking kfd */ if (!run_pm) { + mutex_lock(&kfd_processes_mutex); + count = ++kfd_locked; + mutex_unlock(&kfd_processes_mutex); /* For first KFD device suspend all the KFD processes */ - if (atomic_inc_return(&kfd_locked) == 1) + if (count == 1) kfd_suspend_all_processes(); }
@@ -885,7 +891,10 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
/* for runtime resume, skip unlocking kfd */ if (!run_pm) { - count = atomic_dec_return(&kfd_locked); + mutex_lock(&kfd_processes_mutex); + count = --kfd_locked; + mutex_unlock(&kfd_processes_mutex); + WARN_ONCE(count < 0, "KFD suspend / resume ref. error"); if (count == 0) ret = kfd_resume_all_processes(); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 057c48a9b53a..b3b4da634f71 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -172,6 +172,8 @@ extern int queue_preemption_timeout_ms; /* Enable eviction debug messages */ extern bool debug_evictions;
+extern struct mutex kfd_processes_mutex; + enum cache_policy { cache_policy_coherent, cache_policy_noncoherent diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index d243e60c6eef..aa1e9d7a2507 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -48,7 +48,7 @@ struct mm_struct; * Unique/indexed by mm_struct* */ DEFINE_HASHTABLE(kfd_processes_table, KFD_PROCESS_TABLE_SIZE); -static DEFINE_MUTEX(kfd_processes_mutex); +DEFINE_MUTEX(kfd_processes_mutex);
DEFINE_SRCU(kfd_processes_srcu);
@@ -761,6 +761,12 @@ struct kfd_process *kfd_create_process(struct file *filep) */ mutex_lock(&kfd_processes_mutex);
+ if (kfd_is_locked()) { + mutex_unlock(&kfd_processes_mutex); + pr_debug("KFD is locked! Cannot create process"); + return ERR_PTR(-EINVAL); + } + /* A prior open of /dev/kfd could have already created the process. */ process = find_process(thread); if (process) {
From: Zhigang Luo Zhigang.Luo@amd.com
mainline inclusion from mainline-v6.10-rc1 commit dfb15c4ab58658aaa6161b546e7eb852ae7cc132 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I9U8NU CVE: CVE-2024-36949
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
If there are more than one device doing reset in parallel, the first device will call kfd_suspend_all_processes() to evict all processes on all devices, this call takes time to finish. other device will start reset and recover without waiting. if the process has not been evicted before doing recover, it will be restored, then caused page fault.
Signed-off-by: Zhigang Luo Zhigang.Luo@amd.com Reviewed-by: Felix Kuehling felix.kuehling@amd.com Signed-off-by: Alex Deucher alexander.deucher@amd.com
Conflicts: drivers/gpu/drm/amd/amdkfd/kfd_device.c [Some contexts different. No functional impact.] Signed-off-by: Zheng Zucheng zhengzucheng@huawei.com --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index bf8474d40f10..04126ffbc973 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -860,18 +860,16 @@ bool kfd_is_locked(void)
void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) { - int count; if (!kfd->init_complete) return;
/* for runtime suspend, skip locking kfd */ if (!run_pm) { mutex_lock(&kfd_processes_mutex); - count = ++kfd_locked; - mutex_unlock(&kfd_processes_mutex); /* For first KFD device suspend all the KFD processes */ - if (count == 1) + if (++kfd_locked == 1) kfd_suspend_all_processes(); + mutex_unlock(&kfd_processes_mutex); }
kfd->dqm->ops.stop(kfd->dqm); @@ -880,7 +878,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) { - int ret, count; + int ret;
if (!kfd->init_complete) return 0; @@ -892,12 +890,10 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) /* for runtime resume, skip unlocking kfd */ if (!run_pm) { mutex_lock(&kfd_processes_mutex); - count = --kfd_locked; - mutex_unlock(&kfd_processes_mutex); - - WARN_ONCE(count < 0, "KFD suspend / resume ref. error"); - if (count == 0) + if (--kfd_locked == 0) ret = kfd_resume_all_processes(); + WARN_ONCE(kfd_locked < 0, "KFD suspend / resume ref. error"); + mutex_unlock(&kfd_processes_mutex); }
return ret;
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/8969 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/L...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/8969 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/L...