From: Asad Kamal asad.kamal@amd.com
mainline inclusion from mainline-v6.9-rc1 commit 601429cca96b4af3be44172c3b64e4228515dbe1 category: bugfix bugzilla: 189933 CVE: CVE-2024-35931
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
If one of the devices in the hive detects a fatal error, need to send ras recovery reset message to PMFW of all devices in the hive. For that add a flag in hive to indicate that it's undergoing ras recovery
Signed-off-by: Asad Kamal asad.kamal@amd.com Reviewed-by: Hawking Zhang Hawking.Zhang@amd.com Reviewed-by: Lijo Lazar lijo.lazar@amd.com Signed-off-by: Alex Deucher alexander.deucher@amd.com Conflicts: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h drivers/gpu/drm/amd/amdgpu/amdgpu_device.c [adjust context conflicts] Signed-off-by: Liu Chuang liuchuang40@huawei.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 9 +++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 ++ 2 files changed, 9 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 3638f0e12a2b..454473c9db6c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1568,9 +1568,11 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) struct amdgpu_device *remote_adev = NULL; struct amdgpu_device *adev = ras->adev; struct list_head device_list, *device_list_handle = NULL; + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
+ if (hive) + atomic_set(&hive->ras_recovery, 1); if (!ras->disable_ras_err_cnt_harvest) { - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
/* Build list of devices to query RAS related errors */ if (hive && adev->gmc.xgmi.num_physical_nodes > 1) { @@ -1587,12 +1589,15 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) amdgpu_ras_log_on_err_counter(remote_adev); }
- amdgpu_put_xgmi_hive(hive); }
if (amdgpu_device_should_recover_gpu(ras->adev)) amdgpu_device_gpu_recover(ras->adev, NULL); atomic_set(&ras->in_recovery, 0); + if (hive) { + atomic_set(&hive->ras_recovery, 0); + amdgpu_put_xgmi_hive(hive); + } }
/* alloc/realloc bps array */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h index 148560d63554..a508a5237ac2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h @@ -42,6 +42,8 @@ struct amdgpu_hive_info { AMDGPU_XGMI_PSTATE_MAX_VEGA20, AMDGPU_XGMI_PSTATE_UNKNOWN } pstate; + + atomic_t ras_recovery; };
struct amdgpu_pcs_ras_field {