From: Mauro Carvalho Chehab mchehab+huawei@kernel.org
stable inclusion from stable-5.10.50 commit e23dc4a3e8ff7679326f00800821b7a322d75fd9 bugzilla: 174522 https://gitee.com/openeuler/kernel/issues/I4DNFY
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
[ Upstream commit 4cba5473c5ce0f1389d316c5dc6f83a0259df5eb ]
The Venus code has a sort of watchdog that attempts to recover from IP errors, implemented as a delayed work job, which calls venus_sys_error_handler().
Right now, it has several issues:
1. It assumes that PM runtime resume never fails
2. It internally runs two while() loops that also assume that PM runtime will never fail to go idle:
while (pm_runtime_active(core->dev_dec) || pm_runtime_active(core->dev_enc)) msleep(10);
...
while (core->pmdomains[0] && pm_runtime_active(core->pmdomains[0])) usleep_range(1000, 1500);
3. It uses an OR to merge all return codes and then report to the user
4. If the hardware never recovers, it keeps running on every 10ms, flooding the syslog with 2 messages (so, up to 200 messages per second).
Rework the code, in order to prevent that, by:
1. check the return code from PM runtime resume; 2. don't let the while() loops run forever; 3. store the failed event; 4. use warn ratelimited when it fails to recover.
Fixes: af2c3834c8ca ("[media] media: venus: adding core part and helper functions") Reviewed-by: Jonathan Cameron Jonathan.Cameron@huawei.com Signed-off-by: Mauro Carvalho Chehab mchehab+huawei@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Chen Jun chenjun102@huawei.com Acked-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Chen Jun chenjun102@huawei.com --- drivers/media/platform/qcom/venus/core.c | 60 +++++++++++++++++++----- 1 file changed, 47 insertions(+), 13 deletions(-)
diff --git a/drivers/media/platform/qcom/venus/core.c b/drivers/media/platform/qcom/venus/core.c index fd5993b3e674..58ddebbb8446 100644 --- a/drivers/media/platform/qcom/venus/core.c +++ b/drivers/media/platform/qcom/venus/core.c @@ -48,52 +48,86 @@ static const struct hfi_core_ops venus_core_ops = { .event_notify = venus_event_notify, };
+#define RPM_WAIT_FOR_IDLE_MAX_ATTEMPTS 10 + static void venus_sys_error_handler(struct work_struct *work) { struct venus_core *core = container_of(work, struct venus_core, work.work); - int ret = 0; - - pm_runtime_get_sync(core->dev); + int ret, i, max_attempts = RPM_WAIT_FOR_IDLE_MAX_ATTEMPTS; + const char *err_msg = ""; + bool failed = false; + + ret = pm_runtime_get_sync(core->dev); + if (ret < 0) { + err_msg = "resume runtime PM"; + max_attempts = 0; + failed = true; + }
hfi_core_deinit(core, true);
- dev_warn(core->dev, "system error has occurred, starting recovery!\n"); - mutex_lock(&core->lock);
- while (pm_runtime_active(core->dev_dec) || pm_runtime_active(core->dev_enc)) + for (i = 0; i < max_attempts; i++) { + if (!pm_runtime_active(core->dev_dec) && !pm_runtime_active(core->dev_enc)) + break; msleep(10); + }
venus_shutdown(core);
pm_runtime_put_sync(core->dev);
- while (core->pmdomains[0] && pm_runtime_active(core->pmdomains[0])) + for (i = 0; i < max_attempts; i++) { + if (!core->pmdomains[0] || !pm_runtime_active(core->pmdomains[0])) + break; usleep_range(1000, 1500); + }
hfi_reinit(core);
- pm_runtime_get_sync(core->dev); + ret = pm_runtime_get_sync(core->dev); + if (ret < 0) { + err_msg = "resume runtime PM"; + failed = true; + } + + ret = venus_boot(core); + if (ret && !failed) { + err_msg = "boot Venus"; + failed = true; + }
- ret |= venus_boot(core); - ret |= hfi_core_resume(core, true); + ret = hfi_core_resume(core, true); + if (ret && !failed) { + err_msg = "resume HFI"; + failed = true; + }
enable_irq(core->irq);
mutex_unlock(&core->lock);
- ret |= hfi_core_init(core); + ret = hfi_core_init(core); + if (ret && !failed) { + err_msg = "init HFI"; + failed = true; + }
pm_runtime_put_sync(core->dev);
- if (ret) { + if (failed) { disable_irq_nosync(core->irq); - dev_warn(core->dev, "recovery failed (%d)\n", ret); + dev_warn_ratelimited(core->dev, + "System error has occurred, recovery failed to %s\n", + err_msg); schedule_delayed_work(&core->work, msecs_to_jiffies(10)); return; }
+ dev_warn(core->dev, "system error has occurred (recovered)\n"); + mutex_lock(&core->lock); core->sys_error = false; mutex_unlock(&core->lock);