From: Bob Pearson rpearsonhpe@gmail.com
stable inclusion from stable-v5.10.138 commit 18f62a453b7222d78d59735773a777ac18af78a5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I60QFD
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
[ Upstream commit eff6d998ca297cb0b2e53b032a56cf8e04dd8b17 ]
Limit the maximum number of calls to each tasklet from rxe_do_task() before yielding the cpu. When the limit is reached reschedule the tasklet and exit the calling loop. This patch prevents one tasklet from consuming 100% of a cpu core and causing a deadlock or soft lockup.
Link: https://lore.kernel.org/r/20220630190425.2251-9-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson rpearsonhpe@gmail.com Signed-off-by: Jason Gunthorpe jgg@nvidia.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Reviewed-by: Wei Li liwei391@huawei.com --- drivers/infiniband/sw/rxe/rxe_param.h | 6 ++++++ drivers/infiniband/sw/rxe/rxe_task.c | 16 ++++++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-)
diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h index f9fb56ec6dfd..dca86422b0a2 100644 --- a/drivers/infiniband/sw/rxe/rxe_param.h +++ b/drivers/infiniband/sw/rxe/rxe_param.h @@ -98,6 +98,12 @@ enum rxe_device_param { RXE_INFLIGHT_SKBS_PER_QP_HIGH = 64, RXE_INFLIGHT_SKBS_PER_QP_LOW = 16,
+ /* Max number of interations of each tasklet + * before yielding the cpu to let other + * work make progress + */ + RXE_MAX_ITERATIONS = 1024, + /* Delay before calling arbiter timer */ RXE_NSEC_ARB_TIMER_DELAY = 200,
diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c index 6951fdcb31bf..568cf56c236b 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.c +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -8,7 +8,7 @@ #include <linux/interrupt.h> #include <linux/hardirq.h>
-#include "rxe_task.h" +#include "rxe.h"
int __rxe_do_task(struct rxe_task *task)
@@ -34,6 +34,7 @@ void rxe_do_task(struct tasklet_struct *t) int ret; unsigned long flags; struct rxe_task *task = from_tasklet(task, t, tasklet); + unsigned int iterations = RXE_MAX_ITERATIONS;
spin_lock_irqsave(&task->state_lock, flags); switch (task->state) { @@ -62,13 +63,20 @@ void rxe_do_task(struct tasklet_struct *t) spin_lock_irqsave(&task->state_lock, flags); switch (task->state) { case TASK_STATE_BUSY: - if (ret) + if (ret) { task->state = TASK_STATE_START; - else + } else if (iterations--) { cont = 1; + } else { + /* reschedule the tasklet and exit + * the loop to give up the cpu + */ + tasklet_schedule(&task->tasklet); + task->state = TASK_STATE_START; + } break;
- /* soneone tried to run the task since the last time we called + /* someone tried to run the task since the last time we called * func, so we will call one more time regardless of the * return value */