[Linuxarm] [PATCH v3 3/3] iommu/iopf: add timeout function to iommu

5 Jan 2023

In the current iopf(IO page fault) processing flow of iommu,
each iopf event will create a queue_work event and put
it into the iopf workqueue. When the system schedules
the work event, the page table creation will be completed.
Its basic processing flow is as follows:

         +------------------+
         | recv iopf event  |
         +--------+---------+
                  |
         +--------v---------+
         | init iopf event  |
         +--------+---------+
                  |
         +--------v---------+
         | create queue_work|
         +--------+---------+
                  |
       +----------v-----------+
 +---->|  wait for queue_work |
 |     |       to run         |
 |     +----------+-----------+
 |                |
 |No         +----v----+
 +-----------+  run ?  |
             +----+----+
                  |Yes
        +---------v---------+
 +----->|  hand queue_work  |
 |      | create page table |
 |      +---------+---------+
 |                |
 |No   +----------v------------+
 +-----+ processing completed ?|
       +----------+------------+
                  |Yes
          +-------v---------+
          | return response |
          +-----------------+

In this processing flow, there are two nodes that will be
unable to respond for a long time in some scenarios of
high system load:
1. On the node where the work queue is waiting to be
scheduled.
2. After the work queue is scheduled, in the process
of creating the page table.

A hardware error and reset occurs when the two nodes
cannot complete processing within the limited wait
time of the hardware device.

So I added a timeout function to this iopf processing flow.
First, it creates a timer when creating the queue_work
event and sets its timeout. At the same time, set an
initial state for the iopf event.

Second, the timer is used to time the process from
the waiting of the iopf event to the completion of
the iopf processing.

Third, if the processing is completed within the
timeout period, the timer will be destroyed directly.
If the processing of the iopf event has not been
completed after the timer timeout, the timer callback
interface will be activated, it will directly return
the result of the current iopf processing failure to
the device, and set the status of the iopf event to
the timeout state.

Finally, when the iopf event is scheduled by the
system, it will be judged according to the state
of the iopf event. If it is in the timeout state,
the iopf event will be skipped directly without
processing.

Different from the previous Intel's timeout timer method,
URL: https://lore.kernel.org/lkml/1565900005-62508-3-git-
send-email-jacob.jun.pan@linux.intel.com/

The current timeout timer will create a timer for each
IO page fault request. After the page fault request is
processed normally or an error is returned after timeout,
the timer will be automatically destroyed and will not
be reused.

Signed-off-by: Longfang Liu <liulongfang@huawei.com>
---
 drivers/iommu/io-pgfault.c | 63 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 61 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
index cd7ebbfe4a67..0fb3ec1390a8 100644
--- a/drivers/iommu/io-pgfault.c
+++ b/drivers/iommu/io-pgfault.c
@@ -6,9 +6,11 @@
  */
 
 #include <linux/iommu.h>
+#include <linux/jiffies.h>
 #include <linux/list.h>
 #include <linux/sched/mm.h>
 #include <linux/slab.h>
+#include <linux/timer.h>
 #include <linux/workqueue.h>
 
 #include "iommu-sva.h"
@@ -56,11 +58,20 @@ struct iopf_fault {
 #define IOPF_RESPONSE_MAX_TIMEOUT		10000
 static unsigned long iopf_timeout_ms = IOPF_RESPONSE_DEF_TIMEOUT;
 
+enum iopf_timer_state {
+	IOPF_TIME_INIT = 0,
+	IOPF_IN_PROCESS,
+	IOPF_COMPLETE,
+	IOPF_TIME_OUT,
+};
+
 struct iopf_group {
 	struct iopf_fault		last_fault;
 	struct list_head		faults;
 	struct work_struct		work;
 	struct device			*dev;
+	struct timer_list		iopf_timer;
+	atomic_t			iopf_state;
 };
 
 static int __init iommu_set_iopf_timeout(char *str)
@@ -101,6 +112,32 @@ static int iopf_complete_group(struct device *dev, struct iopf_fault *iopf,
 	return iommu_page_response(dev, &resp);
 }
 
+static void iopf_timer_func(struct timer_list *t)
+{
+	struct iopf_group *group = from_timer(group, t, iopf_timer);
+
+	/* Workqueue process time out */
+	if (atomic_read(&group->iopf_state) < IOPF_COMPLETE) {
+		atomic_set(&group->iopf_state, IOPF_TIME_OUT);
+		iopf_complete_group(group->dev, &group->last_fault,
+					 IOMMU_PAGE_RESP_FAILURE);
+		dev_err(group->dev, "[timer] iopf workqueue process time out.\n");
+	}
+}
+
+static void iopf_timer_setup(struct iopf_group *group)
+{
+	atomic_set(&group->iopf_state, IOPF_TIME_INIT);
+
+	/* IOPF timeout is 0 means no timeout tracking */
+	if (iopf_timeout_ms) {
+		timer_setup(&group->iopf_timer, iopf_timer_func, 0);
+		group->iopf_timer.expires = jiffies +
+					msecs_to_jiffies(iopf_timeout_ms);
+		mod_timer(&group->iopf_timer, group->iopf_timer.expires);
+	}
+}
+
 static void iopf_handler(struct work_struct *work)
 {
 	struct iopf_group *group;
@@ -109,6 +146,17 @@ static void iopf_handler(struct work_struct *work)
 	enum iommu_page_response_code status = IOMMU_PAGE_RESP_SUCCESS;
 
 	group = container_of(work, struct iopf_group, work);
+	if (atomic_read(&group->iopf_state) == IOPF_TIME_OUT) {
+		list_for_each_entry_safe(iopf, next, &group->faults, list) {
+			if (!(iopf->fault.prm.flags &
+			      IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE))
+				kfree(iopf);
+		}
+		dev_err(group->dev, "iopf workqueue wait time out.\n");
+		goto iopf_err;
+	}
+	atomic_set(&group->iopf_state, IOPF_IN_PROCESS);
+
 	domain = iommu_get_domain_for_dev_pasid(group->dev,
 				group->last_fault.fault.prm.pasid, 0);
 	if (!domain || !domain->iopf_handler)
@@ -128,7 +176,16 @@ static void iopf_handler(struct work_struct *work)
 			kfree(iopf);
 	}
 
+	if (atomic_read(&group->iopf_state) == IOPF_TIME_OUT) {
+		dev_err(group->dev, "iopf handle fault time out.\n");
+		goto iopf_err;
+	}
+
+	atomic_set(&group->iopf_state, IOPF_COMPLETE);
 	iopf_complete_group(group->dev, &group->last_fault, status);
+
+iopf_err:
+	del_timer_sync(&group->iopf_timer);
 	kfree(group);
 }
 
@@ -183,7 +240,6 @@ int iommu_queue_iopf(struct iommu_fault *fault, void *cookie)
 	struct dev_iommu *param = dev->iommu;
 
 	lockdep_assert_held(¶m->lock);
-
 	if (fault->type != IOMMU_FAULT_PAGE_REQ)
 		/* Not a recoverable page fault */
 		return -EOPNOTSUPP;
@@ -222,6 +278,8 @@ int iommu_queue_iopf(struct iommu_fault *fault, void *cookie)
 
 	group->dev = dev;
 	group->last_fault.fault = *fault;
+	iopf_timer_setup(group);
+
 	INIT_LIST_HEAD(&group->faults);
 	list_add(&group->last_fault.list, &group->faults);
 	INIT_WORK(&group->work, iopf_handler);
@@ -414,7 +472,8 @@ struct iopf_queue *iopf_queue_alloc(const char *name)
 	 * that's dealt with, the high-level function can handle groups out of
 	 * order.
 	 */
-	queue->wq = alloc_workqueue("iopf_queue/%s", WQ_UNBOUND, 0, name);
+	queue->wq = alloc_workqueue("iopf_queue/%s", WQ_HIGHPRI |
+				 WQ_MEM_RECLAIM | WQ_UNBOUND, 0, name);
 	if (!queue->wq) {
 		kfree(queue);
 		return NULL;
-- 
2.24.0