[PATCH v2 OLK-6.6 1/3] workqueue: add member for NUMA aware order workqueue and implement NUMA affinity for single thread workqueue

5 Mar 2024

From: Shao Denghui <shaodenghui@huawei.com>

euleros inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I94XYA
CVE: NA

-------------------------------------------------

Currently, single thread workqueue only have single pwq, all of
works are queued the same workerpool. This is not optimal on
NUMA machines, will cause workers jump around across node.

This patch add a new wq flags __WQ_DYNAMIC,  this new kind of
single thread workqueue creates a separate pwq covering the
intersecting CPUS for each NUMA node which has online CPUS
in @attrs->cpumask instead of mapping all entries of numa_pwq_tbl[]
to the same pwq. After this, we can specify the @cpu of
queue_work_on, so the work can be executed on the same NUMA
node of the specified @cpu.
This kind of wq only support single work, multi works can't guarantee
the work's order.

Signed-off-by: Biaoxiang Ye <yebiaoxiang@huawei.com>
Signed-off-by: shaodenghui <shaodenghui@huawei.com>
---
 include/linux/workqueue.h |  1 +
 kernel/workqueue.c        | 21 +++++++++++++++++++--
 lib/Kconfig               | 16 ++++++++++++++++
 3 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 9619098755fb..485c0f5b2518 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -415,6 +415,7 @@ enum {
 	__WQ_ORDERED		= 1 << 17, /* internal: workqueue is ordered */
 	__WQ_LEGACY		= 1 << 18, /* internal: create*_workqueue() */
 	__WQ_ORDERED_EXPLICIT	= 1 << 19, /* internal: alloc_ordered_workqueue() */
+	__WQ_DYNAMIC            = 1 << 25, /* internal: only support single work order WQ */
 
 	WQ_MAX_ACTIVE		= 512,	  /* I like 512, better ideas? */
 	WQ_UNBOUND_MAX_ACTIVE	= WQ_MAX_ACTIVE,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1888741f5edd..74431968a05c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4355,6 +4355,10 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
 	 * it even if we don't use it immediately.
 	 */
 	copy_workqueue_attrs(new_attrs, attrs);
+#ifdef KWORKER_NUMA_AFFINITY
+	if (wq->flags & __WQ_DYNAMIC)
+		new_attrs->ordered = false;
+#endif
 	wqattrs_actualize_cpumask(new_attrs, unbound_cpumask);
 	cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);
 	ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
@@ -4591,10 +4595,19 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
 	cpus_read_lock();
 	if (wq->flags & __WQ_ORDERED) {
 		ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
+#ifdef KWORKER_NUMA_AFFINITY
+		if (!(wq->flags & __WQ_DYNAMIC)) {
+			/* there should only be single pwq for ordering guarantee */
+			WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
+				wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
+				"ordering guarantee broken for workqueue %s\n", wq->name);
+		}
+#else
 		/* there should only be single pwq for ordering guarantee */
 		WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
-			      wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
-		     "ordering guarantee broken for workqueue %s\n", wq->name);
+				wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
+				"ordering guarantee broken for workqueue %s\n", wq->name);
+#endif
 	} else {
 		ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
 	}
@@ -5799,7 +5812,11 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
 
 		/* creating multiple pwqs breaks ordering guarantee */
 		if (!list_empty(&wq->pwqs)) {
+#ifdef KWORKER_NUMA_AFFINITY
+			if (wq->flags & __WQ_ORDERED_EXPLICIT && !(wq->flags & __WQ_DYNAMIC))
+#else
 			if (wq->flags & __WQ_ORDERED_EXPLICIT)
+#endif
 				continue;
 			wq->flags &= ~__WQ_ORDERED;
 		}
diff --git a/lib/Kconfig b/lib/Kconfig
index c686f4adc124..81c5a44c51b7 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -19,6 +19,22 @@ config RAID6_PQ_BENCHMARK
 	  Benchmark all available RAID6 PQ functions on init and choose the
 	  fastest one.
 
+config KWORKER_NUMA_AFFINITY
+	bool "kworker NUMA affinity"
+	default n
+	help
+	  kworker: Fix the problem of ipsan performance degradation.
+
+	  When the current downstream FS tests IPSAN, it is found that the
+	  performance on ARM is much worse than that on X86, and the test data
+	  of IPSAN fluctuates greatly. After analysis, the reason is that when
+	  iscsi issues IO, the task is sent to kworker for processing by
+	  iscsi_xmitworker.
+
+	  The workqueue created by iscsi can automatically identify the CPU of
+	  the soft interrupt currently processed by iscsi, and automatically
+	  schedule the workqueue to the corresponding NUMA node.
+
 config LINEAR_RANGES
 	tristate
 
-- 
2.33.0

    

[PATCH v2 OLK-6.6 1/3] workqueue: add member for NUMA aware order workqueue and implement NUMA affinity for single thread workqueue

jiangdongxu