Kernel
Threads by month
- ----- 2025 -----
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2024 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2023 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2022 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2021 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2020 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2019 -----
- December
- 43 participants
- 21189 discussions
[PATCH openEuler-1.0-LTS] drm/amd/display: Skip finding free audio for unknown engine_id
by Zhao Wenhui 05 Aug '24
by Zhao Wenhui 05 Aug '24
05 Aug '24
From: Alex Hung <alex.hung(a)amd.com>
stable inclusion
from stable-v4.19.318
commit 9eb4db08a808e3a3ba59193aeb84a57a6dc4d8c9
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/IAGT0E
CVE: CVE-2024-42119
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id…
--------------------------------
[ Upstream commit 1357b2165d9ad94faa4c4a20d5e2ce29c2ff29c3 ]
[WHY]
ENGINE_ID_UNKNOWN = -1 and can not be used as an array index. Plus, it
also means it is uninitialized and does not need free audio.
[HOW]
Skip and return NULL.
This fixes 2 OVERRUN issues reported by Coverity.
Reviewed-by: Rodrigo Siqueira <rodrigo.siqueira(a)amd.com>
Acked-by: Wayne Lin <wayne.lin(a)amd.com>
Signed-off-by: Alex Hung <alex.hung(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
Signed-off-by: Zhao Wenhui <zhaowenhui8(a)huawei.com>
---
drivers/gpu/drm/amd/display/dc/core/dc_resource.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c
index 6896d69b8c24..8b4337794d1e 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c
@@ -1703,6 +1703,9 @@ static struct audio *find_first_free_audio(
{
int i, available_audio_count;
+ if (id == ENGINE_ID_UNKNOWN)
+ return NULL;
+
available_audio_count = pool->audio_count;
for (i = 0; i < available_audio_count; i++) {
--
2.34.1
2
1
[PATCH OLK-6.6] wifi: mac80211: Avoid address calculations via out of bounds array indexing
by Dong Chenchen 05 Aug '24
by Dong Chenchen 05 Aug '24
05 Aug '24
From: Kenton Groombridge <concord(a)gentoo.org>
mainline inclusion
from mainline-v6.10-rc5
commit 2663d0462eb32ae7c9b035300ab6b1523886c718
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/IAGEKT
CVE: CVE-2024-41071
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
--------------------------------
req->n_channels must be set before req->channels[] can be used.
This patch fixes one of the issues encountered in [1].
[ 83.964255] UBSAN: array-index-out-of-bounds in net/mac80211/scan.c:364:4
[ 83.964258] index 0 is out of range for type 'struct ieee80211_channel *[]'
[...]
[ 83.964264] Call Trace:
[ 83.964267] <TASK>
[ 83.964269] dump_stack_lvl+0x3f/0xc0
[ 83.964274] __ubsan_handle_out_of_bounds+0xec/0x110
[ 83.964278] ieee80211_prep_hw_scan+0x2db/0x4b0
[ 83.964281] __ieee80211_start_scan+0x601/0x990
[ 83.964291] nl80211_trigger_scan+0x874/0x980
[ 83.964295] genl_family_rcv_msg_doit+0xe8/0x160
[ 83.964298] genl_rcv_msg+0x240/0x270
[...]
[1] https://bugzilla.kernel.org/show_bug.cgi?id=218810
Co-authored-by: Kees Cook <keescook(a)chromium.org>
Signed-off-by: Kees Cook <kees(a)kernel.org>
Signed-off-by: Kenton Groombridge <concord(a)gentoo.org>
Link: https://msgid.link/20240605152218.236061-1-concord@gentoo.org
Signed-off-by: Johannes Berg <johannes.berg(a)intel.com>
Conflicts:
net/mac80211/scan.c
[commit 5add321c329b remove scan_width support, which not merged
lead to conflicts.]
Signed-off-by: Dong Chenchen <dongchenchen2(a)huawei.com>
---
net/mac80211/scan.c | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index b68214f15983..108918e81ce0 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -346,7 +346,8 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_sub_if_data *sdata)
struct cfg80211_scan_request *req;
struct cfg80211_chan_def chandef;
u8 bands_used = 0;
- int i, ielen, n_chans;
+ int i, ielen;
+ u32 *n_chans;
u32 flags = 0;
req = rcu_dereference_protected(local->scan_req,
@@ -356,34 +357,34 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_sub_if_data *sdata)
return false;
if (ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS)) {
+ local->hw_scan_req->req.n_channels = req->n_channels;
+
for (i = 0; i < req->n_channels; i++) {
local->hw_scan_req->req.channels[i] = req->channels[i];
bands_used |= BIT(req->channels[i]->band);
}
-
- n_chans = req->n_channels;
} else {
do {
if (local->hw_scan_band == NUM_NL80211_BANDS)
return false;
- n_chans = 0;
+ n_chans = &local->hw_scan_req->req.n_channels;
+ *n_chans = 0;
for (i = 0; i < req->n_channels; i++) {
if (req->channels[i]->band !=
local->hw_scan_band)
continue;
- local->hw_scan_req->req.channels[n_chans] =
+ local->hw_scan_req->req.channels[(*n_chans)++] =
req->channels[i];
- n_chans++;
+
bands_used |= BIT(req->channels[i]->band);
}
local->hw_scan_band++;
- } while (!n_chans);
+ } while (!*n_chans);
}
- local->hw_scan_req->req.n_channels = n_chans;
ieee80211_prepare_scan_chandef(&chandef, req->scan_width);
if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT)
--
2.25.1
2
1
05 Aug '24
From: Waiman Long <longman(a)redhat.com>
stable inclusion
from stable-v5.10.222
commit 0100aeb8a12d51950418e685f879cc80cb8e5982
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/IAGEL6
CVE: CVE-2024-41055
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id…
---------------------------
[ Upstream commit 82f0b6f041fad768c28b4ad05a683065412c226e ]
Commit 5ec8e8ea8b77 ("mm/sparsemem: fix race in accessing
memory_section->usage") changed pfn_section_valid() to add a READ_ONCE()
call around "ms->usage" to fix a race with section_deactivate() where
ms->usage can be cleared. The READ_ONCE() call, by itself, is not enough
to prevent NULL pointer dereference. We need to check its value before
dereferencing it.
Link: https://lkml.kernel.org/r/20240626001639.1350646-1-longman@redhat.com
Fixes: 5ec8e8ea8b77 ("mm/sparsemem: fix race in accessing memory_section->usage")
Signed-off-by: Waiman Long <longman(a)redhat.com>
Cc: Charan Teja Kalla <quic_charante(a)quicinc.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
Signed-off-by: Tong Tiangen <tongtiangen(a)huawei.com>
---
include/linux/mmzone.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5398656315e1..1751f7e4a60f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1391,8 +1391,9 @@ static inline int subsection_map_index(unsigned long pfn)
static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
{
int idx = subsection_map_index(pfn);
+ struct mem_section_usage *usage = READ_ONCE(ms->usage);
- return test_bit(idx, READ_ONCE(ms->usage)->subsection_map);
+ return usage ? test_bit(idx, usage->subsection_map) : 0;
}
#else
static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
--
2.25.1
2
1
05 Aug '24
From: Waiman Long <longman(a)redhat.com>
stable inclusion
from stable-v5.10.222
commit 0100aeb8a12d51950418e685f879cc80cb8e5982
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/IAGEL6
CVE: CVE-2024-41055
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id…
---------------------------
[ Upstream commit 82f0b6f041fad768c28b4ad05a683065412c226e ]
Commit 5ec8e8ea8b77 ("mm/sparsemem: fix race in accessing
memory_section->usage") changed pfn_section_valid() to add a READ_ONCE()
call around "ms->usage" to fix a race with section_deactivate() where
ms->usage can be cleared. The READ_ONCE() call, by itself, is not enough
to prevent NULL pointer dereference. We need to check its value before
dereferencing it.
Link: https://lkml.kernel.org/r/20240626001639.1350646-1-longman@redhat.com
Fixes: 5ec8e8ea8b77 ("mm/sparsemem: fix race in accessing memory_section->usage")
Signed-off-by: Waiman Long <longman(a)redhat.com>
Cc: Charan Teja Kalla <quic_charante(a)quicinc.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
Signed-off-by: Tong Tiangen <tongtiangen(a)huawei.com>
---
include/linux/mmzone.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5398656315e1..1751f7e4a60f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1391,8 +1391,9 @@ static inline int subsection_map_index(unsigned long pfn)
static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
{
int idx = subsection_map_index(pfn);
+ struct mem_section_usage *usage = READ_ONCE(ms->usage);
- return test_bit(idx, READ_ONCE(ms->usage)->subsection_map);
+ return usage ? test_bit(idx, usage->subsection_map) : 0;
}
#else
static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
--
2.25.1
1
0
[PATCH openEuler-22.03-LTS-SP1 v2] drm/shmem-helper: Fix BUG_ON() on mmap(PROT_WRITE, MAP_PRIVATE)
by Pu Lehui 05 Aug '24
by Pu Lehui 05 Aug '24
05 Aug '24
From: "Wachowski, Karol" <karol.wachowski(a)intel.com>
mainline inclusion
from mainline-v6.10-rc2
commit 39bc27bd688066a63e56f7f64ad34fae03fbe3b8
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/IACS4Z
CVE: CVE-2024-39497
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
--------------------------------
Lack of check for copy-on-write (COW) mapping in drm_gem_shmem_mmap
allows users to call mmap with PROT_WRITE and MAP_PRIVATE flag
causing a kernel panic due to BUG_ON in vmf_insert_pfn_prot:
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
Return -EINVAL early if COW mapping is detected.
This bug affects all drm drivers using default shmem helpers.
It can be reproduced by this simple example:
void *ptr = mmap(0, size, PROT_WRITE, MAP_PRIVATE, fd, mmap_offset);
ptr[0] = 0;
Fixes: 2194a63a818d ("drm: Add library for shmem backed GEM objects")
Cc: Noralf Trønnes <noralf(a)tronnes.org>
Cc: Eric Anholt <eric(a)anholt.net>
Cc: Rob Herring <robh(a)kernel.org>
Cc: Maarten Lankhorst <maarten.lankhorst(a)linux.intel.com>
Cc: Maxime Ripard <mripard(a)kernel.org>
Cc: Thomas Zimmermann <tzimmermann(a)suse.de>
Cc: David Airlie <airlied(a)gmail.com>
Cc: Daniel Vetter <daniel(a)ffwll.ch>
Cc: dri-devel(a)lists.freedesktop.org
Cc: <stable(a)vger.kernel.org> # v5.2+
Signed-off-by: Wachowski, Karol <karol.wachowski(a)intel.com>
Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz(a)linux.intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter(a)ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20240520100514.925681-1-jacek…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Conflicts:
drivers/gpu/drm/drm_gem_shmem_helper.c
[The conflicts were due to not merge commit 21aa27ddc582 and 97a7e4733b9b]
Signed-off-by: Pu Lehui <pulehui(a)huawei.com>
---
drivers/gpu/drm/drm_gem_shmem_helper.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/drivers/gpu/drm/drm_gem_shmem_helper.c b/drivers/gpu/drm/drm_gem_shmem_helper.c
index cfacce0418a4..8d1abf82d2b9 100644
--- a/drivers/gpu/drm/drm_gem_shmem_helper.c
+++ b/drivers/gpu/drm/drm_gem_shmem_helper.c
@@ -17,6 +17,8 @@
#include <drm/drm_prime.h>
#include <drm/drm_print.h>
+#include "../../../mm/internal.h" /* is_cow_mapping() */
+
/**
* DOC: overview
*
@@ -613,6 +615,9 @@ int drm_gem_shmem_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma)
return dma_buf_mmap(obj->dma_buf, vma, 0);
}
+ if (is_cow_mapping(vma->vm_flags))
+ return -EINVAL;
+
shmem = to_drm_gem_shmem_obj(obj);
ret = drm_gem_shmem_get_pages(shmem);
--
2.34.1
2
1
From: Udit Kumar <u-kumar1(a)ti.com>
stable inclusion
from stable-v5.10.221
commit cb879300669881970eabebe64bd509dbbe42b9de
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/IAGEP6
CVE: CVE-2024-42095
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id…
--------------------------------
commit 9d141c1e615795eeb93cd35501ad144ee997a826 upstream.
As per Errata i2310[0], Erroneous timeout can be triggered,
if this Erroneous interrupt is not cleared then it may leads
to storm of interrupts, therefore apply Errata i2310 solution.
[0] https://www.ti.com/lit/pdf/sprz536 page 23
Fixes: b67e830d38fa ("serial: 8250: 8250_omap: Fix possible interrupt storm on K3 SoCs")
Cc: stable(a)vger.kernel.org
Signed-off-by: Udit Kumar <u-kumar1(a)ti.com>
Link: https://lore.kernel.org/r/20240619105903.165434-1-u-kumar1@ti.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Signed-off-by: Felix Fu <fuzhen5(a)huawei.com>
---
drivers/tty/serial/8250/8250_omap.c | 25 ++++++++++++++++++++-----
1 file changed, 20 insertions(+), 5 deletions(-)
diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c
index 25765ebb756a..955642e90ede 100644
--- a/drivers/tty/serial/8250/8250_omap.c
+++ b/drivers/tty/serial/8250/8250_omap.c
@@ -164,6 +164,10 @@ static void uart_write(struct omap8250_priv *priv, u32 reg, u32 val)
writel(val, priv->membase + (reg << OMAP_UART_REGSHIFT));
}
+/* Timeout low and High */
+#define UART_OMAP_TO_L 0x26
+#define UART_OMAP_TO_H 0x27
+
/*
* Called on runtime PM resume path from omap8250_restore_regs(), and
* omap8250_set_mctrl().
@@ -647,13 +651,24 @@ static irqreturn_t omap8250_irq(int irq, void *dev_id)
/*
* On K3 SoCs, it is observed that RX TIMEOUT is signalled after
- * FIFO has been drained, in which case a dummy read of RX FIFO
- * is required to clear RX TIMEOUT condition.
+ * FIFO has been drained or erroneously.
+ * So apply solution of Errata i2310 as mentioned in
+ * https://www.ti.com/lit/pdf/sprz536
*/
if (priv->habit & UART_RX_TIMEOUT_QUIRK &&
- (iir & UART_IIR_RX_TIMEOUT) == UART_IIR_RX_TIMEOUT &&
- serial_port_in(port, UART_OMAP_RX_LVL) == 0) {
- serial_port_in(port, UART_RX);
+ (iir & UART_IIR_RX_TIMEOUT) == UART_IIR_RX_TIMEOUT) {
+ unsigned char efr2, timeout_h, timeout_l;
+
+ efr2 = serial_in(up, UART_OMAP_EFR2);
+ timeout_h = serial_in(up, UART_OMAP_TO_H);
+ timeout_l = serial_in(up, UART_OMAP_TO_L);
+ serial_out(up, UART_OMAP_TO_H, 0xFF);
+ serial_out(up, UART_OMAP_TO_L, 0xFF);
+ serial_out(up, UART_OMAP_EFR2, UART_OMAP_EFR2_TIMEOUT_BEHAVE);
+ serial_in(up, UART_IIR);
+ serial_out(up, UART_OMAP_EFR2, efr2);
+ serial_out(up, UART_OMAP_TO_H, timeout_h);
+ serial_out(up, UART_OMAP_TO_L, timeout_l);
}
/* Stop processing interrupts on input overrun */
--
2.34.1
2
1
[PATCH openEuler-22.03-LTS-SP1] serial: 8250_omap: Implementation of Errata i2310
by felix 05 Aug '24
by felix 05 Aug '24
05 Aug '24
From: Udit Kumar <u-kumar1(a)ti.com>
stable inclusion
from stable-v5.10.221
commit cb879300669881970eabebe64bd509dbbe42b9de
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/IAGEP6
CVE: CVE-2024-42095
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id…
--------------------------------
commit 9d141c1e615795eeb93cd35501ad144ee997a826 upstream.
As per Errata i2310[0], Erroneous timeout can be triggered,
if this Erroneous interrupt is not cleared then it may leads
to storm of interrupts, therefore apply Errata i2310 solution.
[0] https://www.ti.com/lit/pdf/sprz536 page 23
Fixes: b67e830d38fa ("serial: 8250: 8250_omap: Fix possible interrupt storm on K3 SoCs")
Cc: stable(a)vger.kernel.org
Signed-off-by: Udit Kumar <u-kumar1(a)ti.com>
Link: https://lore.kernel.org/r/20240619105903.165434-1-u-kumar1@ti.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Signed-off-by: Felix Fu <fuzhen5(a)huawei.com>
---
drivers/tty/serial/8250/8250_omap.c | 25 ++++++++++++++++++++-----
1 file changed, 20 insertions(+), 5 deletions(-)
diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c
index 25765ebb756a..955642e90ede 100644
--- a/drivers/tty/serial/8250/8250_omap.c
+++ b/drivers/tty/serial/8250/8250_omap.c
@@ -164,6 +164,10 @@ static void uart_write(struct omap8250_priv *priv, u32 reg, u32 val)
writel(val, priv->membase + (reg << OMAP_UART_REGSHIFT));
}
+/* Timeout low and High */
+#define UART_OMAP_TO_L 0x26
+#define UART_OMAP_TO_H 0x27
+
/*
* Called on runtime PM resume path from omap8250_restore_regs(), and
* omap8250_set_mctrl().
@@ -647,13 +651,24 @@ static irqreturn_t omap8250_irq(int irq, void *dev_id)
/*
* On K3 SoCs, it is observed that RX TIMEOUT is signalled after
- * FIFO has been drained, in which case a dummy read of RX FIFO
- * is required to clear RX TIMEOUT condition.
+ * FIFO has been drained or erroneously.
+ * So apply solution of Errata i2310 as mentioned in
+ * https://www.ti.com/lit/pdf/sprz536
*/
if (priv->habit & UART_RX_TIMEOUT_QUIRK &&
- (iir & UART_IIR_RX_TIMEOUT) == UART_IIR_RX_TIMEOUT &&
- serial_port_in(port, UART_OMAP_RX_LVL) == 0) {
- serial_port_in(port, UART_RX);
+ (iir & UART_IIR_RX_TIMEOUT) == UART_IIR_RX_TIMEOUT) {
+ unsigned char efr2, timeout_h, timeout_l;
+
+ efr2 = serial_in(up, UART_OMAP_EFR2);
+ timeout_h = serial_in(up, UART_OMAP_TO_H);
+ timeout_l = serial_in(up, UART_OMAP_TO_L);
+ serial_out(up, UART_OMAP_TO_H, 0xFF);
+ serial_out(up, UART_OMAP_TO_L, 0xFF);
+ serial_out(up, UART_OMAP_EFR2, UART_OMAP_EFR2_TIMEOUT_BEHAVE);
+ serial_in(up, UART_IIR);
+ serial_out(up, UART_OMAP_EFR2, efr2);
+ serial_out(up, UART_OMAP_TO_H, timeout_h);
+ serial_out(up, UART_OMAP_TO_L, timeout_l);
}
/* Stop processing interrupts on input overrun */
--
2.34.1
2
1
[PATCH OLK-5.10 v2] drm/shmem-helper: Fix BUG_ON() on mmap(PROT_WRITE, MAP_PRIVATE)
by Pu Lehui 05 Aug '24
by Pu Lehui 05 Aug '24
05 Aug '24
From: "Wachowski, Karol" <karol.wachowski(a)intel.com>
mainline inclusion
from mainline-v6.10-rc2
commit 39bc27bd688066a63e56f7f64ad34fae03fbe3b8
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/IACS4Z
CVE: CVE-2024-39497
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
--------------------------------
Lack of check for copy-on-write (COW) mapping in drm_gem_shmem_mmap
allows users to call mmap with PROT_WRITE and MAP_PRIVATE flag
causing a kernel panic due to BUG_ON in vmf_insert_pfn_prot:
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
Return -EINVAL early if COW mapping is detected.
This bug affects all drm drivers using default shmem helpers.
It can be reproduced by this simple example:
void *ptr = mmap(0, size, PROT_WRITE, MAP_PRIVATE, fd, mmap_offset);
ptr[0] = 0;
Fixes: 2194a63a818d ("drm: Add library for shmem backed GEM objects")
Cc: Noralf Trønnes <noralf(a)tronnes.org>
Cc: Eric Anholt <eric(a)anholt.net>
Cc: Rob Herring <robh(a)kernel.org>
Cc: Maarten Lankhorst <maarten.lankhorst(a)linux.intel.com>
Cc: Maxime Ripard <mripard(a)kernel.org>
Cc: Thomas Zimmermann <tzimmermann(a)suse.de>
Cc: David Airlie <airlied(a)gmail.com>
Cc: Daniel Vetter <daniel(a)ffwll.ch>
Cc: dri-devel(a)lists.freedesktop.org
Cc: <stable(a)vger.kernel.org> # v5.2+
Signed-off-by: Wachowski, Karol <karol.wachowski(a)intel.com>
Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz(a)linux.intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter(a)ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20240520100514.925681-1-jacek…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Conflicts:
drivers/gpu/drm/drm_gem_shmem_helper.c
[The conflicts were due to not merge commit 21aa27ddc582 and 97a7e4733b9b]
Signed-off-by: Pu Lehui <pulehui(a)huawei.com>
---
drivers/gpu/drm/drm_gem_shmem_helper.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/drivers/gpu/drm/drm_gem_shmem_helper.c b/drivers/gpu/drm/drm_gem_shmem_helper.c
index e8f07305e279..6c9e4857fdb8 100644
--- a/drivers/gpu/drm/drm_gem_shmem_helper.c
+++ b/drivers/gpu/drm/drm_gem_shmem_helper.c
@@ -17,6 +17,8 @@
#include <drm/drm_prime.h>
#include <drm/drm_print.h>
+#include "../../../mm/internal.h" /* is_cow_mapping() */
+
/**
* DOC: overview
*
@@ -630,6 +632,9 @@ int drm_gem_shmem_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma)
return ret;
}
+ if (is_cow_mapping(vma->vm_flags))
+ return -EINVAL;
+
shmem = to_drm_gem_shmem_obj(obj);
ret = drm_gem_shmem_get_pages(shmem);
--
2.34.1
2
1
[PATCH openEuler-1.0-LTS] drm/nouveau/dispnv04: fix null pointer dereference in nv17_tv_get_ld_modes
by Yuan Can 05 Aug '24
by Yuan Can 05 Aug '24
05 Aug '24
From: Ma Ke <make24(a)iscas.ac.cn>
stable inclusion
from stable-v4.19.317
commit 9289cd3450d1da3e271ef4b054d4d2932c41243e
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/IAGENV
CVE: CVE-2024-41095
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id…
--------------------------------
commit 66edf3fb331b6c55439b10f9862987b0916b3726 upstream.
In nv17_tv_get_ld_modes(), the return value of drm_mode_duplicate() is
assigned to mode, which will lead to a possible NULL pointer dereference
on failure of drm_mode_duplicate(). Add a check to avoid npd.
Cc: stable(a)vger.kernel.org
Signed-off-by: Ma Ke <make24(a)iscas.ac.cn>
Signed-off-by: Lyude Paul <lyude(a)redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240625081828.2620794-1-make…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Signed-off-by: Yuan Can <yuancan(a)huawei.com>
---
drivers/gpu/drm/nouveau/dispnv04/tvnv17.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/gpu/drm/nouveau/dispnv04/tvnv17.c b/drivers/gpu/drm/nouveau/dispnv04/tvnv17.c
index 8fd8124d72ba..a01613ad1608 100644
--- a/drivers/gpu/drm/nouveau/dispnv04/tvnv17.c
+++ b/drivers/gpu/drm/nouveau/dispnv04/tvnv17.c
@@ -208,6 +208,8 @@ static int nv17_tv_get_ld_modes(struct drm_encoder *encoder,
struct drm_display_mode *mode;
mode = drm_mode_duplicate(encoder->dev, tv_mode);
+ if (!mode)
+ continue;
mode->clock = tv_norm->tv_enc_mode.vrefresh *
mode->htotal / 1000 *
--
2.17.1
2
1
hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/IAGRKP
CVE: NA
--------------------------------
Signed-off-by: Yu Kuai <yukuai3(a)huawei.com>
---
block/Kconfig | 2 +
block/Makefile | 1 +
block/bfq-iosched.c | 11 +-
block/blk-core.c | 15 +
block/blk-flush.c | 5 +
block/blk-io-hierarchy/Kconfig | 156 +++++++
block/blk-io-hierarchy/Makefile | 8 +
block/blk-io-hierarchy/debugfs.c | 230 ++++++++++
block/blk-io-hierarchy/iodump.c | 753 +++++++++++++++++++++++++++++++
block/blk-io-hierarchy/iodump.h | 100 ++++
block/blk-io-hierarchy/stats.c | 331 ++++++++++++++
block/blk-io-hierarchy/stats.h | 323 +++++++++++++
block/blk-mq-debugfs.c | 16 +-
block/blk-mq-debugfs.h | 8 +
block/blk-mq-sched.c | 7 +-
block/blk-mq-tag.c | 13 +-
block/blk-mq.c | 51 ++-
block/blk-mq.h | 36 +-
block/blk-sysfs.c | 16 +
block/blk-throttle.c | 21 +
block/blk-wbt.c | 12 +-
block/blk.h | 58 +++
block/kyber-iosched.c | 8 +-
block/mq-deadline.c | 15 +-
include/linux/blk_types.h | 41 +-
include/linux/blkdev.h | 9 +
26 files changed, 2202 insertions(+), 44 deletions(-)
create mode 100644 block/blk-io-hierarchy/Kconfig
create mode 100644 block/blk-io-hierarchy/Makefile
create mode 100644 block/blk-io-hierarchy/debugfs.c
create mode 100644 block/blk-io-hierarchy/iodump.c
create mode 100644 block/blk-io-hierarchy/iodump.h
create mode 100644 block/blk-io-hierarchy/stats.c
create mode 100644 block/blk-io-hierarchy/stats.h
diff --git a/block/Kconfig b/block/Kconfig
index da71e56f8682..770cd3fa1367 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -213,6 +213,8 @@ config BLK_BIO_DISPATCH_ASYNC
feature will require special care in the driver to work. If unsure,
say N here.
+source "block/blk-io-hierarchy/Kconfig"
+
menu "Partition Types"
source "block/partitions/Kconfig"
diff --git a/block/Makefile b/block/Makefile
index 572b33f32c07..bb711b0c307a 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -37,3 +37,4 @@ obj-$(CONFIG_BLK_WBT) += blk-wbt.o
obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
+obj-$(CONFIG_BLK_IO_HIERARCHY_STATS) += blk-io-hierarchy/
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 473d9e31ff87..2cb1bca71d39 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -140,6 +140,7 @@
#include "blk-mq-sched.h"
#include "bfq-iosched.h"
#include "blk-wbt.h"
+#include "blk-io-hierarchy/stats.h"
#define BFQ_BFQQ_FNS(name) \
void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
@@ -1882,8 +1883,10 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
ret = blk_mq_sched_try_merge(q, bio, &free);
spin_unlock_irq(&bfqd->lock);
- if (free)
+ if (free) {
+ rq_hierarchy_end_io_acct(free, STAGE_BFQ);
blk_mq_free_request(free);
+ }
return ret;
}
@@ -4168,6 +4171,8 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
idle_timer_disabled ? in_serv_queue : NULL,
idle_timer_disabled);
+ if (rq)
+ rq_hierarchy_end_io_acct(rq, STAGE_BFQ);
return rq;
}
@@ -4750,6 +4755,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
spin_lock_irq(&bfqd->lock);
if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
spin_unlock_irq(&bfqd->lock);
+ rq_list_hierarchy_end_io_acct(&free, STAGE_BFQ);
blk_mq_free_requests(&free);
return;
}
@@ -4797,6 +4803,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
struct list_head *list, bool at_head)
{
+ rq_list_hierarchy_start_io_acct(list, STAGE_BFQ);
while (!list_empty(list)) {
struct request *rq;
@@ -5394,6 +5401,7 @@ static void bfq_exit_queue(struct elevator_queue *e)
struct bfq_queue *bfqq, *n;
struct request_queue *q = bfqd->queue;
+ blk_mq_unregister_hierarchy(q, STAGE_BFQ);
hrtimer_cancel(&bfqd->idle_slice_timer);
spin_lock_irq(&bfqd->lock);
@@ -5560,6 +5568,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
wbt_disable_default(q);
+ blk_mq_register_hierarchy(q, STAGE_BFQ);
return 0;
out_free:
diff --git a/block/blk-core.c b/block/blk-core.c
index acf5585b0557..03b8c2367164 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -43,6 +43,7 @@
#include "blk-mq.h"
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"
+#include "blk-io-hierarchy/stats.h"
#ifdef CONFIG_DEBUG_FS
struct dentry *blk_debugfs_root;
@@ -1001,6 +1002,15 @@ void blk_exit_queue(struct request_queue *q)
bdi_put(q->backing_dev_info);
}
+static void blk_mq_unregister_default_hierarchy(struct request_queue *q)
+{
+ blk_mq_unregister_hierarchy(q, STAGE_GETTAG);
+ blk_mq_unregister_hierarchy(q, STAGE_PLUG);
+ blk_mq_unregister_hierarchy(q, STAGE_HCTX);
+ blk_mq_unregister_hierarchy(q, STAGE_REQUEUE);
+ blk_mq_unregister_hierarchy(q, STAGE_RQ_DRIVER);
+}
+
/**
* blk_cleanup_queue - shutdown a request queue
* @q: request queue to shutdown
@@ -1088,6 +1098,7 @@ void blk_cleanup_queue(struct request_queue *q)
blk_exit_queue(q);
if (q->mq_ops) {
+ blk_mq_unregister_default_hierarchy(q);
blk_mq_cancel_work_sync(q);
blk_mq_exit_queue(q);
}
@@ -3919,6 +3930,8 @@ void blk_start_plug(struct blk_plug *plug)
INIT_LIST_HEAD(&plug->list);
INIT_LIST_HEAD(&plug->mq_list);
INIT_LIST_HEAD(&plug->cb_list);
+ plug->cur_ktime = 0;
+
/*
* Store ordering should not be needed here, since a potential
* preempt will imply a full memory barrier
@@ -4060,6 +4073,8 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
*/
if (q)
queue_unplugged(q, depth, from_schedule);
+
+ plug->cur_ktime = 0;
}
void blk_finish_plug(struct blk_plug *plug)
diff --git a/block/blk-flush.c b/block/blk-flush.c
index c1bfcde165af..384fce3b6bf6 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -75,6 +75,7 @@
#include "blk-mq.h"
#include "blk-mq-tag.h"
#include "blk-mq-sched.h"
+#include "blk-io-hierarchy/stats.h"
/* PREFLUSH/FUA sequences */
enum {
@@ -187,6 +188,7 @@ static bool blk_flush_complete_seq(struct request *rq,
if (list_empty(pending))
fq->flush_pending_since = jiffies;
list_move_tail(&rq->flush.list, pending);
+ rq_hierarchy_start_io_acct(rq, STAGE_HCTX);
break;
case REQ_FSEQ_DATA:
@@ -245,6 +247,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
* avoiding use-after-free.
*/
WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE);
+ blk_mq_put_alloc_task(flush_rq);
if (fq->rq_status != BLK_STS_OK) {
error = fq->rq_status;
fq->rq_status = BLK_STS_OK;
@@ -274,6 +277,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
unsigned int seq = blk_flush_cur_seq(rq);
BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
+ rq_hierarchy_end_io_acct(rq, STAGE_HCTX);
queued |= blk_flush_complete_seq(rq, fq, seq, error);
}
@@ -377,6 +381,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
flush_rq->rq_flags |= RQF_FLUSH_SEQ;
flush_rq->rq_disk = first_rq->rq_disk;
flush_rq->end_io = flush_end_io;
+ blk_mq_get_alloc_task(flush_rq, first_rq->bio);
/*
* Order WRITE ->end_io and WRITE rq->ref, and its pair is the one
diff --git a/block/blk-io-hierarchy/Kconfig b/block/blk-io-hierarchy/Kconfig
new file mode 100644
index 000000000000..ce72d0593fce
--- /dev/null
+++ b/block/blk-io-hierarchy/Kconfig
@@ -0,0 +1,156 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+menuconfig BLK_IO_HIERARCHY_STATS
+ bool "Enable hierarchy io stats"
+ default n
+ depends on BLK_DEBUG_FS=y
+ help
+ Enabling this lets the block layer to record additional information
+ in different io stages. Such information can be helpful to debug
+ performance and problems like io hang.
+
+ If unsure, say N.
+
+if BLK_IO_HIERARCHY_STATS
+
+config HIERARCHY_BIO
+ bool "Support to record stats for bio lifetime"
+ default n
+ select BLK_BIO_ALLOC_TIME
+ help
+ Enabling this lets blk hierarchy stats to record additional information
+ for bio. Such information can be helpful to debug performance and
+ problems like io hang.
+
+ If unsure, say N.
+
+config HIERARCHY_IO_DUMP
+ bool "Support to dump io that is throttled"
+ default n
+ select BLK_BIO_ALLOC_TIME
+ select BLK_BIO_ALLOC_TASK
+ depends on BLK_DEV_IO_TRACE
+ help
+ Enable this will create new debugfs entries to show user the detailed
+ information of IO that are submitted and not done yet, and user can
+ filter the result by IO stage or IO latency.
+
+ If unsure, say N.
+
+config HIERARCHY_THROTTLE
+ bool "Enable hierarchy stats layer blk-throttle"
+ default n
+ depends on BLK_DEV_THROTTLING=y
+ help
+ Enabling this lets blk hierarchy stats to record additional information
+ for blk-throttle. Such information can be helpful to debug performance
+ and problems like io hang.
+
+ If unsure, say N.
+
+config HIERARCHY_WBT
+ bool "Enable hierarchy stats layer blk-wbt"
+ default n
+ depends on BLK_WBT
+ help
+ Enabling this lets blk hierarchy stats to record additional information
+ for blk-wbt. Such information can be helpful to debug performance
+ and problems like io hang.
+
+ If unsure, say N.
+
+config HIERARCHY_IOCOST
+ bool "Enable hierarchy stats layer blk-iocost"
+ default n
+ depends on BLK_CGROUP_IOCOST
+ help
+ Enabling this lets blk hierarchy stats to record additional information
+ for blk-iocost. Such information can be helpful to debug performance
+ and problems like io hang.
+
+ If unsure, say N.
+
+config HIERARCHY_GETTAG
+ bool "Enable hierarchy stats layer get-tag"
+ default n
+ help
+ Enabling this lets blk hierarchy stats to record additional information
+ for getting tag. Such information can be helpful to debug performance
+ and problems like io hang.
+
+ If unsure, say N.
+
+config HIERARCHY_PLUG
+ bool "Enable hierarchy stats layer plug"
+ default n
+ help
+ Enabling this lets blk hierarchy stats to record additional information
+ for plug. Such information can be helpful to debug performance
+ and problems like io hang.
+
+ If unsure, say N.
+
+config HIERARCHY_DEADLINE
+ bool "Enable hierarchy stats layer mq-deadline"
+ default n
+ depends on MQ_IOSCHED_DEADLINE
+ help
+ Enabling this lets blk hierarchy stats to record additional information
+ for mq-deadline. Such information can be helpful to debug performance
+ and problems like io hang.
+
+ If unsure, say N.
+
+config HIERARCHY_BFQ
+ bool "Enable hierarchy stats layer bfq"
+ default n
+ depends on IOSCHED_BFQ
+ help
+ Enabling this lets blk hierarchy stats to record additional information
+ for bfq. Such information can be helpful to debug performance
+ and problems like io hang.
+
+ If unsure, say N.
+
+config HIERARCHY_KYBER
+ bool "Enable hierarchy stats layer kyber"
+ default n
+ depends on MQ_IOSCHED_KYBER
+ help
+ Enabling this lets blk hierarchy stats to record additional information
+ for kyber. Such information can be helpful to debug performance
+ and problems like io hang.
+
+ If unsure, say N.
+
+config HIERARCHY_HCTX
+ bool "Enable hierarchy stats layer hctx"
+ default n
+ help
+ Enabling this lets blk hierarchy stats to record additional information
+ for hctx. Such information can be helpful to debug performance
+ and problems like io hang.
+
+ If unsure, say N.
+
+config HIERARCHY_REQUEUE
+ bool "Enable hierarchy stats layer requeue"
+ default n
+ help
+ Enabling this lets blk hierarchy stats to record additional information
+ for requeue. Such information can be helpful to debug performance
+ and problems like io hang.
+
+ If unsure, say N.
+
+config HIERARCHY_RQ_DRIVER
+ bool "Enable hierarchy stats layer rq_driver"
+ default n
+ help
+ Enabling this lets blk hierarchy stats to record additional information
+ for requeue driver. Such information can be helpful to debug performance
+ and problems like io hang.
+
+ If unsure, say N.
+
+endif
diff --git a/block/blk-io-hierarchy/Makefile b/block/blk-io-hierarchy/Makefile
new file mode 100644
index 000000000000..9b989d379e58
--- /dev/null
+++ b/block/blk-io-hierarchy/Makefile
@@ -0,0 +1,8 @@
+#
+# Make file for blk_io_hierarchy_stats
+#
+
+obj-$(CONFIG_BLK_IO_HIERARCHY_STATS) += blk_io_hierarchy_stats.o
+
+blk_io_hierarchy_stats-y := stats.o debugfs.o
+obj-$(CONFIG_HIERARCHY_IO_DUMP) += iodump.o
diff --git a/block/blk-io-hierarchy/debugfs.c b/block/blk-io-hierarchy/debugfs.c
new file mode 100644
index 000000000000..cb7ff2866c49
--- /dev/null
+++ b/block/blk-io-hierarchy/debugfs.c
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/blkdev.h>
+
+#include "../blk-mq-debugfs.h"
+#include "stats.h"
+#include "iodump.h"
+
+static const char *stage_name[NR_STAGE_GROUPS] = {
+#ifdef CONFIG_HIERARCHY_THROTTLE
+ [STAGE_THROTTLE] = "throtl",
+#endif
+#ifdef CONFIG_HIERARCHY_WBT
+ [STAGE_WBT] = "wbt",
+#endif
+#ifdef CONFIG_HIERARCHY_IOCOST
+ [STAGE_IOCOST] = "iocost",
+#endif
+#ifdef CONFIG_HIERARCHY_GETTAG
+ [STAGE_GETTAG] = "gettag",
+#endif
+#ifdef CONFIG_HIERARCHY_PLUG
+ [STAGE_PLUG] = "plug",
+#endif
+#ifdef CONFIG_HIERARCHY_DEADLINE
+ [STAGE_DEADLINE] = "deadline",
+#endif
+#ifdef CONFIG_HIERARCHY_BFQ
+ [STAGE_BFQ] = "bfq",
+#endif
+#ifdef CONFIG_HIERARCHY_KYBER
+ [STAGE_KYBER] = "kyber",
+#endif
+#ifdef CONFIG_HIERARCHY_HCTX
+ [STAGE_HCTX] = "hctx",
+#endif
+#ifdef CONFIG_HIERARCHY_REQUEUE
+ [STAGE_REQUEUE] = "requeue",
+#endif
+#ifdef CONFIG_HIERARCHY_RQ_DRIVER
+ [STAGE_RQ_DRIVER] = "rq_driver",
+#endif
+#ifdef CONFIG_HIERARCHY_BIO
+ [STAGE_BIO] = "bio",
+#endif
+};
+
+const char *hierarchy_stage_name(enum stage_group stage)
+{
+ return stage_name[stage];
+}
+
+static int __hierarchy_stats_show(void *data, struct seq_file *m)
+{
+ struct hierarchy_stage *hstage = data;
+ u64 dispatched[NEW_NR_STAT_GROUPS] = {0};
+ u64 completed[NEW_NR_STAT_GROUPS] = {0};
+ u64 latency[NEW_NR_STAT_GROUPS] = {0};
+ int cpu;
+ int i;
+
+ for_each_possible_cpu(cpu) {
+ struct hierarchy_stats *stat = per_cpu_ptr(hstage->hstats, cpu);
+
+ for (i = 0; i < NEW_NR_STAT_GROUPS; ++i) {
+ dispatched[i] += stat->dispatched[i];
+ completed[i] += stat->completed[i];
+ latency[i] += stage_is_rq(hstage->stage) ?
+ stat->jiffies[i] : stat->nsecs[i];
+ }
+ }
+
+ if (stage_is_rq(hstage->stage))
+ for (i = 0; i < NEW_NR_STAT_GROUPS; ++i)
+ latency[i] =
+ jiffies_to_msecs(latency[i]) * NSEC_PER_MSEC;
+
+ seq_printf(m, "%llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu",
+ dispatched[STAT_READ], completed[STAT_READ],
+ latency[STAT_READ], dispatched[STAT_WRITE],
+ completed[STAT_WRITE], latency[STAT_WRITE],
+ dispatched[STAT_DISCARD], completed[STAT_DISCARD],
+ latency[STAT_DISCARD], dispatched[STAT_FLUSH],
+ completed[STAT_FLUSH], latency[STAT_FLUSH]);
+
+ hierarchy_show_slow_io(hstage, m);
+ seq_puts(m, "\n");
+ return 0;
+}
+
+static void *hierarchy_stats_start(struct seq_file *m, loff_t *pos)
+{
+ int ret;
+ enum stage_group stage = *pos;
+ struct blk_io_hierarchy_stats *stats = m->private;
+
+ ret = blk_queue_enter(stats->q, 0);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (stage < 0 || stage >= NR_STAGE_GROUPS)
+ return NULL;
+
+ return pos;
+}
+
+static void *hierarchy_stats_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ enum stage_group stage = ++(*pos);
+
+ if (stage >= 0 && stage < NR_STAGE_GROUPS)
+ return pos;
+
+ return NULL;
+}
+
+static void hierarchy_stats_stop(struct seq_file *m, void *v)
+{
+ struct blk_io_hierarchy_stats *stats = m->private;
+
+ if (!IS_ERR(v))
+ blk_queue_exit(stats->q);
+}
+
+static int hierarchy_stats_show(struct seq_file *m, void *v)
+{
+ enum stage_group stage = (*(loff_t *)v);
+ struct blk_io_hierarchy_stats *stats = m->private;
+ struct hierarchy_stage *hstage = stats->hstage[stage];
+
+ if (!hstage)
+ return 0;
+
+ seq_printf(m, "%s ", hierarchy_stage_name(stage));
+ __hierarchy_stats_show(hstage, m);
+ return 0;
+}
+
+static const struct seq_operations hierarchy_stats_ops = {
+ .start = hierarchy_stats_start,
+ .next = hierarchy_stats_next,
+ .stop = hierarchy_stats_stop,
+ .show = hierarchy_stats_show,
+};
+
+static const struct blk_mq_debugfs_attr hierarchy_debugfs_attrs[] = {
+ {"stats", 0400, __hierarchy_stats_show},
+ {},
+};
+
+static const struct blk_mq_debugfs_attr hierarchy_stats_attr[] = {
+ {"stats", 0400, .seq_ops = &hierarchy_stats_ops},
+ {},
+};
+
+static void hierarchy_register_stage(struct blk_io_hierarchy_stats *stats,
+ enum stage_group stage)
+{
+ struct hierarchy_stage *hstage = stats->hstage[stage];
+ struct dentry *dir;
+
+ if (!stage_name[stage] || hstage->debugfs_dir)
+ return;
+
+ dir = debugfs_create_dir(stage_name[stage], stats->debugfs_dir);
+ if (IS_ERR(dir))
+ return;
+
+ hstage->debugfs_dir = dir;
+ debugfs_create_files(dir, hstage, hierarchy_debugfs_attrs);
+ io_hierarchy_register_iodump(hstage);
+}
+
+static void hierarchy_unregister_stage(struct blk_io_hierarchy_stats *stats,
+ enum stage_group stage)
+{
+ struct hierarchy_stage *hstage = stats->hstage[stage];
+
+ if (!stage_name[stage] || !hstage->debugfs_dir)
+ return;
+
+ debugfs_remove_recursive(hstage->debugfs_dir);
+ hstage->debugfs_dir = NULL;
+}
+
+void blk_mq_debugfs_register_hierarchy(struct request_queue *q,
+ enum stage_group stage)
+{
+ struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats;
+
+ if (!blk_mq_hierarchy_registered(q, stage) ||
+ !blk_mq_debugfs_enabled(q))
+ return;
+
+ hierarchy_register_stage(stats, stage);
+}
+
+void blk_mq_debugfs_unregister_hierarchy(struct request_queue *q,
+ enum stage_group stage)
+{
+ struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats;
+
+ if (!blk_mq_hierarchy_registered(q, stage) ||
+ !blk_mq_debugfs_enabled(q))
+ return;
+
+ hierarchy_unregister_stage(stats, stage);
+}
+
+void blk_mq_debugfs_create_default_hierarchy_attr(struct request_queue *q)
+{
+ struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats;
+
+ if (!blk_mq_debugfs_enabled(q))
+ return;
+
+ debugfs_create_files(stats->debugfs_dir, stats, hierarchy_stats_attr);
+}
diff --git a/block/blk-io-hierarchy/iodump.c b/block/blk-io-hierarchy/iodump.c
new file mode 100644
index 000000000000..49ad2292873c
--- /dev/null
+++ b/block/blk-io-hierarchy/iodump.c
@@ -0,0 +1,753 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/seq_file.h>
+#include <linux/blktrace_api.h>
+#include <linux/blk-cgroup.h>
+#include <linux/sched/task.h>
+
+#include "iodump.h"
+#include "../blk.h"
+#include "../blk-mq-debugfs.h"
+
+#define RWB_LEN 6
+#define PATH_LEN 64
+#define ms_to_ns(time) (time * NSEC_PER_MSEC)
+#define DEFAULT_THRESHOLD 1000
+
+static DEFINE_MUTEX(dump_mutex);
+
+struct bio_dump_data {
+ u64 stat_time;
+ struct list_head head;
+ spinlock_t lock;
+};
+
+struct rq_dump_data {
+ struct request_queue *q;
+ enum stage_group stage;
+ unsigned int tag;
+ unsigned int total_tags;
+ bool shared;
+ bool has_elevator;
+ bool enter_queue;
+};
+
+#ifdef CONFIG_HIERARCHY_BIO
+struct pos_data {
+ enum stage_group stage;
+ unsigned int count;
+};
+
+struct bio_stage_dump_data {
+ union {
+ loff_t pos;
+ struct pos_data pdata;
+ };
+ struct rq_dump_data rq_ddata;
+ u64 stat_time;
+};
+#endif
+
+static struct blk_mq_hw_ctx *queue_hctx(struct request_queue *q, int id)
+{
+ return q->queue_hw_ctx[id];
+}
+
+int blk_io_hierarchy_iodump_init(struct request_queue *q,
+ struct hierarchy_stage *hstage)
+{
+ hstage->threshold = DEFAULT_THRESHOLD;
+
+ if (stage_is_bio(hstage->stage)) {
+ struct bio_dump_data *bio_ddata =
+ kmalloc(sizeof(*bio_ddata), GFP_KERNEL);
+
+ if (!bio_ddata)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&bio_ddata->head);
+ spin_lock_init(&bio_ddata->lock);
+ hstage->dump_data = bio_ddata;
+ return 0;
+ }
+
+ if (stage_is_rq(hstage->stage)) {
+ struct rq_dump_data *rq_ddata =
+ kzalloc(sizeof(*rq_ddata), GFP_KERNEL);
+
+ if (!rq_ddata)
+ return -ENOMEM;
+
+ rq_ddata->q = q;
+ rq_ddata->stage = hstage->stage;
+ hstage->dump_data = rq_ddata;
+ return 0;
+ }
+
+#ifdef CONFIG_HIERARCHY_BIO
+ BUILD_BUG_ON(sizeof(struct pos_data) != sizeof(loff_t));
+
+ if (hstage->stage == STAGE_BIO) {
+ struct bio_stage_dump_data *bstage_ddata =
+ kzalloc(sizeof(*bstage_ddata), GFP_KERNEL);
+
+ if (!bstage_ddata)
+ return -ENOMEM;
+
+ bstage_ddata->rq_ddata.q = q;
+ bstage_ddata->rq_ddata.stage = hstage->stage;
+ hstage->dump_data = bstage_ddata;
+ return 0;
+ }
+#endif
+
+ return -EINVAL;
+}
+
+void blk_io_hierarchy_iodump_exit(struct request_queue *q,
+ enum stage_group stage)
+{
+ struct hierarchy_stage *hstage = q->io_hierarchy_stats->hstage[stage];
+
+ if (stage_is_bio(hstage->stage)) {
+ struct bio_dump_data *bio_ddata = hstage->dump_data;
+
+ WARN(!list_empty(&bio_ddata->head),
+ "blk-io-hierarchy: disk %s stage %s unregistered whih throttled IO.\n",
+ kobject_name(q->kobj.parent), hierarchy_stage_name(stage));
+ }
+
+ kfree(hstage->dump_data);
+ hstage->dump_data = NULL;
+}
+
+void hierarchy_add_bio(struct hierarchy_stage *hstage, struct bio *bio)
+{
+ unsigned long flags;
+ struct bio_hierarchy_data *data = bio->hdata;
+ struct bio_dump_data *bio_ddata = hstage->dump_data;
+
+ spin_lock_irqsave(&bio_ddata->lock, flags);
+ list_add_tail(&data->hierarchy_list, &bio_ddata->head);
+ spin_unlock_irqrestore(&bio_ddata->lock, flags);
+}
+
+void hierarchy_remove_bio(struct hierarchy_stage *hstage, struct bio *bio)
+{
+ unsigned long flags;
+ struct bio_hierarchy_data *data = bio->hdata;
+ struct bio_dump_data *bio_ddata = hstage->dump_data;
+
+ spin_lock_irqsave(&bio_ddata->lock, flags);
+ list_del_init(&data->hierarchy_list);
+ spin_unlock_irqrestore(&bio_ddata->lock, flags);
+}
+
+void bio_hierarchy_data_init(struct bio *bio, struct bio_hierarchy_data *hdata)
+{
+ hdata->bio = bio;
+ INIT_LIST_HEAD(&hdata->hierarchy_list);
+}
+
+static void *bio_hierarchy_list_start(struct seq_file *m, loff_t *pos)
+ __acquires(&bio_ddata->lock)
+{
+ struct hierarchy_stage *hstage = m->private;
+ struct bio_dump_data *bio_ddata = hstage->dump_data;
+
+ spin_lock_irq(&bio_ddata->lock);
+ bio_ddata->stat_time = blk_time_get_ns();
+
+ return seq_list_start(&bio_ddata->head, *pos);
+}
+
+static void *bio_hierarchy_list_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct hierarchy_stage *hstage = m->private;
+ struct bio_dump_data *bio_ddata = hstage->dump_data;
+
+ return seq_list_next(v, &bio_ddata->head, pos);
+}
+
+static void bio_hierarchy_list_stop(struct seq_file *m, void *v)
+ __releases(&hstage->lock)
+{
+ struct hierarchy_stage *hstage = m->private;
+ struct bio_dump_data *bio_ddata = hstage->dump_data;
+
+ spin_unlock_irq(&bio_ddata->lock);
+}
+
+static void __hierarchy_show_bio(struct seq_file *m,
+ struct bio_hierarchy_data *data,
+ enum stage_group stage, u64 duration)
+{
+ char rwbs[RWB_LEN];
+ char path[PATH_LEN] = {0};
+ struct bio *bio = data->bio;
+ struct task_struct *task = get_pid_task(bio->pid, PIDTYPE_PID);
+
+ blk_fill_rwbs(rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+ cgroup_path(bio->bi_css->cgroup, path, PATH_LEN);
+
+ seq_printf(m, "%s-%d %s stage %s bio %s %lu + %u cgroup %s started %llu ns ago\n",
+ task ? task->comm : "null", task ? task->pid : 0,
+ bio->bi_disk->disk_name, hierarchy_stage_name(stage),
+ rwbs, bio->bi_iter.bi_sector, bio_sectors(bio), path,
+ duration);
+
+ if (task)
+ put_task_struct(task);
+}
+
+static u64 get_duration(u64 a, u64 b)
+{
+ return a > b ? a - b : 0;
+}
+
+static void hierarchy_show_bio(struct seq_file *m,
+ struct bio_hierarchy_data *data)
+{
+ u64 duration;
+ struct hierarchy_stage *hstage = m->private;
+ struct bio_dump_data *bio_ddata = hstage->dump_data;
+
+ duration = get_duration(bio_ddata->stat_time, data->time);
+ if (hstage->threshold > ns_to_ms(duration))
+ return;
+
+ __hierarchy_show_bio(m, data, hstage->stage, duration);
+}
+
+static int bio_hierarchy_list_show(struct seq_file *m, void *v)
+{
+ struct bio_hierarchy_data *data =
+ list_entry(v, struct bio_hierarchy_data, hierarchy_list);
+
+ hierarchy_show_bio(m, data);
+ return 0;
+}
+
+static const struct seq_operations hierarchy_bio_dump_ops = {
+ .start = bio_hierarchy_list_start,
+ .next = bio_hierarchy_list_next,
+ .stop = bio_hierarchy_list_stop,
+ .show = bio_hierarchy_list_show,
+};
+
+static int threshold_show(void *data, struct seq_file *m)
+{
+ struct hierarchy_stage *hstage = data;
+
+ seq_printf(m, "%lu\n", hstage->threshold);
+ return 0;
+}
+
+/*
+ * max size needed by different bases to express U64
+ * HEX: "0xFFFFFFFFFFFFFFFF" --> 18
+ * DEC: "18446744073709551615" --> 20
+ * OCT: "01777777777777777777777" --> 23
+ * pick the max one to define NUMBER_BUF_LEN
+ */
+#define MAX_BUF_LEN 24
+static ssize_t threshold_store(void *data, const char __user *buf, size_t count,
+ loff_t *ppos)
+{
+ int err;
+ unsigned long val;
+ char b[MAX_BUF_LEN + 1];
+ struct hierarchy_stage *hstage = data;
+
+ if (count > MAX_BUF_LEN)
+ return -EINVAL;
+
+ if (copy_from_user(b, buf, count))
+ return -EFAULT;
+
+ b[count] = 0;
+ err = kstrtoul(b, 0, &val);
+ if (!err)
+ hstage->threshold = val;
+
+ return err ? err : count;
+}
+
+static void rq_hierarchy_init_dump_data(struct rq_dump_data *rq_ddata)
+{
+ struct request_queue *q = rq_ddata->q;
+
+ rq_ddata->shared = blk_mq_is_sbitmap_shared(q->tag_set->flags);
+ rq_ddata->has_elevator = !!q->elevator;
+
+ if (rq_ddata->has_elevator)
+ rq_ddata->total_tags = q->nr_hw_queues * q->nr_requests;
+ else
+ rq_ddata->total_tags = q->nr_hw_queues * q->tag_set->queue_depth;
+}
+
+static bool __rq_hierarchy_start(struct rq_dump_data *rq_ddata,
+ unsigned int tag)
+{
+ /*
+ * Grab .q_usage_counter so request pool won't go away, then no
+ * request use-after-free is possible during iteration. If queue is
+ * frozen, there won't be any inflight requests.
+ */
+ if (!percpu_ref_tryget(&rq_ddata->q->q_usage_counter)) {
+ rq_ddata->enter_queue = false;
+ return false;
+ }
+
+ rq_ddata->enter_queue = true;
+ rq_hierarchy_init_dump_data(rq_ddata);
+ rq_ddata->tag = tag;
+
+ return tag < rq_ddata->total_tags + rq_ddata->q->nr_hw_queues;
+}
+
+static bool __rq_hierarchy_next(struct rq_dump_data *rq_ddata)
+{
+ rq_ddata->tag++;
+
+ return rq_ddata->tag < rq_ddata->total_tags + rq_ddata->q->nr_hw_queues;
+}
+
+static void __rq_hierarchy_stop(struct rq_dump_data *rq_ddata)
+{
+ if (rq_ddata->enter_queue) {
+ percpu_ref_put(&rq_ddata->q->q_usage_counter);
+ rq_ddata->enter_queue = false;
+ }
+}
+
+static void *rq_hierarchy_start(struct seq_file *m, loff_t *pos)
+ __acquires(&dump_mutex)
+{
+ struct hierarchy_stage *hstage = m->private;
+ struct rq_dump_data *rq_ddata = hstage->dump_data;
+
+ mutex_lock(&dump_mutex);
+
+ if (__rq_hierarchy_start(rq_ddata, *pos))
+ return rq_ddata;
+
+ return NULL;
+}
+
+static void *rq_hierarchy_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct rq_dump_data *rq_ddata = v;
+
+ if (__rq_hierarchy_next(rq_ddata)) {
+ *pos = rq_ddata->tag;
+ return rq_ddata;
+ }
+
+ (*pos)++;
+ return NULL;
+}
+
+static void rq_hierarchy_stop(struct seq_file *m, void *v)
+ __releases(&dump_mutex)
+{
+ struct hierarchy_stage *hstage = m->private;
+ struct rq_dump_data *rq_ddata = hstage->dump_data;
+
+ __rq_hierarchy_stop(rq_ddata);
+ mutex_unlock(&dump_mutex);
+}
+
+static struct request *hierarchy_find_and_get_rq(struct rq_dump_data *rq_ddata)
+{
+ struct request *rq;
+ struct blk_mq_hw_ctx *hctx;
+ struct request_queue *q = rq_ddata->q;
+ unsigned int nr_tag = rq_ddata->tag;
+ unsigned int hctx_id;
+
+ if (nr_tag >= rq_ddata->total_tags) {
+ hctx_id = nr_tag - rq_ddata->total_tags;
+ if (hctx_id >= q->nr_hw_queues)
+ return NULL;
+
+ hctx = queue_hctx(q, hctx_id);
+ rq = hctx->fq->flush_rq;
+ } else if (rq_ddata->shared) {
+ return NULL;
+ } else if (rq_ddata->has_elevator) {
+ hctx_id = nr_tag / q->nr_requests;
+ if (hctx_id >= q->nr_hw_queues)
+ return NULL;
+
+ hctx = queue_hctx(q, hctx_id);
+ rq = hctx->sched_tags->static_rqs[nr_tag % q->nr_requests];
+ } else {
+ hctx_id = nr_tag / q->tag_set->queue_depth;
+ if (hctx_id >= q->nr_hw_queues)
+ return NULL;
+
+ hctx = queue_hctx(q, hctx_id);
+ if (!hctx->tags)
+ return NULL;
+
+ rq = hctx->tags->static_rqs[nr_tag % q->tag_set->queue_depth];
+ }
+
+ /*
+ * fast path to avoid refcount cas operations for the request that
+ * is from other shared request_queue or other stages.
+ */
+ if (rq->q != q || (rq_ddata->stage != STAGE_BIO &&
+ READ_ONCE(rq->stage) != rq_ddata->stage))
+ return NULL;
+
+ if (!refcount_inc_not_zero(&rq->ref))
+ return NULL;
+
+ /* Check again after request is pinned, in case request is resued. */
+ if (rq->q != q) {
+ blk_mq_put_rq_ref(rq);
+ return NULL;
+ }
+
+ if (rq_ddata->stage == STAGE_BIO)
+ return rq;
+
+ /*
+ * Barrier is paired with the smp_store_release() in
+ * rq_hierarchy_start_io_acct(), so that if stage is read, uninitialized
+ * hierarchy_time won't be read.
+ */
+ if (smp_load_acquire(&rq->stage) != rq_ddata->stage) {
+ blk_mq_put_rq_ref(rq);
+ return NULL;
+ }
+
+ return rq;
+}
+
+static void hierarchy_show_rq(struct seq_file *m, struct request *rq,
+ u64 duration)
+{
+ struct task_struct *task = get_pid_task(rq->pid, PIDTYPE_PID);
+ const char *name = hierarchy_stage_name(rq->stage);
+
+ seq_printf(m, "%s-%d %s stage %s ", task ? task->comm : "null",
+ task ? task->pid : 0,
+ rq->rq_disk ? rq->rq_disk->disk_name : "?",
+ name ? name : "?");
+ debugfs_rq_show(m, rq);
+ seq_printf(m, " started %llu ns ago}\n", duration);
+
+ if (task)
+ put_task_struct(task);
+}
+
+static int rq_hierarchy_show(struct seq_file *m, void *v)
+{
+ u64 duration;
+ unsigned long htime;
+ struct hierarchy_stage *hstage = m->private;
+ struct request *rq = hierarchy_find_and_get_rq(v);
+
+ if (!rq)
+ return 0;
+
+ htime = READ_ONCE(rq->hierarchy_time);
+ htime = time_after(jiffies, htime) ? jiffies - htime : 0;
+ duration = jiffies_to_msecs(htime);
+ if (hstage->threshold <= duration)
+ hierarchy_show_rq(m, rq, ms_to_ns(duration));
+
+ blk_mq_put_rq_ref(rq);
+ return 0;
+}
+
+static const struct seq_operations hierarchy_rq_dump_ops = {
+ .start = rq_hierarchy_start,
+ .next = rq_hierarchy_next,
+ .stop = rq_hierarchy_stop,
+ .show = rq_hierarchy_show,
+};
+
+static const struct blk_mq_debugfs_attr hierarchy_threshold_attr[] = {
+ {
+ "threshold",
+ 0600,
+ threshold_show,
+ threshold_store,
+ },
+ {},
+};
+
+static const struct blk_mq_debugfs_attr hierarchy_bio_dump_attr[] = {
+ {
+ "io_dump",
+ 0400,
+ .seq_ops = &hierarchy_bio_dump_ops,
+ },
+ {},
+};
+
+static const struct blk_mq_debugfs_attr hierarchy_rq_dump_attr[] = {
+ {
+ "io_dump",
+ 0400,
+ .seq_ops = &hierarchy_rq_dump_ops,
+ },
+ {},
+};
+
+#ifdef CONFIG_HIERARCHY_BIO
+static struct bio_dump_data *get_bio_stage_ddata(struct request_queue *q,
+ enum stage_group stage)
+{
+ struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats;
+ struct hierarchy_stage *hstage = READ_ONCE(stats->hstage[stage]);
+
+ if (!hstage)
+ return NULL;
+
+ return hstage->dump_data;
+}
+
+static void bio_stage_start_next_stage(struct bio_stage_dump_data *bstage_ddata,
+ loff_t *pos)
+{
+ struct pos_data *pdata = &bstage_ddata->pdata;
+
+ pdata->stage++;
+ if (!stage_is_bio(pdata->stage))
+ pdata->stage = STAGE_BIO;
+ pdata->count = 0;
+
+ *pos = bstage_ddata->pos;
+}
+
+static void bio_stage_start_next_io(struct bio_stage_dump_data *bstage_ddata,
+ loff_t *pos)
+{
+ struct pos_data *pdata = &bstage_ddata->pdata;
+
+ if (stage_is_bio(pdata->stage))
+ pdata->count++;
+ else
+ pdata->count = bstage_ddata->rq_ddata.tag;
+
+ *pos = bstage_ddata->pos;
+}
+
+static void __bio_stage_hierarchy_stop(struct bio_stage_dump_data *bstage_ddata)
+{
+ struct pos_data *pdata = &bstage_ddata->pdata;
+ struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata;
+
+ if (stage_is_bio(pdata->stage)) {
+ struct bio_dump_data *bio_ddata =
+ get_bio_stage_ddata(rq_ddata->q, pdata->stage);
+
+ spin_unlock_irq(&bio_ddata->lock);
+ }
+
+ if (rq_ddata->enter_queue) {
+ percpu_ref_put(&rq_ddata->q->q_usage_counter);
+ rq_ddata->enter_queue = false;
+ }
+}
+
+void *__bio_stage_hierarchy_start(struct bio_stage_dump_data *bstage_ddata,
+ loff_t *pos)
+{
+ struct pos_data *pdata = &bstage_ddata->pdata;
+ struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata;
+
+retry:
+ if (stage_is_bio(pdata->stage)) {
+ struct list_head *list;
+ struct bio_dump_data *bio_ddata =
+ get_bio_stage_ddata(rq_ddata->q, pdata->stage);
+
+ if (!bio_ddata) {
+ bio_stage_start_next_stage(bstage_ddata, pos);
+ goto retry;
+ }
+
+ spin_lock_irq(&bio_ddata->lock);
+ list = seq_list_start(&bio_ddata->head, pdata->count);
+ if (list)
+ return list;
+
+ spin_unlock_irq(&bio_ddata->lock);
+ bio_stage_start_next_stage(bstage_ddata, pos);
+ goto retry;
+ }
+
+ if (pdata->stage == STAGE_BIO &&
+ __rq_hierarchy_start(rq_ddata, pdata->count))
+ return bstage_ddata;
+
+ return NULL;
+}
+
+static void *bio_stage_hierarchy_start(struct seq_file *m, loff_t *pos)
+{
+ struct hierarchy_stage *hstage = m->private;
+ struct bio_stage_dump_data *bstage_ddata = hstage->dump_data;
+
+ mutex_lock(&dump_mutex);
+ bstage_ddata->pos = *pos;
+ bstage_ddata->stat_time = blk_time_get_ns();
+
+ return __bio_stage_hierarchy_start(bstage_ddata, pos);
+}
+
+static void *bio_stage_hierarchy_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct hierarchy_stage *hstage = m->private;
+ struct bio_stage_dump_data *bstage_ddata = hstage->dump_data;
+ struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata;
+ struct pos_data *pdata = &bstage_ddata->pdata;
+
+ if (stage_is_bio(pdata->stage)) {
+ struct bio_dump_data *bio_ddata =
+ get_bio_stage_ddata(rq_ddata->q, pdata->stage);
+ struct list_head *list = ((struct list_head *)v)->next;
+
+ if (list != &bio_ddata->head) {
+ bio_stage_start_next_io(bstage_ddata, pos);
+ return list;
+ }
+
+ spin_unlock_irq(&bio_ddata->lock);
+
+ bio_stage_start_next_stage(bstage_ddata, pos);
+ return __bio_stage_hierarchy_start(bstage_ddata, pos);
+ }
+
+ if (pdata->stage == STAGE_BIO &&
+ __rq_hierarchy_next(rq_ddata)) {
+ bio_stage_start_next_io(bstage_ddata, pos);
+ return bstage_ddata;
+ }
+
+ (*pos)++;
+ return NULL;
+}
+
+static void bio_stage_hierarchy_stop(struct seq_file *m, void *v)
+{
+ struct hierarchy_stage *hstage = m->private;
+ struct bio_stage_dump_data *bstage_ddata = hstage->dump_data;
+
+ __bio_stage_hierarchy_stop(bstage_ddata);
+ mutex_unlock(&dump_mutex);
+}
+
+static int bio_stage_hierarchy_show(struct seq_file *m, void *v)
+{
+ struct hierarchy_stage *hstage = m->private;
+ struct bio_stage_dump_data *bstage_ddata = hstage->dump_data;
+ struct rq_dump_data *rq_ddata = &bstage_ddata->rq_ddata;
+ struct pos_data *pdata = &bstage_ddata->pdata;
+ u64 duration;
+
+ if (stage_is_bio(pdata->stage)) {
+ struct bio_hierarchy_data *data = list_entry(
+ v, struct bio_hierarchy_data, hierarchy_list);
+
+ duration = get_duration(bstage_ddata->stat_time,
+ data->bio->bi_alloc_time_ns);
+ if (hstage->threshold <= ns_to_ms(duration))
+ __hierarchy_show_bio(m, data, pdata->stage, duration);
+ } else if (pdata->stage == STAGE_BIO) {
+ struct request *rq = hierarchy_find_and_get_rq(rq_ddata);
+
+ if (rq) {
+ duration = get_duration(bstage_ddata->stat_time,
+ rq->bi_alloc_time_ns);
+ if (hstage->threshold <= ns_to_ms(duration))
+ hierarchy_show_rq(m, rq, duration);
+ blk_mq_put_rq_ref(rq);
+ }
+ }
+
+ return 0;
+}
+
+static const struct seq_operations bio_stage_hierarchy_ops = {
+ .start = bio_stage_hierarchy_start,
+ .next = bio_stage_hierarchy_next,
+ .stop = bio_stage_hierarchy_stop,
+ .show = bio_stage_hierarchy_show,
+};
+
+static const struct blk_mq_debugfs_attr bio_stage_dump_attr[] = {
+ {
+ "io_dump",
+ 0400,
+ .seq_ops = &bio_stage_hierarchy_ops,
+ },
+ {},
+};
+
+#else /* CONFIG_HIERARCHY_BIO */
+static const struct blk_mq_debugfs_attr bio_stage_dump_attr[] = {
+ {},
+};
+
+#endif
+
+void io_hierarchy_register_iodump(struct hierarchy_stage *hstage)
+{
+ const struct blk_mq_debugfs_attr *attr;
+
+ if (stage_is_bio(hstage->stage))
+ attr = hierarchy_bio_dump_attr;
+ else if (stage_is_rq(hstage->stage))
+ attr = hierarchy_rq_dump_attr;
+ else if (hstage->stage == STAGE_BIO)
+ attr = bio_stage_dump_attr;
+ else
+ attr = NULL;
+
+ debugfs_create_files(hstage->debugfs_dir, hstage,
+ hierarchy_threshold_attr);
+ if (attr)
+ debugfs_create_files(hstage->debugfs_dir, hstage, attr);
+}
+
+void hierarchy_account_slow_io(struct hierarchy_stage *hstage,
+ enum stat_group op, unsigned long duration)
+{
+ if (hstage->threshold <= duration)
+ this_cpu_inc(hstage->hstats->slow[op]);
+}
+
+void hierarchy_show_slow_io(struct hierarchy_stage *hstage, struct seq_file *m)
+{
+ u64 slow[NEW_NR_STAT_GROUPS] = {0};
+ int cpu;
+ int i;
+
+ for_each_possible_cpu(cpu) {
+ struct hierarchy_stats *stat = per_cpu_ptr(hstage->hstats, cpu);
+
+ for (i = 0; i < NEW_NR_STAT_GROUPS; ++i)
+ slow[i] += stat->slow[i];
+ }
+
+ seq_printf(m, " %llu %llu %llu %llu", slow[STAT_READ], slow[STAT_WRITE],
+ slow[STAT_DISCARD], slow[STAT_FLUSH]);
+}
diff --git a/block/blk-io-hierarchy/iodump.h b/block/blk-io-hierarchy/iodump.h
new file mode 100644
index 000000000000..2f9e159f2588
--- /dev/null
+++ b/block/blk-io-hierarchy/iodump.h
@@ -0,0 +1,100 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef BLK_IO_HIERARCHY_IODUMP_H
+#define BLK_IO_HIERARCHY_IODUMP_H
+
+#ifdef CONFIG_HIERARCHY_IO_DUMP
+
+#include "stats.h"
+
+#define ns_to_ms(time) div_u64(time, NSEC_PER_MSEC)
+
+int blk_io_hierarchy_iodump_init(struct request_queue *q,
+ struct hierarchy_stage *hstage);
+void blk_io_hierarchy_iodump_exit(struct request_queue *q,
+ enum stage_group stage);
+void hierarchy_add_bio(struct hierarchy_stage *hstage, struct bio *bio);
+void hierarchy_remove_bio(struct hierarchy_stage *hstage, struct bio *bio);
+void bio_hierarchy_data_init(struct bio *bio, struct bio_hierarchy_data *hdata);
+void io_hierarchy_register_iodump(struct hierarchy_stage *hstage);
+
+void hierarchy_account_slow_io(struct hierarchy_stage *hstage,
+ enum stat_group op, unsigned long duration);
+void hierarchy_show_slow_io(struct hierarchy_stage *hstage, struct seq_file *m);
+
+static inline void
+hierarchy_account_slow_io_ns(struct hierarchy_stage *hstage,
+ enum stat_group op, u64 duration)
+{
+ hierarchy_account_slow_io(hstage, op, ns_to_ms(duration));
+}
+
+static inline void
+hierarchy_account_slow_io_jiffies(struct hierarchy_stage *hstage,
+ enum stat_group op, unsigned long duration)
+{
+ hierarchy_account_slow_io(hstage, op, jiffies_to_msecs(duration));
+}
+
+#else
+static inline int
+blk_io_hierarchy_iodump_init(struct request_queue *q,
+ struct hierarchy_stage *hstage)
+{
+ return 0;
+}
+
+static inline void
+blk_io_hierarchy_iodump_exit(struct request_queue *q, enum stage_group stage)
+{
+}
+
+static inline void
+hierarchy_add_bio(struct hierarchy_stage *hstage, struct bio *bio)
+{
+}
+
+static inline void
+hierarchy_remove_bio(struct hierarchy_stage *hstage, struct bio *bio)
+{
+}
+
+static inline void
+bio_hierarchy_data_init(struct bio *bio, struct bio_hierarchy_data *hdata)
+{
+}
+
+static inline void
+io_hierarchy_register_iodump(struct hierarchy_stage *hstage)
+{
+}
+
+static inline void
+hierarchy_account_slow_io_ns(struct hierarchy_stage *hstage,
+ enum stat_group op, u64 duration)
+{
+}
+
+static inline void
+hierarchy_account_slow_io_jiffies(struct hierarchy_stage *hstage,
+ enum stat_group op, unsigned long duration)
+{
+}
+
+static inline void
+hierarchy_show_slow_io(struct hierarchy_stage *hstage, struct seq_file *m)
+{
+}
+#endif
+#endif
diff --git a/block/blk-io-hierarchy/stats.c b/block/blk-io-hierarchy/stats.c
new file mode 100644
index 000000000000..52a23413f468
--- /dev/null
+++ b/block/blk-io-hierarchy/stats.c
@@ -0,0 +1,331 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/debugfs.h>
+
+#include "stats.h"
+#include "iodump.h"
+#include "../blk.h"
+#include "../blk-mq-debugfs.h"
+
+#define io_hierarchy_add(statsp, field, group, nr) \
+ this_cpu_add((statsp)->field[group], nr)
+#define io_hierarchy_inc(statsp, field, group) \
+ io_hierarchy_add(statsp, field, group, 1)
+
+#define PRE_ALLOC_BIO_CNT 8
+
+static mempool_t *hdata_pool;
+
+void blk_mq_debugfs_register_hierarchy_stats(struct request_queue *q)
+{
+ struct blk_io_hierarchy_stats *stats;
+ enum stage_group stage;
+
+ stats = q->io_hierarchy_stats;
+ if (!stats || !blk_mq_debugfs_enabled(q))
+ return;
+
+ stats->debugfs_dir = debugfs_create_dir("blk_io_hierarchy",
+ q->debugfs_dir);
+ blk_mq_debugfs_create_default_hierarchy_attr(q);
+
+ for (stage = 0; stage < NR_STAGE_GROUPS; ++stage)
+ blk_mq_debugfs_register_hierarchy(q, stage);
+}
+
+static void bio_alloc_hierarchy_data(struct bio *bio)
+{
+ if (!bio->hdata) {
+ struct bio_hierarchy_data *hdata =
+ mempool_alloc(hdata_pool, GFP_NOIO);
+
+ bio_hierarchy_data_init(bio, hdata);
+ bio->hdata = hdata;
+ }
+}
+
+void bio_free_hierarchy_data(struct bio *bio)
+{
+ if (!bio->hdata)
+ return;
+
+ mempool_free(bio->hdata, hdata_pool);
+ bio->hdata = NULL;
+}
+
+void blk_mq_debugfs_unregister_hierarchy_stats(struct request_queue *q)
+{
+ struct blk_io_hierarchy_stats *stats;
+ enum stage_group stage;
+
+ stats = q->io_hierarchy_stats;
+ if (!stats || !blk_mq_debugfs_enabled(q))
+ return;
+
+ for (stage = 0; stage < NR_STAGE_GROUPS; ++stage)
+ blk_mq_debugfs_unregister_hierarchy(q, stage);
+
+ debugfs_remove_recursive(stats->debugfs_dir);
+ stats->debugfs_dir = NULL;
+}
+
+int blk_io_hierarchy_stats_alloc(struct request_queue *q)
+{
+ struct blk_io_hierarchy_stats *stats;
+
+ if (!q->mq_ops)
+ return 0;
+
+ stats = kzalloc(sizeof(struct blk_io_hierarchy_stats), GFP_KERNEL);
+ if (!stats)
+ return -ENOMEM;
+
+ stats->q = q;
+ q->io_hierarchy_stats = stats;
+
+ return 0;
+}
+
+void blk_io_hierarchy_stats_free(struct request_queue *q)
+{
+ struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats;
+
+ if (!stats)
+ return;
+
+ q->io_hierarchy_stats = NULL;
+ kfree(stats);
+}
+
+bool blk_mq_hierarchy_registered(struct request_queue *q,
+ enum stage_group stage)
+{
+ struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats;
+
+ if (!stats)
+ return false;
+
+ return stats->hstage[stage] != NULL;
+}
+EXPORT_SYMBOL_GPL(blk_mq_hierarchy_registered);
+
+void blk_mq_register_hierarchy(struct request_queue *q, enum stage_group stage)
+{
+ struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats;
+ struct hierarchy_stage *hstage;
+
+ if (!stats || !hierarchy_stage_name(stage))
+ return;
+
+ if (blk_mq_hierarchy_registered(q, stage)) {
+ pr_warn("blk-io-hierarchy: disk %s is registering stage %s again.",
+ kobject_name(q->kobj.parent),
+ hierarchy_stage_name(stage));
+ return;
+ }
+
+ /*
+ * Alloc memory before freeze queue, prevent deadlock if new IO is
+ * issued by memory reclaim.
+ */
+ hstage = kmalloc(sizeof(*hstage), GFP_KERNEL);
+ if (!hstage)
+ return;
+
+ hstage->hstats = alloc_percpu(struct hierarchy_stats);
+ if (!hstage->hstats) {
+ kfree(hstage);
+ return;
+ }
+
+ hstage->stage = stage;
+ hstage->debugfs_dir = NULL;
+ if (blk_io_hierarchy_iodump_init(q, hstage) < 0) {
+ free_percpu(hstage->hstats);
+ kfree(hstage);
+ return;
+ }
+
+ blk_mq_freeze_queue(q);
+
+ WRITE_ONCE(stats->hstage[stage], hstage);
+ blk_mq_debugfs_register_hierarchy(q, stage);
+
+ blk_mq_unfreeze_queue(q);
+}
+EXPORT_SYMBOL_GPL(blk_mq_register_hierarchy);
+
+void blk_mq_unregister_hierarchy(struct request_queue *q,
+ enum stage_group stage)
+{
+ struct blk_io_hierarchy_stats *stats = q->io_hierarchy_stats;
+ struct hierarchy_stage *hstage;
+
+ if (!blk_mq_hierarchy_registered(q, stage))
+ return;
+
+ blk_mq_debugfs_unregister_hierarchy(q, stage);
+ blk_io_hierarchy_iodump_exit(q, stage);
+
+ hstage = stats->hstage[stage];
+ stats->hstage[stage] = NULL;
+ free_percpu(hstage->hstats);
+ kfree(hstage);
+}
+EXPORT_SYMBOL_GPL(blk_mq_unregister_hierarchy);
+
+static enum stat_group bio_hierarchy_op(struct bio *bio)
+{
+ if (op_is_discard(bio->bi_opf))
+ return STAT_DISCARD;
+
+ if (op_is_flush(bio->bi_opf) &&
+ !(bio_sectors(bio) || bio_flagged(bio, BIO_HAS_DATA)))
+ return STAT_FLUSH;
+
+ if (op_is_write(bio->bi_opf))
+ return STAT_WRITE;
+
+ return STAT_READ;
+}
+
+
+void bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage)
+{
+ struct request_queue *q = bio->bi_disk->queue;
+ struct hierarchy_stage *hstage;
+
+ if (!blk_mq_hierarchy_registered(q, stage))
+ return;
+
+ hstage = q->io_hierarchy_stats->hstage[stage];
+ bio_alloc_hierarchy_data(bio);
+ io_hierarchy_inc(hstage->hstats, dispatched, bio_hierarchy_op(bio));
+ bio->hdata->time = blk_time_get_ns();
+ hierarchy_add_bio(hstage, bio);
+}
+
+void __bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage,
+ u64 time)
+{
+ struct request_queue *q = bio->bi_disk->queue;
+ struct hierarchy_stage *hstage;
+ u64 duration;
+ enum stat_group op;
+
+ if (!blk_mq_hierarchy_registered(q, stage))
+ return;
+
+ op = bio_hierarchy_op(bio);
+ duration = time - bio->hdata->time;
+ hstage = q->io_hierarchy_stats->hstage[stage];
+
+ hierarchy_remove_bio(hstage, bio);
+ io_hierarchy_inc(hstage->hstats, completed, op);
+ io_hierarchy_add(hstage->hstats, nsecs, op, duration);
+ hierarchy_account_slow_io_ns(hstage, op, duration);
+}
+
+static enum stat_group rq_hierarchy_op(struct request *rq)
+{
+ if (op_is_discard(rq->cmd_flags))
+ return STAT_DISCARD;
+
+ if (is_flush_rq(rq))
+ return STAT_FLUSH;
+
+ if (op_is_write(rq->cmd_flags))
+ return STAT_WRITE;
+
+ return STAT_READ;
+}
+
+void __rq_hierarchy_start_io_acct(struct request *rq,
+ struct hierarchy_stage *hstage)
+{
+ io_hierarchy_inc(hstage->hstats, dispatched, rq_hierarchy_op(rq));
+ WRITE_ONCE(rq->hierarchy_time, jiffies);
+
+ /*
+ * Paired with barrier in hierarchy_show_rq_fn(), make sure
+ * hierarchy_time is set before stage.
+ */
+ smp_store_release(&rq->stage, hstage->stage);
+}
+EXPORT_SYMBOL_GPL(__rq_hierarchy_start_io_acct);
+
+void __rq_hierarchy_end_io_acct(struct request *rq,
+ struct hierarchy_stage *hstage)
+{
+ unsigned long duration = jiffies - rq->hierarchy_time;
+ enum stat_group op = rq_hierarchy_op(rq);
+
+ io_hierarchy_inc(hstage->hstats, completed, op);
+ io_hierarchy_add(hstage->hstats, jiffies, op, duration);
+ hierarchy_account_slow_io_jiffies(hstage, op, duration);
+ WRITE_ONCE(rq->stage, NR_RQ_STAGE_GROUPS);
+}
+EXPORT_SYMBOL_GPL(__rq_hierarchy_end_io_acct);
+
+#ifdef CONFIG_HIERARCHY_BIO
+void bio_hierarchy_start(struct bio *bio)
+{
+ struct gendisk *disk = bio->bi_disk;
+ struct hierarchy_stage *hstage;
+
+ if (bio_flagged(bio, BIO_HIERARCHY_ACCT))
+ return;
+
+ if (!blk_mq_hierarchy_registered(disk->queue, STAGE_BIO))
+ return;
+
+ bio_set_flag(bio, BIO_HIERARCHY_ACCT);
+ if (bio_has_data(bio))
+ bio_set_flag(bio, BIO_HAS_DATA);
+ hstage = disk->queue->io_hierarchy_stats->hstage[STAGE_BIO];
+ io_hierarchy_inc(hstage->hstats, dispatched, bio_hierarchy_op(bio));
+}
+
+void __bio_hierarchy_end(struct bio *bio, u64 now)
+{
+ struct gendisk *disk = bio->bi_disk;
+ struct hierarchy_stage *hstage;
+ u64 duration;
+ enum stat_group op;
+
+ op = bio_hierarchy_op(bio);
+ duration = now - bio->bi_alloc_time_ns;
+ hstage = disk->queue->io_hierarchy_stats->hstage[STAGE_BIO];
+
+ io_hierarchy_inc(hstage->hstats, completed, op);
+ io_hierarchy_add(hstage->hstats, nsecs, op, duration);
+ hierarchy_account_slow_io_ns(hstage, op, duration);
+
+ bio_clear_flag(bio, BIO_HIERARCHY_ACCT);
+ bio_clear_flag(bio, BIO_HAS_DATA);
+}
+
+#endif
+
+static int __init hierarchy_stats_init(void)
+{
+ hdata_pool = mempool_create_kmalloc_pool(PRE_ALLOC_BIO_CNT,
+ sizeof(struct bio_hierarchy_data));
+ if (!hdata_pool)
+ panic("Failed to create hdata_pool\n");
+
+ return 0;
+}
+module_init(hierarchy_stats_init);
diff --git a/block/blk-io-hierarchy/stats.h b/block/blk-io-hierarchy/stats.h
new file mode 100644
index 000000000000..ed3e5ddc084a
--- /dev/null
+++ b/block/blk-io-hierarchy/stats.h
@@ -0,0 +1,323 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef BLK_IO_HIERARCHY_STATS_H
+#define BLK_IO_HIERARCHY_STATS_H
+
+#ifdef CONFIG_BLK_IO_HIERARCHY_STATS
+
+#include <linux/blkdev.h>
+#include <linux/blk_types.h>
+#include "../blk.h"
+
+struct bio_hierarchy_data {
+ u64 time;
+#ifdef CONFIG_HIERARCHY_IO_DUMP
+ struct bio *bio;
+ struct list_head hierarchy_list;
+#endif
+};
+
+struct hierarchy_stats {
+ union {
+ /* for bio based stages. */
+ u64 nsecs[NEW_NR_STAT_GROUPS];
+ /* for request based stages. */
+ unsigned long jiffies[NEW_NR_STAT_GROUPS];
+ };
+ unsigned long dispatched[NEW_NR_STAT_GROUPS];
+ unsigned long completed[NEW_NR_STAT_GROUPS];
+#ifdef CONFIG_HIERARCHY_IO_DUMP
+ unsigned long slow[NEW_NR_STAT_GROUPS];
+#endif
+
+};
+
+struct hierarchy_stage {
+ enum stage_group stage;
+ struct dentry *debugfs_dir;
+ struct hierarchy_stats __percpu *hstats;
+#ifdef CONFIG_HIERARCHY_IO_DUMP
+ unsigned long threshold;
+ void *dump_data;
+#endif
+};
+
+struct blk_io_hierarchy_stats {
+ struct request_queue *q;
+ struct dentry *debugfs_dir;
+ struct hierarchy_stage *hstage[NR_STAGE_GROUPS];
+};
+
+static inline bool stage_is_bio(enum stage_group stage)
+{
+ return stage >= 0 && stage < NR_BIO_STAGE_GROUPS;
+}
+
+static inline bool stage_is_rq(enum stage_group stage)
+{
+ return stage >= NR_BIO_STAGE_GROUPS && stage < NR_RQ_STAGE_GROUPS;
+}
+
+const char *hierarchy_stage_name(enum stage_group stage);
+int blk_io_hierarchy_stats_alloc(struct request_queue *q);
+void blk_io_hierarchy_stats_free(struct request_queue *q);
+
+/* APIs for stage registration */
+bool blk_mq_hierarchy_registered(struct request_queue *q,
+ enum stage_group stage);
+void blk_mq_register_hierarchy(struct request_queue *q, enum stage_group stage);
+void blk_mq_unregister_hierarchy(struct request_queue *q,
+ enum stage_group stage);
+
+/* APIs for disk level debugfs */
+void blk_mq_debugfs_register_hierarchy_stats(struct request_queue *q);
+void blk_mq_debugfs_unregister_hierarchy_stats(struct request_queue *q);
+void blk_mq_debugfs_create_default_hierarchy_attr(struct request_queue *q);
+
+/* APIs for stage level debugfs */
+void blk_mq_debugfs_register_hierarchy(struct request_queue *q,
+ enum stage_group stage);
+void blk_mq_debugfs_unregister_hierarchy(struct request_queue *q,
+ enum stage_group stage);
+
+/* APIs for bio based stage io accounting */
+void bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage);
+void __bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage,
+ u64 time);
+void bio_free_hierarchy_data(struct bio *bio);
+
+static inline void bio_hierarchy_end_io_acct(struct bio *bio,
+ enum stage_group stage)
+{
+ __bio_hierarchy_end_io_acct(bio, stage, blk_time_get_ns());
+}
+
+static inline void bio_list_hierarchy_end_io_acct(struct bio_list *list,
+ enum stage_group stage)
+{
+ u64 time = blk_time_get_ns();
+ struct bio *bio;
+
+ bio_list_for_each(bio, list)
+ __bio_hierarchy_end_io_acct(bio, stage, time);
+}
+
+/* APIs for request based stage io accounting */
+void __rq_hierarchy_start_io_acct(struct request *rq,
+ struct hierarchy_stage *hstage);
+void __rq_hierarchy_end_io_acct(struct request *rq,
+ struct hierarchy_stage *hstage);
+
+static inline void rq_hierarchy_start_io_acct(struct request *rq,
+ enum stage_group stage)
+{
+ if (!blk_mq_hierarchy_registered(rq->q, stage))
+ return;
+
+ __rq_hierarchy_start_io_acct(
+ rq, rq->q->io_hierarchy_stats->hstage[stage]);
+}
+
+static inline void rq_hierarchy_end_io_acct(struct request *rq,
+ enum stage_group stage)
+{
+ if (!blk_mq_hierarchy_registered(rq->q, stage))
+ return;
+
+ __rq_hierarchy_end_io_acct(
+ rq, rq->q->io_hierarchy_stats->hstage[stage]);
+}
+
+static inline void rq_list_hierarchy_start_io_acct(struct list_head *head,
+ enum stage_group stage)
+{
+ struct request *rq;
+ struct hierarchy_stage *hstage;
+
+ if (list_empty(head))
+ return;
+
+ rq = list_first_entry(head, struct request, queuelist);
+ if (!blk_mq_hierarchy_registered(rq->q, stage))
+ return;
+
+ hstage = rq->q->io_hierarchy_stats->hstage[stage];
+ list_for_each_entry(rq, head, queuelist)
+ __rq_hierarchy_start_io_acct(rq, hstage);
+}
+
+static inline void rq_list_hierarchy_end_io_acct(struct list_head *head,
+ enum stage_group stage)
+{
+ struct request *rq;
+ struct hierarchy_stage *hstage;
+
+ if (list_empty(head))
+ return;
+
+ rq = list_first_entry(head, struct request, queuelist);
+ if (!blk_mq_hierarchy_registered(rq->q, stage))
+ return;
+
+ hstage = rq->q->io_hierarchy_stats->hstage[stage];
+ list_for_each_entry(rq, head, queuelist)
+ __rq_hierarchy_end_io_acct(rq, hstage);
+}
+
+#ifdef CONFIG_HIERARCHY_BIO
+void bio_hierarchy_start(struct bio *bio);
+void __bio_hierarchy_end(struct bio *bio, u64 now);
+
+static inline void bio_hierarchy_end(struct bio *bio)
+{
+ if (!bio_flagged(bio, BIO_HIERARCHY_ACCT))
+ return;
+
+ if (!blk_mq_hierarchy_registered(bio->bi_disk->queue, STAGE_BIO))
+ return;
+
+ __bio_hierarchy_end(bio, blk_time_get_ns());
+}
+
+static inline void req_bio_hierarchy_end(struct request *rq, struct bio *bio)
+{
+ u64 now;
+
+ if (!bio_flagged(bio, BIO_HIERARCHY_ACCT))
+ return;
+
+ if (!blk_mq_hierarchy_registered(bio->bi_disk->queue, STAGE_BIO))
+ return;
+
+ now = rq->io_end_time_ns;
+ if (!now) {
+ now = blk_time_get_ns();
+ rq->io_end_time_ns = now;
+ }
+
+ __bio_hierarchy_end(bio, now);
+}
+#endif
+
+#else /* CONFIG_BLK_IO_HIERARCHY_STATS */
+
+static inline int
+blk_io_hierarchy_stats_alloc(struct request_queue *q)
+{
+ return 0;
+}
+
+static inline void
+blk_io_hierarchy_stats_free(struct request_queue *q)
+{
+}
+
+static inline bool
+blk_mq_hierarchy_registered(struct request_queue *q, enum stage_group stage)
+{
+ return false;
+}
+
+static inline void
+blk_mq_register_hierarchy(struct request_queue *q, enum stage_group stage)
+{
+}
+
+static inline void
+blk_mq_unregister_hierarchy(struct request_queue *q, enum stage_group stage)
+{
+}
+
+static inline void
+blk_mq_debugfs_register_hierarchy_stats(struct request_queue *q)
+{
+}
+
+static inline void
+blk_mq_debugfs_unregister_hierarchy_stats(struct request_queue *q)
+{
+}
+
+static inline void
+blk_mq_debugfs_register_hierarchy(struct request_queue *q,
+ enum stage_group stage)
+{
+}
+
+static inline void
+blk_mq_debugfs_unregister_hierarchy(struct request_queue *q,
+ enum stage_group stage)
+{
+}
+
+static inline void
+bio_hierarchy_start_io_acct(struct bio *bio, enum stage_group stage)
+{
+}
+
+static inline void
+bio_hierarchy_end_io_acct(struct bio *bio, enum stage_group stage)
+{
+}
+
+static inline void
+bio_list_hierarchy_end_io_acct(struct bio_list *list, enum stage_group stage)
+{
+}
+
+static inline void
+bio_free_hierarchy_data(struct bio *bio)
+{
+}
+
+static inline void
+rq_hierarchy_start_io_acct(struct request *rq, enum stage_group stage)
+{
+}
+
+static inline void
+rq_hierarchy_end_io_acct(struct request *rq, enum stage_group stage)
+{
+}
+
+static inline void
+rq_list_hierarchy_start_io_acct(struct list_head *head, enum stage_group stage)
+{
+}
+
+static inline void
+rq_list_hierarchy_end_io_acct(struct list_head *head, enum stage_group stage)
+{
+}
+
+#endif /* CONFIG_BLK_IO_HIERARCHY_STATS */
+
+#if !defined(CONFIG_BLK_IO_HIERARCHY_STATS) || !defined(CONFIG_HIERARCHY_BIO)
+static inline void
+bio_hierarchy_start(struct bio *bio)
+{
+}
+
+static inline void
+bio_hierarchy_end(struct bio *bio)
+{
+}
+
+static inline void
+req_bio_hierarchy_end(struct request *rq, struct bio *bio)
+{
+}
+#endif
+
+#endif /* BLK_IO_HIERARCHY_STATS_H */
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index f0865b6ea1e1..a0909e56d669 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -23,6 +23,7 @@
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
#include "blk-mq-tag.h"
+#include "blk-io-hierarchy/stats.h"
static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
{
@@ -355,9 +356,8 @@ static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state)
return blk_mq_rq_state_name_array[rq_state];
}
-int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
+void debugfs_rq_show(struct seq_file *m, struct request *rq)
{
- const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
const unsigned int op = rq->cmd_flags & REQ_OP_MASK;
seq_printf(m, "%p {.op=", rq);
@@ -374,6 +374,13 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
seq_printf(m, ", .state=%s", blk_mq_rq_state_name(blk_mq_rq_state(rq)));
seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag,
rq->internal_tag);
+}
+
+int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
+{
+ const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
+
+ debugfs_rq_show(m, rq);
if (mq_ops->show_rq)
mq_ops->show_rq(m, rq);
seq_puts(m, "}\n");
@@ -811,8 +818,8 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
{},
};
-static bool debugfs_create_files(struct dentry *parent, void *data,
- const struct blk_mq_debugfs_attr *attr)
+bool debugfs_create_files(struct dentry *parent, void *data,
+ const struct blk_mq_debugfs_attr *attr)
{
if (IS_ERR_OR_NULL(parent))
return false;
@@ -861,6 +868,7 @@ int blk_mq_debugfs_register(struct request_queue *q)
goto err;
}
+ blk_mq_debugfs_register_hierarchy_stats(q);
return 0;
err:
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index a9160be12be0..73a3796bd03c 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -31,6 +31,14 @@ void blk_mq_debugfs_unregister_sched(struct request_queue *q);
int blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
struct blk_mq_hw_ctx *hctx);
void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx);
+bool debugfs_create_files(struct dentry *parent, void *data,
+ const struct blk_mq_debugfs_attr *attr);
+void debugfs_rq_show(struct seq_file *m, struct request *rq);
+
+static inline bool blk_mq_debugfs_enabled(struct request_queue *q)
+{
+ return !IS_ERR_OR_NULL(q->debugfs_dir);
+}
#else
static inline int blk_mq_debugfs_register(struct request_queue *q)
{
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 0fb33abac3f6..1c8befbe7b69 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -15,6 +15,7 @@
#include "blk-mq-sched.h"
#include "blk-mq-tag.h"
#include "blk-wbt.h"
+#include "blk-io-hierarchy/stats.h"
void blk_mq_sched_free_hctx_data(struct request_queue *q,
void (*exit)(struct blk_mq_hw_ctx *))
@@ -250,6 +251,7 @@ int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
*/
if (!list_empty(&rq_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
+ rq_list_hierarchy_end_io_acct(&rq_list, STAGE_HCTX);
if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
if (has_sched_dispatch)
ret = blk_mq_do_dispatch_sched(hctx);
@@ -389,10 +391,8 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
bool ret = false;
- if (e && e->type->ops.mq.bio_merge) {
- blk_mq_put_ctx(ctx);
+ if (e && e->type->ops.mq.bio_merge)
return e->type->ops.mq.bio_merge(hctx, bio);
- }
if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
!list_empty_careful(&ctx->rq_list)) {
@@ -402,7 +402,6 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
spin_unlock(&ctx->lock);
}
- blk_mq_put_ctx(ctx);
return ret;
}
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index bee92ab06a5e..f7b21d7f136e 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -13,6 +13,7 @@
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
+#include "blk-io-hierarchy/stats.h"
bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
{
@@ -113,7 +114,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
struct sbq_wait_state *ws;
DEFINE_WAIT(wait);
unsigned int tag_offset;
- bool drop_ctx;
int tag;
if (data->flags & BLK_MQ_REQ_RESERVED) {
@@ -135,8 +135,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
if (data->flags & BLK_MQ_REQ_NOWAIT)
return BLK_MQ_TAG_FAIL;
+ if (data->bio)
+ bio_hierarchy_start_io_acct(data->bio, STAGE_GETTAG);
ws = bt_wait_ptr(bt, data->hctx);
- drop_ctx = data->ctx == NULL;
do {
struct sbitmap_queue *bt_prev;
@@ -162,9 +163,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
if (tag != -1)
break;
- if (data->ctx)
- blk_mq_put_ctx(data->ctx);
-
bt_prev = bt;
io_schedule();
@@ -189,10 +187,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
ws = bt_wait_ptr(bt, data->hctx);
} while (1);
- if (drop_ctx && data->ctx)
- blk_mq_put_ctx(data->ctx);
-
finish_wait(&ws->wait, &wait);
+ if (data->bio)
+ bio_hierarchy_end_io_acct(data->bio, STAGE_GETTAG);
found_tag:
return tag + tag_offset;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index aa4b3c608249..7f24ff0692d4 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -36,6 +36,7 @@
#include "blk-stat.h"
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"
+#include "blk-io-hierarchy/stats.h"
static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
static void blk_mq_poll_stats_start(struct request_queue *q);
@@ -368,6 +369,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
rq->part = NULL;
rq->start_time_ns = ktime_get_ns();
rq->io_start_time_ns = 0;
+ blk_mq_get_alloc_task(rq, data->bio);
+
rq->nr_phys_segments = 0;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
rq->nr_integrity_segments = 0;
@@ -400,13 +403,13 @@ static struct request *blk_mq_get_request(struct request_queue *q,
struct elevator_queue *e = q->elevator;
struct request *rq;
unsigned int tag;
- bool put_ctx_on_error = false;
+ bool clear_ctx_on_error = false;
blk_queue_enter_live(q);
data->q = q;
if (likely(!data->ctx)) {
data->ctx = blk_mq_get_ctx(q);
- put_ctx_on_error = true;
+ clear_ctx_on_error = true;
}
if (likely(!data->hctx))
data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
@@ -430,10 +433,8 @@ static struct request *blk_mq_get_request(struct request_queue *q,
tag = blk_mq_get_tag(data);
if (tag == BLK_MQ_TAG_FAIL) {
- if (put_ctx_on_error) {
- blk_mq_put_ctx(data->ctx);
+ if (clear_ctx_on_error)
data->ctx = NULL;
- }
blk_queue_exit(q);
return NULL;
}
@@ -470,8 +471,6 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
if (!rq)
return ERR_PTR(-EWOULDBLOCK);
- blk_mq_put_ctx(alloc_data.ctx);
-
rq->__data_len = 0;
rq->__sector = (sector_t) -1;
rq->bio = rq->biotail = NULL;
@@ -532,6 +531,8 @@ static void __blk_mq_free_request(struct request *rq)
struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
const int sched_tag = rq->internal_tag;
+ blk_mq_put_alloc_task(rq);
+
if (rq->tag != -1)
blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
if (sched_tag != -1)
@@ -583,6 +584,8 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
blk_stat_add(rq, now);
}
+ if (blk_mq_request_started(rq))
+ rq_hierarchy_end_io_acct(rq, STAGE_RQ_DRIVER);
blk_account_io_done(rq, now);
if (rq->end_io) {
@@ -722,6 +725,7 @@ void blk_mq_start_request(struct request *rq)
blk_mq_sched_started_request(rq);
trace_block_rq_issue(q, rq);
+ rq_hierarchy_start_io_acct(rq, STAGE_RQ_DRIVER);
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
rq->io_start_time_ns = ktime_get_ns();
@@ -762,6 +766,7 @@ static void __blk_mq_requeue_request(struct request *rq)
rq->rq_flags &= ~RQF_TIMED_OUT;
if (q->dma_drain_size && blk_rq_bytes(rq))
rq->nr_phys_segments--;
+ rq_hierarchy_end_io_acct(rq, STAGE_RQ_DRIVER);
}
}
@@ -787,6 +792,7 @@ static void blk_mq_requeue_work(struct work_struct *work)
spin_lock_irq(&q->requeue_lock);
list_splice_init(&q->requeue_list, &rq_list);
spin_unlock_irq(&q->requeue_lock);
+ rq_list_hierarchy_end_io_acct(&rq_list, STAGE_REQUEUE);
list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP)))
@@ -826,6 +832,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
*/
BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
+ rq_hierarchy_start_io_acct(rq, STAGE_REQUEUE);
spin_lock_irqsave(&q->requeue_lock, flags);
if (at_head) {
rq->rq_flags |= RQF_SOFTBARRIER;
@@ -1317,6 +1324,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
if (!list_empty(list)) {
bool needs_restart;
+ rq_list_hierarchy_start_io_acct(list, STAGE_HCTX);
spin_lock(&hctx->lock);
list_splice_tail_init(list, &hctx->dispatch);
spin_unlock(&hctx->lock);
@@ -1726,6 +1734,7 @@ void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
struct blk_mq_ctx *ctx = rq->mq_ctx;
struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
+ rq_hierarchy_start_io_acct(rq, STAGE_HCTX);
spin_lock(&hctx->lock);
if (at_head)
list_add(&rq->queuelist, &hctx->dispatch);
@@ -1792,6 +1801,8 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
if (rq->mq_ctx != this_ctx) {
if (this_ctx) {
trace_block_unplug(this_q, depth, !from_schedule);
+ rq_list_hierarchy_end_io_acct(&ctx_list,
+ STAGE_PLUG);
blk_mq_sched_insert_requests(this_q, this_ctx,
&ctx_list,
from_schedule);
@@ -1812,6 +1823,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
*/
if (this_ctx) {
trace_block_unplug(this_q, depth, !from_schedule);
+ rq_list_hierarchy_end_io_acct(&ctx_list, STAGE_PLUG);
blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
from_schedule);
}
@@ -1975,7 +1987,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
{
const int is_sync = op_is_sync(bio->bi_opf);
const int is_flush_fua = op_is_flush(bio->bi_opf);
- struct blk_mq_alloc_data data = { .flags = 0 };
+ struct blk_mq_alloc_data data = {
+ .flags = 0,
+ .bio = bio,
+ };
struct request *rq;
unsigned int request_count = 0;
struct blk_plug *plug;
@@ -2019,7 +2034,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
plug = current->plug;
if (unlikely(is_flush_fua)) {
- blk_mq_put_ctx(data.ctx);
blk_mq_bio_to_request(rq, bio);
/* bypass scheduler for flush rq */
@@ -2028,7 +2042,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
} else if (plug && q->nr_hw_queues == 1) {
struct request *last = NULL;
- blk_mq_put_ctx(data.ctx);
blk_mq_bio_to_request(rq, bio);
/*
@@ -2051,6 +2064,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
trace_block_plug(q);
}
+ rq_hierarchy_start_io_acct(rq, STAGE_PLUG);
list_add_tail(&rq->queuelist, &plug->mq_list);
} else if (plug && !blk_queue_nomerges(q)) {
blk_mq_bio_to_request(rq, bio);
@@ -2066,23 +2080,21 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
same_queue_rq = NULL;
if (same_queue_rq)
list_del_init(&same_queue_rq->queuelist);
+ rq_hierarchy_start_io_acct(rq, STAGE_PLUG);
list_add_tail(&rq->queuelist, &plug->mq_list);
- blk_mq_put_ctx(data.ctx);
-
if (same_queue_rq) {
data.hctx = blk_mq_map_queue(q,
same_queue_rq->mq_ctx->cpu);
+ rq_hierarchy_end_io_acct(same_queue_rq, STAGE_PLUG);
blk_mq_try_issue_directly(data.hctx, same_queue_rq,
&cookie);
}
} else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator &&
!data.hctx->dispatch_busy)) {
- blk_mq_put_ctx(data.ctx);
blk_mq_bio_to_request(rq, bio);
blk_mq_try_issue_directly(data.hctx, rq, &cookie);
} else {
- blk_mq_put_ctx(data.ctx);
blk_mq_bio_to_request(rq, bio);
blk_mq_sched_insert_request(rq, false, true, true);
}
@@ -2324,6 +2336,7 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
if (list_empty(&tmp))
return 0;
+ rq_list_hierarchy_start_io_acct(&tmp, STAGE_HCTX);
spin_lock(&hctx->lock);
list_splice_tail_init(&tmp, &hctx->dispatch);
spin_unlock(&hctx->lock);
@@ -2758,6 +2771,9 @@ void blk_mq_release(struct request_queue *q)
struct blk_mq_hw_ctx *hctx, *next;
int i;
+ blk_mq_unregister_hierarchy(q, STAGE_BIO);
+ blk_io_hierarchy_stats_free(q);
+
queue_for_each_hw_ctx(q, hctx, i)
WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
@@ -2895,11 +2911,14 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
/* mark the queue as mq asap */
q->mq_ops = set->ops;
+ if (blk_io_hierarchy_stats_alloc(q))
+ goto err_exit;
+
q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
blk_mq_poll_stats_bkt,
BLK_MQ_POLL_STATS_BKTS, q);
if (!q->poll_cb)
- goto err_exit;
+ goto err_hierarchy_stats;
if (blk_mq_alloc_ctxs(q))
goto err_exit;
@@ -2972,6 +2991,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
q->nr_hw_queues = 0;
err_sys_init:
blk_mq_sysfs_deinit(q);
+err_hierarchy_stats:
+ blk_io_hierarchy_stats_free(q);
err_exit:
q->mq_ops = NULL;
return ERR_PTR(-ENOMEM);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index c6ec9aa12fb2..1bba4eb18332 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -125,12 +125,7 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
*/
static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
{
- return __blk_mq_get_ctx(q, get_cpu());
-}
-
-static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
-{
- put_cpu();
+ return __blk_mq_get_ctx(q, raw_smp_processor_id());
}
struct blk_mq_alloc_data {
@@ -142,6 +137,7 @@ struct blk_mq_alloc_data {
/* input & output parameter */
struct blk_mq_ctx *ctx;
struct blk_mq_hw_ctx *hctx;
+ struct bio *bio;
};
static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
@@ -234,4 +230,32 @@ static inline void blk_mq_free_requests(struct list_head *list)
}
}
+static inline bool blk_mq_is_sbitmap_shared(unsigned int flags)
+{
+ return false;
+}
+
+#ifdef CONFIG_BLK_BIO_ALLOC_TASK
+static inline void blk_mq_get_alloc_task(struct request *rq, struct bio *bio)
+{
+ rq->pid = bio ? get_pid(bio->pid) : get_pid(task_pid(current));
+}
+
+static inline void blk_mq_put_alloc_task(struct request *rq)
+{
+ if (rq->pid) {
+ put_pid(rq->pid);
+ rq->pid = NULL;
+ }
+}
+#else
+static inline void blk_mq_get_alloc_task(struct request *rq, struct bio *bio)
+{
+}
+
+static inline void blk_mq_put_alloc_task(struct request *rq)
+{
+}
+#endif
+
#endif
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 1c4d795bbdc4..719687a394ea 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -17,6 +17,7 @@
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
#include "blk-wbt.h"
+#include "blk-io-hierarchy/stats.h"
struct queue_sysfs_entry {
struct attribute attr;
@@ -924,6 +925,19 @@ struct kobj_type blk_queue_ktype = {
.release = blk_release_queue,
};
+static void blk_mq_register_default_hierarchy(struct request_queue *q)
+{
+ if (!q->mq_ops)
+ return;
+
+ blk_mq_register_hierarchy(q, STAGE_GETTAG);
+ blk_mq_register_hierarchy(q, STAGE_PLUG);
+ blk_mq_register_hierarchy(q, STAGE_HCTX);
+ blk_mq_register_hierarchy(q, STAGE_REQUEUE);
+ blk_mq_register_hierarchy(q, STAGE_RQ_DRIVER);
+ blk_mq_register_hierarchy(q, STAGE_BIO);
+}
+
/**
* blk_register_queue - register a block layer queue with sysfs
* @disk: Disk of which the request queue should be registered with sysfs.
@@ -973,6 +987,8 @@ int blk_register_queue(struct gendisk *disk)
has_elevator = true;
}
+ blk_mq_register_default_hierarchy(q);
+
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
wbt_enable_default(q);
blk_throtl_register_queue(q);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 598191286557..446864c27c3b 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -14,6 +14,7 @@
#include <linux/sched/signal.h>
#include <linux/delay.h>
#include "blk.h"
+#include "blk-io-hierarchy/stats.h"
/* Max dispatch from a group in 1 round */
static int throtl_grp_quantum = 8;
@@ -1350,6 +1351,8 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
bio_list_add(&bio_list_on_stack, bio);
spin_unlock_irq(q->queue_lock);
+ bio_list_hierarchy_end_io_acct(&bio_list_on_stack, STAGE_THROTTLE);
+
if (!bio_list_empty(&bio_list_on_stack)) {
blk_start_plug(&plug);
while((bio = bio_list_pop(&bio_list_on_stack)))
@@ -2333,6 +2336,20 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
tg->last_low_overflow_time[rw] = jiffies;
+ /*
+ * This is slow path now, bio_hierarchy_start_io_acct() might spend
+ * some time to allocate memory. However, it's safe because 'tg' is
+ * pinned by this bio, and io charge should still be accurate because
+ * slice is already started from tg_may_dispatch().
+ */
+ spin_unlock_irq(q->queue_lock);
+ rcu_read_unlock();
+
+ bio_hierarchy_start_io_acct(bio, STAGE_THROTTLE);
+
+ rcu_read_lock();
+ spin_lock_irq(q->queue_lock);
+
td->nr_queued[rw]++;
throtl_add_bio_tg(bio, qn, tg);
throttled = true;
@@ -2561,6 +2578,8 @@ void blk_throtl_exit(struct request_queue *q)
del_timer_sync(&q->td->service_queue.pending_timer);
throtl_shutdown_wq(q);
blkcg_deactivate_policy(q, &blkcg_policy_throtl);
+ blk_mq_unregister_hierarchy(q, STAGE_THROTTLE);
+
free_percpu(q->td->latency_buckets[READ]);
free_percpu(q->td->latency_buckets[WRITE]);
kfree(q->td);
@@ -2593,6 +2612,8 @@ void blk_throtl_register_queue(struct request_queue *q)
td->track_bio_latency = !queue_is_rq_based(q);
if (!td->track_bio_latency)
blk_stat_enable_accounting(q);
+
+ blk_mq_register_hierarchy(q, STAGE_THROTTLE);
}
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 94b5eff0cd3a..87d7816af6e0 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -29,6 +29,7 @@
#include "blk-wbt.h"
#include "blk-rq-qos.h"
+#include "blk-io-hierarchy/stats.h"
#define CREATE_TRACE_POINTS
#include <trace/events/wbt.h>
@@ -532,11 +533,12 @@ static int wbt_wake_function(struct wait_queue_entry *curr, unsigned int mode,
* Block if we will exceed our limit, or if we are currently waiting for
* the timer to kick off queuing again.
*/
-static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
- unsigned long rw, spinlock_t *lock)
+static void __wbt_wait(struct rq_wb *rwb, struct bio *bio,
+ enum wbt_flags wb_acct, spinlock_t *lock)
__releases(lock)
__acquires(lock)
{
+ unsigned long rw = bio->bi_opf;
struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
struct wbt_wait_data data = {
.wq = {
@@ -554,6 +556,7 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw)))
return;
+ bio_hierarchy_start_io_acct(bio, STAGE_WBT);
has_sleeper = !__prepare_to_wait_exclusive(&rqw->wait, &data.wq,
TASK_UNINTERRUPTIBLE);
do {
@@ -588,6 +591,7 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
} while (1);
finish_wait(&rqw->wait, &data.wq);
+ bio_hierarchy_end_io_acct(bio, STAGE_WBT);
}
static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
@@ -652,7 +656,7 @@ static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock)
return;
}
- __wbt_wait(rwb, flags, bio->bi_opf, lock);
+ __wbt_wait(rwb, bio, flags, lock);
if (!blk_stat_is_active(rwb->cb))
rwb_arm_timer(rwb);
@@ -770,6 +774,7 @@ static void wbt_exit(struct rq_qos *rqos)
struct rq_wb *rwb = RQWB(rqos);
struct request_queue *q = rqos->q;
+ blk_mq_unregister_hierarchy(q, STAGE_WBT);
blk_stat_remove_callback(q, rwb->cb);
blk_stat_free_callback(rwb->cb);
kfree(rwb);
@@ -845,6 +850,7 @@ int wbt_init(struct request_queue *q)
rwb->min_lat_nsec = wbt_default_latency_nsec(q);
wbt_set_queue_depth(q, blk_queue_depth(q));
+ blk_mq_register_hierarchy(q, STAGE_WBT);
blk_mq_unfreeze_queue(q);
wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
diff --git a/block/blk.h b/block/blk.h
index 965e9c507654..162b42388610 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -175,6 +175,51 @@ static inline void blk_queue_enter_live(struct request_queue *q)
percpu_ref_get(&q->q_usage_counter);
}
+#ifdef CONFIG_BLK_BIO_ALLOC_TIME
+static inline u64 blk_time_get_ns(void);
+static inline void blk_rq_init_bi_alloc_time(struct request *rq,
+ struct request *first_rq)
+{
+ rq->bi_alloc_time_ns = first_rq ? first_rq->bi_alloc_time_ns :
+ blk_time_get_ns();
+}
+
+/*
+ * Used in following cases to updated request bi_alloc_time_ns:
+ *
+ * 1) Allocate a new @rq for @bio;
+ * 2) @bio is merged to @rq, in this case @merged_rq should be NULL;
+ * 3) @merged_rq is merged to @rq, in this case @bio should be NULL;
+ */
+static inline void blk_rq_update_bi_alloc_time(struct request *rq,
+ struct bio *bio,
+ struct request *merged_rq)
+{
+ if (bio) {
+ if (rq->bi_alloc_time_ns > bio->bi_alloc_time_ns)
+ rq->bi_alloc_time_ns = bio->bi_alloc_time_ns;
+ return;
+ }
+
+ if (!merged_rq)
+ return;
+
+ if (rq->bi_alloc_time_ns > merged_rq->bi_alloc_time_ns)
+ rq->bi_alloc_time_ns = merged_rq->bi_alloc_time_ns;
+}
+#else /* CONFIG_BLK_BIO_ALLOC_TIME */
+static inline void blk_rq_init_bi_alloc_time(struct request *rq,
+ struct request *first_rq)
+{
+}
+
+static inline void blk_rq_update_bi_alloc_time(struct request *rq,
+ struct bio *bio,
+ struct request *merged_rq)
+{
+}
+#endif
+
#ifdef CONFIG_BLK_DEV_INTEGRITY
void blk_flush_integrity(void);
bool __bio_integrity_endio(struct bio *);
@@ -479,4 +524,17 @@ static inline void blk_free_queue_dispatch_async(struct request_queue *q)
}
#endif
+static inline u64 blk_time_get_ns(void)
+{
+ struct blk_plug *plug = current->plug;
+
+ if (!plug || !in_task())
+ return ktime_get_ns();
+
+ if (!plug->cur_ktime)
+ plug->cur_ktime = ktime_get_ns();
+
+ return plug->cur_ktime;
+}
+
#endif /* BLK_INTERNAL_H */
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 833e9eaae640..04ff97c076fb 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -30,6 +30,7 @@
#include "blk-mq-sched.h"
#include "blk-mq-tag.h"
#include "blk-stat.h"
+#include "blk-io-hierarchy/stats.h"
/* Scheduling domains. */
enum {
@@ -365,6 +366,7 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
blk_stat_add_callback(q, kqd->cb);
+ blk_mq_register_hierarchy(q, STAGE_KYBER);
return 0;
}
@@ -374,6 +376,7 @@ static void kyber_exit_sched(struct elevator_queue *e)
struct request_queue *q = kqd->q;
int i;
+ blk_mq_unregister_hierarchy(kqd->q, STAGE_KYBER);
blk_stat_remove_callback(q, kqd->cb);
for (i = 0; i < KYBER_NUM_DOMAINS; i++)
@@ -517,7 +520,6 @@ static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx_q, struct bio *bio)
spin_lock(&kcq->lock);
merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio);
spin_unlock(&kcq->lock);
- blk_mq_put_ctx(ctx);
return merged;
}
@@ -533,6 +535,7 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
struct kyber_hctx_data *khd = hctx->sched_data;
struct request *rq, *next;
+ rq_list_hierarchy_start_io_acct(rq_list, STAGE_KYBER);
list_for_each_entry_safe(rq, next, rq_list, queuelist) {
unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);
struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw];
@@ -772,6 +775,9 @@ static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
rq = NULL;
out:
spin_unlock(&khd->lock);
+
+ if (rq)
+ rq_hierarchy_end_io_acct(rq, STAGE_KYBER);
return rq;
}
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 7ad820050675..aa51abb3eaa4 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -22,6 +22,7 @@
#include "blk-mq-debugfs.h"
#include "blk-mq-tag.h"
#include "blk-mq-sched.h"
+#include "blk-io-hierarchy/stats.h"
/*
* See Documentation/block/deadline-iosched.txt
@@ -61,6 +62,8 @@ struct deadline_data {
spinlock_t lock;
spinlock_t zone_lock;
struct list_head dispatch;
+
+ struct request_queue *q;
};
static inline struct rb_root *
@@ -386,6 +389,8 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
rq = __dd_dispatch_request(dd);
spin_unlock(&dd->lock);
+ if (rq)
+ rq_hierarchy_end_io_acct(rq, STAGE_DEADLINE);
return rq;
}
@@ -396,6 +401,7 @@ static void dd_exit_queue(struct elevator_queue *e)
BUG_ON(!list_empty(&dd->fifo_list[READ]));
BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
+ blk_mq_unregister_hierarchy(dd->q, STAGE_DEADLINE);
kfree(dd);
}
@@ -427,11 +433,13 @@ static int dd_init_queue(struct request_queue *q, struct elevator_type *e)
dd->writes_starved = writes_starved;
dd->front_merges = 1;
dd->fifo_batch = fifo_batch;
+ dd->q = q;
spin_lock_init(&dd->lock);
spin_lock_init(&dd->zone_lock);
INIT_LIST_HEAD(&dd->dispatch);
q->elevator = eq;
+ blk_mq_register_hierarchy(q, STAGE_DEADLINE);
return 0;
}
@@ -469,8 +477,10 @@ static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
ret = blk_mq_sched_try_merge(q, bio, &free);
spin_unlock(&dd->lock);
- if (free)
+ if (free) {
+ rq_hierarchy_end_io_acct(free, STAGE_DEADLINE);
blk_mq_free_request(free);
+ }
return ret;
}
@@ -493,6 +503,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
blk_req_zone_write_unlock(rq);
if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
+ rq_list_hierarchy_end_io_acct(&free, STAGE_DEADLINE);
blk_mq_free_requests(&free);
return;
}
@@ -527,6 +538,8 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
struct request_queue *q = hctx->queue;
struct deadline_data *dd = q->elevator->elevator_data;
+ rq_list_hierarchy_start_io_acct(list, STAGE_DEADLINE);
+
spin_lock(&dd->lock);
while (!list_empty(list)) {
struct request *rq;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 8075b9955bb3..c2867571bcc7 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -203,6 +203,9 @@ struct bio {
struct bio_set *bi_pool;
+ u64 bi_alloc_time_ns;
+ struct bio_hierarchy_data *hdata;
+ struct pid *pid;
KABI_RESERVE(1)
KABI_RESERVE(2)
KABI_RESERVE(3)
@@ -234,6 +237,13 @@ struct bio {
* of this bio. */
#define BIO_QUEUE_ENTERED 11 /* can use blk_queue_enter_live() */
#define BIO_TRACKED 12 /* set if bio goes through the rq_qos path */
+#ifdef CONFIG_BLK_IO_HIERARCHY_STATS
+#define BIO_HAS_DATA 13 /* bio contain data. */
+#define BIO_HIERARCHY_ACCT 14 /*
+ * This bio has already been subjected to
+ * blk-io-hierarchy, don't do it again.
+ */
+#endif
/* See BVEC_POOL_OFFSET below before adding new flags */
@@ -368,7 +378,36 @@ enum stat_group {
STAT_WRITE,
STAT_DISCARD,
- NR_STAT_GROUPS
+ NR_STAT_GROUPS,
+ STAT_FLUSH = NR_STAT_GROUPS,
+ NEW_NR_STAT_GROUPS,
+};
+
+enum stage_group {
+#ifdef CONFIG_BLK_DEV_THROTTLING
+ STAGE_THROTTLE,
+#endif
+#ifdef CONFIG_BLK_WBT
+ STAGE_WBT,
+#endif
+ STAGE_GETTAG,
+ NR_BIO_STAGE_GROUPS,
+ STAGE_PLUG = NR_BIO_STAGE_GROUPS,
+#if IS_ENABLED(CONFIG_MQ_IOSCHED_DEADLINE)
+ STAGE_DEADLINE,
+#endif
+#if IS_ENABLED(CONFIG_IOSCHED_BFQ)
+ STAGE_BFQ,
+#endif
+#if IS_ENABLED(CONFIG_MQ_IOSCHED_KYBER)
+ STAGE_KYBER,
+#endif
+ STAGE_HCTX,
+ STAGE_REQUEUE,
+ STAGE_RQ_DRIVER,
+ NR_RQ_STAGE_GROUPS,
+ STAGE_BIO = NR_RQ_STAGE_GROUPS,
+ NR_STAGE_GROUPS,
};
#define bio_op(bio) \
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c848f4205729..713c42987851 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -277,6 +277,12 @@ struct request {
#ifdef CONFIG_BLK_CGROUP
struct request_list *rl; /* rl this rq is alloced from */
#endif
+
+ enum stage_group stage;
+ unsigned long hierarchy_time;
+ u64 io_end_time_ns;
+ u64 bi_alloc_time_ns;
+ struct pid *pid;
};
static inline bool blk_op_is_scsi(unsigned int op)
@@ -703,6 +709,8 @@ struct request_queue {
#define BLK_MAX_WRITE_HINTS 5
u64 write_hints[BLK_MAX_WRITE_HINTS];
+
+ struct blk_io_hierarchy_stats *io_hierarchy_stats;
};
#define QUEUE_FLAG_QUEUED 0 /* uses generic tag queueing */
@@ -1372,6 +1380,7 @@ struct blk_plug {
struct list_head list; /* requests */
struct list_head mq_list; /* blk-mq requests */
struct list_head cb_list; /* md requires an unplug callback */
+ u64 cur_ktime;
};
#define BLK_MAX_REQUEST_COUNT 16
#define BLK_PLUG_FLUSH_SIZE (128 * 1024)
--
2.39.2
2
1