[PATCH OLK-6.6 8/8] blk-iocost: fix busy_level reset when no IOs complete

7 May 2026

From: Jialin Wang <wjl.linux@gmail.com>

mainline inclusion
from mainline-v7.1-rc1
commit f91ffe89b2016d280995a9c28d73288b02d83615
category: bugfix
bugzilla: https://atomgit.com/openeuler/kernel/issues/9104

Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...

--------------------------------

When a disk is saturated, it is common for no IOs to complete within a
timer period. Currently, in this case, rq_wait_pct and missed_ppm are
calculated as 0, the iocost incorrectly interprets this as meeting QoS
targets and resets busy_level to 0.

This reset prevents busy_level from reaching the threshold (4) needed
to reduce vrate. On certain cloud storage, such as Azure Premium SSD,
we observed that iocost may fail to reduce vrate for tens of seconds
during saturation, failing to mitigate noisy neighbor issues.

Fix this by tracking the number of IO completions (nr_done) in a period.
If nr_done is 0 and there are lagging IOs, the saturation status is
unknown, so we keep busy_level unchanged.

The issue is consistently reproducible on Azure Standard_D8as_v5 (Dasv5)
VMs with 512GB Premium SSD (P20) using the script below. It was not
observed on GCP n2d VMs (with 100G pd-ssd and 1.5T local-ssd), and no
regressions were found with this patch. In this script, cgA performs
large IOs with iodepth=128, while cgB performs small IOs with iodepth=1
rate_iops=100 rw=randrw. With iocost enabled, we expect it to throttle
cgA, the submission latency (slat) of cgA should be significantly higher,
cgB can reach 200 IOPS and the completion latency (clat) should below.

  BLK_DEVID="8:0"
  MODEL="rbps=173471131 rseqiops=3566 rrandiops=3566 wbps=173333269 wseqiops=3566 wrandiops=3566"
  QOS="rpct=90 rlat=3500 wpct=90 wlat=3500 min=80 max=10000"

  echo "$BLK_DEVID ctrl=user model=linear $MODEL" > /sys/fs/cgroup/io.cost.model
  echo "$BLK_DEVID enable=1 ctrl=user $QOS" > /sys/fs/cgroup/io.cost.qos

  CG_A="/sys/fs/cgroup/cgA"
  CG_B="/sys/fs/cgroup/cgB"

  FILE_A="/path/to/sda/A.fio.testfile"
  FILE_B="/path/to/sda/B.fio.testfile"
  RESULT_DIR="./iocost_results_$(date +%Y%m%d_%H%M%S)"

  mkdir -p "$CG_A" "$CG_B" "$RESULT_DIR"

  get_result() {
    local file=$1
    local label=$2

    local results=$(jq -r '
    .jobs[0].mixed |
    ( .iops | tonumber | round ) as $iops |
    ( .bw_bytes / 1024 / 1024 ) as $bps |
    ( .slat_ns.mean / 1000000 ) as $slat |
    ( .clat_ns.mean / 1000000 ) as $avg |
    ( .clat_ns.max / 1000000 ) as $max |
    ( .clat_ns.percentile["90.000000"] / 1000000 ) as $p90 |
    ( .clat_ns.percentile["99.000000"] / 1000000 ) as $p99 |
    ( .clat_ns.percentile["99.900000"] / 1000000 ) as $p999 |
    ( .clat_ns.percentile["99.990000"] / 1000000 ) as $p9999 |
    "\($iops)|\($bps)|\($slat)|\($avg)|\($max)|\($p90)|\($p99)|\($p999)|\($p9999)"
    ' "$file")

    IFS='|' read -r iops bps slat avg max p90 p99 p999 p9999 <<<"$results"
    printf "%-8s %-6s %-7.2f %-8.2f %-8.2f %-8.2f %-8.2f %-8.2f %-8.2f %-8.2f\n" \
           "$label" "$iops" "$bps" "$slat" "$avg" "$max" "$p90" "$p99" "$p999" "$p9999"
  }

  run_fio() {
    local cg_path=$1
    local filename=$2
    local name=$3
    local bs=$4
    local qd=$5
    local out=$6
    shift 6
    local extra=$@

    (
      pid=$(sh -c 'echo $PPID')
      echo $pid >"${cg_path}/cgroup.procs"
      fio --name="$name" --filename="$filename" --direct=1 --rw=randrw --rwmixread=50 \
          --ioengine=libaio --bs="$bs" --iodepth="$qd" --size=4G --runtime=10 \
          --time_based --group_reporting --unified_rw_reporting=mixed \
          --output-format=json --output="$out" $extra >/dev/null 2>&1
    ) &
  }

  echo "Starting Test ..."

  for bs_b in "4k" "32k" "256k"; do
    echo "Running iteration: BS=$bs_b"
    out_a="${RESULT_DIR}/cgA_1m.json"
    out_b="${RESULT_DIR}/cgB_${bs_b}.json"

    # cgA: Heavy background (BS 1MB, QD 128)
    run_fio "$CG_A" "$FILE_A" "cgA" "1m" 128 "$out_a"
    # cgB: Latency sensitive (Variable BS, QD 1, Read/Write IOPS limit 100)
    run_fio "$CG_B" "$FILE_B" "cgB" "$bs_b" 1 "$out_b" "--rate_iops=100"

    wait
    SUMMARY_DATA+="$(get_result "$out_a" "cgA-1m")"$'\n'
    SUMMARY_DATA+="$(get_result "$out_b" "cgB-$bs_b")"$'\n\n'
  done

  echo -e "\nFinal Results Summary:\n"

  printf "%-8s %-6s %-7s %-8s %-8s %-8s %-8s %-8s %-8s %-8s\n" \
          "" "" "" "slat" "clat" "clat" "clat" "clat" "clat" "clat"
  printf "%-8s %-6s %-7s %-8s %-8s %-8s %-8s %-8s %-8s %-8s\n\n" \
          "CGROUP" "IOPS" "MB/s" "avg(ms)" "avg(ms)" "max(ms)" "P90(ms)" "P99" "P99.9" "P99.99"
  echo "$SUMMARY_DATA"

  echo "Results saved in $RESULT_DIR"

Before:
                          slat     clat     clat     clat     clat     clat     clat
  CGROUP   IOPS   MB/s    avg(ms)  avg(ms)  max(ms)  P90(ms)  P99      P99.9    P99.99

  cgA-1m   166    166.37  3.44     748.95   1298.29  977.27   1233.13  1300.23  1300.23
  cgB-4k   5      0.02    0.02     181.74   761.32   742.39   759.17   759.17   759.17

  cgA-1m   167    166.51  1.98     748.68   1549.41  809.50   1451.23  1551.89  1551.89
  cgB-32k  6      0.18    0.02     169.98   761.76   742.39   759.17   759.17   759.17

  cgA-1m   166    165.55  2.89     750.89   1540.37  851.44   1451.23  1535.12  1535.12
  cgB-256k 5      1.30    0.02     191.35   759.51   750.78   759.17   759.17   759.17

After:
                          slat     clat     clat     clat     clat     clat     clat
  CGROUP   IOPS   MB/s    avg(ms)  avg(ms)  max(ms)  P90(ms)  P99      P99.9    P99.99

  cgA-1m   162    162.48  6.14     749.69   850.02   826.28   834.67   843.06   851.44
  cgB-4k   199    0.78    0.01     1.95     42.12    2.57     7.50     34.87    42.21

  cgA-1m   146    146.20  6.83     833.04   908.68   893.39   901.78   910.16   910.16
  cgB-32k  200    6.25    0.01     2.32     31.40    3.06     7.50     16.58    31.33

  cgA-1m   110    110.46  9.04     1082.67  1197.91  1182.79  1199.57  1199.57  1199.57
  cgB-256k 200    49.98   0.02     3.69     22.20    4.88     9.11     20.05    22.15

Signed-off-by: Jialin Wang <wjl.linux@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://patch.msgid.link/20260331100509.182882-1-wjl.linux@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Zizhi Wo <wozizhi@huawei.com>
---
 block/blk-iocost.c | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 0019b67aed15..133d3f5828ee 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -1595,11 +1595,12 @@ static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
 	iocg_unlock(iocg, pay_debt, &flags);
 
 	return HRTIMER_NORESTART;
 }
 
-static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
+static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p,
+			 u32 *nr_done)
 {
 	u32 nr_met[2] = { };
 	u32 nr_missed[2] = { };
 	u64 rq_wait_ns = 0;
 	int cpu, rw;
@@ -1632,10 +1633,12 @@ static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p
 			missed_ppm_ar[rw] = 0;
 	}
 
 	*rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
 				   ioc->period_us * NSEC_PER_USEC);
+
+	*nr_done = nr_met[READ] + nr_met[WRITE] + nr_missed[READ] + nr_missed[WRITE];
 }
 
 /* was iocg idle this period? */
 static bool iocg_is_idle(struct ioc_gq *iocg)
 {
@@ -2249,16 +2252,16 @@ static void ioc_timer_fn(struct timer_list *timer)
 	LIST_HEAD(surpluses);
 	int nr_debtors, nr_shortages = 0, nr_lagging = 0;
 	u64 usage_us_sum = 0;
 	u32 ppm_rthr;
 	u32 ppm_wthr;
-	u32 missed_ppm[2], rq_wait_pct;
+	u32 missed_ppm[2], rq_wait_pct, nr_done;
 	u64 period_vtime;
 	int prev_busy_level;
 
 	/* how were the latencies during the period? */
-	ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
+	ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct, &nr_done);
 
 	/* take care of active iocgs */
 	spin_lock_irq(&ioc->lock);
 
 	ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
@@ -2396,13 +2399,21 @@ static void ioc_timer_fn(struct timer_list *timer)
 	 * too much IO and should lower vtime rate.  If we're not missing
 	 * and experiencing shortages but not surpluses, we're too stingy
 	 * and should increase vtime rate.
 	 */
 	prev_busy_level = ioc->busy_level;
-	if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
-	    missed_ppm[READ] > ppm_rthr ||
-	    missed_ppm[WRITE] > ppm_wthr) {
+	if (!nr_done && nr_lagging) {
+		/*
+		 * When there are lagging IOs but no completions, we don't
+		 * know if the IO latency will meet the QoS targets. The
+		 * disk might be saturated or not. We should not reset
+		 * busy_level to 0 (which would prevent vrate from scaling
+		 * up or down), but rather to keep it unchanged.
+		 */
+	} else if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
+		   missed_ppm[READ] > ppm_rthr ||
+		   missed_ppm[WRITE] > ppm_wthr) {
 		/* clearly missing QoS targets, slow down vrate */
 		ioc->busy_level = max(ioc->busy_level, 0);
 		ioc->busy_level++;
 	} else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
 		   missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
-- 
2.52.0