From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6NBAZ CVE: NA
--------------------------------
If alua_rtpg_queue() failed from alua_activate(), then 'qdata' is not freed, which will cause following memleak:
unreferenced object 0xffff88810b2c6980 (size 32): comm "kworker/u16:2", pid 635322, jiffies 4355801099 (age 1216426.076s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 40 39 24 c1 ff ff ff ff 00 f8 ea 0a 81 88 ff ff @9$............. backtrace: [<0000000098f3a26d>] alua_activate+0xb0/0x320 [<000000003b529641>] scsi_dh_activate+0xb2/0x140 [<000000007b296db3>] activate_path_work+0xc6/0xe0 [dm_multipath] [<000000007adc9ace>] process_one_work+0x3c5/0x730 [<00000000c457a985>] worker_thread+0x93/0x650 [<00000000cb80e628>] kthread+0x1ba/0x210 [<00000000a1e61077>] ret_from_fork+0x22/0x30
Fix the problem by freeing 'qdata' in error path.
Fixes: 625fe857e4fa ("scsi: scsi_dh_alua: Check scsi_device_get() return value") Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- drivers/scsi/device_handler/scsi_dh_alua.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c index fe8a5e5c0df8..bf0b3178f84d 100644 --- a/drivers/scsi/device_handler/scsi_dh_alua.c +++ b/drivers/scsi/device_handler/scsi_dh_alua.c @@ -1036,10 +1036,12 @@ static int alua_activate(struct scsi_device *sdev, rcu_read_unlock(); mutex_unlock(&h->init_mutex);
- if (alua_rtpg_queue(pg, sdev, qdata, true)) + if (alua_rtpg_queue(pg, sdev, qdata, true)) { fn = NULL; - else + } else { + kfree(qdata); err = SCSI_DH_DEV_OFFLINED; + } kref_put(&pg->kref, release_port_group); out: if (fn)
From: wenxu wenxu@ucloud.cn
mainline inclusion from mainline-v5.11-rc1 commit fa6d639930ee5cd3f932cc314f3407f07a06582d category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I64END CVE: CVE-2022-4269
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h...
--------------------------------
This one is prepare for the next patch.
Signed-off-by: wenxu wenxu@ucloud.cn Signed-off-by: Jakub Kicinski kuba@kernel.org Conflicts: include/net/sch_generic.h Signed-off-by: Ziyang Xuan william.xuanziyang@huawei.com Reviewed-by: Liu Jian liujian56@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- include/net/sch_generic.h | 5 ----- net/sched/act_mirred.c | 21 +++++++++++++++------ 2 files changed, 15 insertions(+), 11 deletions(-)
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 73c699355470..8271b10bc30e 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -1329,11 +1329,6 @@ void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp, struct tcf_block *block);
-static inline int skb_tc_reinsert(struct sk_buff *skb, struct tcf_result *res) -{ - return res->ingress ? netif_receive_skb(skb) : dev_queue_xmit(skb); -} - /* Make sure qdisc is no longer in SCHED state. */ static inline void qdisc_synchronize(const struct Qdisc *q) { diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 24d561d8d9c9..53594b0466eb 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -206,6 +206,18 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, return err; }
+static int tcf_mirred_forward(bool want_ingress, struct sk_buff *skb) +{ + int err; + + if (!want_ingress) + err = dev_queue_xmit(skb); + else + err = netif_receive_skb(skb); + + return err; +} + static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { @@ -295,18 +307,15 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, /* let's the caller reinsert the packet, if possible */ if (use_reinsert) { res->ingress = want_ingress; - if (skb_tc_reinsert(skb, res)) + err = tcf_mirred_forward(res->ingress, skb); + if (err) tcf_action_inc_overlimit_qstats(&m->common); __this_cpu_dec(mirred_rec_level); return TC_ACT_CONSUMED; } }
- if (!want_ingress) - err = dev_queue_xmit(skb2); - else - err = netif_receive_skb(skb2); - + err = tcf_mirred_forward(want_ingress, skb2); if (err) { out: tcf_action_inc_overlimit_qstats(&m->common);
From: Davide Caratti dcaratti@redhat.com
mainline inclusion from mainline-v6.3-rc1 commit 78dcdffe0418ac8f3f057f26fe71ccf4d8ed851f category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I64END CVE: CVE-2022-4269
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h...
--------------------------------
with commit e2ca070f89ec ("net: sched: protect against stack overflow in TC act_mirred"), act_mirred protected itself against excessive stack growth using per_cpu counter of nested calls to tcf_mirred_act(), and capping it to MIRRED_RECURSION_LIMIT. However, such protection does not detect recursion/loops in case the packet is enqueued to the backlog (for example, when the mirred target device has RPS or skb timestamping enabled). Change the wording from "recursion" to "nesting" to make it more clear to readers.
CC: Jamal Hadi Salim jhs@mojatatu.com Signed-off-by: Davide Caratti dcaratti@redhat.com Reviewed-by: Marcelo Ricardo Leitner marcelo.leitner@gmail.com Acked-by: Jamal Hadi Salim jhs@mojatatu.com Signed-off-by: Paolo Abeni pabeni@redhat.com Signed-off-by: Ziyang Xuan william.xuanziyang@huawei.com Reviewed-by: Liu Jian liujian56@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- net/sched/act_mirred.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 53594b0466eb..01a44c3e8d6d 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -28,8 +28,8 @@ static LIST_HEAD(mirred_list); static DEFINE_SPINLOCK(mirred_list_lock);
-#define MIRRED_RECURSION_LIMIT 4 -static DEFINE_PER_CPU(unsigned int, mirred_rec_level); +#define MIRRED_NEST_LIMIT 4 +static DEFINE_PER_CPU(unsigned int, mirred_nest_level);
static bool tcf_mirred_is_act_redirect(int action) { @@ -225,7 +225,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, struct sk_buff *skb2 = skb; bool m_mac_header_xmit; struct net_device *dev; - unsigned int rec_level; + unsigned int nest_level; int retval, err = 0; bool use_reinsert; bool want_ingress; @@ -236,11 +236,11 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, int mac_len; bool at_nh;
- rec_level = __this_cpu_inc_return(mirred_rec_level); - if (unlikely(rec_level > MIRRED_RECURSION_LIMIT)) { + nest_level = __this_cpu_inc_return(mirred_nest_level); + if (unlikely(nest_level > MIRRED_NEST_LIMIT)) { net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n", netdev_name(skb->dev)); - __this_cpu_dec(mirred_rec_level); + __this_cpu_dec(mirred_nest_level); return TC_ACT_SHOT; }
@@ -310,7 +310,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, err = tcf_mirred_forward(res->ingress, skb); if (err) tcf_action_inc_overlimit_qstats(&m->common); - __this_cpu_dec(mirred_rec_level); + __this_cpu_dec(mirred_nest_level); return TC_ACT_CONSUMED; } } @@ -322,7 +322,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, if (tcf_mirred_is_act_redirect(m_eaction)) retval = TC_ACT_SHOT; } - __this_cpu_dec(mirred_rec_level); + __this_cpu_dec(mirred_nest_level);
return retval; }
From: Davide Caratti dcaratti@redhat.com
mainline inclusion from mainline-v6.3-rc1 commit ca22da2fbd693b54dc8e3b7b54ccc9f7e9ba3640 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I64END CVE: CVE-2022-4269
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h...
--------------------------------
William reports kernel soft-lockups on some OVS topologies when TC mirred egress->ingress action is hit by local TCP traffic [1]. The same can also be reproduced with SCTP (thanks Xin for verifying), when client and server reach themselves through mirred egress to ingress, and one of the two peers sends a "heartbeat" packet (from within a timer).
Enqueueing to backlog proved to fix this soft lockup; however, as Cong noticed [2], we should preserve - when possible - the current mirred behavior that counts as "overlimits" any eventual packet drop subsequent to the mirred forwarding action [3]. A compromise solution might use the backlog only when tcf_mirred_act() has a nest level greater than one: change tcf_mirred_forward() accordingly.
Also, add a kselftest that can reproduce the lockup and verifies TC mirred ability to account for further packet drops after TC mirred egress->ingress (when the nest level is 1).
[1] https://lore.kernel.org/netdev/33dc43f587ec1388ba456b4915c75f02a8aae226.1663... [2] https://lore.kernel.org/netdev/Y0w%2FWWY60gqrtGLp@pop-os.localdomain/ [3] such behavior is not guaranteed: for example, if RPS or skb RX timestamping is enabled on the mirred target device, the kernel can defer receiving the skb and return NET_RX_SUCCESS inside tcf_mirred_forward().
Reported-by: William Zhao wizhao@redhat.com CC: Xin Long lucien.xin@gmail.com Signed-off-by: Davide Caratti dcaratti@redhat.com Reviewed-by: Marcelo Ricardo Leitner marcelo.leitner@gmail.com Acked-by: Jamal Hadi Salim jhs@mojatatu.com Signed-off-by: Paolo Abeni pabeni@redhat.com Conflicts: net/sched/act_mirred.c tools/testing/selftests/net/forwarding/tc_actions.sh Signed-off-by: Ziyang Xuan william.xuanziyang@huawei.com Reviewed-by: Liu Jian liujian56@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- net/sched/act_mirred.c | 7 +++ .../selftests/net/forwarding/tc_actions.sh | 49 ++++++++++++++++++- 2 files changed, 55 insertions(+), 1 deletion(-)
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 01a44c3e8d6d..296af520817d 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -206,12 +206,19 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, return err; }
+static bool is_mirred_nested(void) +{ + return unlikely(__this_cpu_read(mirred_nest_level) > 1); +} + static int tcf_mirred_forward(bool want_ingress, struct sk_buff *skb) { int err;
if (!want_ingress) err = dev_queue_xmit(skb); + else if (is_mirred_nested()) + err = netif_rx(skb); else err = netif_receive_skb(skb);
diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh index d9eca227136b..e396e24d30e0 100755 --- a/tools/testing/selftests/net/forwarding/tc_actions.sh +++ b/tools/testing/selftests/net/forwarding/tc_actions.sh @@ -3,7 +3,8 @@
ALL_TESTS="gact_drop_and_ok_test mirred_egress_redirect_test \ mirred_egress_mirror_test matchall_mirred_egress_mirror_test \ - gact_trap_test" + gact_trap_test \ + mirred_egress_to_ingress_tcp_test" NUM_NETIFS=4 source tc_common.sh source lib.sh @@ -153,6 +154,52 @@ gact_trap_test() log_test "trap ($tcflags)" }
+mirred_egress_to_ingress_tcp_test() +{ + local tmpfile=$(mktemp) tmpfile1=$(mktemp) + + RET=0 + dd conv=sparse status=none if=/dev/zero bs=1M count=2 of=$tmpfile + tc filter add dev $h1 protocol ip pref 100 handle 100 egress flower \ + $tcflags ip_proto tcp src_ip 192.0.2.1 dst_ip 192.0.2.2 \ + action ct commit nat src addr 192.0.2.2 pipe \ + action ct clear pipe \ + action ct commit nat dst addr 192.0.2.1 pipe \ + action ct clear pipe \ + action skbedit ptype host pipe \ + action mirred ingress redirect dev $h1 + tc filter add dev $h1 protocol ip pref 101 handle 101 egress flower \ + $tcflags ip_proto icmp \ + action mirred ingress redirect dev $h1 + tc filter add dev $h1 protocol ip pref 102 handle 102 ingress flower \ + ip_proto icmp \ + action drop + + ip vrf exec v$h1 nc --recv-only -w10 -l -p 12345 -o $tmpfile1 & + local rpid=$! + ip vrf exec v$h1 nc -w1 --send-only 192.0.2.2 12345 <$tmpfile + wait -n $rpid + cmp -s $tmpfile $tmpfile1 + check_err $? "server output check failed" + + $MZ $h1 -c 10 -p 64 -a $h1mac -b $h1mac -A 192.0.2.1 -B 192.0.2.1 \ + -t icmp "ping,id=42,seq=5" -q + tc_check_packets "dev $h1 egress" 101 10 + check_err $? "didn't mirred redirect ICMP" + tc_check_packets "dev $h1 ingress" 102 10 + check_err $? "didn't drop mirred ICMP" + local overlimits=$(tc_rule_stats_get ${h1} 101 egress .overlimits) + test ${overlimits} = 10 + check_err $? "wrong overlimits, expected 10 got ${overlimits}" + + tc filter del dev $h1 egress protocol ip pref 100 handle 100 flower + tc filter del dev $h1 egress protocol ip pref 101 handle 101 flower + tc filter del dev $h1 ingress protocol ip pref 102 handle 102 flower + + rm -f $tmpfile $tmpfile1 + log_test "mirred_egress_to_ingress_tcp ($tcflags)" +} + setup_prepare() { h1=${NETIFS[p1]}
From: Brian Foster bfoster@redhat.com
mainline inclusion from mainline-v5.13-rc1 commit 16eaab839a9273ed156ebfccbd40c15d1e72f3d8 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Introduce an in-core counter to track the sum of all allocbt blocks used by the filesystem. This value is currently tracked per-ag via the ->agf_btreeblks field in the AGF, which also happens to include rmapbt blocks. A global, in-core count of allocbt blocks is required to identify the subset of global ->m_fdblocks that consists of unavailable blocks currently used for allocation btrees. To support this calculation at block reservation time, construct a similar global counter for allocbt blocks, populate it on first read of each AGF and update it as allocbt blocks are used and released.
Signed-off-by: Brian Foster bfoster@redhat.com Reviewed-by: Chandan Babu R chandanrlinux@gmail.com Reviewed-by: Allison Henderson allison.henderson@oracle.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- fs/xfs/libxfs/xfs_alloc.c | 14 ++++++++++++++ fs/xfs/libxfs/xfs_alloc_btree.c | 2 ++ fs/xfs/xfs_mount.h | 6 ++++++ 3 files changed, 22 insertions(+)
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 6f54f639913f..e18f3c089eea 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2983,6 +2983,7 @@ xfs_alloc_read_agf( struct xfs_agf *agf; /* ag freelist header */ struct xfs_perag *pag; /* per allocation group data */ int error; + int allocbt_blks;
trace_xfs_alloc_read_agf(mp, agno);
@@ -3013,6 +3014,19 @@ xfs_alloc_read_agf( pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); pag->pagf_init = 1; pag->pagf_agflreset = xfs_agfl_needs_reset(mp, agf); + + /* + * Update the in-core allocbt counter. Filter out the rmapbt + * subset of the btreeblks counter because the rmapbt is managed + * by perag reservation. Subtract one for the rmapbt root block + * because the rmap counter includes it while the btreeblks + * counter only tracks non-root blocks. + */ + allocbt_blks = pag->pagf_btreeblks; + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1; + if (allocbt_blks > 0) + atomic64_add(allocbt_blks, &mp->m_allocbt_blks); } #ifdef DEBUG else if (!XFS_FORCED_SHUTDOWN(mp)) { diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index dbe302d1cb8d..a43e4c50e69b 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -71,6 +71,7 @@ xfs_allocbt_alloc_block( return 0; }
+ atomic64_inc(&cur->bc_mp->m_allocbt_blks); xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1, false);
new->s = cpu_to_be32(bno); @@ -94,6 +95,7 @@ xfs_allocbt_free_block( if (error) return error;
+ atomic64_dec(&cur->bc_mp->m_allocbt_blks); xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); return 0; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 91a66b7b8815..80ec62ba6bd8 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -187,6 +187,12 @@ typedef struct xfs_mount { * extents or anything related to the rt device. */ struct percpu_counter m_delalloc_blks; + /* + * Global count of allocation btree blocks in use across all AGs. Only + * used when perag reservation is enabled. Helps prevent block + * reservation from attempting to reserve allocation btree blocks. + */ + atomic64_t m_allocbt_blks;
struct radix_tree_root m_perag_tree; /* per-ag accounting info */ spinlock_t m_perag_lock; /* lock for m_perag_tree */
From: Brian Foster bfoster@redhat.com
mainline inclusion from mainline-v5.13-rc1 commit fd43cf600cf61c66ae0a1021aca2f636115c7fcb category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The blocks used for allocation btrees (bnobt and countbt) are technically considered free space. This is because as free space is used, allocbt blocks are removed and naturally become available for traditional allocation. However, this means that a significant portion of free space may consist of in-use btree blocks if free space is severely fragmented.
On large filesystems with large perag reservations, this can lead to a rare but nasty condition where a significant amount of physical free space is available, but the majority of actual usable blocks consist of in-use allocbt blocks. We have a record of a (~12TB, 32 AG) filesystem with multiple AGs in a state with ~2.5GB or so free blocks tracked across ~300 total allocbt blocks, but effectively at 100% full because the the free space is entirely consumed by refcountbt perag reservation.
Such a large perag reservation is by design on large filesystems. The problem is that because the free space is so fragmented, this AG contributes the 300 or so allocbt blocks to the global counters as free space. If this pattern repeats across enough AGs, the filesystem lands in a state where global block reservation can outrun physical block availability. For example, a streaming buffered write on the affected filesystem continues to allow delayed allocation beyond the point where writeback starts to fail due to physical block allocation failures. The expected behavior is for the delalloc block reservation to fail gracefully with -ENOSPC before physical block allocation failure is a possibility.
To address this problem, set aside in-use allocbt blocks at reservation time and thus ensure they cannot be reserved until truly available for physical allocation. This allows alloc btree metadata to continue to reside in free space, but dynamically adjusts reservation availability based on internal state. Note that the logic requires that the allocbt counter is fully populated at reservation time before it is fully effective. We currently rely on the mount time AGF scan in the perag reservation initialization code for this dependency on filesystems where it's most important (i.e. with active perag reservations).
Signed-off-by: Brian Foster bfoster@redhat.com Reviewed-by: Chandan Babu R chandanrlinux@gmail.com Reviewed-by: Allison Henderson allison.henderson@oracle.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- fs/xfs/xfs_mount.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 6504503f07b3..adc49fe31bdd 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1244,6 +1244,7 @@ xfs_mod_fdblocks( int64_t lcounter; long long res_used; s32 batch; + uint64_t set_aside;
if (delta > 0) { /* @@ -1283,8 +1284,20 @@ xfs_mod_fdblocks( else batch = XFS_FDBLOCKS_BATCH;
+ /* + * Set aside allocbt blocks because these blocks are tracked as free + * space but not available for allocation. Technically this means that a + * single reservation cannot consume all remaining free space, but the + * ratio of allocbt blocks to usable free blocks should be rather small. + * The tradeoff without this is that filesystems that maintain high + * perag block reservations can over reserve physical block availability + * and fail physical allocation, which leads to much more serious + * problems (i.e. transaction abort, pagecache discards, etc.) than + * slightly premature -ENOSPC. + */ + set_aside = mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); percpu_counter_add_batch(&mp->m_fdblocks, delta, batch); - if (__percpu_counter_compare(&mp->m_fdblocks, mp->m_alloc_set_aside, + if (__percpu_counter_compare(&mp->m_fdblocks, set_aside, XFS_FDBLOCKS_BATCH) >= 0) { /* we had space! */ return 0;
From: "Darrick J. Wong" djwong@kernel.org
mainline inclusion from mainline-v5.18-rc1 commit c8c568259772751a14e969b7230990508de73d9d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
xfs_reserve_blocks controls the size of the user-visible free space reserve pool. Given the difference between the current and requested pool sizes, it will try to reserve free space from fdblocks. However, the amount requested from fdblocks is also constrained by the amount of space that we think xfs_mod_fdblocks will give us. If we forget to subtract m_allocbt_blks before calling xfs_mod_fdblocks, it will will return ENOSPC and we'll hang the kernel at mount due to the infinite loop.
In commit fd43cf600cf6, we decided that xfs_mod_fdblocks should not hand out the "free space" used by the free space btrees, because some portion of the free space btrees hold in reserve space for future btree expansion. Unfortunately, xfs_reserve_blocks' estimation of the number of blocks that it could request from xfs_mod_fdblocks was not updated to include m_allocbt_blks, so if space is extremely low, the caller hangs.
Fix this by creating a function to estimate the number of blocks that can be reserved from fdblocks, which needs to exclude the set-aside and m_allocbt_blks.
Found by running xfs/306 (which formats a single-AG 20MB filesystem) with an fstests configuration that specifies a 1k blocksize and a specially crafted log size that will consume 7/8 of the space (17920 blocks, specifically) in that AG.
Cc: Brian Foster bfoster@redhat.com Fixes: fd43cf600cf6 ("xfs: set aside allocation btree blocks from block reservation") Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Dave Chinner dchinner@redhat.com Conflicts: fs/xfs/xfs_mount.h [ 15f04fdc75aaaa1c("xfs: remove infinite loop when reserving free block pool") applied earlier. ] Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- fs/xfs/xfs_mount.c | 2 +- fs/xfs/xfs_mount.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index adc49fe31bdd..743c7ffdc316 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1295,7 +1295,7 @@ xfs_mod_fdblocks( * problems (i.e. transaction abort, pagecache discards, etc.) than * slightly premature -ENOSPC. */ - set_aside = mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); + set_aside = xfs_fdblocks_unavailable(mp); percpu_counter_add_batch(&mp->m_fdblocks, delta, batch); if (__percpu_counter_compare(&mp->m_fdblocks, set_aside, XFS_FDBLOCKS_BATCH) >= 0) { diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 80ec62ba6bd8..3dc54ed4315a 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -478,7 +478,7 @@ static inline uint64_t xfs_fdblocks_unavailable( struct xfs_mount *mp) { - return mp->m_alloc_set_aside; + return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); }
extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
From: "Darrick J. Wong" djwong@kernel.org
mainline inclusion from mainline-v5.18-rc1 commit 85bcfa26f9a3782be37d4feafd49668b98b8bdbe category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
On a modern filesystem, we don't allow userspace to allocate blocks for data storage from the per-AG space reservations, the user-controlled reservation pool that prevents ENOSPC in the middle of internal operations, or the internal per-AG set-aside that prevents unwanted filesystem shutdowns due to ENOSPC during a bmap btree split.
Since we now consider freespace btree blocks as unavailable for allocation for data storage, we shouldn't report those blocks via statfs either. This makes the numbers that we return via the statfs f_bavail and f_bfree fields a more conservative estimate of actual free space.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- fs/xfs/xfs_fsops.c | 2 +- fs/xfs/xfs_super.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 81cb2b3b20f2..0e899eafbc16 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -291,7 +291,7 @@ xfs_fs_counts( cnt->allocino = percpu_counter_read_positive(&mp->m_icount); cnt->freeino = percpu_counter_read_positive(&mp->m_ifree); cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) - - mp->m_alloc_set_aside; + xfs_fdblocks_unavailable(mp);
spin_lock(&mp->m_sb_lock); cnt->freertx = mp->m_sb.sb_frextents; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index bc4ea6f13dad..9b4ff8247de2 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -809,7 +809,8 @@ xfs_fs_statfs( spin_unlock(&mp->m_sb_lock);
/* make sure statp->f_bfree does not underflow */ - statp->f_bfree = max_t(int64_t, fdblocks - mp->m_alloc_set_aside, 0); + statp->f_bfree = max_t(int64_t, 0, + fdblocks - xfs_fdblocks_unavailable(mp)); statp->f_bavail = statp->f_bfree;
fakeinos = XFS_FSB_TO_INO(mp, statp->f_bfree);
From: Li Lingfeng lilingfeng3@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6QMTU
--------------------------------
Recently, a null-ptr-deref problem occur when submitting IO to nvme disk:
[34432.226539] ========================================================== [34432.226579] BUG: KASAN: null-ptr-deref in trace_event_raw_event_nvme_complete_rq+0x13c/0x270 [nvme_core] [34432.226584] Read of size 2 at addr 0000000000000002 by task loop0/32 [34432.226586] [34432.226594] CPU: 0 PID: 3242729 Comm: loop0 Kdump: loaded Tainted: [34432.226598] Hardware name: Huawei TaiShan 2280 V2/BC82AMDD [34432.226602] Call trace: [34432.226610] dump_backtrace+0x0/0x2fc [34432.226615] show_stack+0x20/0x30 [34432.226623] dump_stack+0x104/0x17c [34432.226630] __kasan_report+0x138/0x140 [34432.226634] kasan_report+0x44/0xdc [34432.226639] __asan_load2+0x90/0xd0 [34432.226662] trace_event_raw_event_nvme_complete_rq+0x13c/0x270 [34432.226684] nvme_complete_rq+0x228/0x480 [nvme_core] [34432.226698] nvme_pci_complete_rq+0x184/0x1b4 [nvme] [34432.226706] nvme_irq+0x270/0x500 [nvme] [34432.226714] __handle_irq_event_percpu+0x8c/0x324 [34432.226719] handle_irq_event_percpu+0x88/0x11c [34432.226724] handle_irq_event+0x110/0x2b0 [34432.226729] handle_fasteoi_irq+0x1e4/0x3f4 [34432.226734] __handle_domain_irq+0xbc/0x130 [34432.226739] gic_handle_irq+0x78/0x460 [34432.226743] el1_irq+0xb8/0x140 [34432.226750] __slab_alloc+0x38/0x70 [34432.226756] kmem_cache_alloc+0x6b8/0x904 [34432.226762] mempool_alloc_slab+0x3c/0x60 [34432.226766] mempool_alloc+0xf0/0x440 [34432.226772] bio_alloc_bioset+0x208/0x2f0 [34432.226899] io_submit_init_bio+0x3c/0x190 [ext4] [34432.226991] ext4_bio_write_page+0x540/0xbd0 [ext4] [34432.227082] mpage_submit_page+0xb0/0x120 [ext4] [34432.227173] mpage_process_page_bufs+0x25c/0x2b4 [ext4] [34432.227265] mpage_prepare_extent_to_map+0x3b8/0x75c [ext4] [34432.227356] ext4_writepages+0x454/0xcb4 [ext4] [34432.227361] do_writepages+0xc4/0x1c0 ...
This can be reproduced by following steps: 1) modprobe nvme 2) echo nvme:* > /sys/kernel/debug/tracing/set_event 3) dd if=/dev/random of=/dev/nvmexxx bs=1M count=1024
Generating command_id by nvme_cid() in trace event instead of nvme_req(req)->cmd->common.command_id can fix it since nvme_req(req)->cmd can be NULL in sometimes.
Fixes: bbd033cb95d9 ("nvme: use command_id instead of req->tag in trace_nvme_complete_rq()") Signed-off-by: Li Lingfeng lilingfeng3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- drivers/nvme/host/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h index aa8b0f86b2be..d4eacb0b43ce 100644 --- a/drivers/nvme/host/trace.h +++ b/drivers/nvme/host/trace.h @@ -98,7 +98,7 @@ TRACE_EVENT(nvme_complete_rq, TP_fast_assign( __entry->ctrl_id = nvme_req(req)->ctrl->instance; __entry->qid = nvme_req_qid(req); - __entry->cid = nvme_req(req)->cmd->common.command_id; + __entry->cid = nvme_cid(req); __entry->result = le64_to_cpu(nvme_req(req)->result.u64); __entry->retries = nvme_req(req)->retries; __entry->flags = nvme_req(req)->flags;
From: Greg Kroah-Hartman gregkh@linuxfoundation.org
stable inclusion from stable-v5.10.169 commit 6416c2108ba54d569e4c98d3b62ac78cb12e7107 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I6OOP3 CVE: CVE-2023-1513
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit 2c10b61421a28e95a46ab489fd56c0f442ff6952 upstream.
When calling the KVM_GET_DEBUGREGS ioctl, on some configurations, there might be some unitialized portions of the kvm_debugregs structure that could be copied to userspace. Prevent this as is done in the other kvm ioctls, by setting the whole structure to 0 before copying anything into it.
Bonus is that this reduces the lines of code as the explicit flag setting and reserved space zeroing out can be removed.
Cc: Sean Christopherson seanjc@google.com Cc: Paolo Bonzini pbonzini@redhat.com Cc: Thomas Gleixner tglx@linutronix.de Cc: Ingo Molnar mingo@redhat.com Cc: Borislav Petkov bp@alien8.de Cc: Dave Hansen dave.hansen@linux.intel.com Cc: x86@kernel.org Cc: "H. Peter Anvin" hpa@zytor.com Cc: stable stable@kernel.org Reported-by: Xingyuan Mo hdthky0@gmail.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Message-Id: 20230214103304.3689213-1-gregkh@linuxfoundation.org Tested-by: Xingyuan Mo hdthky0@gmail.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Guo Mengqi guomengqi3@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- arch/x86/kvm/x86.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 53ef53d5b414..b2ce17f29c75 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4692,12 +4692,11 @@ static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, { unsigned long val;
+ memset(dbgregs, 0, sizeof(*dbgregs)); memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); kvm_get_dr(vcpu, 6, &val); dbgregs->dr6 = val; dbgregs->dr7 = vcpu->arch.dr7; - dbgregs->flags = 0; - memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved)); }
static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
From: Yang Guo guoyang2@huawei.com
mainline inclusion from mainline-v6.1-rc1 commit af246cc6d0ed11318223606128bb0b09866c4c08 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6RQVI
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
CNTPCT_LO and CNTVCT_LO are defined by mistake in commit '8b82c4f883a7', so fix them according to the Arm ARM DDI 0487I.a, Table I2-4 "CNTBaseN memory map" as follows:
Offset Register Type Description 0x000 CNTPCT[31:0] RO Physical Count register. 0x004 CNTPCT[63:32] RO 0x008 CNTVCT[31:0] RO Virtual Count register. 0x00C CNTVCT[63:32] RO
Fixes: 8b82c4f883a7 ("clocksource/drivers/arm_arch_timer: Move MMIO timer programming over to CVAL") Cc: stable@vger.kernel.org Cc: Daniel Lezcano daniel.lezcano@linaro.org Cc: Thomas Gleixner tglx@linutronix.de Cc: Marc Zyngier maz@kernel.org Cc: Mark Rutland mark.rutland@arm.com Acked-by: Marc Zyngier maz@kernel.org Signed-off-by: Yang Guo guoyang2@huawei.com Signed-off-by: Shaokun Zhang zhangshaokun@hisilicon.com Link: https://lore.kernel.org/r/20220927033221.49589-1-zhangshaokun@hisilicon.com Signed-off-by: Daniel Lezcano daniel.lezcano@linaro.org Signed-off-by: Yu Liao liaoyu15@huawei.com Reviewed-by: Xiongfeng Wang wangxiongfeng2@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- drivers/clocksource/arm_arch_timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index 447e2a688113..432aeb7051fa 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -41,8 +41,8 @@ #define CNTACR_RWVT BIT(4) #define CNTACR_RWPT BIT(5)
-#define CNTVCT_LO 0x00 -#define CNTPCT_LO 0x08 +#define CNTPCT_LO 0x00 +#define CNTVCT_LO 0x08 #define CNTFRQ 0x10 #define CNTP_CVAL_LO 0x20 #define CNTP_CTL 0x2c
From: Ma Wupeng mawupeng1@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6RKHX CVE: NA
--------------------------------
After the fork operation, it is erroneous for the child process to have a reliable page size twice that of its parent process.
Upon examining the mm_struct structure, it was discovered that reliable_nr_page should be initialized to 0, similar to how RSS is initialized during mm_init(). This particular problem that arises during forking is merely one such example.
To resolve this issue, it is recommended to set reliable_nr_page to 0 during the mm_init() operation.
Fixes: d81e9624de21 ("proc: Count reliable memory usage of reliable tasks") Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- include/linux/mem_reliable.h | 9 +++++++++ kernel/fork.c | 1 + 2 files changed, 10 insertions(+)
diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index ddadf2803742..79228a1b2f0b 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -135,6 +135,14 @@ static inline void reliable_page_counter(struct page *page, if (page_reliable(page)) atomic_long_add(val, &mm->reliable_nr_page); } + +static inline void reliable_clear_page_counter(struct mm_struct *mm) +{ + if (!mem_reliable_is_enabled()) + return; + + atomic_long_set(&mm->reliable_nr_page, 0); +} #else #define reliable_enabled 0 #define pagecache_use_reliable_mem 0 @@ -178,6 +186,7 @@ static inline void reliable_page_counter(struct page *page, struct mm_struct *mm, int val) {} static inline void reliable_report_usage(struct seq_file *m, struct mm_struct *mm) {} +static inline void reliable_clear_page_counter(struct mm_struct *mm) {} #endif
#endif diff --git a/kernel/fork.c b/kernel/fork.c index f0aa2da990b8..908c4e2e7896 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1049,6 +1049,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, atomic_set(&mm->has_pinned, 0); atomic64_set(&mm->pinned_vm, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); + reliable_clear_page_counter(mm); spin_lock_init(&mm->page_table_lock); spin_lock_init(&mm->arg_lock); mm_init_cpumask(mm);
From: Baokun Li libaokun1@huawei.com
hulk inclusion category: bugfix bugzilla: 188500, https://gitee.com/openeuler/kernel/issues/I6RJ0V CVE: NA
--------------------------------
We got a WARNING in ext4_add_complete_io: ================================================================== WARNING: at fs/ext4/page-io.c:231 ext4_put_io_end_defer+0x182/0x250 CPU: 10 PID: 77 Comm: ksoftirqd/10 Tainted: 6.3.0-rc2 #85 RIP: 0010:ext4_put_io_end_defer+0x182/0x250 [ext4] [...] Call Trace: <TASK> ext4_end_bio+0xa8/0x240 [ext4] bio_endio+0x195/0x310 blk_update_request+0x184/0x770 scsi_end_request+0x2f/0x240 scsi_io_completion+0x75/0x450 scsi_finish_command+0xef/0x160 scsi_complete+0xa3/0x180 blk_complete_reqs+0x60/0x80 blk_done_softirq+0x25/0x40 __do_softirq+0x119/0x4c8 run_ksoftirqd+0x42/0x70 smpboot_thread_fn+0x136/0x3c0 kthread+0x140/0x1a0 ret_from_fork+0x2c/0x50 ==================================================================
Above issue may happen as follows:
cpu1 cpu2 ----------------------------|---------------------------- mount -o dioread_lock ext4_writepages ext4_do_writepages *if (ext4_should_dioread_nolock(inode))* // rsv_blocks is not assigned here mount -o remount,dioread_nolock ext4_journal_start_with_reserve __ext4_journal_start __ext4_journal_start_sb jbd2__journal_start *if (rsv_blocks)* // h_rsv_handle is not initialized here mpage_map_and_submit_extent mpage_map_one_extent dioread_nolock = ext4_should_dioread_nolock(inode) if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) mpd->io_submit.io_end->handle = handle->h_rsv_handle ext4_set_io_unwritten_flag io_end->flag |= EXT4_IO_END_UNWRITTEN // now io_end->handle is NULL but has EXT4_IO_END_UNWRITTEN flag
scsi_finish_command scsi_io_completion scsi_io_completion_action scsi_end_request blk_update_request req_bio_endio bio_endio bio->bi_end_io > ext4_end_bio ext4_put_io_end_defer ext4_add_complete_io // trigger WARN_ON(!io_end->handle && sbi->s_journal);
The immediate cause of this problem is that ext4_should_dioread_nolock() function returns inconsistent values in the ext4_do_writepages() and mpage_map_one_extent(). There are four conditions in this function that can be changed at mount time to cause this problem. These four conditions can be divided into two categories:
(1) journal_data and EXT4_EXTENTS_FL, which can be changed by ioctl (2) DELALLOC and DIOREAD_NOLOCK, which can be changed by remount
The two in the first category have been fixed by commit c8585c6fcaf2 ("ext4: fix races between changing inode journal mode and ext4_writepages") and commit cb85f4d23f79 ("ext4: fix race between writepages and enabling EXT4_EXTENTS_FL") respectively.
Two cases in the other category have not yet been fixed, and the above issue is caused by this situation. We refer to the fix for the first category, when applying options during remount, we grab s_writepages_rwsem to avoid racing with writepages ops to trigger this problem.
Fixes: 6b523df4fb5a ("ext4: use transaction reservation for extent conversion in ext4_end_io") Cc: stable@vger.kernel.org Signed-off-by: Baokun Li libaokun1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Yang Erkun yangerkun@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- fs/ext4/ext4.h | 3 ++- fs/ext4/super.c | 13 +++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f1d36671bc2b..5d5ae6f44510 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1618,7 +1618,8 @@ struct ext4_sb_info {
/* * Barrier between writepages ops and changing any inode's JOURNAL_DATA - * or EXTENTS flag. + * or EXTENTS flag or between writepages ops and changing DELALLOC or + * DIOREAD_NOLOCK mount options on remount. */ struct percpu_rw_semaphore s_writepages_rwsem; struct dax_device *s_daxdev; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9e34f90552ee..3278d46a7d65 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5984,10 +5984,20 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) vfs_flags = SB_LAZYTIME | SB_I_VERSION; sb->s_flags = (sb->s_flags & ~vfs_flags) | (*flags & vfs_flags);
+ /* + * Changing the DIOREAD_NOLOCK or DELALLOC mount options may cause + * two calls to ext4_should_dioread_nolock() to return inconsistent + * values, triggering WARN_ON in ext4_add_complete_io(). we grab + * here s_writepages_rwsem to avoid race between writepages ops and + * remount. + */ + percpu_down_write(&sbi->s_writepages_rwsem); if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) { err = -EINVAL; + percpu_up_write(&sbi->s_writepages_rwsem); goto restore_opts; } + percpu_up_write(&sbi->s_writepages_rwsem);
if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ test_opt(sb, JOURNAL_CHECKSUM)) { @@ -6205,6 +6215,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) return 0;
restore_opts: + percpu_down_write(&sbi->s_writepages_rwsem); sb->s_flags = old_sb_flags; sbi->s_mount_opt = old_opts.s_mount_opt; sbi->s_mount_opt2 = old_opts.s_mount_opt2; @@ -6213,6 +6224,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) sbi->s_commit_interval = old_opts.s_commit_interval; sbi->s_min_batch_time = old_opts.s_min_batch_time; sbi->s_max_batch_time = old_opts.s_max_batch_time; + percpu_up_write(&sbi->s_writepages_rwsem); + if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) ext4_release_system_zone(sb); #ifdef CONFIG_QUOTA
From: Zhang Yi yi.zhang@huawei.com
mainline inclusion from mainline-v6.3-rc1 commit 240930fb7e6b52229bdee5b1423bfeab0002fed2 category: perf bugzilla: https://gitee.com/openeuler/kernel/issues/I6S63P CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
In the dio write path, we only take shared inode lock for the case of aligned overwriting initialized blocks inside EOF. But for overwriting preallocated blocks, it may only need to split unwritten extents, this procedure has been protected under i_data_sem lock, it's safe to release the exclusive inode lock and take shared inode lock.
This could give a significant speed up for multi-threaded writes. Test on Intel Xeon Gold 6140 and nvme SSD with below fio parameters.
direct=1 ioengine=libaio iodepth=10 numjobs=10 runtime=60 rw=randwrite size=100G
And the test result are: Before: bs=4k IOPS=11.1k, BW=43.2MiB/s bs=16k IOPS=11.1k, BW=173MiB/s bs=64k IOPS=11.2k, BW=697MiB/s
After: bs=4k IOPS=41.4k, BW=162MiB/s bs=16k IOPS=41.3k, BW=646MiB/s bs=64k IOPS=13.5k, BW=843MiB/s
Signed-off-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Jan Kara jack@suse.cz Link: https://lore.kernel.org/r/20221226062015.3479416-1-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o tytso@mit.edu Conflicts: fs/ext4/file.c [ 2f63296578cad1a("iomap: pass a flags argument to iomap_dio_rw") is not applied. ] Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- fs/ext4/file.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 6f78f7fbf419..32de10ef895b 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -190,8 +190,9 @@ ext4_extending_io(struct inode *inode, loff_t offset, size_t len) return false; }
-/* Is IO overwriting allocated and initialized blocks? */ -static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len) +/* Is IO overwriting allocated or initialized blocks? */ +static bool ext4_overwrite_io(struct inode *inode, + loff_t pos, loff_t len, bool *unwritten) { struct ext4_map_blocks map; unsigned int blkbits = inode->i_blkbits; @@ -205,12 +206,15 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len) blklen = map.m_len;
err = ext4_map_blocks(NULL, inode, &map, 0); + if (err != blklen) + return false; /* * 'err==len' means that all of the blocks have been preallocated, - * regardless of whether they have been initialized or not. To exclude - * unwritten extents, we need to check m_flags. + * regardless of whether they have been initialized or not. We need to + * check m_flags to distinguish the unwritten extents. */ - return err == blklen && (map.m_flags & EXT4_MAP_MAPPED); + *unwritten = !(map.m_flags & EXT4_MAP_MAPPED); + return true; }
static ssize_t ext4_generic_write_checks(struct kiocb *iocb, @@ -421,11 +425,16 @@ static const struct iomap_dio_ops ext4_dio_write_ops = { * - For extending writes case we don't take the shared lock, since it requires * updating inode i_disksize and/or orphan handling with exclusive lock. * - * - shared locking will only be true mostly with overwrites. Otherwise we will - * switch to exclusive i_rwsem lock. + * - shared locking will only be true mostly with overwrites, including + * initialized blocks and unwritten blocks. For overwrite unwritten blocks + * we protect splitting extents by i_data_sem in ext4_inode_info, so we can + * also release exclusive i_rwsem lock. + * + * - Otherwise we will switch to exclusive i_rwsem lock. */ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, - bool *ilock_shared, bool *extend) + bool *ilock_shared, bool *extend, + bool *unwritten) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); @@ -449,7 +458,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, * in file_modified(). */ if (*ilock_shared && (!IS_NOSEC(inode) || *extend || - !ext4_overwrite_io(inode, offset, count))) { + !ext4_overwrite_io(inode, offset, count, unwritten))) { if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; goto out; @@ -481,7 +490,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) loff_t offset = iocb->ki_pos; size_t count = iov_iter_count(from); const struct iomap_ops *iomap_ops = &ext4_iomap_ops; - bool extend = false, unaligned_io = false; + bool extend = false, unaligned_io = false, unwritten = false; bool ilock_shared = true;
/* @@ -524,7 +533,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) return ext4_buffered_write_iter(iocb, from); }
- ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend); + ret = ext4_dio_write_checks(iocb, from, + &ilock_shared, &extend, &unwritten); if (ret <= 0) return ret;
@@ -568,7 +578,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) ext4_journal_stop(handle); }
- if (ilock_shared) + if (ilock_shared && !unwritten) iomap_ops = &ext4_iomap_overwrite_ops; ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, is_sync_kiocb(iocb) || unaligned_io || extend);
From: "Gustavo A. R. Silva" gustavoars@kernel.org
mainline inclusion from mainline-v5.16-rc1 commit 69508d43334e3b09c344f662272bcf24a5b508ed category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I6SFHJ CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Make use of the struct_size() and flex_array_size() helpers instead of an open-coded version, in order to avoid any potential type mistakes or integer overflows that, in the worse scenario, could lead to heap overflows.
Link: https://github.com/KSPP/linux/issues/160 Signed-off-by: Gustavo A. R. Silva gustavoars@kernel.org Link: https://lore.kernel.org/r/20210928193107.GA262595@embeddedor Signed-off-by: Jakub Kicinski kuba@kernel.org
Conflicts: net/sched/sch_api.c
Signed-off-by: Zhengchao Shao shaozhengchao@huawei.com Reviewed-by: Liu Jian liujian56@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- net/sched/sch_api.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 54e2309315eb..b12a3502fca5 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -507,7 +507,8 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt, list_for_each_entry(stab, &qdisc_stab_list, list) { if (memcmp(&stab->szopts, s, sizeof(*s))) continue; - if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16))) + if (tsize > 0 && + memcmp(stab->data, tab, flex_array_size(stab, data, tsize))) continue; stab->refcnt++; return stab; @@ -519,14 +520,14 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt, return ERR_PTR(-EINVAL); }
- stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL); + stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL); if (!stab) return ERR_PTR(-ENOMEM);
stab->refcnt = 1; stab->szopts = *s; if (tsize > 0) - memcpy(stab->data, tab, tsize * sizeof(u16)); + memcpy(stab->data, tab, flex_array_size(stab, data, tsize));
list_add_tail(&stab->list, &qdisc_stab_list);
From: "Gustavo A. R. Silva" gustavoars@kernel.org
mainline inclusion from mainline-v5.16-rc1 commit 129291980f4901ae68ee3ba4344bdc38cf5f800d category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I6SFHJ CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Make use of the struct_size() helper instead of an open-coded version, in order to avoid any potential type mistakes or integer overflows that, in the worst scenario, could lead to heap overflows.
Link: https://github.com/KSPP/linux/issues/160 Signed-off-by: Gustavo A. R. Silva gustavoars@kernel.org Link: https://lore.kernel.org/r/20210929201718.GA342296@embeddedor Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Zhengchao Shao shaozhengchao@huawei.com Reviewed-by: Liu Jian liujian56@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- net/sched/sch_netem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index adc5407fd5d5..b72b308fe406 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -785,7 +785,7 @@ static int get_dist_table(struct Qdisc *sch, struct disttable **tbl, if (!n || n > NETEM_DIST_MAX) return -EINVAL;
- d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL); + d = kvmalloc(struct_size(d, table, n), GFP_KERNEL); if (!d) return -ENOMEM;