From: Li Lingfeng lilingfeng3@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5DI4S CVE: NA Reference: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/commit/...
---------------------------
We don't really need the field names to be globally unique, it is enough when they are unique in the given struct. Since structs do not generally span mutliple files, using the line number is enough to ensure an unique identifier. It means that we can't use two KABI_RENAME macros on the same line but that's not happening anyway.
This allows pahole to deduplicate the type info of structs using KABI macros, lowering the size of vmlinuz from 26M to 8.5
Signed-off-by: Li Lingfeng lilingfeng3@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/kabi.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/include/linux/kabi.h b/include/linux/kabi.h index a52d9fa72cfa..fe3213c0f576 100644 --- a/include/linux/kabi.h +++ b/include/linux/kabi.h @@ -393,6 +393,8 @@ # define __KABI_CHECK_SIZE(_item, _size) #endif
+#define KABI_UNIQUE_ID __PASTE(kabi_hidden_, __LINE__) + # define _KABI_DEPRECATE(_type, _orig) _type kabi_reserved_##_orig # define _KABI_DEPRECATE_FN(_type, _orig, _args...) \ _type (* kabi_reserved_##_orig)(_args) @@ -402,7 +404,7 @@ _new; \ struct { \ _orig; \ - } __UNIQUE_ID(kabi_hide); \ + } KABI_UNIQUE_ID; \ __KABI_CHECK_SIZE_ALIGN(_orig, _new); \ } #else
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I58VE5 CVE: NA
--------------------------------
If new configuration is submitted while a bio is throttled, then new waiting time is recaculated regardless that the bio might aready wait for some time:
tg_conf_updated throtl_start_new_slice tg_update_disptime throtl_schedule_next_dispatch
Then io hung can be triggered by always submmiting new configuration before the throttled bio is dispatched.
Fix the problem by respecting the time that throttled bio aready waited. In order to do that, instead of start new slice in tg_conf_updated(), just update 'bytes_disp' and 'io_disp' based on the new configuration.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-throttle.c | 79 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 66 insertions(+), 13 deletions(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 95a13da1f343..0427c9c63e81 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1421,7 +1421,57 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v) return 0; }
-static void tg_conf_updated(struct throtl_grp *tg, bool global) +static u64 throtl_update_bytes_disp(u64 dispatched, u64 new_limit, + u64 old_limit) +{ + if (new_limit == old_limit) + return dispatched; + + if (!dispatched) + return 0; + + /* + * In the case that multiply will overflow, just return 0. It will only + * let bios to be dispatched earlier. + */ + if (div64_u64(U64_MAX, dispatched) < new_limit) + return 0; + + dispatched *= new_limit; + return div64_u64(dispatched, old_limit); +} + +static u32 throtl_update_io_disp(u32 dispatched, u32 new_limit, u32 old_limit) +{ + if (new_limit == old_limit) + return dispatched; + + if (!dispatched) + return 0; + /* + * In the case that multiply will overflow, just return 0. It will only + * let bios to be dispatched earlier. + */ + if (UINT_MAX / dispatched < new_limit) + return 0; + + dispatched *= new_limit; + return dispatched / old_limit; +} + +static void throtl_update_slice(struct throtl_grp *tg, u64 *old_limits) +{ + tg->bytes_disp[READ] = throtl_update_bytes_disp(tg->bytes_disp[READ], + tg_bps_limit(tg, READ), old_limits[0]); + tg->bytes_disp[WRITE] = throtl_update_bytes_disp(tg->bytes_disp[WRITE], + tg_bps_limit(tg, WRITE), old_limits[1]); + tg->io_disp[READ] = throtl_update_io_disp(tg->io_disp[READ], + tg_iops_limit(tg, READ), (u32)old_limits[2]); + tg->io_disp[WRITE] = throtl_update_io_disp(tg->io_disp[WRITE], + tg_iops_limit(tg, WRITE), (u32)old_limits[3]); +} + +static void tg_conf_updated(struct throtl_grp *tg, u64 *old_limits, bool global) { struct throtl_service_queue *sq = &tg->service_queue; struct cgroup_subsys_state *pos_css; @@ -1460,16 +1510,7 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global) parent_tg->latency_target); }
- /* - * We're already holding queue_lock and know @tg is valid. Let's - * apply the new config directly. - * - * Restart the slices for both READ and WRITES. It might happen - * that a group's limit are dropped suddenly and we don't want to - * account recently dispatched IO with new low rate. - */ - throtl_start_new_slice(tg, READ); - throtl_start_new_slice(tg, WRITE); + throtl_update_slice(tg, old_limits);
if (tg->flags & THROTL_TG_PENDING) { tg_update_disptime(tg); @@ -1502,6 +1543,14 @@ static inline int throtl_restart_syscall_when_busy(int errno) return ret; }
+static void tg_get_limits(struct throtl_grp *tg, u64 *limits) +{ + limits[0] = tg_bps_limit(tg, READ); + limits[1] = tg_bps_limit(tg, WRITE); + limits[2] = tg_iops_limit(tg, READ); + limits[3] = tg_iops_limit(tg, WRITE); +} + static ssize_t tg_set_conf(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool is_u64) { @@ -1510,6 +1559,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, struct throtl_grp *tg; int ret; u64 v; + u64 old_limits[4];
ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); if (ret) @@ -1526,13 +1576,14 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, v = U64_MAX;
tg = blkg_to_tg(ctx.blkg); + tg_get_limits(tg, old_limits);
if (is_u64) *(u64 *)((void *)tg + of_cft(of)->private) = v; else *(unsigned int *)((void *)tg + of_cft(of)->private) = v;
- tg_conf_updated(tg, false); + tg_conf_updated(tg, old_limits, false); ret = 0; out_finish: blkg_conf_finish(&ctx); @@ -1703,6 +1754,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, struct blkg_conf_ctx ctx; struct throtl_grp *tg; u64 v[4]; + u64 old_limits[4]; unsigned long idle_time; unsigned long latency_time; int ret; @@ -1721,6 +1773,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, v[1] = tg->bps_conf[WRITE][index]; v[2] = tg->iops_conf[READ][index]; v[3] = tg->iops_conf[WRITE][index]; + tg_get_limits(tg, old_limits);
idle_time = tg->idletime_threshold_conf; latency_time = tg->latency_target_conf; @@ -1807,7 +1860,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, tg->td->limit_index = LIMIT_LOW; } else tg->td->limit_index = LIMIT_MAX; - tg_conf_updated(tg, index == LIMIT_LOW && + tg_conf_updated(tg, old_limits, index == LIMIT_LOW && tg->td->limit_valid[LIMIT_LOW]); ret = 0; out_finish:
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I57S8D CVE: NA
--------------------------------
commit 7ec2ec682568 ("block: update io_ticks when io hang") fixed that %util will be zero for iostat when io is hanged, however, avgqu-sz is still zero while it represents the number of io that are hunged. On the other hand, for some slow device, if an io is started before and done after diskstats is read, the avgqu-sz will be miscalculated.
To fix the problem, update 'nsecs[]' when part_stat_show() or diskstats_show() is called. In order to do that, add 'stat_time' in struct hd_struct and 'rq_stat_time' in struct request to record the time. And during iteration, update 'nsecs[]' for each inflight request.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-core.c | 15 ++++++++-- block/blk-mq.c | 45 ++++++++++++++++++++++++++++++ block/blk-mq.h | 2 ++ block/genhd.c | 61 ++++++++++++++++++++++++++--------------- block/partitions/core.c | 2 ++ include/linux/blkdev.h | 3 +- include/linux/genhd.h | 2 ++ 7 files changed, 105 insertions(+), 25 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c index bbd3d4560458..f5f419dd8ab6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1292,13 +1292,24 @@ void blk_account_io_done(struct request *req, u64 now) !(req->rq_flags & RQF_FLUSH_SEQ)) { const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; + u64 stat_time;
part_stat_lock(); part = req->part; - update_io_ticks(part, jiffies, true); part_stat_inc(part, ios[sgrp]); - part_stat_add(part, nsecs[sgrp], now - req->start_time_ns); + stat_time = READ_ONCE(req->stat_time_ns); + /* + * This might fail if 'req->stat_time_ns' is updated + * in blk_mq_check_inflight_with_stat(). + */ + if (likely(cmpxchg64(&req->stat_time_ns, stat_time, now) + == stat_time)) { + u64 duation = stat_time ? now - stat_time : + now - req->start_time_ns; + + part_stat_add(req->part, nsecs[sgrp], duation); + } part_stat_unlock();
hd_struct_put(part); diff --git a/block/blk-mq.c b/block/blk-mq.c index 83193e44aada..2cf387697a41 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -102,6 +102,50 @@ struct mq_inflight { unsigned int inflight[2]; };
+static bool blk_mq_check_inflight_with_stat(struct blk_mq_hw_ctx *hctx, + struct request *rq, void *priv, + bool reserved) +{ + struct mq_inflight *mi = priv; + + if ((!mi->part->partno || rq->part == mi->part) && + blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) { + u64 stat_time; + + mi->inflight[rq_data_dir(rq)]++; + if (!rq->part) + return true; + + stat_time = READ_ONCE(rq->stat_time_ns); + /* + * This might fail if 'req->stat_time_ns' is updated in + * blk_account_io_done(). + */ + if (likely(cmpxchg64(&rq->stat_time_ns, stat_time, + rq->part->stat_time) == stat_time)) { + int sgrp = op_stat_group(req_op(rq)); + u64 duation = stat_time ? + rq->part->stat_time - stat_time : + rq->part->stat_time - rq->start_time_ns; + + part_stat_add(rq->part, nsecs[sgrp], duation); + } + } + + return true; +} + +unsigned int blk_mq_in_flight_with_stat(struct request_queue *q, + struct hd_struct *part) +{ + struct mq_inflight mi = { .part = part }; + + blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_with_stat, &mi); + + return mi.inflight[0] + mi.inflight[1]; +} + + static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) @@ -328,6 +372,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->start_time_ns = ktime_get_ns(); else rq->start_time_ns = 0; + rq->stat_time_ns = 0; rq->io_start_time_ns = 0; rq->stats_sectors = 0; rq->nr_phys_segments = 0; diff --git a/block/blk-mq.h b/block/blk-mq.h index bb58c68b8274..6897b4c09b7c 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -187,6 +187,8 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part); void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]); +unsigned int blk_mq_in_flight_with_stat(struct request_queue *q, + struct hd_struct *part);
static inline void blk_mq_put_dispatch_budget(struct request_queue *q) { diff --git a/block/genhd.c b/block/genhd.c index f94152e99876..16ad881172d0 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1293,25 +1293,52 @@ ssize_t part_size_show(struct device *dev, (unsigned long long)part_nr_sects_read(p)); }
+static void part_set_stat_time(struct hd_struct *hd) +{ + u64 now = ktime_get_ns(); + +again: + hd->stat_time = now; + if (hd->partno) { + hd = &part_to_disk(hd)->part0; + goto again; + } +} + +static void part_get_stat_info(struct hd_struct *hd, struct disk_stats *stat, + unsigned int *inflight) +{ + struct request_queue *q = part_to_disk(hd)->queue; + + if (queue_is_mq(q)) { + part_stat_lock(); + spin_lock(&hd->bd_stat_lock); + part_set_stat_time(hd); + *inflight = blk_mq_in_flight_with_stat(q, hd); + spin_unlock(&hd->bd_stat_lock); + part_stat_unlock(); + } else { + *inflight = part_in_flight(hd); + } + + if (*inflight) { + part_stat_lock(); + update_io_ticks(hd, jiffies, true); + part_stat_unlock(); + } + + part_stat_read_all(hd, stat); +} + ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); - struct request_queue *q = part_to_disk(p)->queue; struct disk_stats stat; unsigned int inflight;
- if (queue_is_mq(q)) - inflight = blk_mq_in_flight(q, p); - else - inflight = part_in_flight(p); + part_get_stat_info(p, &stat, &inflight);
- if (inflight) { - part_stat_lock(); - update_io_ticks(p, jiffies, true); - part_stat_unlock(); - } - part_stat_read_all(p, &stat); return sprintf(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " @@ -1628,17 +1655,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { - if (queue_is_mq(gp->queue)) - inflight = blk_mq_in_flight(gp->queue, hd); - else - inflight = part_in_flight(hd); - - if (inflight) { - part_stat_lock(); - update_io_ticks(hd, jiffies, true); - part_stat_unlock(); - } - part_stat_read_all(hd, &stat); + part_get_stat_info(hd, &stat, &inflight); seq_printf(seqf, "%4d %7d %s " "%lu %lu %lu %u " "%lu %lu %lu %u " diff --git a/block/partitions/core.c b/block/partitions/core.c index 569b0ca9f6e1..92c723c19bb0 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -415,6 +415,8 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, p->nr_sects = len; p->partno = partno; p->read_only = get_disk_ro(disk) | test_bit(partno, disk->user_ro_bitmap); + p->stat_time = 0; + spin_lock_init(&p->bd_stat_lock);
if (info) { struct partition_meta_info *pinfo; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6a4b2a01a462..46feee6bd45c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -207,7 +207,8 @@ struct request { u64 start_time_ns; /* Time that I/O was submitted to the device. */ u64 io_start_time_ns; - + /* Time that I/O was counted in part_get_stat_info(). */ + u64 stat_time_ns; #ifdef CONFIG_BLK_WBT unsigned short wbt_flags; #endif diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 09da27361620..07122c79210c 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -63,6 +63,8 @@ struct hd_struct { seqcount_t nr_sects_seq; #endif unsigned long stamp; + spinlock_t bd_stat_lock; + u64 stat_time; struct disk_stats __percpu *dkstats; struct percpu_ref ref;
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I57S8D CVE: NA
--------------------------------
Since there are no reserved fields, declare a wrapper to fix kabi broken.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-core.c | 7 ++++--- block/blk-mq.c | 12 +++++++----- include/linux/blk-mq.h | 13 +++++++++++-- include/linux/blkdev.h | 2 -- 4 files changed, 22 insertions(+), 12 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c index f5f419dd8ab6..8b93366c76e4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1293,17 +1293,18 @@ void blk_account_io_done(struct request *req, u64 now) const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; u64 stat_time; + struct request_wrapper *rq_wrapper = request_to_wrapper(req);
part_stat_lock(); part = req->part; update_io_ticks(part, jiffies, true); part_stat_inc(part, ios[sgrp]); - stat_time = READ_ONCE(req->stat_time_ns); + stat_time = READ_ONCE(rq_wrapper->stat_time_ns); /* - * This might fail if 'req->stat_time_ns' is updated + * This might fail if 'stat_time_ns' is updated * in blk_mq_check_inflight_with_stat(). */ - if (likely(cmpxchg64(&req->stat_time_ns, stat_time, now) + if (likely(cmpxchg64(&rq_wrapper->stat_time_ns, stat_time, now) == stat_time)) { u64 duation = stat_time ? now - stat_time : now - req->start_time_ns; diff --git a/block/blk-mq.c b/block/blk-mq.c index 2cf387697a41..76ff229e7fb2 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -111,17 +111,19 @@ static bool blk_mq_check_inflight_with_stat(struct blk_mq_hw_ctx *hctx, if ((!mi->part->partno || rq->part == mi->part) && blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) { u64 stat_time; + struct request_wrapper *rq_wrapper;
mi->inflight[rq_data_dir(rq)]++; if (!rq->part) return true;
- stat_time = READ_ONCE(rq->stat_time_ns); + rq_wrapper = request_to_wrapper(rq); + stat_time = READ_ONCE(rq_wrapper->stat_time_ns); /* - * This might fail if 'req->stat_time_ns' is updated in + * This might fail if 'stat_time_ns' is updated in * blk_account_io_done(). */ - if (likely(cmpxchg64(&rq->stat_time_ns, stat_time, + if (likely(cmpxchg64(&rq_wrapper->stat_time_ns, stat_time, rq->part->stat_time) == stat_time)) { int sgrp = op_stat_group(req_op(rq)); u64 duation = stat_time ? @@ -368,11 +370,11 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, #ifdef CONFIG_BLK_RQ_ALLOC_TIME rq->alloc_time_ns = alloc_time_ns; #endif + request_to_wrapper(rq)->stat_time_ns = 0; if (blk_mq_need_time_stamp(rq)) rq->start_time_ns = ktime_get_ns(); else rq->start_time_ns = 0; - rq->stat_time_ns = 0; rq->io_start_time_ns = 0; rq->stats_sectors = 0; rq->nr_phys_segments = 0; @@ -2555,7 +2557,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, * rq_size is the size of the request plus driver payload, rounded * to the cacheline size */ - rq_size = round_up(sizeof(struct request) + set->cmd_size, + rq_size = round_up(sizeof(struct request_wrapper) + set->cmd_size, cache_line_size()); left = rq_size * depth;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index c9210fb70e4d..ac83257972a0 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -304,6 +304,15 @@ struct blk_mq_queue_data { KABI_RESERVE(1) };
+struct request_wrapper { + struct request rq; + + /* Time that I/O was counted in part_get_stat_info(). */ + u64 stat_time_ns; +}; + +#define request_to_wrapper(_rq) container_of(_rq, struct request_wrapper, rq) + typedef bool (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, bool); typedef bool (busy_tag_iter_fn)(struct request *, void *, bool); @@ -595,7 +604,7 @@ static inline bool blk_should_fake_timeout(struct request_queue *q) */ static inline struct request *blk_mq_rq_from_pdu(void *pdu) { - return pdu - sizeof(struct request); + return pdu - sizeof(struct request_wrapper); }
/** @@ -609,7 +618,7 @@ static inline struct request *blk_mq_rq_from_pdu(void *pdu) */ static inline void *blk_mq_rq_to_pdu(struct request *rq) { - return rq + 1; + return request_to_wrapper(rq) + 1; }
static inline struct blk_mq_hw_ctx *queue_hctx(struct request_queue *q, int id) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 46feee6bd45c..49540ce9e325 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -207,8 +207,6 @@ struct request { u64 start_time_ns; /* Time that I/O was submitted to the device. */ u64 io_start_time_ns; - /* Time that I/O was counted in part_get_stat_info(). */ - u64 stat_time_ns; #ifdef CONFIG_BLK_WBT unsigned short wbt_flags; #endif
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I57S8D CVE: NA
--------------------------------
Use reserved fields to fix kabi broken for field 'stat_time'. However, for the field 'bd_stat_lock', spinlock_t can be up to 64 bytes, thus reserved fields is not enough. And struct 'hd_struct' is internal of other sutrct, thus declare a wrapper is infeasible. In order to fix kabi broken for 'bd_stat_lock', use 'dev->mutex' instead.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/genhd.c | 4 ++-- block/partitions/core.c | 1 - include/linux/genhd.h | 4 +--- 3 files changed, 3 insertions(+), 6 deletions(-)
diff --git a/block/genhd.c b/block/genhd.c index 16ad881172d0..cc114dd0265b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1312,10 +1312,10 @@ static void part_get_stat_info(struct hd_struct *hd, struct disk_stats *stat,
if (queue_is_mq(q)) { part_stat_lock(); - spin_lock(&hd->bd_stat_lock); + mutex_lock(&part_to_dev(hd)->mutex); part_set_stat_time(hd); *inflight = blk_mq_in_flight_with_stat(q, hd); - spin_unlock(&hd->bd_stat_lock); + mutex_unlock(&part_to_dev(hd)->mutex); part_stat_unlock(); } else { *inflight = part_in_flight(hd); diff --git a/block/partitions/core.c b/block/partitions/core.c index 92c723c19bb0..8f32f3cd0ede 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -416,7 +416,6 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, p->partno = partno; p->read_only = get_disk_ro(disk) | test_bit(partno, disk->user_ro_bitmap); p->stat_time = 0; - spin_lock_init(&p->bd_stat_lock);
if (info) { struct partition_meta_info *pinfo; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 07122c79210c..05927a1c6b5b 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -63,8 +63,6 @@ struct hd_struct { seqcount_t nr_sects_seq; #endif unsigned long stamp; - spinlock_t bd_stat_lock; - u64 stat_time; struct disk_stats __percpu *dkstats; struct percpu_ref ref;
@@ -78,7 +76,7 @@ struct hd_struct { #endif struct rcu_work rcu_work;
- KABI_RESERVE(1) + KABI_USE(1, u64 stat_time) KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4)
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I57S8D CVE: NA
--------------------------------
part_get_stat_info() call mutex_lock() inside part_stat_lock(), which is wrong because part_stat_lock() disables preempt.
Fix the problem by hold mutex first.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/genhd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/block/genhd.c b/block/genhd.c index cc114dd0265b..fcd6210417bc 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1311,12 +1311,12 @@ static void part_get_stat_info(struct hd_struct *hd, struct disk_stats *stat, struct request_queue *q = part_to_disk(hd)->queue;
if (queue_is_mq(q)) { - part_stat_lock(); mutex_lock(&part_to_dev(hd)->mutex); + part_stat_lock(); part_set_stat_time(hd); *inflight = blk_mq_in_flight_with_stat(q, hd); - mutex_unlock(&part_to_dev(hd)->mutex); part_stat_unlock(); + mutex_unlock(&part_to_dev(hd)->mutex); } else { *inflight = part_in_flight(hd); }
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I57S8D CVE: NA
--------------------------------
There might be a problem that iostat can read less 'nsecs' than the last time.
1) io is started after 'hd->stat_time' is set.
2) following concurrent scenario:
t1 t2 blk_mq_end_request time1 -> before hd->stat_time
blk_account_io_done part_get_stat_info part_set_stat_time hd->stat_time = time2 -> time1 < time2 blk_mq_in_flight_with_stat blk_mq_check_inflight_with_stat cmpxchg64() -> set stat_time_ns to time2 cmpxchg64() -> set stat_time to time1 duation = time1 - time2; -> time1 < time2 part_stat_add(xx, nsecs, duation) -> problematic
3) Similar concurrent scenario the other way around.
Fix the problem by don't add 'duation' if the calculation might underflow.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-core.c | 3 ++- block/blk-mq.c | 12 ++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c index 8b93366c76e4..715b61c239ea 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1304,7 +1304,8 @@ void blk_account_io_done(struct request *req, u64 now) * This might fail if 'stat_time_ns' is updated * in blk_mq_check_inflight_with_stat(). */ - if (likely(cmpxchg64(&rq_wrapper->stat_time_ns, stat_time, now) + if (likely(now > stat_time && + cmpxchg64(&rq_wrapper->stat_time_ns, stat_time, now) == stat_time)) { u64 duation = stat_time ? now - stat_time : now - req->start_time_ns; diff --git a/block/blk-mq.c b/block/blk-mq.c index 76ff229e7fb2..5031c4fc4412 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -117,14 +117,22 @@ static bool blk_mq_check_inflight_with_stat(struct blk_mq_hw_ctx *hctx, if (!rq->part) return true;
+ /* + * If the request is started after 'part->stat_time' is set, + * don't update 'nsces' here. + */ + if (rq->part->stat_time <= rq->start_time_ns) + return true; + rq_wrapper = request_to_wrapper(rq); stat_time = READ_ONCE(rq_wrapper->stat_time_ns); /* * This might fail if 'stat_time_ns' is updated in * blk_account_io_done(). */ - if (likely(cmpxchg64(&rq_wrapper->stat_time_ns, stat_time, - rq->part->stat_time) == stat_time)) { + if (likely(rq->part->stat_time > stat_time && + cmpxchg64(&rq_wrapper->stat_time_ns, stat_time, + rq->part->stat_time) == stat_time)) { int sgrp = op_stat_group(req_op(rq)); u64 duation = stat_time ? rq->part->stat_time - stat_time :
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I57S8D CVE: NA
--------------------------------
Some 32-bit platform doesn't support cmpxchg64(), using it in generic code will cause compile error.
Fixes: 4c8f034bf1e6 ("[Huawei] block: update nsecs[] in part_stat_show() and diskstats_show()") Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-core.c | 6 ++++++ block/blk-mq.c | 3 ++- block/blk-mq.h | 2 ++ block/genhd.c | 8 ++++++-- 4 files changed, 16 insertions(+), 3 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c index 715b61c239ea..109fb2750453 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1292,13 +1292,16 @@ void blk_account_io_done(struct request *req, u64 now) !(req->rq_flags & RQF_FLUSH_SEQ)) { const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; +#ifdef CONFIG_64BIT u64 stat_time; struct request_wrapper *rq_wrapper = request_to_wrapper(req); +#endif
part_stat_lock(); part = req->part; update_io_ticks(part, jiffies, true); part_stat_inc(part, ios[sgrp]); +#ifdef CONFIG_64BIT stat_time = READ_ONCE(rq_wrapper->stat_time_ns); /* * This might fail if 'stat_time_ns' is updated @@ -1312,6 +1315,9 @@ void blk_account_io_done(struct request *req, u64 now)
part_stat_add(req->part, nsecs[sgrp], duation); } +#else + part_stat_add(part, nsecs[sgrp], now - req->start_time_ns); +#endif part_stat_unlock();
hd_struct_put(part); diff --git a/block/blk-mq.c b/block/blk-mq.c index 5031c4fc4412..1941ffc4db85 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -102,6 +102,7 @@ struct mq_inflight { unsigned int inflight[2]; };
+#ifdef CONFIG_64BIT static bool blk_mq_check_inflight_with_stat(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) @@ -154,7 +155,7 @@ unsigned int blk_mq_in_flight_with_stat(struct request_queue *q,
return mi.inflight[0] + mi.inflight[1]; } - +#endif
static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, diff --git a/block/blk-mq.h b/block/blk-mq.h index 6897b4c09b7c..ad2d74f887f2 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -187,8 +187,10 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part); void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]); +#ifdef CONFIG_64BIT unsigned int blk_mq_in_flight_with_stat(struct request_queue *q, struct hd_struct *part); +#endif
static inline void blk_mq_put_dispatch_budget(struct request_queue *q) { diff --git a/block/genhd.c b/block/genhd.c index fcd6210417bc..8b37fcfa10d1 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1293,6 +1293,7 @@ ssize_t part_size_show(struct device *dev, (unsigned long long)part_nr_sects_read(p)); }
+#ifdef CONFIG_64BIT static void part_set_stat_time(struct hd_struct *hd) { u64 now = ktime_get_ns(); @@ -1304,12 +1305,13 @@ static void part_set_stat_time(struct hd_struct *hd) goto again; } } +#endif
static void part_get_stat_info(struct hd_struct *hd, struct disk_stats *stat, unsigned int *inflight) { +#ifdef CONFIG_64BIT struct request_queue *q = part_to_disk(hd)->queue; - if (queue_is_mq(q)) { mutex_lock(&part_to_dev(hd)->mutex); part_stat_lock(); @@ -1320,7 +1322,9 @@ static void part_get_stat_info(struct hd_struct *hd, struct disk_stats *stat, } else { *inflight = part_in_flight(hd); } - +#else + *inflight = part_in_flight(hd); +#endif if (*inflight) { part_stat_lock(); update_io_ticks(hd, jiffies, true);
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: 186896, https://gitee.com/src-openeuler/kernel/issues/I5FRAP CVE: NA
--------------------------------
In filemap_read(), first page will not mark accessed if previous page is equal to the current page:
if (iocb->ki_pos >> PAGE_SHIFT != ra->prev_pos >> PAGE_SHIFT)) folio_mark_accessed(fbatch.folios[0]);
However, 'prev_pos' is set to 'ki_pos + copied' during last read, which means 'prev_pos' can be equal to 'ki_pos' in this read, thus previous page can be miscaculated.
Fix the problem by setting 'prev_pos' to the start offset of last read, so that 'prev_pos >> PAGE_SHIFT' will be previous page as expected.
Fixes: 06c0444290ce ("mm/filemap.c: generic_file_buffered_read() now uses find_get_pages_contig") Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/filemap.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c index 3958fc3280d8..9653c1e0faef 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2549,10 +2549,11 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, flush_dcache_page(pages[i]);
copied = copy_page_to_iter(pages[i], offset, bytes, iter); - - written += copied; - iocb->ki_pos += copied; - ra->prev_pos = iocb->ki_pos; + if (copied) { + ra->prev_pos = iocb->ki_pos; + written += copied; + iocb->ki_pos += copied; + }
if (copied < bytes) { error = -EFAULT;
From: Guo Mengqi guomengqi3@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5EOOG CVE: NA
--------------------------------
This reverts commit da76349ca8776aa7f8b186010005fb563fb163bb. However, the iommu_fault_param and iommu_fault_event changes are reserved to avoid KABI change.
Signed-off-by: Guo Mengqi guomengqi3@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/iommu/iommu.c | 55 ------------------------------------------- include/linux/iommu.h | 4 ++-- 2 files changed, 2 insertions(+), 57 deletions(-)
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 9116c93945d0..97953fa27630 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1084,39 +1084,6 @@ int iommu_group_unregister_notifier(struct iommu_group *group, } EXPORT_SYMBOL_GPL(iommu_group_unregister_notifier);
-static void iommu_dev_fault_timer_fn(struct timer_list *t) -{ - struct iommu_fault_param *fparam = from_timer(fparam, t, timer); - struct iommu_fault_event *evt; - struct iommu_fault_page_request *prm; - - u64 now; - - now = get_jiffies_64(); - - /* The goal is to ensure driver or guest page fault handler(via vfio) - * send page response on time. Otherwise, limited queue resources - * may be occupied by some irresponsive guests or drivers. - * When per device pending fault list is not empty, we periodically checks - * if any anticipated page response time has expired. - * - * TODO: - * We could do the following if response time expires: - * 1. send page response code FAILURE to all pending PRQ - * 2. inform device driver or vfio - * 3. drain in-flight page requests and responses for this device - * 4. clear pending fault list such that driver can unregister fault - * handler(otherwise blocked when pending faults are present). - */ - list_for_each_entry(evt, &fparam->faults, list) { - prm = &evt->fault.prm; - if (time_after64(now, evt->expire)) - pr_err("Page response time expired!, pasid %d gid %d exp %llu now %llu\n", - prm->pasid, prm->grpid, evt->expire, now); - } - mod_timer(t, now + prq_timeout); -} - /** * iommu_register_device_fault_handler() - Register a device fault handler * @dev: the device @@ -1164,9 +1131,6 @@ int iommu_register_device_fault_handler(struct device *dev, mutex_init(¶m->fault_param->lock); INIT_LIST_HEAD(¶m->fault_param->faults);
- if (prq_timeout) - timer_setup(¶m->fault_param->timer, iommu_dev_fault_timer_fn, - TIMER_DEFERRABLE); done_unlock: mutex_unlock(¶m->lock);
@@ -1306,9 +1270,7 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt) struct dev_iommu *param = dev->iommu; struct iommu_fault_event *evt_pending = NULL; struct iommu_fault_param *fparam; - struct timer_list *tmr; int ret = 0; - u64 exp;
if (!param || !evt || WARN_ON_ONCE(!iommu_fault_valid(&evt->fault))) return -EINVAL; @@ -1329,17 +1291,7 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt) ret = -ENOMEM; goto done_unlock; } - /* Keep track of response expiration time */ - exp = get_jiffies_64() + prq_timeout; - evt_pending->expire = exp; mutex_lock(&fparam->lock); - if (list_empty(&fparam->faults)) { - /* First pending event, start timer */ - tmr = &fparam->timer; - WARN_ON(timer_pending(tmr)); - mod_timer(tmr, exp); - } - list_add_tail(&evt_pending->list, &fparam->faults); mutex_unlock(&fparam->lock); } @@ -1417,13 +1369,6 @@ int iommu_page_response(struct device *dev, break; }
- /* stop response timer if no more pending request */ - if (list_empty(¶m->fault_param->faults) && - timer_pending(¶m->fault_param->timer)) { - pr_debug("no pending PRQ, stop timer\n"); - del_timer(¶m->fault_param->timer); - } - done_unlock: mutex_unlock(¶m->fault_param->lock); return ret; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 8baf5ed66a84..092384b71ab2 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -393,7 +393,7 @@ struct iommu_device { struct iommu_fault_event { struct iommu_fault fault; struct list_head list; - u64 expire; + _KABI_DEPRECATE(u64, expire); };
/** @@ -408,7 +408,7 @@ struct iommu_fault_param { iommu_dev_fault_handler_t handler; void *data; struct list_head faults; - struct timer_list timer; + _KABI_DEPRECATE(struct timer_list, timer); struct mutex lock; };
From: Amir Goldstein amir73il@gmail.com
mainline inclusion from mainline-5.13-rc1 commit 9591c3a34f7722bd77f42c98d76fd5a5bad465f0 category: feature bugzilla: 187162, https://gitee.com/openeuler/kernel/issues/I5FYKV CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
-------------------------------------------------
Some filesystem's use a digest of their uuid for f_fsid. Create a simple wrapper for this open coded folding.
Filesystems that have a non null uuid but use the block device number for f_fsid may also consider using this helper.
[JK: Added missing asm/byteorder.h include] Link: https://lore.kernel.org/r/20210322173944.449469-2-amir73il@gmail.com Acked-by: Damien Le Moal damien.lemoal@wdc.com Reviewed-by: Christian Brauner christian.brauner@ubuntu.com Signed-off-by: Amir Goldstein amir73il@gmail.com Signed-off-by: Jan Kara jack@suse.cz Signed-off-by: Baokun Li libaokun1@huawei.com Reviewed-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/ext2/super.c | 5 +---- fs/ext4/super.c | 5 +---- fs/zonefs/super.c | 5 +---- include/linux/statfs.h | 8 ++++++++ 4 files changed, 11 insertions(+), 12 deletions(-)
diff --git a/fs/ext2/super.c b/fs/ext2/super.c index b6314d3c6a87..74e110289413 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1403,7 +1403,6 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf) struct super_block *sb = dentry->d_sb; struct ext2_sb_info *sbi = EXT2_SB(sb); struct ext2_super_block *es = sbi->s_es; - u64 fsid;
spin_lock(&sbi->s_lock);
@@ -1457,9 +1456,7 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf) buf->f_ffree = ext2_count_free_inodes(sb); es->s_free_inodes_count = cpu_to_le32(buf->f_ffree); buf->f_namelen = EXT2_NAME_LEN; - fsid = le64_to_cpup((void *)es->s_uuid) ^ - le64_to_cpup((void *)es->s_uuid + sizeof(u64)); - buf->f_fsid = u64_to_fsid(fsid); + buf->f_fsid = uuid_to_fsid(es->s_uuid); spin_unlock(&sbi->s_lock); return 0; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9c004269f8d6..7cf82474b4de 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6245,7 +6245,6 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; ext4_fsblk_t overhead = 0, resv_blocks; - u64 fsid; s64 bfree; resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
@@ -6266,9 +6265,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = le32_to_cpu(es->s_inodes_count); buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); buf->f_namelen = EXT4_NAME_LEN; - fsid = le64_to_cpup((void *)es->s_uuid) ^ - le64_to_cpup((void *)es->s_uuid + sizeof(u64)); - buf->f_fsid = u64_to_fsid(fsid); + buf->f_fsid = uuid_to_fsid(es->s_uuid);
#ifdef CONFIG_QUOTA if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) && diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index e60759d8bb5f..2d1f5d4d12e0 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -1169,7 +1169,6 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf) struct super_block *sb = dentry->d_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); enum zonefs_ztype t; - u64 fsid;
buf->f_type = ZONEFS_MAGIC; buf->f_bsize = sb->s_blocksize; @@ -1192,9 +1191,7 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf)
spin_unlock(&sbi->s_lock);
- fsid = le64_to_cpup((void *)sbi->s_uuid.b) ^ - le64_to_cpup((void *)sbi->s_uuid.b + sizeof(u64)); - buf->f_fsid = u64_to_fsid(fsid); + buf->f_fsid = uuid_to_fsid(sbi->s_uuid.b);
return 0; } diff --git a/include/linux/statfs.h b/include/linux/statfs.h index 20f695b90aab..02c862686ea3 100644 --- a/include/linux/statfs.h +++ b/include/linux/statfs.h @@ -4,6 +4,7 @@
#include <linux/types.h> #include <asm/statfs.h> +#include <asm/byteorder.h>
struct kstatfs { long f_type; @@ -50,4 +51,11 @@ static inline __kernel_fsid_t u64_to_fsid(u64 v) return (__kernel_fsid_t){.val = {(u32)v, (u32)(v>>32)}}; }
+/* Fold 16 bytes uuid to 64 bit fsid */ +static inline __kernel_fsid_t uuid_to_fsid(__u8 *uuid) +{ + return u64_to_fsid(le64_to_cpup((void *)uuid) ^ + le64_to_cpup((void *)(uuid + sizeof(u64)))); +} + #endif
From: Amir Goldstein amir73il@gmail.com
mainline inclusion from mainline-5.13-rc1 commit 59cda49ecf6c9a32fae4942420701b6e087204f6 category: feature bugzilla: 187162, https://gitee.com/openeuler/kernel/issues/I5FYKV CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
-------------------------------------------------
Since kernel v5.1, fanotify_init(2) supports the flag FAN_REPORT_FID for identifying objects using file handle and fsid in events.
fanotify_mark(2) fails with -ENODEV when trying to set a mark on filesystems that report null f_fsid in stasfs(2).
Use the digest of uuid as f_fsid for tmpfs to uniquely identify tmpfs objects as best as possible and allow setting an fanotify mark that reports events with file handles on tmpfs.
Link: https://lore.kernel.org/r/20210322173944.449469-3-amir73il@gmail.com Acked-by: Hugh Dickins hughd@google.com Reviewed-by: Christian Brauner christian.brauner@ubuntu.com Signed-off-by: Amir Goldstein amir73il@gmail.com Signed-off-by: Jan Kara jack@suse.cz Signed-off-by: Baokun Li libaokun1@huawei.com Reviewed-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/shmem.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/mm/shmem.c b/mm/shmem.c index 9df016296347..201086030e72 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2869,6 +2869,9 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_ffree = sbinfo->free_inodes; } /* else leave those fields 0 like simple_statfs */ + + buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b); + return 0; }
From: Liu Shixin liushixin2@huawei.com
maillist inclusion category: bugfix bugzilla: 186821, https://gitee.com/openeuler/kernel/issues/I5G69G
Reference: https://lore.kernel.org/all/20220707020938.2122198-1-liushixin2@huawei.com/
--------------------------------
Release refcount after xas_set to fix UAF which may cause panic like this:
page:ffffea000491fa40 refcount:1 mapcount:0 mapping:0000000000000000 index:0x1 pfn:0x1247e9 head:ffffea000491fa00 order:3 compound_mapcount:0 compound_pincount:0 memcg:ffff888104f91091 flags: 0x2fffff80010200(slab|head|node=0|zone=2|lastcpupid=0x1fffff) ... page dumped because: VM_BUG_ON_PAGE(PageTail(page)) ------------[ cut here ]------------ kernel BUG at include/linux/page-flags.h:632! invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN CPU: 1 PID: 7642 Comm: sh Not tainted 5.15.51-dirty #26 ... Call Trace: <TASK> __invalidate_mapping_pages+0xe7/0x540 drop_pagecache_sb+0x159/0x320 iterate_supers+0x120/0x240 drop_caches_sysctl_handler+0xaa/0xe0 proc_sys_call_handler+0x2b4/0x480 new_sync_write+0x3d6/0x5c0 vfs_write+0x446/0x7a0 ksys_write+0x105/0x210 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f52b5733130 ...
This problem has been fixed on mainline by patch 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache") since it deletes the related code.
Fixes: 5c211ba29deb ("mm: add and use find_lock_entries") Signed-off-by: Liu Shixin liushixin2@huawei.com Acked-by: Matthew Wilcox (Oracle) willy@infradead.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Conflicts: mm/filemap.c Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/filemap.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c index 9653c1e0faef..ebae261f9df9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1951,7 +1951,11 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
rcu_read_lock(); while ((page = find_get_entry(&xas, end, XA_PRESENT))) { + unsigned long next_idx = xas.xa_index + 1; + if (!xa_is_value(page)) { + if (PageTransHuge(page)) + next_idx = page->index + thp_nr_pages(page); if (page->index < start) goto put; VM_BUG_ON_PAGE(page->index != xas.xa_index, page); @@ -1973,13 +1977,11 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, put: put_page(page); next: - if (!xa_is_value(page) && PageTransHuge(page)) { - unsigned int nr_pages = thp_nr_pages(page); - + if (next_idx != xas.xa_index + 1) { /* Final THP may cross MAX_LFS_FILESIZE on 32-bit */ - xas_set(&xas, page->index + nr_pages); - if (xas.xa_index < nr_pages) + if (next_idx < xas.xa_index) break; + xas_set(&xas, next_idx); } } rcu_read_unlock();
From: Luo Meng luomeng12@huawei.com
mainline inclusion from mainline-v5.19-rc3 commit d14f5efadd846dbde561bd734318de6a9e6b26e6 category: bugfix bugzilla: 186471 https://gitee.com/openeuler/kernel/issues/I5DRKR CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------------------------------------
When shmem_reconfigure() calls __percpu_counter_compare(), the second parameter is unsigned long long. But in the definition of __percpu_counter_compare(), the second parameter is s64. So when __percpu_counter_compare() executes abs(count - rhs), UBSAN shows the following warning:
================================================================================ UBSAN: Undefined behaviour in lib/percpu_counter.c:209:6 signed integer overflow: 0 - -9223372036854775808 cannot be represented in type 'long long int' CPU: 1 PID: 9636 Comm: syz-executor.2 Tainted: G ---------r- - 4.18.0 #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 Call Trace: __dump_stack home/install/linux-rh-3-10/lib/dump_stack.c:77 [inline] dump_stack+0x125/0x1ae home/install/linux-rh-3-10/lib/dump_stack.c:117 ubsan_epilogue+0xe/0x81 home/install/linux-rh-3-10/lib/ubsan.c:159 handle_overflow+0x19d/0x1ec home/install/linux-rh-3-10/lib/ubsan.c:190 __percpu_counter_compare+0x124/0x140 home/install/linux-rh-3-10/lib/percpu_counter.c:209 percpu_counter_compare home/install/linux-rh-3-10/./include/linux/percpu_counter.h:50 [inline] shmem_remount_fs+0x1ce/0x6b0 home/install/linux-rh-3-10/mm/shmem.c:3530 do_remount_sb+0x11b/0x530 home/install/linux-rh-3-10/fs/super.c:888 do_remount home/install/linux-rh-3-10/fs/namespace.c:2344 [inline] do_mount+0xf8d/0x26b0 home/install/linux-rh-3-10/fs/namespace.c:2844 ksys_mount+0xad/0x120 home/install/linux-rh-3-10/fs/namespace.c:3075 __do_sys_mount home/install/linux-rh-3-10/fs/namespace.c:3089 [inline] __se_sys_mount home/install/linux-rh-3-10/fs/namespace.c:3086 [inline] __x64_sys_mount+0xbf/0x160 home/install/linux-rh-3-10/fs/namespace.c:3086 do_syscall_64+0xca/0x5c0 home/install/linux-rh-3-10/arch/x86/entry/common.c:298 entry_SYSCALL_64_after_hwframe+0x6a/0xdf RIP: 0033:0x46b5e9 Code: 5d db fa ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 2b db fa ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f54d5f22c68 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5 RAX: ffffffffffffffda RBX: 000000000077bf60 RCX: 000000000046b5e9 RDX: 0000000000000000 RSI: 0000000020000000 RDI: 0000000000000000 RBP: 000000000077bf60 R08: 0000000020000140 R09: 0000000000000000 R10: 00000000026740a4 R11: 0000000000000246 R12: 0000000000000000 R13: 00007ffd1fb1592f R14: 00007f54d5f239c0 R15: 000000000077bf6c ================================================================================
[akpm@linux-foundation.org: tweak error message text] Link: https://lkml.kernel.org/r/20220513025225.2678727-1-luomeng12@huawei.com Signed-off-by: Luo Meng luomeng12@huawei.com Cc: Hugh Dickins hughd@google.com Cc: Yu Kuai yukuai3@huawei.com Signed-off-by: Andrew Morton akpm@linux-foundation.org
Conflicts: mm/shmem.c
Signed-off-by: ZhaoLong Wang wangzhaolong1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/shmem.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/mm/shmem.c b/mm/shmem.c index 201086030e72..f043682ba567 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3553,6 +3553,10 @@ static int shmem_reconfigure(struct fs_context *fc)
spin_lock(&sbinfo->stat_lock); inodes = sbinfo->max_inodes - sbinfo->free_inodes; + if (ctx->blocks > S64_MAX) { + err = "Number of blocks too large"; + goto out; + } if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { if (!sbinfo->max_blocks) { err = "Cannot retroactively limit size";
From: ZhaoLong Wang wangzhaolong1@huawei.com
hulk inclusion category: bugfix bugzilla: 187184, https://gitee.com/openeuler/kernel/issues/I5G9EK CVE: NA
--------------------------------
An undefined-behavior issue has not been completely fixed since commit 5f62fbf88c99("tmpfs: fix undefined-behaviour in shmem_reconfigure()"). In the commit, check in the shmem_reconfigure() is added in remount process to avoid the Ubsan problem. However, the check is not added to the mount process. It causes inconsistent results between mount and remount. The operations to reproduce the problem in user mode as follows:
If nr_blocks is set to 0x8000000000000000, the mounting is successful.
# mount tmpfs /dev/shm/ -t tmpfs -o nr_blocks=0x8000000000000000
However, when -o remount is used, the mount fails because of the check in the shmem_reconfigure()
# mount tmpfs /dev/shm/ -t tmpfs -o remount,nr_blocks=0x8000000000000000 mount: /dev/shm: mount point not mounted or bad option.
Therefore, add checks in the shmem_parse_one() function and remove the check in shmem_reconfigure() to avoid this problem.
Signed-off-by: ZhaoLong Wang wangzhaolong1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/shmem.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-)
diff --git a/mm/shmem.c b/mm/shmem.c index f043682ba567..ad2d68150ed2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3432,7 +3432,7 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) break; case Opt_nr_blocks: ctx->blocks = memparse(param->string, &rest); - if (*rest) + if (*rest || ctx->blocks > S64_MAX) goto bad_value; ctx->seen |= SHMEM_SEEN_BLOCKS; break; @@ -3553,10 +3553,7 @@ static int shmem_reconfigure(struct fs_context *fc)
spin_lock(&sbinfo->stat_lock); inodes = sbinfo->max_inodes - sbinfo->free_inodes; - if (ctx->blocks > S64_MAX) { - err = "Number of blocks too large"; - goto out; - } + if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { if (!sbinfo->max_blocks) { err = "Cannot retroactively limit size";
From: Jan Beulich jbeulich@suse.com
stable inclusion from stable-v5.10.129 commit 547b7c640df545a344358ede93e491a89194cdfa category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5GBF2?from=project-issue CVE: CVE-2022-33743
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=l...
--------------------------------
commit f63c2c2032c2e3caad9add3b82cc6e91c376fd26 upstream.
The commit referenced below moved the invocation past the "next" label, without any explanation. In fact this allows misbehaving backends undue control over the domain the frontend runs in, as earlier detected errors require the skb to not be freed (it may be retained for later processing via xennet_move_rx_slot(), or it may simply be unsafe to have it freed).
This is CVE-2022-33743 / XSA-405.
Fixes: 6c5aa6fc4def ("xen networking: add basic XDP support for xen-netfront") Signed-off-by: Jan Beulich jbeulich@suse.com Reviewed-by: Juergen Gross jgross@suse.com Signed-off-by: Juergen Gross jgross@suse.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Ziyang Xuan william.xuanziyang@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/xen-netfront.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 1a69b5246133..a7ebf17f75a3 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -1057,8 +1057,10 @@ static int xennet_get_responses(struct netfront_queue *queue, } } rcu_read_unlock(); -next: + __skb_queue_tail(list, skb); + +next: if (!(rx->flags & XEN_NETRXF_more_data)) break;
From: Pablo Neira Ayuso pablo@netfilter.org
mainline inclusion from mainline-v5.19-rc6 commit 7e6bc1f6cabcd30aba0b11219d8e01b952eacbb6 category: bugfix bugzilla: 187147, https://gitee.com/src-openeuler/kernel/issues/I5GCQH CVE: CVE-2022-34918
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Make sure element data type and length do not mismatch the one specified by the set declaration.
Fixes: 7d7402642eaf ("netfilter: nf_tables: variable sized set element keys / data") Reported-by: Hugues ANGUELKOV hanguelkov@randorisec.fr Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/netfilter/nf_tables_api.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index ea162e36e0e4..560a93aad5b6 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4880,13 +4880,20 @@ static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set, struct nft_data *data, struct nlattr *attr) { + u32 dtype; int err;
err = nft_data_init(ctx, data, NFT_DATA_VALUE_MAXLEN, desc, attr); if (err < 0) return err;
- if (desc->type != NFT_DATA_VERDICT && desc->len != set->dlen) { + if (set->dtype == NFT_DATA_VERDICT) + dtype = NFT_DATA_VERDICT; + else + dtype = NFT_DATA_VALUE; + + if (dtype != desc->type || + set->dlen != desc->len) { nft_data_release(data, desc->type); return -EINVAL; }
From: Roger Pau Monne roger.pau@citrix.com
stable inclusion from stable-v5.10.129 commit cfea428030be836d79a7690968232bb7fa4410f1 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5GLXT CVE: CVE-2022-26365
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=l...
--------------------------------
commit 2f446ffe9d737e9a844b97887919c4fda18246e7 upstream.
When allocating pages to be used for shared communication with the backend always zero them, this avoids leaking unintended data present on the pages.
This is CVE-2022-26365, part of XSA-403.
Signed-off-by: Roger Pau Monné roger.pau@citrix.com Reviewed-by: Jan Beulich jbeulich@suse.com Reviewed-by: Juergen Gross jgross@suse.com Signed-off-by: Juergen Gross jgross@suse.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: ChenXiaoSong chenxiaosong2@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/block/xen-blkfront.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 47d4bb23d6f3..fffb7c3118b1 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -311,7 +311,7 @@ static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num) goto out_of_memory;
if (info->feature_persistent) { - granted_page = alloc_page(GFP_NOIO); + granted_page = alloc_page(GFP_NOIO | __GFP_ZERO); if (!granted_page) { kfree(gnt_list_entry); goto out_of_memory; @@ -1753,7 +1753,7 @@ static int setup_blkring(struct xenbus_device *dev, for (i = 0; i < info->nr_ring_pages; i++) rinfo->ring_ref[i] = GRANT_INVALID_REF;
- sring = alloc_pages_exact(ring_size, GFP_NOIO); + sring = alloc_pages_exact(ring_size, GFP_NOIO | __GFP_ZERO); if (!sring) { xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring"); return -ENOMEM; @@ -2293,7 +2293,8 @@ static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo)
BUG_ON(!list_empty(&rinfo->indirect_pages)); for (i = 0; i < num; i++) { - struct page *indirect_page = alloc_page(GFP_KERNEL); + struct page *indirect_page = alloc_page(GFP_KERNEL | + __GFP_ZERO); if (!indirect_page) goto out_of_memory; list_add(&indirect_page->lru, &rinfo->indirect_pages);
From: Roger Pau Monne roger.pau@citrix.com
stable inclusion from stable-v5.10.129 commit 728d68bfe68d92eae1407b8a9edc7817d6227404 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5GLYP CVE: CVE-2022-33740
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=l...
--------------------------------
commit 307c8de2b02344805ebead3440d8feed28f2f010 upstream.
When allocating pages to be used for shared communication with the backend always zero them, this avoids leaking unintended data present on the pages.
This is CVE-2022-33740, part of XSA-403.
Signed-off-by: Roger Pau Monné roger.pau@citrix.com Reviewed-by: Jan Beulich jbeulich@suse.com Reviewed-by: Juergen Gross jgross@suse.com Signed-off-by: Juergen Gross jgross@suse.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: ChenXiaoSong chenxiaosong2@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/xen-netfront.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index a7ebf17f75a3..881bc68e5402 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -273,7 +273,8 @@ static struct sk_buff *xennet_alloc_one_rx_buffer(struct netfront_queue *queue) if (unlikely(!skb)) return NULL;
- page = page_pool_dev_alloc_pages(queue->page_pool); + page = page_pool_alloc_pages(queue->page_pool, + GFP_ATOMIC | __GFP_NOWARN | __GFP_ZERO); if (unlikely(!page)) { kfree_skb(skb); return NULL;
From: Roger Pau Monne roger.pau@citrix.com
stable inclusion from stable-v5.10.129 commit 4923217af5742a796821272ee03f8d6de15c0cca category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5GLZZ CVE: CVE-2022-33741
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=l...
--------------------------------
commit 4491001c2e0fa69efbb748c96ec96b100a5cdb7e upstream.
Bounce all data on the skbs to be transmitted into zeroed pages if the backend is untrusted. This avoids leaking data present in the pages shared with the backend but not part of the skb fragments. This requires introducing a new helper in order to allocate skbs with a size multiple of XEN_PAGE_SIZE so we don't leak contiguous data on the granted pages.
Reporting whether the backend is to be trusted can be done using a module parameter, or from the xenstore frontend path as set by the toolstack when adding the device.
This is CVE-2022-33741, part of XSA-403.
Signed-off-by: Roger Pau Monné roger.pau@citrix.com Reviewed-by: Juergen Gross jgross@suse.com Signed-off-by: Juergen Gross jgross@suse.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: ChenXiaoSong chenxiaosong2@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/xen-netfront.c | 49 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-)
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 881bc68e5402..569f3c8e7b75 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -66,6 +66,10 @@ module_param_named(max_queues, xennet_max_queues, uint, 0644); MODULE_PARM_DESC(max_queues, "Maximum number of queues per virtual interface");
+static bool __read_mostly xennet_trusted = true; +module_param_named(trusted, xennet_trusted, bool, 0644); +MODULE_PARM_DESC(trusted, "Is the backend trusted"); + #define XENNET_TIMEOUT (5 * HZ)
static const struct ethtool_ops xennet_ethtool_ops; @@ -175,6 +179,9 @@ struct netfront_info { /* Is device behaving sane? */ bool broken;
+ /* Should skbs be bounced into a zeroed buffer? */ + bool bounce; + atomic_t rx_gso_checksum_fixup; };
@@ -670,6 +677,33 @@ static int xennet_xdp_xmit(struct net_device *dev, int n, return n - drops; }
+struct sk_buff *bounce_skb(const struct sk_buff *skb) +{ + unsigned int headerlen = skb_headroom(skb); + /* Align size to allocate full pages and avoid contiguous data leaks */ + unsigned int size = ALIGN(skb_end_offset(skb) + skb->data_len, + XEN_PAGE_SIZE); + struct sk_buff *n = alloc_skb(size, GFP_ATOMIC | __GFP_ZERO); + + if (!n) + return NULL; + + if (!IS_ALIGNED((uintptr_t)n->head, XEN_PAGE_SIZE)) { + WARN_ONCE(1, "misaligned skb allocated\n"); + kfree_skb(n); + return NULL; + } + + /* Set the data pointer */ + skb_reserve(n, headerlen); + /* Set the tail pointer and length */ + skb_put(n, skb->len); + + BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)); + + skb_copy_header(n, skb); + return n; +}
#define MAX_XEN_SKB_FRAGS (65536 / XEN_PAGE_SIZE + 1)
@@ -723,9 +757,13 @@ static netdev_tx_t xennet_start_xmit(struct sk_buff *skb, struct net_device *dev
/* The first req should be at least ETH_HLEN size or the packet will be * dropped by netback. + * + * If the backend is not trusted bounce all data to zeroed pages to + * avoid exposing contiguous data on the granted page not belonging to + * the skb. */ - if (unlikely(PAGE_SIZE - offset < ETH_HLEN)) { - nskb = skb_copy(skb, GFP_ATOMIC); + if (np->bounce || unlikely(PAGE_SIZE - offset < ETH_HLEN)) { + nskb = bounce_skb(skb); if (!nskb) goto drop; dev_consume_skb_any(skb); @@ -2251,6 +2289,10 @@ static int talk_to_netback(struct xenbus_device *dev,
info->netdev->irq = 0;
+ /* Check if backend is trusted. */ + info->bounce = !xennet_trusted || + !xenbus_read_unsigned(dev->nodename, "trusted", 1); + /* Check if backend supports multiple queues */ max_queues = xenbus_read_unsigned(info->xbdev->otherend, "multi-queue-max-queues", 1); @@ -2417,6 +2459,9 @@ static int xennet_connect(struct net_device *dev) return err; if (np->netback_has_xdp_headroom) pr_info("backend supports XDP headroom\n"); + if (np->bounce) + dev_info(&np->xbdev->dev, + "bouncing transmitted data to zeroed pages\n");
/* talk_to_netback() sets the correct number of queues */ num_queues = dev->real_num_tx_queues;
From: Roger Pau Monne roger.pau@citrix.com
stable inclusion from stable-v5.10.129 commit cbbd2d2531539212ff090aecbea9877c996e6ce6 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5GM0S CVE: CVE-2022-33742
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=l...
--------------------------------
commit 2400617da7eebf9167d71a46122828bc479d64c9 upstream.
Split the current bounce buffering logic used with persistent grants into it's own option, and allow enabling it independently of persistent grants. This allows to reuse the same code paths to perform the bounce buffering required to avoid leaking contiguous data in shared pages not part of the request fragments.
Reporting whether the backend is to be trusted can be done using a module parameter, or from the xenstore frontend path as set by the toolstack when adding the device.
This is CVE-2022-33742, part of XSA-403.
Signed-off-by: Roger Pau Monné roger.pau@citrix.com Reviewed-by: Juergen Gross jgross@suse.com Signed-off-by: Juergen Gross jgross@suse.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: ChenXiaoSong chenxiaosong2@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/block/xen-blkfront.c | 49 +++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 15 deletions(-)
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index fffb7c3118b1..abbb68b6d9bd 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -151,6 +151,10 @@ static unsigned int xen_blkif_max_ring_order; module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, 0444); MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
+static bool __read_mostly xen_blkif_trusted = true; +module_param_named(trusted, xen_blkif_trusted, bool, 0644); +MODULE_PARM_DESC(trusted, "Is the backend trusted"); + #define BLK_RING_SIZE(info) \ __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * (info)->nr_ring_pages)
@@ -208,6 +212,7 @@ struct blkfront_info unsigned int feature_discard:1; unsigned int feature_secdiscard:1; unsigned int feature_persistent:1; + unsigned int bounce:1; unsigned int discard_granularity; unsigned int discard_alignment; /* Number of 4KB segments handled */ @@ -310,7 +315,7 @@ static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num) if (!gnt_list_entry) goto out_of_memory;
- if (info->feature_persistent) { + if (info->bounce) { granted_page = alloc_page(GFP_NOIO | __GFP_ZERO); if (!granted_page) { kfree(gnt_list_entry); @@ -330,7 +335,7 @@ static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num) list_for_each_entry_safe(gnt_list_entry, n, &rinfo->grants, node) { list_del(&gnt_list_entry->node); - if (info->feature_persistent) + if (info->bounce) __free_page(gnt_list_entry->page); kfree(gnt_list_entry); i--; @@ -376,7 +381,7 @@ static struct grant *get_grant(grant_ref_t *gref_head, /* Assign a gref to this page */ gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head); BUG_ON(gnt_list_entry->gref == -ENOSPC); - if (info->feature_persistent) + if (info->bounce) grant_foreign_access(gnt_list_entry, info); else { /* Grant access to the GFN passed by the caller */ @@ -400,7 +405,7 @@ static struct grant *get_indirect_grant(grant_ref_t *gref_head, /* Assign a gref to this page */ gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head); BUG_ON(gnt_list_entry->gref == -ENOSPC); - if (!info->feature_persistent) { + if (!info->bounce) { struct page *indirect_page;
/* Fetch a pre-allocated page to use for indirect grefs */ @@ -715,7 +720,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri .grant_idx = 0, .segments = NULL, .rinfo = rinfo, - .need_copy = rq_data_dir(req) && info->feature_persistent, + .need_copy = rq_data_dir(req) && info->bounce, };
/* @@ -1035,11 +1040,12 @@ static void xlvbd_flush(struct blkfront_info *info) { blk_queue_write_cache(info->rq, info->feature_flush ? true : false, info->feature_fua ? true : false); - pr_info("blkfront: %s: %s %s %s %s %s\n", + pr_info("blkfront: %s: %s %s %s %s %s %s %s\n", info->gd->disk_name, flush_info(info), "persistent grants:", info->feature_persistent ? "enabled;" : "disabled;", "indirect descriptors:", - info->max_indirect_segments ? "enabled;" : "disabled;"); + info->max_indirect_segments ? "enabled;" : "disabled;", + "bounce buffer:", info->bounce ? "enabled" : "disabled;"); }
static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) @@ -1273,7 +1279,7 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo) if (!list_empty(&rinfo->indirect_pages)) { struct page *indirect_page, *n;
- BUG_ON(info->feature_persistent); + BUG_ON(info->bounce); list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) { list_del(&indirect_page->lru); __free_page(indirect_page); @@ -1290,7 +1296,7 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo) 0, 0UL); rinfo->persistent_gnts_c--; } - if (info->feature_persistent) + if (info->bounce) __free_page(persistent_gnt->page); kfree(persistent_gnt); } @@ -1311,7 +1317,7 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo) for (j = 0; j < segs; j++) { persistent_gnt = rinfo->shadow[i].grants_used[j]; gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); - if (info->feature_persistent) + if (info->bounce) __free_page(persistent_gnt->page); kfree(persistent_gnt); } @@ -1501,7 +1507,7 @@ static int blkif_completion(unsigned long *id, data.s = s; num_sg = s->num_sg;
- if (bret->operation == BLKIF_OP_READ && info->feature_persistent) { + if (bret->operation == BLKIF_OP_READ && info->bounce) { for_each_sg(s->sg, sg, num_sg, i) { BUG_ON(sg->offset + sg->length > PAGE_SIZE);
@@ -1560,7 +1566,7 @@ static int blkif_completion(unsigned long *id, * Add the used indirect page back to the list of * available pages for indirect grefs. */ - if (!info->feature_persistent) { + if (!info->bounce) { indirect_page = s->indirect_grants[i]->page; list_add(&indirect_page->lru, &rinfo->indirect_pages); } @@ -1857,6 +1863,10 @@ static int talk_to_blkback(struct xenbus_device *dev, if (!info) return -ENODEV;
+ /* Check if backend is trusted. */ + info->bounce = !xen_blkif_trusted || + !xenbus_read_unsigned(dev->nodename, "trusted", 1); + max_page_order = xenbus_read_unsigned(info->xbdev->otherend, "max-ring-page-order", 0); ring_page_order = min(xen_blkif_max_ring_order, max_page_order); @@ -2283,10 +2293,10 @@ static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo) if (err) goto out_of_memory;
- if (!info->feature_persistent && info->max_indirect_segments) { + if (!info->bounce && info->max_indirect_segments) { /* - * We are using indirect descriptors but not persistent - * grants, we need to allocate a set of pages that can be + * We are using indirect descriptors but don't have a bounce + * buffer, we need to allocate a set of pages that can be * used for mapping indirect grefs */ int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info); @@ -2387,6 +2397,8 @@ static void blkfront_gather_backend_features(struct blkfront_info *info) info->feature_persistent = !!xenbus_read_unsigned(info->xbdev->otherend, "feature-persistent", 0); + if (info->feature_persistent) + info->bounce = true;
indirect_segments = xenbus_read_unsigned(info->xbdev->otherend, "feature-max-indirect-segments", 0); @@ -2760,6 +2772,13 @@ static void blkfront_delay_work(struct work_struct *work) struct blkfront_info *info; bool need_schedule_work = false;
+ /* + * Note that when using bounce buffers but not persistent grants + * there's no need to run blkfront_delay_work because grants are + * revoked in blkif_completion or else an error is reported and the + * connection is closed. + */ + mutex_lock(&blkfront_mutex);
list_for_each_entry(info, &info_list, info_list) {