From: Zheng Liang zhengliang6@huawei.com
mainline inclusion from mainline commit 2fc428f6b7ca80794cb9928c90d4de524366659f category: bugfix bugzilla: 185657 https://gitee.com/openeuler/kernel/issues/I4DDEL
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
In bfq_pd_alloc(), the function bfqg_stats_init() init bfqg. If blkg_rwstat_init() init bfqg_stats->bytes successful and init bfqg_stats->ios failed, bfqg_stats_init() return failed, bfqg will be freed. But blkg_rwstat->cpu_cnt is not deleted from the list of percpu_counters. If we traverse the list of percpu_counters, It will have UAF problem.
we should use blkg_rwstat_exit() to cleanup bfqg_stats bytes in the above scenario.
Fixes: commit fd41e60331b ("bfq-iosched: stop using blkg->stat_bytes and ->stat_ios") Signed-off-by: Zheng Liang zhengliang6@huawei.com Acked-by: Tejun Heo tj@kernel.org Link: https://lore.kernel.org/r/20211018024225.1493938-1-zhengliang6@huawei.com Signed-off-by: Jens Axboe axboe@kernel.dk Reviewed-by: Jason Yan yanaijie@huawei.com
Signed-off-by: Chen Jun chenjun102@huawei.com --- block/bfq-cgroup.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-)
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index b791e2041e49..a6bcc779c912 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -463,7 +463,7 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) { if (blkg_rwstat_init(&stats->bytes, gfp) || blkg_rwstat_init(&stats->ios, gfp)) - return -ENOMEM; + goto error;
#ifdef CONFIG_BFQ_CGROUP_DEBUG if (blkg_rwstat_init(&stats->merged, gfp) || @@ -476,13 +476,15 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) bfq_stat_init(&stats->dequeue, gfp) || bfq_stat_init(&stats->group_wait_time, gfp) || bfq_stat_init(&stats->idle_time, gfp) || - bfq_stat_init(&stats->empty_time, gfp)) { - bfqg_stats_exit(stats); - return -ENOMEM; - } + bfq_stat_init(&stats->empty_time, gfp)) + goto error; #endif
return 0; + +error: + bfqg_stats_exit(stats); + return -ENOMEM; }
static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
From: yangerkun yangerkun@huawei.com
hulk inclusion category: bugfix bugzilla: 182995 https://gitee.com/openeuler/kernel/issues/I4DDEL
---------------------------
Like ext4_ext_rm_leaf, we can ensure that there are enough credits before every call that will consume credits. As part of this fix we fold the functionality of ext4_access_path() into ext4_ext_shift_path_extents(). This change is needed as a preparation for the next bugfix patch.
Cc: stable@kernel.org Link: https://lore.kernel.org/r/20210903062748.4118886-3-yangerkun@huawei.com Signed-off-by: yangerkun yangerkun@huawei.com Reviewed-by: Jan Kara jack@suse.cz Signed-off-by: Theodore Ts'o tytso@mit.edu Reviewed-by: Zhang Yi yi.zhang@huawei.com
Signed-off-by: Chen Jun chenjun102@huawei.com --- fs/ext4/extents.c | 49 +++++++++++++++-------------------------------- 1 file changed, 15 insertions(+), 34 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 74b69d83c198..a16be67a1fe8 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5014,36 +5014,6 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, return ext4_fill_es_cache_info(inode, start_blk, len_blks, fieinfo); }
-/* - * ext4_access_path: - * Function to access the path buffer for marking it dirty. - * It also checks if there are sufficient credits left in the journal handle - * to update path. - */ -static int -ext4_access_path(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path) -{ - int credits, err; - - if (!ext4_handle_valid(handle)) - return 0; - - /* - * Check if need to extend journal credits - * 3 for leaf, sb, and inode plus 2 (bmap and group - * descriptor) for each block group; assume two block - * groups - */ - credits = ext4_writepage_trans_blocks(inode); - err = ext4_datasem_ensure_credits(handle, inode, 7, credits, 0); - if (err < 0) - return err; - - err = ext4_ext_get_access(handle, inode, path); - return err; -} - /* * ext4_ext_shift_path_extents: * Shift the extents of a path structure lying between path[depth].p_ext @@ -5058,6 +5028,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, int depth, err = 0; struct ext4_extent *ex_start, *ex_last; bool update = false; + int credits, restart_credits; depth = path->p_depth;
while (depth >= 0) { @@ -5067,13 +5038,23 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, return -EFSCORRUPTED;
ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); + /* leaf + sb + inode */ + credits = 3; + if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) { + update = true; + /* extent tree + sb + inode */ + credits = depth + 2; + }
- err = ext4_access_path(handle, inode, path + depth); + restart_credits = ext4_writepage_trans_blocks(inode); + err = ext4_datasem_ensure_credits(handle, inode, credits, + restart_credits, 0); if (err) goto out;
- if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) - update = true; + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out;
while (ex_start <= ex_last) { if (SHIFT == SHIFT_LEFT) { @@ -5104,7 +5085,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, }
/* Update index too */ - err = ext4_access_path(handle, inode, path + depth); + err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out;
From: yangerkun yangerkun@huawei.com
hulk inclusion category: bugfix bugzilla: 182995 https://gitee.com/openeuler/kernel/issues/I4DDEL
---------------------------
After we drop i_data sem, we need to reload the ext4_ext_path structure since the extent tree can change once i_data_sem is released.
This addresses the BUG:
[52117.465187] ------------[ cut here ]------------ [52117.465686] kernel BUG at fs/ext4/extents.c:1756! ... [52117.478306] Call Trace: [52117.478565] ext4_ext_shift_extents+0x3ee/0x710 [52117.479020] ext4_fallocate+0x139c/0x1b40 [52117.479405] ? __do_sys_newfstat+0x6b/0x80 [52117.479805] vfs_fallocate+0x151/0x4b0 [52117.480177] ksys_fallocate+0x4a/0xa0 [52117.480533] __x64_sys_fallocate+0x22/0x30 [52117.480930] do_syscall_64+0x35/0x80 [52117.481277] entry_SYSCALL_64_after_hwframe+0x44/0xae [52117.481769] RIP: 0033:0x7fa062f855ca
Cc: stable@kernel.org Link: https://lore.kernel.org/r/20210903062748.4118886-4-yangerkun@huawei.com Signed-off-by: yangerkun yangerkun@huawei.com Signed-off-by: Theodore Ts'o tytso@mit.edu Reviewed-by: Zhang Yi yi.zhang@huawei.com
Signed-off-by: Chen Jun chenjun102@huawei.com --- fs/ext4/extents.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a16be67a1fe8..f1548e52496c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5049,8 +5049,11 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, restart_credits = ext4_writepage_trans_blocks(inode); err = ext4_datasem_ensure_credits(handle, inode, credits, restart_credits, 0); - if (err) + if (err) { + if (err > 0) + err = -EAGAIN; goto out; + }
err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -5124,6 +5127,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, int ret = 0, depth; struct ext4_extent *extent; ext4_lblk_t stop, *iterator, ex_start, ex_end; + ext4_lblk_t tmp = EXT_MAX_BLOCKS;
/* Let path point to the last extent */ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, @@ -5177,11 +5181,15 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, * till we reach stop. In case of right shift, iterator points to stop * and it is decreased till we reach start. */ +again: if (SHIFT == SHIFT_LEFT) iterator = &start; else iterator = &stop;
+ if (tmp != EXT_MAX_BLOCKS) + *iterator = tmp; + /* * Its safe to start updating extents. Start and stop are unsigned, so * in case of right shift if extent with 0 block is reached, iterator @@ -5210,6 +5218,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, } }
+ tmp = *iterator; if (SHIFT == SHIFT_LEFT) { extent = EXT_LAST_EXTENT(path[depth].p_hdr); *iterator = le32_to_cpu(extent->ee_block) + @@ -5228,6 +5237,9 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, } ret = ext4_ext_shift_path_extents(path, shift, inode, handle, SHIFT); + /* iterator can be NULL which means we should break */ + if (ret == -EAGAIN) + goto again; if (ret) break; }
From: yangerkun yangerkun@huawei.com
hulk inclusion category: performance bugzilla: 174005 https://gitee.com/openeuler/kernel/issues/I4DDEL
---------------------------
This reverts commit b2d85dfb8c1de87700afae78df99715d3a0788a5. This patch is a local patch try to remove the useless rcu gap for loop setup. Now mainline has the solution too. Revert local patch.
Signed-off-by: yangerkun yangerkun@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com
Signed-off-by: Chen Jun chenjun102@huawei.com --- block/elevator.c | 14 ++++---------- drivers/block/loop.c | 5 ----- include/linux/blkdev.h | 2 -- 3 files changed, 4 insertions(+), 17 deletions(-)
diff --git a/block/elevator.c b/block/elevator.c index 0753255a8164..2a525863d4e9 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -665,7 +665,6 @@ void elevator_init_mq(struct request_queue *q) { struct elevator_type *e; int err; - bool no_init_io;
if (!elv_support_iosched(q)) return; @@ -682,18 +681,13 @@ void elevator_init_mq(struct request_queue *q) if (!e) return;
- no_init_io = blk_queue_no_init_io(q); - if (!no_init_io) { - blk_mq_freeze_queue(q); - blk_mq_quiesce_queue(q); - } + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q);
err = blk_mq_init_sched(q, e);
- if (!no_init_io) { - blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); - } + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q);
if (err) { pr_warn(""%s" elevator initialization failed, " diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 24f4192a2ffb..5dd8bd480e29 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -2176,11 +2176,6 @@ static int loop_add(struct loop_device **l, int i) disk->private_data = lo; disk->queue = lo->lo_queue; sprintf(disk->disk_name, "loop%d", i); - /* - * There won't be io before add_disk, QUEUE_FLAG_NO_INIT_IO can help - * to save time while elevator_init_mq. - */ - blk_queue_flag_set(QUEUE_FLAG_NO_INIT_IO, lo->lo_queue); add_disk(disk); *l = lo; return lo->lo_number; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8ea21d8eb1bf..7517381b5e15 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -622,7 +622,6 @@ struct request_queue { #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */ -#define QUEUE_FLAG_NO_INIT_IO 30 /* no IO can happen while elevator_init_mq */
#define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_SAME_COMP) | \ @@ -669,7 +668,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_fua(q) test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags) #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags) #define blk_queue_nowait(q) test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags) -#define blk_queue_no_init_io(q) test_bit(QUEUE_FLAG_NO_INIT_IO, &(q)->queue_flags)
extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q);
From: Bart Van Assche bvanassche@acm.org
mainline inclusion from mainline-5.15-rc1 commit 90b7198001f23ea37d3b46dc631bdaa2357a20b1 category: performance bugzilla: 174005 https://gitee.com/openeuler/kernel/issues/I4DDEL
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
---------------------------
elevator_get_default() uses the following algorithm to select an I/O scheduler from inside add_disk(): - In case of a single hardware queue or if sharing hardware queues across multiple request queues (BLK_MQ_F_TAG_HCTX_SHARED), use mq-deadline. - Otherwise, use 'none'.
This is a good choice for most but not for all block drivers. Make it possible to override the selection of mq-deadline with a new flag, namely BLK_MQ_F_NO_SCHED_BY_DEFAULT.
Cc: Christoph Hellwig hch@lst.de Cc: Ming Lei ming.lei@redhat.com Cc: Tetsuo Handa penguin-kernel@I-love.SAKURA.ne.jp Cc: Martijn Coenen maco@android.com Cc: Jaegeuk Kim jaegeuk@kernel.org Signed-off-by: Bart Van Assche bvanassche@acm.org Link: https://lore.kernel.org/r/20210805174200.3250718-2-bvanassche@acm.org Signed-off-by: Jens Axboe axboe@kernel.dk
Conflicts: block/elevator.c
Signed-off-by: yangerkun yangerkun@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com
Signed-off-by: Chen Jun chenjun102@huawei.com --- block/elevator.c | 3 +++ include/linux/blk-mq.h | 6 ++++++ 2 files changed, 9 insertions(+)
diff --git a/block/elevator.c b/block/elevator.c index 2a525863d4e9..4ce6b22813a1 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -624,6 +624,9 @@ static inline bool elv_support_iosched(struct request_queue *q) */ static struct elevator_type *elevator_get_default(struct request_queue *q) { + if (q->tag_set && q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT) + return NULL; + if (q->nr_hw_queues != 1) return NULL;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 92bbc9a72355..eee2c8a16601 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -399,7 +399,13 @@ enum { BLK_MQ_F_STACKING = 1 << 2, BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, BLK_MQ_F_BLOCKING = 1 << 5, + /* Do not allow an I/O scheduler to be configured. */ BLK_MQ_F_NO_SCHED = 1 << 6, + /* + * Select 'none' during queue registration in case of a single hwq + * or shared hwqs instead of 'mq-deadline'. + */ + BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 7, BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, BLK_MQ_F_ALLOC_POLICY_BITS = 1,
From: Bart Van Assche bvanassche@acm.org
mainline inclusion from mainline-5.15-rc1 commit 2112f5c1330a671fa852051d85cb9eadc05d7eb7 category: performance bugzilla: 174005 https://gitee.com/openeuler/kernel/issues/I4DDEL
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
---------------------------
We noticed that the user interface of Android devices becomes very slow under memory pressure. This is because Android uses the zram driver on top of the loop driver for swapping, because under memory pressure the swap code alternates reads and writes quickly, because mq-deadline is the default scheduler for loop devices and because mq-deadline delays writes by five seconds for such a workload with default settings. Fix this by making the kernel select I/O scheduler 'none' from inside add_disk() for loop devices. This default can be overridden at any time from user space, e.g. via a udev rule. This approach has an advantage compared to changing the I/O scheduler from userspace from 'mq-deadline' into 'none', namely that synchronize_rcu() does not get called.
This patch changes the default I/O scheduler for loop devices from 'mq-deadline' into 'none'.
Additionally, this patch reduces the Android boot time on my test setup with 0.5 seconds compared to configuring the loop I/O scheduler from user space.
Cc: Christoph Hellwig hch@lst.de Cc: Ming Lei ming.lei@redhat.com Cc: Tetsuo Handa penguin-kernel@I-love.SAKURA.ne.jp Cc: Martijn Coenen maco@android.com Cc: Jaegeuk Kim jaegeuk@kernel.org Signed-off-by: Bart Van Assche bvanassche@acm.org Link: https://lore.kernel.org/r/20210805174200.3250718-3-bvanassche@acm.org Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: yangerkun yangerkun@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com
Signed-off-by: Chen Jun chenjun102@huawei.com --- drivers/block/loop.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 5dd8bd480e29..84f3191f4566 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -2117,7 +2117,8 @@ static int loop_add(struct loop_device **l, int i) lo->tag_set.queue_depth = 128; lo->tag_set.numa_node = NUMA_NO_NODE; lo->tag_set.cmd_size = sizeof(struct loop_cmd); - lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING; + lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING | + BLK_MQ_F_NO_SCHED_BY_DEFAULT; lo->tag_set.driver_data = lo;
err = blk_mq_alloc_tag_set(&lo->tag_set);
From: Theodore Ts'o tytso@mit.edu
mainline inclusion from mainline-5.15-rc1 commit 308c57ccf4318236be75dfa251c84713e694457b category: bugfix bugzilla: 167373 https://gitee.com/openeuler/kernel/issues/I4DDEL
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
---------------------------
If the underlying storage device is using thin-provisioning, it's possible for a zeroout operation to return ENOSPC.
Commit df22291ff0fd ("ext4: Retry block allocation if we have free blocks left") added logic to retry block allocation since we might get free block after we commit a transaction. But the ENOSPC from thin-provisioning will confuse ext4, and lead to an infinite loop.
Since using zeroout instead of splitting the extent node is an optimization, if it fails, we might as well fall back to splitting the extent node.
Reported-by: yangerkun yangerkun@huawei.com Signed-off-by: Theodore Ts'o tytso@mit.edu Signed-off-by: yangerkun yangerkun@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com
Signed-off-by: Chen Jun chenjun102@huawei.com --- fs/ext4/extents.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f1548e52496c..398df993ccd8 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3612,7 +3612,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, split_map.m_len - ee_block); err = ext4_ext_zeroout(inode, &zero_ex1); if (err) - goto out; + goto fallback; split_map.m_len = allocated; } if (split_map.m_lblk - ee_block + split_map.m_len < @@ -3626,7 +3626,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ext4_ext_pblock(ex)); err = ext4_ext_zeroout(inode, &zero_ex2); if (err) - goto out; + goto fallback; }
split_map.m_len += split_map.m_lblk - ee_block; @@ -3635,6 +3635,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, } }
+fallback: err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag, flags); if (err > 0)
From: yangerkun yangerkun@huawei.com
hulk inclusion category: bugfix bugzilla: 181448 https://gitee.com/openeuler/kernel/issues/I4DDEL
---------------------------
Example for triggering use after free in a overlay on ext4 setup:
aio_read ovl_read_iter vfs_iter_read ext4_file_read_iter ext4_dio_read_iter iomap_dio_rw -> -EIOCBQUEUED /* * Here IO is completed in a separate thread, * ovl_aio_cleanup_handler() frees aio_req which has iocb embedded */ file_accessed(iocb->ki_filp); /**BOOM**/
Fix by introducing a refcount in ovl_aio_req similarly to aio_kiocb. This guarantees that iocb is only freed after vfs_read/write_iter() returns on underlying fs.
Fixes: 2406a307ac7d ("ovl: implement async IO routines") Signed-off-by: yangerkun yangerkun@huawei.com Link: https://lore.kernel.org/r/20210930032228.3199690-3-yangerkun@huawei.com/ Cc: stable@vger.kernel.org # v5.6 Signed-off-by: Miklos Szeredi mszeredi@redhat.com Reviewed-by: Zhang Yi yi.zhang@huawei.com
Signed-off-by: Chen Jun chenjun102@huawei.com --- fs/overlayfs/file.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index f7135777cb4e..d1ae643a999a 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -17,6 +17,7 @@
struct ovl_aio_req { struct kiocb iocb; + refcount_t ref; struct kiocb *orig_iocb; struct fd fd; }; @@ -257,6 +258,14 @@ static rwf_t ovl_iocb_to_rwf(int ifl) return flags; }
+static inline void ovl_aio_put(struct ovl_aio_req *aio_req) +{ + if (refcount_dec_and_test(&aio_req->ref)) { + fdput(aio_req->fd); + kmem_cache_free(ovl_aio_request_cachep, aio_req); + } +} + static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) { struct kiocb *iocb = &aio_req->iocb; @@ -273,8 +282,7 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) }
orig_iocb->ki_pos = iocb->ki_pos; - fdput(aio_req->fd); - kmem_cache_free(ovl_aio_request_cachep, aio_req); + ovl_aio_put(aio_req); }
static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2) @@ -324,7 +332,9 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) aio_req->orig_iocb = iocb; kiocb_clone(&aio_req->iocb, iocb, real.file); aio_req->iocb.ki_complete = ovl_aio_rw_complete; + refcount_set(&aio_req->ref, 2); ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter); + ovl_aio_put(aio_req); if (ret != -EIOCBQUEUED) ovl_aio_cleanup_handler(aio_req); } @@ -395,7 +405,9 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) kiocb_clone(&aio_req->iocb, iocb, real.file); aio_req->iocb.ki_flags = ifl; aio_req->iocb.ki_complete = ovl_aio_rw_complete; + refcount_set(&aio_req->ref, 2); ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter); + ovl_aio_put(aio_req); if (ret != -EIOCBQUEUED) ovl_aio_cleanup_handler(aio_req); }
From: Jakub Kicinski kuba@kernel.org
mainline inclusion from mainline-v5.11-rc4 commit c269a24ce057abfc31130960e96ab197ef6ab196 category: bugfix bugzilla: 181892 https://gitee.com/openeuler/kernel/issues/I4DDEL
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
There are two flavors of handling netdev registration: - ones called without holding rtnl_lock: register_netdev() and unregister_netdev(); and - those called with rtnl_lock held: register_netdevice() and unregister_netdevice().
While the semantics of the former are pretty clear, the same can't be said about the latter. The netdev_todo mechanism is utilized to perform some of the device unregistering tasks and it hooks into rtnl_unlock() so the locked variants can't actually finish the work. In general free_netdev() does not mix well with locked calls. Most drivers operating under rtnl_lock set dev->needs_free_netdev to true and expect core to make the free_netdev() call some time later.
The part where this becomes most problematic is error paths. There is no way to unwind the state cleanly after a call to register_netdevice(), since unreg can't be performed fully without dropping locks.
Make free_netdev() more lenient, and defer the freeing if device is being unregistered. This allows error paths to simply call free_netdev() both after register_netdevice() failed, and after a call to unregister_netdevice() but before dropping rtnl_lock.
Simplify the error paths which are currently doing gymnastics around free_netdev() handling.
Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Xu Jia xujia39@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com
Signed-off-by: Chen Jun chenjun102@huawei.com --- net/8021q/vlan.c | 4 +--- net/core/dev.c | 11 +++++++++++ net/core/rtnetlink.c | 23 ++++++----------------- 3 files changed, 18 insertions(+), 20 deletions(-)
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 15bbfaf943fd..8b644113715e 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -284,9 +284,7 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) return 0;
out_free_newdev: - if (new_dev->reg_state == NETREG_UNINITIALIZED || - new_dev->reg_state == NETREG_UNREGISTERED) - free_netdev(new_dev); + free_netdev(new_dev); return err; }
diff --git a/net/core/dev.c b/net/core/dev.c index 93d54e30c159..736972a8f981 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10655,6 +10655,17 @@ void free_netdev(struct net_device *dev) struct napi_struct *p, *n;
might_sleep(); + + /* When called immediately after register_netdevice() failed the unwind + * handling may still be dismantling the device. Handle that case by + * deferring the free. + */ + if (dev->reg_state == NETREG_UNREGISTERING) { + ASSERT_RTNL(); + dev->needs_free_netdev = true; + return; + } + netif_free_tx_queues(dev); netif_free_rx_queues(dev);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 27ffa83ffeb3..e08b506163f5 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3438,26 +3438,15 @@ static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
dev->ifindex = ifm->ifi_index;
- if (ops->newlink) { + if (ops->newlink) err = ops->newlink(link_net ? : net, dev, tb, data, extack); - /* Drivers should call free_netdev() in ->destructor - * and unregister it on failure after registration - * so that device could be finally freed in rtnl_unlock. - */ - if (err < 0) { - /* If device is not registered at all, free it now */ - if (dev->reg_state == NETREG_UNINITIALIZED || - dev->reg_state == NETREG_UNREGISTERED) - free_netdev(dev); - goto out; - } - } else { + else err = register_netdevice(dev); - if (err < 0) { - free_netdev(dev); - goto out; - } + if (err < 0) { + free_netdev(dev); + goto out; } + err = rtnl_configure_link(dev, ifm); if (err < 0) goto out_unregister;
From: Andrii Nakryiko andrii@kernel.org
mainline inclusion from mainline-v5.15-rc1 commit c7603cfa04e7c3a435b31d065f7cbdc829428f6e category: bugfix bugzilla: 182996 https://gitee.com/openeuler/kernel/issues/I4DDEL
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper") fixed the problem with cgroup-local storage use in BPF by pre-allocating per-CPU array of 8 cgroup storage pointers to accommodate possible BPF program preemptions and nested executions.
While this seems to work good in practice, it introduces new and unnecessary failure mode in which not all BPF programs might be executed if we fail to find an unused slot for cgroup storage, however unlikely it is. It might also not be so unlikely when/if we allow sleepable cgroup BPF programs in the future.
Further, the way that cgroup storage is implemented as ambiently-available property during entire BPF program execution is a convenient way to pass extra information to BPF program and helpers without requiring user code to pass around extra arguments explicitly. So it would be good to have a generic solution that can allow implementing this without arbitrary restrictions. Ideally, such solution would work for both preemptable and sleepable BPF programs in exactly the same way.
This patch introduces such solution, bpf_run_ctx. It adds one pointer field (bpf_ctx) to task_struct. This field is maintained by BPF_PROG_RUN family of macros in such a way that it always stays valid throughout BPF program execution. BPF program preemption is handled by remembering previous current->bpf_ctx value locally while executing nested BPF program and restoring old value after nested BPF program finishes. This is handled by two helper functions, bpf_set_run_ctx() and bpf_reset_run_ctx(), which are supposed to be used before and after BPF program runs, respectively.
Restoring old value of the pointer handles preemption, while bpf_run_ctx pointer being a property of current task_struct naturally solves this problem for sleepable BPF programs by "following" BPF program execution as it is scheduled in and out of CPU. It would even allow CPU migration of BPF programs, even though it's not currently allowed by BPF infra.
This patch cleans up cgroup local storage handling as a first application. The design itself is generic, though, with bpf_run_ctx being an empty struct that is supposed to be embedded into a specific struct for a given BPF program type (bpf_cg_run_ctx in this case). Follow up patches are planned that will expand this mechanism for other uses within tracing BPF programs.
To verify that this change doesn't revert the fix to the original cgroup storage issue, I ran the same repro as in the original report ([0]) and didn't get any problems. Replacing bpf_reset_run_ctx(old_run_ctx) with bpf_reset_run_ctx(NULL) triggers the issue pretty quickly (so repro does work).
[0] https://lore.kernel.org/bpf/YEEvBUiJl2pJkxTd@krava/
Fixes: b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper") Signed-off-by: Andrii Nakryiko andrii@kernel.org Signed-off-by: Daniel Borkmann daniel@iogearbox.net Acked-by: Yonghong Song yhs@fb.com Link: https://lore.kernel.org/bpf/20210712230615.3525979-1-andrii@kernel.org Conflicts: include/linux/bpf.h include/linux/sched.h kernel/fork.c net/bpf/test_run.c Signed-off-by: Pu Lehui pulehui@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com
Signed-off-by: Chen Jun chenjun102@huawei.com --- include/linux/bpf-cgroup.h | 54 --------------------------- include/linux/bpf.h | 75 ++++++++++++++++---------------------- include/linux/sched.h | 5 +++ kernel/bpf/helpers.c | 16 +++----- kernel/bpf/local_storage.c | 3 -- kernel/fork.c | 3 ++ net/bpf/test_run.c | 22 +++++------ 7 files changed, 56 insertions(+), 122 deletions(-)
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 91b966978541..5120ca319ff4 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -27,19 +27,6 @@ struct task_struct; extern struct static_key_false cgroup_bpf_enabled_key; #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
-#define BPF_CGROUP_STORAGE_NEST_MAX 8 - -struct bpf_cgroup_storage_info { - struct task_struct *task; - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; -}; - -/* For each cpu, permit maximum BPF_CGROUP_STORAGE_NEST_MAX number of tasks - * to use bpf cgroup storage simultaneously. - */ -DECLARE_PER_CPU(struct bpf_cgroup_storage_info, - bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]); - #define for_each_cgroup_storage_type(stype) \ for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++)
@@ -167,44 +154,6 @@ static inline enum bpf_cgroup_storage_type cgroup_storage_type( return BPF_CGROUP_STORAGE_SHARED; }
-static inline int bpf_cgroup_storage_set(struct bpf_cgroup_storage - *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) -{ - enum bpf_cgroup_storage_type stype; - int i, err = 0; - - preempt_disable(); - for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) { - if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != NULL)) - continue; - - this_cpu_write(bpf_cgroup_storage_info[i].task, current); - for_each_cgroup_storage_type(stype) - this_cpu_write(bpf_cgroup_storage_info[i].storage[stype], - storage[stype]); - goto out; - } - err = -EBUSY; - WARN_ON_ONCE(1); - -out: - preempt_enable(); - return err; -} - -static inline void bpf_cgroup_storage_unset(void) -{ - int i; - - for (i = BPF_CGROUP_STORAGE_NEST_MAX - 1; i >= 0; i--) { - if (likely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) - continue; - - this_cpu_write(bpf_cgroup_storage_info[i].task, NULL); - return; - } -} - struct bpf_cgroup_storage * cgroup_storage_lookup(struct bpf_cgroup_storage_map *map, void *key, bool locked); @@ -450,9 +399,6 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, return -EINVAL; }
-static inline int bpf_cgroup_storage_set( - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) { return 0; } -static inline void bpf_cgroup_storage_unset(void) {} static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map) { return 0; } static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f931fffa317b..fc83187082ed 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1078,43 +1078,20 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *include_prog, struct bpf_prog_array **new_array);
-/* For BPF_PROG_RUN_ARRAY_FLAGS and __BPF_PROG_RUN_ARRAY, - * if bpf_cgroup_storage_set() failed, the rest of programs - * will not execute. This should be a really rare scenario - * as it requires BPF_CGROUP_STORAGE_NEST_MAX number of - * preemptions all between bpf_cgroup_storage_set() and - * bpf_cgroup_storage_unset() on the same cpu. - */ -#define BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, ret_flags) \ - ({ \ - struct bpf_prog_array_item *_item; \ - struct bpf_prog *_prog; \ - struct bpf_prog_array *_array; \ - u32 _ret = 1; \ - u32 func_ret; \ - migrate_disable(); \ - rcu_read_lock(); \ - _array = rcu_dereference(array); \ - _item = &_array->items[0]; \ - while ((_prog = READ_ONCE(_item->prog))) { \ - if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage))) \ - break; \ - func_ret = func(_prog, ctx); \ - _ret &= (func_ret & 1); \ - *(ret_flags) |= (func_ret >> 1); \ - bpf_cgroup_storage_unset(); \ - _item++; \ - } \ - rcu_read_unlock(); \ - migrate_enable(); \ - _ret; \ - }) +struct bpf_run_ctx {}; + +struct bpf_cg_run_ctx { + struct bpf_run_ctx run_ctx; + struct bpf_prog_array_item *prog_item; +};
#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null, set_cg_storage) \ ({ \ struct bpf_prog_array_item *_item; \ struct bpf_prog *_prog; \ struct bpf_prog_array *_array; \ + struct bpf_run_ctx *old_run_ctx; \ + struct bpf_cg_run_ctx run_ctx; \ u32 _ret = 1; \ migrate_disable(); \ rcu_read_lock(); \ @@ -1122,17 +1099,13 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, if (unlikely(check_non_null && !_array))\ goto _out; \ _item = &_array->items[0]; \ - while ((_prog = READ_ONCE(_item->prog))) { \ - if (!set_cg_storage) { \ - _ret &= func(_prog, ctx); \ - } else { \ - if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage))) \ - break; \ - _ret &= func(_prog, ctx); \ - bpf_cgroup_storage_unset(); \ - } \ + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);\ + while ((_prog = READ_ONCE(_item->prog))) { \ + run_ctx.prog_item = _item; \ + _ret &= func(_prog, ctx); \ _item++; \ } \ + bpf_reset_run_ctx(old_run_ctx); \ _out: \ rcu_read_unlock(); \ migrate_enable(); \ @@ -1166,6 +1139,8 @@ _out: \ struct bpf_prog_array_item *_item; \ struct bpf_prog *_prog; \ struct bpf_prog_array *_array; \ + struct bpf_run_ctx *old_run_ctx; \ + struct bpf_cg_run_ctx run_ctx; \ u32 ret; \ u32 _ret = 1; \ u32 _cn = 0; \ @@ -1173,15 +1148,15 @@ _out: \ rcu_read_lock(); \ _array = rcu_dereference(array); \ _item = &_array->items[0]; \ + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); \ while ((_prog = READ_ONCE(_item->prog))) { \ - if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage))) \ - break; \ + run_ctx.prog_item = _item; \ ret = func(_prog, ctx); \ - bpf_cgroup_storage_unset(); \ _ret &= (ret & 1); \ _cn |= (ret & 2); \ _item++; \ } \ + bpf_reset_run_ctx(old_run_ctx); \ rcu_read_unlock(); \ migrate_enable(); \ if (_ret) \ @@ -1231,6 +1206,20 @@ static inline void bpf_enable_instrumentation(void) migrate_enable(); }
+static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx) +{ + struct bpf_run_ctx *old_ctx; + + old_ctx = current->bpf_ctx; + current->bpf_ctx = new_ctx; + return old_ctx; +} + +static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx) +{ + current->bpf_ctx = old_ctx; +} + extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; extern const struct file_operations bpf_iter_fops; diff --git a/include/linux/sched.h b/include/linux/sched.h index 2fd7b89c50ef..9e9c0bd4197d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -41,6 +41,7 @@ struct audit_context; struct backing_dev_info; struct bio_list; struct blk_plug; +struct bpf_run_ctx; struct capture_control; struct cfs_rq; struct fs_struct; @@ -1341,6 +1342,10 @@ struct task_struct { /* Used by LSM modules for access restriction: */ void *security; #endif +#ifdef CONFIG_BPF_SYSCALL + /* Used for BPF run context */ + struct bpf_run_ctx *bpf_ctx; +#endif
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK unsigned long lowest_stack; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 0efe7c7bfe5e..bb4350de9f11 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -372,8 +372,6 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = { };
#ifdef CONFIG_CGROUP_BPF -DECLARE_PER_CPU(struct bpf_cgroup_storage_info, - bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) { @@ -382,17 +380,13 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) * verifier checks that its value is correct. */ enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); - struct bpf_cgroup_storage *storage = NULL; + struct bpf_cgroup_storage *storage; + struct bpf_cg_run_ctx *ctx; void *ptr; - int i;
- for (i = BPF_CGROUP_STORAGE_NEST_MAX - 1; i >= 0; i--) { - if (likely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) - continue; - - storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]); - break; - } + /* get current cgroup storage from BPF run context */ + ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); + storage = ctx->prog_item->cgroup_storage[stype];
if (stype == BPF_CGROUP_STORAGE_SHARED) ptr = &READ_ONCE(storage->buf)->data[0]; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index b139247d2dd3..49ca8771d470 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -11,9 +11,6 @@
#ifdef CONFIG_CGROUP_BPF
-DEFINE_PER_CPU(struct bpf_cgroup_storage_info, - bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]); - #include "../cgroup/cgroup-internal.h"
#define LOCAL_STORAGE_CREATE_FLAG_MASK \ diff --git a/kernel/fork.c b/kernel/fork.c index 1abdc1f6eeca..1c95cd59a635 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2099,6 +2099,9 @@ static __latent_entropy struct task_struct *copy_process( p->sequential_io = 0; p->sequential_io_avg = 0; #endif +#ifdef CONFIG_BPF_SYSCALL + p->bpf_ctx = NULL; +#endif
/* Perform scheduler related setup. Assign this task to a CPU. */ retval = sched_fork(clone_flags, p); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index eb684f31fd69..99712d35e535 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -19,18 +19,20 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *retval, u32 *time, bool xdp) { - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { NULL }; + struct bpf_prog_array_item item = {.prog = prog}; + struct bpf_run_ctx *old_ctx; + struct bpf_cg_run_ctx run_ctx; enum bpf_cgroup_storage_type stype; u64 time_start, time_spent = 0; int ret = 0; u32 i;
for_each_cgroup_storage_type(stype) { - storage[stype] = bpf_cgroup_storage_alloc(prog, stype); - if (IS_ERR(storage[stype])) { - storage[stype] = NULL; + item.cgroup_storage[stype] = bpf_cgroup_storage_alloc(prog, stype); + if (IS_ERR(item.cgroup_storage[stype])) { + item.cgroup_storage[stype] = NULL; for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); + bpf_cgroup_storage_free(item.cgroup_storage[stype]); return -ENOMEM; } } @@ -41,18 +43,15 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, rcu_read_lock(); migrate_disable(); time_start = ktime_get_ns(); + old_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); for (i = 0; i < repeat; i++) { - ret = bpf_cgroup_storage_set(storage); - if (ret) - break; + run_ctx.prog_item = &item;
if (xdp) *retval = bpf_prog_run_xdp(prog, ctx); else *retval = BPF_PROG_RUN(prog, ctx);
- bpf_cgroup_storage_unset(); - if (signal_pending(current)) { ret = -EINTR; break; @@ -70,6 +69,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, time_start = ktime_get_ns(); } } + bpf_reset_run_ctx(old_ctx); time_spent += ktime_get_ns() - time_start; migrate_enable(); rcu_read_unlock(); @@ -78,7 +78,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, *time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent;
for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); + bpf_cgroup_storage_free(item.cgroup_storage[stype]);
return ret; }
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: 182920 https://gitee.com/openeuler/kernel/issues/I4DDEL
---------------------------
When user pass 0x100000 as index, nbd will end up create sysfs dir "/sys/block/43:0":
nbd_dev_add disk->first_minor = index << part_shift -> default part_shift is 5, 0x100000 << 5 = 0x2000000 device_add_disk blk_alloc_devt MKDEV(disk->major, disk->first_minor + part->partno) -> (0x2b << 20) | (0x2000000) = 0x2b00000 register_disk device_add device_create_sys_dev_entry format_dev_t MAJOR(devt) -> 0x2b00000 >> 20 = 0x2b MINOR(devt) -> 0x2b00000 & 0xfffff = 0 sysfs_create_link -> /sys/block/43:0
If nbd created device with index 0 aready, then sysfs will compalin about dumplicated creation.
On the other hand, the similar dumplicated creation will happen if "index << part_shift" over flow to a value that is less than MINORMASK.
Thus fix the problem by adding sanity check for first_minor.
Fixes: b0d9111a2d53 ("nbd: use an idr to keep track of nbd devices") Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com
Signed-off-by: Chen Jun chenjun102@huawei.com --- drivers/block/nbd.c | 11 +++++++++++ 1 file changed, 11 insertions(+)
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index bb50af20b284..cc9770936c67 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1821,7 +1821,18 @@ static int nbd_dev_add(int index) refcount_set(&nbd->refs, 1); INIT_LIST_HEAD(&nbd->list); disk->major = NBD_MAJOR; + + /* + * Too big index can cause duplicate creation of sysfs files/links, + * because MKDEV() expect that the max first minor is MINORMASK, or + * index << part_shift can overflow. + */ disk->first_minor = index << part_shift; + if (disk->first_minor < index || disk->first_minor > MINORMASK) { + err = -EINVAL; + goto out_free_tags; + } + disk->fops = &nbd_fops; disk->private_data = nbd; sprintf(disk->disk_name, "nbd%d", index);
From: Pavel Begunkov asml.silence@gmail.com
mainline inclusion from mainline-5.12-rc1 commit eab30c4d20dc761d463445e5130421863ff81505 category: bugfix bugzilla: 182869 https://gitee.com/openeuler/kernel/issues/I4DDEL
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
---------------------------
When io_req_task_work_add() fails, the request will be cancelled by enqueueing via task_works of io-wq. Extract a function for that.
Signed-off-by: Pavel Begunkov asml.silence@gmail.com Signed-off-by: Jens Axboe axboe@kernel.dk
Conflicts: fs/io_uring.c [ 355fb9e2b78e78("io_uring: remove 'twa_signal_ok' deadlock work-around") is not applied. ]
Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com
Signed-off-by: Chen Jun chenjun102@huawei.com --- fs/io_uring.c | 46 +++++++++++++++++----------------------------- 1 file changed, 17 insertions(+), 29 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index 0736487165da..353604296e7d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2056,6 +2056,16 @@ static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok) return ret; }
+static void io_req_task_work_add_fallback(struct io_kiocb *req, + void (*cb)(struct callback_head *)) +{ + struct task_struct *tsk = io_wq_get_task(req->ctx->io_wq); + + init_task_work(&req->task_work, cb); + task_work_add(tsk, &req->task_work, TWA_NONE); + wake_up_process(tsk); +} + static void __io_req_task_cancel(struct io_kiocb *req, int error) { struct io_ring_ctx *ctx = req->ctx; @@ -2111,14 +2121,8 @@ static void io_req_task_queue(struct io_kiocb *req) percpu_ref_get(&req->ctx->refs);
ret = io_req_task_work_add(req, true); - if (unlikely(ret)) { - struct task_struct *tsk; - - init_task_work(&req->task_work, io_req_task_cancel); - tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, TWA_NONE); - wake_up_process(tsk); - } + if (unlikely(ret)) + io_req_task_work_add_fallback(req, io_req_task_cancel); }
static void io_queue_next(struct io_kiocb *req) @@ -2237,13 +2241,8 @@ static void io_free_req_deferred(struct io_kiocb *req)
init_task_work(&req->task_work, io_put_req_deferred_cb); ret = io_req_task_work_add(req, true); - if (unlikely(ret)) { - struct task_struct *tsk; - - tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, TWA_NONE); - wake_up_process(tsk); - } + if (unlikely(ret)) + io_req_task_work_add_fallback(req, io_put_req_deferred_cb); }
static inline void io_put_req_deferred(struct io_kiocb *req, int refs) @@ -3343,15 +3342,8 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, /* submit ref gets dropped, acquire a new one */ refcount_inc(&req->refs); ret = io_req_task_work_add(req, true); - if (unlikely(ret)) { - struct task_struct *tsk; - - /* queue just for cancelation */ - init_task_work(&req->task_work, io_req_task_cancel); - tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, TWA_NONE); - wake_up_process(tsk); - } + if (unlikely(ret)) + io_req_task_work_add_fallback(req, io_req_task_cancel); return 1; }
@@ -4961,12 +4953,8 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, */ ret = io_req_task_work_add(req, twa_signal_ok); if (unlikely(ret)) { - struct task_struct *tsk; - WRITE_ONCE(poll->canceled, true); - tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, TWA_NONE); - wake_up_process(tsk); + io_req_task_work_add_fallback(req, func); } return 1; }
From: Pavel Begunkov asml.silence@gmail.com
mainline inclusion from mainline-5.12-rc1 commit 792bb6eb862333658bf1bd2260133f0507e2da8d category: bugfix bugzilla: 182869 https://gitee.com/openeuler/kernel/issues/I4DDEL
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
---------------------------
[ 97.866748] a.out/2890 is trying to acquire lock: [ 97.867829] ffff8881046763e8 (&ctx->uring_lock){+.+.}-{3:3}, at: io_wq_submit_work+0x155/0x240 [ 97.869735] [ 97.869735] but task is already holding lock: [ 97.871033] ffff88810dfe0be8 (&ctx->uring_lock){+.+.}-{3:3}, at: __x64_sys_io_uring_enter+0x3f0/0x5b0 [ 97.873074] [ 97.873074] other info that might help us debug this: [ 97.874520] Possible unsafe locking scenario: [ 97.874520] [ 97.875845] CPU0 [ 97.876440] ---- [ 97.877048] lock(&ctx->uring_lock); [ 97.877961] lock(&ctx->uring_lock); [ 97.878881] [ 97.878881] *** DEADLOCK *** [ 97.878881] [ 97.880341] May be due to missing lock nesting notation [ 97.880341] [ 97.881952] 1 lock held by a.out/2890: [ 97.882873] #0: ffff88810dfe0be8 (&ctx->uring_lock){+.+.}-{3:3}, at: __x64_sys_io_uring_enter+0x3f0/0x5b0 [ 97.885108] [ 97.885108] stack backtrace: [ 97.890457] Call Trace: [ 97.891121] dump_stack+0xac/0xe3 [ 97.891972] __lock_acquire+0xab6/0x13a0 [ 97.892940] lock_acquire+0x2c3/0x390 [ 97.894894] __mutex_lock+0xae/0x9f0 [ 97.901101] io_wq_submit_work+0x155/0x240 [ 97.902112] io_wq_cancel_cb+0x162/0x490 [ 97.904126] io_async_find_and_cancel+0x3b/0x140 [ 97.905247] io_issue_sqe+0x86d/0x13e0 [ 97.909122] __io_queue_sqe+0x10b/0x550 [ 97.913971] io_queue_sqe+0x235/0x470 [ 97.914894] io_submit_sqes+0xcce/0xf10 [ 97.917872] __x64_sys_io_uring_enter+0x3fb/0x5b0 [ 97.921424] do_syscall_64+0x2d/0x40 [ 97.922329] entry_SYSCALL_64_after_hwframe+0x44/0xa9
While holding uring_lock, e.g. from inline execution, async cancel request may attempt cancellations through io_wq_submit_work, which may try to grab a lock. Delay it to task_work, so we do it from a clean context and don't have to worry about locking.
Cc: stable@vger.kernel.org # 5.5+ Fixes: c07e6719511e ("io_uring: hold uring_lock while completing failed polled io in io_wq_submit_work()") Reported-by: Abaci abaci@linux.alibaba.com Reported-by: Hao Xu haoxu@linux.alibaba.com Signed-off-by: Pavel Begunkov asml.silence@gmail.com Signed-off-by: Jens Axboe axboe@kernel.dk
Conflicts: [ 5280f7e530f7("io_uring/io-wq: return 2-step work swap scheme") is not applied. ]
Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com
Signed-off-by: Chen Jun chenjun102@huawei.com --- fs/io_uring.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index 353604296e7d..5453d1842e2b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2085,7 +2085,9 @@ static void io_req_task_cancel(struct callback_head *cb) struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); struct io_ring_ctx *ctx = req->ctx;
+ mutex_lock(&ctx->uring_lock); __io_req_task_cancel(req, -ECANCELED); + mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); }
@@ -6133,7 +6135,11 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work) /* if NO_CANCEL is set, we must still run the work */ if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) == IO_WQ_WORK_CANCEL) { - ret = -ECANCELED; + /* io-wq is going to take down one */ + refcount_inc(&req->refs); + percpu_ref_get(&req->ctx->refs); + io_req_task_work_add_fallback(req, io_req_task_cancel); + return io_steal_work(req); }
if (!ret) {
From: Wang Wensheng wangwensheng4@huawei.com
maillist inclusion category: bugfix bugzilla: 185665 https://gitee.com/openeuler/kernel/issues/I4DDEL
Reference: https://lore.kernel.org/all/3b02dd76-d952-e38e-bc0c-c8a121919720@huawei.com/...
-------------------------------------------------
When the timer instance was add into ack_list but was not currently in process, the user could stop it via snd_timer_stop1() without delete it from the ack_list. Then the user could free the timer instance and when it was actually processed UAF occurred.
This issue could be reproduced via testcase snd_timer01 in ltp - running several instances of that testcase at the same time.
What I actually met was that the ack_list of the timer broken and the kernel went into deadloop with irqoff. That could be detected by hardlockup detector on board or when we run it on qemu, we could use gdb to dump the ack_list when the console has no response.
To fix this issue, we delete the timer instance from ack_list and active_list unconditionally in snd_timer_stop1().
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Suggested-by: Takashi Iwai tiwai@suse.de Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Chen Jun chenjun102@huawei.com --- sound/core/timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/sound/core/timer.c b/sound/core/timer.c index c15c8314671b..963ddd9e7715 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -624,13 +624,13 @@ static int snd_timer_stop1(struct snd_timer_instance *timeri, bool stop) if (!timer) return -EINVAL; spin_lock_irqsave(&timer->lock, flags); + list_del_init(&timeri->ack_list); + list_del_init(&timeri->active_list); if (!(timeri->flags & (SNDRV_TIMER_IFLG_RUNNING | SNDRV_TIMER_IFLG_START))) { result = -EBUSY; goto unlock; } - list_del_init(&timeri->ack_list); - list_del_init(&timeri->active_list); if (timer->card && timer->card->shutdown) goto unlock; if (stop) {