From: Ma Wupeng <mawupeng1(a)huawei.com>
hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I6RKHX
CVE: NA
--------------------------------
After the fork operation, it is erroneous for the child process to have a
reliable page size twice that of its parent process.
Upon examining the mm_struct structure, it was discovered that
reliable_nr_page should be initialized to 0, similar to how RSS is
initialized during mm_init(). This particular problem that arises during
forking is merely one such example.
To resolve this issue, it is recommended to set reliable_nr_page to 0
during the mm_init() operation.
Fixes: 094eaabb3fe8 ("proc: Count reliable memory usage of reliable tasks")
Signed-off-by: Ma Wupeng <mawupeng1(a)huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com>
---
include/linux/mem_reliable.h | 8 ++++++++
kernel/fork.c | 1 +
2 files changed, 9 insertions(+)
diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h
index 6d57c36fb676..aa3fe77c8a72 100644
--- a/include/linux/mem_reliable.h
+++ b/include/linux/mem_reliable.h
@@ -123,6 +123,13 @@ static inline bool mem_reliable_shmem_limit_check(void)
shmem_reliable_nr_page;
}
+static inline void reliable_clear_page_counter(struct mm_struct *mm)
+{
+ if (!mem_reliable_is_enabled())
+ return;
+
+ atomic_long_set(&mm->reliable_nr_page, 0);
+}
#else
#define reliable_enabled 0
#define reliable_allow_fb_enabled() false
@@ -171,6 +178,7 @@ static inline void reliable_lru_add_batch(int zid, enum lru_list lru,
int val) {}
static inline bool mem_reliable_counter_initialized(void) { return false; }
+static inline void reliable_clear_page_counter(struct mm_struct *mm) {}
#endif
#endif
diff --git a/kernel/fork.c b/kernel/fork.c
index b5453a26655e..c256525d4ce5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1007,6 +1007,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
atomic_long_set(&mm->locked_vm, 0);
mm->pinned_vm = 0;
memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
+ reliable_clear_page_counter(mm);
spin_lock_init(&mm->page_table_lock);
spin_lock_init(&mm->arg_lock);
mm_init_cpumask(mm);
--
2.25.1
From: Zhong Jinghua <zhongjinghua(a)huawei.com>
hulk inclusion
category: bugfix
bugzilla: 188586, https://gitee.com/openeuler/kernel/issues/I6TFPJ
CVE: NA
----------------------------------------
We found that in loop_control_ioctl, the kernel panic can be easily caused:
1. syscall(__NR_ioctl, r[1], 0x4c80, 0x80000200000ul);
Create a loop device 0x80000200000ul.
In fact, in the code, it is used as the first_minor number, and the
first_minor number is 0.
So the created loop device number is 7:0.
2. syscall(__NR_ioctl, r[2], 0x4c80, 0ul);
Create a loop device 0x0ul.
Since the 7:0 device has been created in 1, add_disk will fail because
the major and first_minor numbers are consistent.
3. syscall(__NR_ioctl, r[5], 0x4c81, 0ul);
Delete the device that failed to create, the kernel panics.
Panic like below:
BUG: KASAN: null-ptr-deref in device_del+0xb3/0x840 drivers/base/core.c:3107
Call Trace:
kill_device drivers/base/core.c:3079 [inline]
device_del+0xb3/0x840 drivers/base/core.c:3107
del_gendisk+0x463/0x5f0 block/genhd.c:971
loop_remove drivers/block/loop.c:2190 [inline]
loop_control_ioctl drivers/block/loop.c:2289 [inline]
The stack like below:
Create loop device:
loop_control_ioctl
loop_add
add_disk
device_add_disk
bdi_register
bdi_register_va
device_create
device_create_groups_vargs
device_add
kfree(dev->p);
dev->p = NULL;
Remove loop device:
loop_control_ioctl
loop_remove
del_gendisk
device_del
kill_device
if (dev->p->dead) // p is null
Fix it by adding a check for parm.
Fixes: 770fe30a46a1 ("loop: add management interface for on-demand device allocation")
Signed-off-by: Zhong Jinghua <zhongjinghua(a)huawei.com>
Reviewed-by: Yu Kuai <yukuai3(a)huawei.com>
Reviewed-by: Hou Tao <houtao1(a)huawei.com>
Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com>
---
drivers/block/loop.c | 14 +++++++++++++-
1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 108a4ff27bcd..826633aa328c 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1972,6 +1972,17 @@ static int loop_add(struct loop_device **l, int i)
struct gendisk *disk;
int err;
+ /*
+ * i << part_shift is actually used as the first_minor.
+ * So here should avoid i << part_shift overflow.
+ * And, MKDEV() expect that the max bits of
+ * first_minor is 20.
+ */
+ if (i > 0 && i > MINORMASK >> part_shift) {
+ err = -EINVAL;
+ goto out;
+ }
+
err = -ENOMEM;
lo = kzalloc(sizeof(*lo), GFP_KERNEL);
if (!lo)
@@ -1985,7 +1996,8 @@ static int loop_add(struct loop_device **l, int i)
if (err == -ENOSPC)
err = -EEXIST;
} else {
- err = idr_alloc(&loop_index_idr, lo, 0, 0, GFP_KERNEL);
+ err = idr_alloc(&loop_index_idr, lo, 0,
+ (MINORMASK >> part_shift) + 1, GFP_KERNEL);
}
if (err < 0)
goto out_free_dev;
--
2.25.1
From: Al Viro <viro(a)zeniv.linux.org.uk>
stable inclusion
from stable-v4.19.245
commit 6ca70982c646cc32e458150ee7f2530a24369b8c
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I6T1EY
CVE: CVE-2023-1838
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id…
--------------------------------
commit fb4554c2232e44d595920f4d5c66cf8f7d13f9bc upstream.
Descriptor table is a shared resource; two fget() on the same descriptor
may return different struct file references. get_tap_ptr_ring() is
called after we'd found (and pinned) the socket we'll be using and it
tries to find the private tun/tap data structures associated with it.
Redoing the lookup by the same file descriptor we'd used to get the
socket is racy - we need to same struct file.
Thanks to Jason for spotting a braino in the original variant of patch -
I'd missed the use of fd == -1 for disabling backend, and in that case
we can end up with sock == NULL and sock != oldsock.
Cc: stable(a)kernel.org
Acked-by: Michael S. Tsirkin <mst(a)redhat.com>
Signed-off-by: Jason Wang <jasowang(a)redhat.com>
Signed-off-by: Al Viro <viro(a)zeniv.linux.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Signed-off-by: Zhengchao Shao <shaozhengchao(a)huawei.com>
Reviewed-by: Yue Haibing <yuehaibing(a)huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng(a)huawei.com>
Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com>
---
drivers/vhost/net.c | 15 +++++++--------
1 file changed, 7 insertions(+), 8 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 1d99f5c443ee..4b9151474a24 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1211,13 +1211,9 @@ static struct socket *get_raw_socket(int fd)
return ERR_PTR(r);
}
-static struct ptr_ring *get_tap_ptr_ring(int fd)
+static struct ptr_ring *get_tap_ptr_ring(struct file *file)
{
struct ptr_ring *ring;
- struct file *file = fget(fd);
-
- if (!file)
- return NULL;
ring = tun_get_tx_ring(file);
if (!IS_ERR(ring))
goto out;
@@ -1226,7 +1222,6 @@ static struct ptr_ring *get_tap_ptr_ring(int fd)
goto out;
ring = NULL;
out:
- fput(file);
return ring;
}
@@ -1313,8 +1308,12 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
r = vhost_net_enable_vq(n, vq);
if (r)
goto err_used;
- if (index == VHOST_NET_VQ_RX)
- nvq->rx_ring = get_tap_ptr_ring(fd);
+ if (index == VHOST_NET_VQ_RX) {
+ if (sock)
+ nvq->rx_ring = get_tap_ptr_ring(sock->file);
+ else
+ nvq->rx_ring = NULL;
+ }
oldubufs = nvq->ubufs;
nvq->ubufs = ubufs;
--
2.25.1
当前例会议题:
议题一:进展update --- 张伽琳 & 郑增凯
议题二:openEuler-22.03-LTS-SP2需求评审 ---张伽琳
欢迎大家继续申报~
-----原始约会-----
发件人: openEuler conference <public(a)openeuler.org>
发送时间: 2023年4月7日 9:30
收件人: dev@openeuler.org,kernel-discuss@openeuler.org,kernel@openeuler.org
主题: openEuler Kernel SIG双周例会
时间: 2023年4月7日 星期五 14:00-15:30(UTC+08:00) 北京,重庆,香港特别行政区,乌鲁木齐。
地点:
您好!
Kernel SIG 邀请您参加 2023-04-07 14:00 召开的Zoom会议(自动录制)
会议主题:openEuler Kernel SIG双周例会
会议内容:
1.进展update
2.议题征集中
欢迎大家积极申报议题(新增议题可以直接回复邮件,或录入会议看板)
会议链接:https://us06web.zoom.us/j/88353599877?pwd=aFZtMjJwUHl1UmNTZFV4eUJQM2xVdz09
会议纪要:https://etherpad.openeuler.org/p/Kernel-meetings
温馨提醒:建议接入会议后修改参会人的姓名,也可以使用您在gitee.com的ID
更多资讯尽在:https://openeuler.org/zh/
Hello!
openEuler Kernel SIG invites you to attend the Zoom conference(auto recording) will be held at 2023-04-07 14:00,
The subject of the conference is openEuler Kernel SIG双周例会,
Summary:
1.进展update
2.议题征集中
欢迎大家积极申报议题(新增议题可以直接回复邮件,或录入会议看板)
You can join the meeting at https://us06web.zoom.us/j/88353599877?pwd=aFZtMjJwUHl1UmNTZFV4eUJQM2xVdz09.
Add topics at https://etherpad.openeuler.org/p/Kernel-meetings.
Note: You are advised to change the participant name after joining the conference or use your ID at gitee.com.
More information: https://openeuler.org/en/
From: Baokun Li <libaokun1(a)huawei.com>
hulk inclusion
category: bugfix
bugzilla: 188500, https://gitee.com/openeuler/kernel/issues/I6RJ0V
CVE: NA
--------------------------------
We got a WARNING in ext4_add_complete_io:
==================================================================
WARNING: at fs/ext4/page-io.c:231 ext4_put_io_end_defer+0x182/0x250
CPU: 10 PID: 77 Comm: ksoftirqd/10 Tainted: 6.3.0-rc2 #85
RIP: 0010:ext4_put_io_end_defer+0x182/0x250 [ext4]
[...]
Call Trace:
<TASK>
ext4_end_bio+0xa8/0x240 [ext4]
bio_endio+0x195/0x310
blk_update_request+0x184/0x770
scsi_end_request+0x2f/0x240
scsi_io_completion+0x75/0x450
scsi_finish_command+0xef/0x160
scsi_complete+0xa3/0x180
blk_complete_reqs+0x60/0x80
blk_done_softirq+0x25/0x40
__do_softirq+0x119/0x4c8
run_ksoftirqd+0x42/0x70
smpboot_thread_fn+0x136/0x3c0
kthread+0x140/0x1a0
ret_from_fork+0x2c/0x50
==================================================================
Above issue may happen as follows:
cpu1 cpu2
----------------------------|----------------------------
mount -o dioread_lock
ext4_writepages
ext4_do_writepages
*if (ext4_should_dioread_nolock(inode))*
// rsv_blocks is not assigned here
mount -o remount,dioread_nolock
ext4_journal_start_with_reserve
__ext4_journal_start
__ext4_journal_start_sb
jbd2__journal_start
*if (rsv_blocks)*
// h_rsv_handle is not initialized here
mpage_map_and_submit_extent
mpage_map_one_extent
dioread_nolock = ext4_should_dioread_nolock(inode)
if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN))
mpd->io_submit.io_end->handle = handle->h_rsv_handle
ext4_set_io_unwritten_flag
io_end->flag |= EXT4_IO_END_UNWRITTEN
// now io_end->handle is NULL but has EXT4_IO_END_UNWRITTEN flag
scsi_finish_command
scsi_io_completion
scsi_io_completion_action
scsi_end_request
blk_update_request
req_bio_endio
bio_endio
bio->bi_end_io > ext4_end_bio
ext4_put_io_end_defer
ext4_add_complete_io
// trigger WARN_ON(!io_end->handle && sbi->s_journal);
The immediate cause of this problem is that ext4_should_dioread_nolock()
function returns inconsistent values in the ext4_do_writepages() and
mpage_map_one_extent(). There are four conditions in this function that
can be changed at mount time to cause this problem. These four conditions
can be divided into two categories:
(1) journal_data and EXT4_EXTENTS_FL, which can be changed by ioctl
(2) DELALLOC and DIOREAD_NOLOCK, which can be changed by remount
The two in the first category have been fixed by commit c8585c6fcaf2
("ext4: fix races between changing inode journal mode and ext4_writepages")
and commit cb85f4d23f79 ("ext4: fix race between writepages and enabling
EXT4_EXTENTS_FL") respectively.
Two cases in the other category have not yet been fixed, and the above
issue is caused by this situation. We refer to the fix for the first
category, when applying options during remount, we grab s_writepages_rwsem
to avoid racing with writepages ops to trigger this problem.
Fixes: 6b523df4fb5a ("ext4: use transaction reservation for extent conversion in ext4_end_io")
Cc: stable(a)vger.kernel.org
Signed-off-by: Baokun Li <libaokun1(a)huawei.com>
Reviewed-by: Zhang Yi <yi.zhang(a)huawei.com>
Reviewed-by: Yang Erkun <yangerkun(a)huawei.com>
Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com>
---
fs/ext4/ext4.h | 3 ++-
fs/ext4/super.c | 13 +++++++++++++
2 files changed, 15 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4c88e75180a2..6df919b154b4 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1530,7 +1530,8 @@ struct ext4_sb_info {
/*
* Barrier between writepages ops and changing any inode's JOURNAL_DATA
- * or EXTENTS flag.
+ * or EXTENTS flag or between writepages ops and changing DIOREAD_NOLOCK
+ * mount option on remount.
*/
struct percpu_rw_semaphore s_writepages_rwsem;
struct dax_device *s_daxdev;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8029a6f6471c..df07222f1cc5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5605,10 +5605,20 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
vfs_flags = SB_LAZYTIME | SB_I_VERSION;
sb->s_flags = (sb->s_flags & ~vfs_flags) | (*flags & vfs_flags);
+ /*
+ * Changing the DIOREAD_NOLOCK mount option may cause two calls to
+ * ext4_should_dioread_nolock() to return inconsistent values,
+ * triggering WARN_ON in ext4_add_complete_io(). we grab here
+ * s_writepages_rwsem to avoid race between writepages ops and
+ * remount.
+ */
+ percpu_down_write(&sbi->s_writepages_rwsem);
if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
err = -EINVAL;
+ percpu_up_write(&sbi->s_writepages_rwsem);
goto restore_opts;
}
+ percpu_up_write(&sbi->s_writepages_rwsem);
if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
test_opt(sb, JOURNAL_CHECKSUM)) {
@@ -5833,6 +5843,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
return 0;
restore_opts:
+ percpu_down_write(&sbi->s_writepages_rwsem);
sb->s_flags = old_sb_flags;
sbi->s_mount_opt = old_opts.s_mount_opt;
sbi->s_mount_opt2 = old_opts.s_mount_opt2;
@@ -5841,6 +5852,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
sbi->s_commit_interval = old_opts.s_commit_interval;
sbi->s_min_batch_time = old_opts.s_min_batch_time;
sbi->s_max_batch_time = old_opts.s_max_batch_time;
+ percpu_up_write(&sbi->s_writepages_rwsem);
+
if (!test_opt(sb, BLOCK_VALIDITY) && sbi->system_blks)
ext4_release_system_zone(sb);
#ifdef CONFIG_QUOTA
--
2.25.1