Pull new CVEs: CVE-2023-1829 CVE-2022-36280 CVE-2022-1015 CVE-2023-1989 CVE-2023-30456 CVE-2023-1990
xfs bugfixes from Long Li and yangerkun
From: "Darrick J. Wong" djwong@kernel.org
mainline inclusion from mainline-v5.13-rc1 commit 8e9800f9f2b89e1efe2a5993361fae4d618a6c26 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
While running generic/050 with an external log, I observed this warning in dmesg:
Trying to write to read-only block-device sda4 (partno 4) WARNING: CPU: 2 PID: 215677 at block/blk-core.c:704 submit_bio_checks+0x256/0x510 Call Trace: submit_bio_noacct+0x2c/0x430 _xfs_buf_ioapply+0x283/0x3c0 [xfs] __xfs_buf_submit+0x6a/0x210 [xfs] xfs_buf_delwri_submit_buffers+0xf8/0x270 [xfs] xfsaild+0x2db/0xc50 [xfs] kthread+0x14b/0x170
I think this happened because we tried to cover the log after a readonly mount, and the AIL tried to write the primary superblock to the data device. The test marks the data device readonly, but it doesn't do the same to the external log device. Therefore, XFS thinks that the log is writable, even though AIL writes whine to dmesg because the data device is read only.
Fix this by amending xfs_log_writable to prevent writes when the AIL can't possible write anything into the filesystem.
Note: As for the external log or the rt devices being readonly-- xfs_blkdev_get will complain about that if we aren't doing a norecovery mount.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: yangerkun yangerkun@huaweicloud.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- fs/xfs/xfs_log.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index e154c0d44f9c..7bf94b1d3dbd 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -349,13 +349,15 @@ xfs_log_writable( struct xfs_mount *mp) { /* - * Never write to the log on norecovery mounts, if the block device is - * read-only, or if the filesystem is shutdown. Read-only mounts still - * allow internal writes for log recovery and unmount purposes, so don't - * restrict that case here. + * Do not write to the log on norecovery mounts, if the data or log + * devices are read-only, or if the filesystem is shutdown. Read-only + * mounts allow internal writes for log recovery and unmount purposes, + * so don't restrict that case. */ if (mp->m_flags & XFS_MOUNT_NORECOVERY) return false; + if (xfs_readonly_buftarg(mp->m_ddev_targp)) + return false; if (xfs_readonly_buftarg(mp->m_log->l_targ)) return false; if (XFS_FORCED_SHUTDOWN(mp))
From: "Darrick J. Wong" djwong@kernel.org
mainline inclusion from mainline-v5.14-rc1 commit 10be350b8c6c426b82d4df937f25b37eabdc3d67 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
It's currently unlikely that we will ever end up with more than 4 billion inodes waiting for reclamation, but the fs object code uses long int for object counts and we're certainly capable of generating that many. Instead of truncating the internal counters, widen them and report the object counts correctly.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Chandan Babu R chandanrlinux@gmail.com Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: yangerkun yangerkun@huaweicloud.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- fs/xfs/xfs_icache.c | 8 ++++---- fs/xfs/xfs_icache.h | 6 +++--- fs/xfs/xfs_trace.h | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index ab220c5dd20c..4f09e6991e4e 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -991,11 +991,11 @@ xfs_reclaim_inodes( long xfs_reclaim_inodes_nr( struct xfs_mount *mp, - int nr_to_scan) + unsigned long nr_to_scan) { struct xfs_icwalk icw = { .icw_flags = XFS_ICWALK_FLAG_SCAN_LIMIT, - .icw_scan_limit = nr_to_scan, + .icw_scan_limit = min_t(unsigned long, LONG_MAX, nr_to_scan), };
if (xfs_want_reclaim_sick(mp)) @@ -1013,13 +1013,13 @@ xfs_reclaim_inodes_nr( * Return the number of reclaimable inodes in the filesystem for * the shrinker to determine how much to reclaim. */ -int +long xfs_reclaim_inodes_count( struct xfs_mount *mp) { struct xfs_perag *pag; xfs_agnumber_t ag = 0; - int reclaimable = 0; + long reclaimable = 0;
while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { ag = pag->pag_agno + 1; diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index fd4e79722251..2e4cfddf8b8e 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -15,7 +15,7 @@ struct xfs_icwalk { kgid_t icw_gid; prid_t icw_prid; __u64 icw_min_file_size; - int icw_scan_limit; + long icw_scan_limit; };
/* Flags that reflect xfs_fs_eofblocks functionality. */ @@ -49,8 +49,8 @@ void xfs_inode_free(struct xfs_inode *ip); void xfs_reclaim_worker(struct work_struct *work);
void xfs_reclaim_inodes(struct xfs_mount *mp); -int xfs_reclaim_inodes_count(struct xfs_mount *mp); -long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); +long xfs_reclaim_inodes_count(struct xfs_mount *mp); +long xfs_reclaim_inodes_nr(struct xfs_mount *mp, unsigned long nr_to_scan);
void xfs_inode_mark_reclaimable(struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 68fea812d190..ae96aaa3bb31 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3962,7 +3962,7 @@ DECLARE_EVENT_CLASS(xfs_icwalk_class, __field(uint32_t, gid) __field(prid_t, prid) __field(__u64, min_file_size) - __field(int, scan_limit) + __field(long, scan_limit) __field(unsigned long, caller_ip) ), TP_fast_assign( @@ -3977,7 +3977,7 @@ DECLARE_EVENT_CLASS(xfs_icwalk_class, __entry->scan_limit = icw ? icw->icw_scan_limit : 0; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d flags 0x%x uid %u gid %u prid %u minsize %llu scan_limit %d caller %pS", + TP_printk("dev %d:%d flags 0x%x uid %u gid %u prid %u minsize %llu scan_limit %ld caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->flags, __entry->uid,
From: "Darrick J. Wong" djwong@kernel.org
mainline inclusion from mainline-v5.19-rc1 commit 2723234923b3294dbcf6019c288c87465e927ed4 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Move the code that allocates and frees the buffer cancellation tables used by log recovery into the file that actually uses the tables. This is a precursor to some cleanups and a memory leak fix.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Dave Chinner david@fromorbit.com
Conflicts: fs/xfs/libxfs/xfs_log_recover.h fs/xfs/xfs_log_priv.h
Signed-off-by: yangerkun yangerkun@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- fs/xfs/libxfs/xfs_log_recover.h | 14 +++++----- fs/xfs/xfs_buf_item_recover.c | 47 +++++++++++++++++++++++++++++++++ fs/xfs/xfs_log_priv.h | 3 --- fs/xfs/xfs_log_recover.c | 32 +++++++--------------- 4 files changed, 64 insertions(+), 32 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 3cca2bfe714c..7d53e34930ef 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -108,12 +108,6 @@ struct xlog_recover {
#define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr)
-/* - * This is the number of entries in the l_buf_cancel_table used during - * recovery. - */ -#define XLOG_BC_TABLE_SIZE 64 - #define XLOG_RECOVER_CRCPASS 0 #define XLOG_RECOVER_PASS1 1 #define XLOG_RECOVER_PASS2 2 @@ -124,5 +118,13 @@ bool xlog_is_buffer_cancelled(struct xlog *log, xfs_daddr_t blkno, uint len);
void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type, uint64_t intent_id); +void xlog_alloc_buf_cancel_table(struct xlog *log); +void xlog_free_buf_cancel_table(struct xlog *log); + +#ifdef DEBUG +void xlog_check_buf_cancel_table(struct xlog *log); +#else +#define xlog_check_buf_cancel_table(log) do { } while (0) +#endif
#endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index aa4d45701de5..809c61e1d0be 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -23,6 +23,15 @@ #include "xfs_dir2.h" #include "xfs_quota.h"
+/* + * This is the number of entries in the l_buf_cancel_table used during + * recovery. + */ +#define XLOG_BC_TABLE_SIZE 64 + +#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ + ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) + /* * This structure is used during recovery to record the buf log items which * have been canceled and should not be replayed. @@ -993,3 +1002,41 @@ const struct xlog_recover_item_ops xlog_buf_item_ops = { .commit_pass1 = xlog_recover_buf_commit_pass1, .commit_pass2 = xlog_recover_buf_commit_pass2, }; + +#ifdef DEBUG +void +xlog_check_buf_cancel_table( + struct xlog *log) +{ + int i; + + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) + ASSERT(list_empty(&log->l_buf_cancel_table[i])); +} +#endif + +void +xlog_alloc_buf_cancel_table( + struct xlog *log) +{ + int i; + + ASSERT(log->l_buf_cancel_table == NULL); + + log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * + sizeof(struct list_head), + 0); + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) + INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); +} + +void +xlog_free_buf_cancel_table( + struct xlog *log) +{ + if (!log->l_buf_cancel_table) + return; + + kmem_free(log->l_buf_cancel_table); + log->l_buf_cancel_table = NULL; +} diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index b01fb03e60d4..ca19a3f04bee 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -444,9 +444,6 @@ struct xlog { uint32_t l_iclog_roundoff;/* padding roundoff */ };
-#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ - ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) - /* * Bits for operational state */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 88b48aed446a..b7a6e3f459b1 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3229,7 +3229,7 @@ xlog_do_log_recovery( xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { - int error, i; + int error;
ASSERT(head_blk != tail_blk);
@@ -3237,37 +3237,23 @@ xlog_do_log_recovery( * First do a pass to find all of the cancelled buf log items. * Store them in the buf_cancel_table for use in the second pass. */ - log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * - sizeof(struct list_head), - 0); - for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) - INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); + xlog_alloc_buf_cancel_table(log);
error = xlog_do_recovery_pass(log, head_blk, tail_blk, XLOG_RECOVER_PASS1, NULL); - if (error != 0) { - kmem_free(log->l_buf_cancel_table); - log->l_buf_cancel_table = NULL; - return error; - } + if (error != 0) + goto out_cancel; + /* * Then do a second pass to actually recover the items in the log. * When it is complete free the table of buf cancel items. */ error = xlog_do_recovery_pass(log, head_blk, tail_blk, XLOG_RECOVER_PASS2, NULL); -#ifdef DEBUG - if (!error) { - int i; - - for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) - ASSERT(list_empty(&log->l_buf_cancel_table[i])); - } -#endif /* DEBUG */ - - kmem_free(log->l_buf_cancel_table); - log->l_buf_cancel_table = NULL; - + if (!error) + xlog_check_buf_cancel_table(log); +out_cancel: + xlog_free_buf_cancel_table(log); return error; }
From: "Darrick J. Wong" djwong@kernel.org
mainline inclusion from mainline-v5.19-rc1 commit 8db074bd84df5ccc88bff3f8f900f66f4b8349fa category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
If log recovery fails, we free the memory used by the buffer cancellation buckets, but we don't actually traverse each bucket list to free the individual xfs_buf_cancel objects. This leads to a memory leak, as reported by kmemleak in xfs/051:
unreferenced object 0xffff888103629560 (size 32): comm "mount", pid 687045, jiffies 4296935916 (age 10.752s) hex dump (first 32 bytes): 08 d3 0a 01 00 00 00 00 08 00 00 00 01 00 00 00 ................ d0 f5 0b 92 81 88 ff ff 80 64 64 25 81 88 ff ff .........dd%.... backtrace: [<ffffffffa0317c83>] kmem_alloc+0x73/0x140 [xfs] [<ffffffffa03234a9>] xlog_recover_buf_commit_pass1+0x139/0x200 [xfs] [<ffffffffa032dc27>] xlog_recover_commit_trans+0x307/0x350 [xfs] [<ffffffffa032df15>] xlog_recovery_process_trans+0xa5/0xe0 [xfs] [<ffffffffa032e12d>] xlog_recover_process_data+0x8d/0x140 [xfs] [<ffffffffa032e49d>] xlog_do_recovery_pass+0x19d/0x740 [xfs] [<ffffffffa032f22d>] xlog_do_log_recovery+0x6d/0x150 [xfs] [<ffffffffa032f343>] xlog_do_recover+0x33/0x1d0 [xfs] [<ffffffffa032faba>] xlog_recover+0xda/0x190 [xfs] [<ffffffffa03194bc>] xfs_log_mount+0x14c/0x360 [xfs] [<ffffffffa030bfed>] xfs_mountfs+0x50d/0xa60 [xfs] [<ffffffffa03124b5>] xfs_fs_fill_super+0x6a5/0x950 [xfs] [<ffffffff812b92a5>] get_tree_bdev+0x175/0x280 [<ffffffff812b7c3a>] vfs_get_tree+0x1a/0x80 [<ffffffff812e366f>] path_mount+0x6ff/0xaa0 [<ffffffff812e3b13>] __x64_sys_mount+0x103/0x140
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Dave Chinner david@fromorbit.com Signed-off-by: yangerkun yangerkun@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- fs/xfs/xfs_buf_item_recover.c | 13 +++++++++++++ 1 file changed, 13 insertions(+)
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 809c61e1d0be..7464c43560c8 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -1034,9 +1034,22 @@ void xlog_free_buf_cancel_table( struct xlog *log) { + int i; + if (!log->l_buf_cancel_table) return;
+ for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) { + struct xfs_buf_cancel *bc; + + while ((bc = list_first_entry_or_null( + &log->l_buf_cancel_table[i], + struct xfs_buf_cancel, bc_list))) { + list_del(&bc->bc_list); + kmem_free(bc); + } + } + kmem_free(log->l_buf_cancel_table); log->l_buf_cancel_table = NULL; }
From: "Darrick J. Wong" djwong@kernel.org
mainline inclusion from mainline-v5.19-rc1 commit 910bbdf2f4d7df46781bc9b723048f5ebed3d0d7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
While we're messing around with how recovery allocates and frees the buffer cancellation table, convert the allocation to use kmalloc_array instead of the old kmem_alloc APIs, and make it handle a null return, even though that's not likely.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Dave Chinner david@fromorbit.com
Conflicts: fs/xfs/libxfs/xfs_log_recover.h
Signed-off-by: yangerkun yangerkun@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- fs/xfs/libxfs/xfs_log_recover.h | 2 +- fs/xfs/xfs_buf_item_recover.c | 14 ++++++++++---- fs/xfs/xfs_log_recover.c | 4 +++- 3 files changed, 14 insertions(+), 6 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 7d53e34930ef..32a8ab3da183 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -118,7 +118,7 @@ bool xlog_is_buffer_cancelled(struct xlog *log, xfs_daddr_t blkno, uint len);
void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type, uint64_t intent_id); -void xlog_alloc_buf_cancel_table(struct xlog *log); +int xlog_alloc_buf_cancel_table(struct xlog *log); void xlog_free_buf_cancel_table(struct xlog *log);
#ifdef DEBUG diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 7464c43560c8..4b2ad8dce690 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -1015,19 +1015,25 @@ xlog_check_buf_cancel_table( } #endif
-void +int xlog_alloc_buf_cancel_table( struct xlog *log) { + void *p; int i;
ASSERT(log->l_buf_cancel_table == NULL);
- log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * - sizeof(struct list_head), - 0); + p = kmalloc_array(XLOG_BC_TABLE_SIZE, sizeof(struct list_head), + GFP_KERNEL); + if (!p) + return -ENOMEM; + + log->l_buf_cancel_table = p; for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); + + return 0; }
void diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index b7a6e3f459b1..dc40b356df38 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3237,7 +3237,9 @@ xlog_do_log_recovery( * First do a pass to find all of the cancelled buf log items. * Store them in the buf_cancel_table for use in the second pass. */ - xlog_alloc_buf_cancel_table(log); + error = xlog_alloc_buf_cancel_table(log); + if (error) + return error;
error = xlog_do_recovery_pass(log, head_blk, tail_blk, XLOG_RECOVER_PASS1, NULL);
From: Eric Sandeen sandeen@redhat.com
mainline inclusion from mainline-v6.0-rc1 commit 70b589a37e1aba892c1e5d41957b0042f9eb031b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
We got a report that "renameat2() with flags=RENAME_WHITEOUT doesn't apply an SELinux label on xfs" as it does on other filesystems (for example, ext4 and tmpfs.) While I'm not quite sure how labels may interact w/ whiteout files, leaving them as unlabeled seems inconsistent at best. Now that xfs_init_security is not static, rename it to xfs_inode_init_security per dchinner's suggestion.
Signed-off-by: Eric Sandeen sandeen@redhat.com Reviewed-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org
Conflicts: fs/xfs/xfs_inode.c fs/xfs/xfs_iops.h
Signed-off-by: yangerkun yangerkun@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- fs/xfs/xfs_inode.c | 13 ++++++++++++- fs/xfs/xfs_iops.c | 11 +++++------ fs/xfs/xfs_iops.h | 3 +++ 3 files changed, 20 insertions(+), 7 deletions(-)
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c2e9498601a8..bbf6def9429e 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3181,16 +3181,27 @@ xfs_cross_rename( */ static int xfs_rename_alloc_whiteout( + struct xfs_name *src_name, struct xfs_inode *dp, struct xfs_inode **wip) { struct xfs_inode *tmpfile; + struct qstr name; int error;
error = xfs_create_tmpfile(dp, S_IFCHR | WHITEOUT_MODE, &tmpfile); if (error) return error;
+ name.name = src_name->name; + name.len = src_name->len; + error = xfs_inode_init_security(VFS_I(tmpfile), VFS_I(dp), &name); + if (error) { + xfs_finish_inode_setup(tmpfile); + xfs_irele(tmpfile); + return error; + } + /* * Prepare the tmpfile inode as if it were created through the VFS. * Complete the inode setup and flag it as linkable. nlink is already @@ -3239,7 +3250,7 @@ xfs_rename( * appropriately. */ if (flags & RENAME_WHITEOUT) { - error = xfs_rename_alloc_whiteout(target_dp, &wip); + error = xfs_rename_alloc_whiteout(src_name, target_dp, &wip); if (error) return error;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 88e814bb2476..8026f788261d 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -71,9 +71,8 @@ xfs_initxattrs( * these attrs can be journalled at inode creation time (along with the * inode, of course, such that log replay can't cause these to be lost). */ - -STATIC int -xfs_init_security( +int +xfs_inode_init_security( struct inode *inode, struct inode *dir, const struct qstr *qstr) @@ -118,7 +117,7 @@ xfs_cleanup_inode(
/* Oh, the horror. * If we can't add the ACL or we fail in - * xfs_init_security we must back out. + * xfs_inode_init_security we must back out. * ENOSPC can hit here, among other things. */ xfs_dentry_to_name(&teardown, dentry); @@ -170,7 +169,7 @@ xfs_generic_create(
inode = VFS_I(ip);
- error = xfs_init_security(inode, dir, &dentry->d_name); + error = xfs_inode_init_security(inode, dir, &dentry->d_name); if (unlikely(error)) goto out_cleanup_inode;
@@ -385,7 +384,7 @@ xfs_vn_symlink(
inode = VFS_I(cip);
- error = xfs_init_security(inode, dir, &dentry->d_name); + error = xfs_inode_init_security(inode, dir, &dentry->d_name); if (unlikely(error)) goto out_cleanup_inode;
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index 4d24ff309f59..fb3604b68208 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -24,4 +24,7 @@ extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, extern int xfs_vn_setattr_nonsize(struct dentry *dentry, struct iattr *vap); extern int xfs_vn_setattr_size(struct dentry *dentry, struct iattr *vap);
+int xfs_inode_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr); + #endif /* __XFS_IOPS_H__ */
From: "Darrick J. Wong" djwong@kernel.org
mainline inclusion from mainline-v6.2-rc1 commit 032e160305f6872e590c77f11896fb28365c6d6c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Every now and then I see fstests failures on aarch64 (64k pages) that trigger on the following sequence:
mkfs.xfs $dev mount $dev $mnt touch $mnt/a umount $mnt xfs_db -c 'path /a' -c 'print' $dev
99% of the time this succeeds, but every now and then xfs_db cannot find /a and fails. This turns out to be a race involving udev/blkid, the page cache for the block device, and the xfs_db process.
udev is triggered whenever anyone closes a block device or unmounts it. The default udev rules invoke blkid to read the fs super and create symlinks to the bdev under /dev/disk. For this, it uses buffered reads through the page cache.
xfs_db also uses buffered reads to examine metadata. There is no coordination between xfs_db and udev, which means that they can run concurrently. Note there is no coordination between the kernel and blkid either.
On a system with 64k pages, the page cache can cache the superblock and the root inode (and hence the root dir) with the same 64k page. If udev spawns blkid after the mkfs and the system is busy enough that it is still running when xfs_db starts up, they'll both read from the same page in the pagecache.
The unmount writes updated inode metadata to disk directly. The XFS buffer cache does not use the bdev pagecache, nor does it invalidate the pagecache on umount. If the above scenario occurs, the pagecache no longer reflects what's on disk, xfs_db reads the stale metadata, and fails to find /a. Most of the time this succeeds because closing a bdev invalidates the page cache, but when processes race, everyone loses.
Fix the problem by invalidating the bdev pagecache after flushing the bdev, so that xfs_db will see up to date metadata.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Gao Xiang hsiangkao@linux.alibaba.com Reviewed-by: Dave Chinner dchinner@redhat.com
Conflicts: fs/xfs/xfs_buf.c
Signed-off-by: yangerkun yangerkun@huaweicloud.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- fs/xfs/xfs_buf.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 90a88bfc1f61..4cf07ffd9edf 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1947,6 +1947,7 @@ xfs_free_buftarg( list_lru_destroy(&btp->bt_lru);
xfs_blkdev_issue_flush(btp); + invalidate_bdev(btp->bt_bdev);
kmem_free(btp); }
From: "Darrick J. Wong" djwong@kernel.org
mainline inclusion from mainline-v6.2-rc1 commit 4c6dbfd2756bd83a0085ed804e2bb7be9cc16bc5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
I've been running near-continuous integration testing of online fsck, and I've noticed that once a day, one of the ARM VMs will fail the test with out of order records in the data fork.
xfs/804 races fsstress with online scrub (aka scan but do not change anything), so I think this might be a bug in the core xfs code. This also only seems to trigger if one runs the test for more than ~6 minutes via TIME_FACTOR=13 or something. https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfstests-dev.git/tree...
I added a debugging patch to the kernel to check the data fork extents after taking the ILOCK, before dropping ILOCK, and before and after each bmapping operation. So far I've narrowed it down to the delalloc code inserting a record in the wrong place in the iext tree:
xfs_bmap_add_extent_hole_delay, near line 2691:
case 0: /* * New allocation is not contiguous with another * delayed allocation. * Insert a new entry. */ oldlen = newlen = 0; xfs_iunlock_check_datafork(ip); <-- ok here xfs_iext_insert(ip, icur, new, state); xfs_iunlock_check_datafork(ip); <-- bad here break; }
I recorded the state of the data fork mappings and iext cursor state when a corrupt data fork is detected immediately after the xfs_bmap_add_extent_hole_delay call in xfs_bmapi_reserve_delalloc:
ino 0x140bb3 func xfs_bmapi_reserve_delalloc line 4164 data fork: ino 0x140bb3 nr 0x0 nr_real 0x0 offset 0xb9 blockcount 0x1f startblock 0x935de2 state 1 ino 0x140bb3 nr 0x1 nr_real 0x1 offset 0xe6 blockcount 0xa startblock 0xffffffffe0007 state 0 ino 0x140bb3 nr 0x2 nr_real 0x1 offset 0xd8 blockcount 0xe startblock 0x935e01 state 0
Here we see that a delalloc extent was inserted into the wrong position in the iext leaf, same as all the other times. The extra trace data I collected are as follows:
ino 0x140bb3 fork 0 oldoff 0xe6 oldlen 0x4 oldprealloc 0x6 isize 0xe6000 ino 0x140bb3 oldgotoff 0xea oldgotstart 0xfffffffffffffffe oldgotcount 0x0 oldgotstate 0 ino 0x140bb3 crapgotoff 0x0 crapgotstart 0x0 crapgotcount 0x0 crapgotstate 0 ino 0x140bb3 freshgotoff 0xd8 freshgotstart 0x935e01 freshgotcount 0xe freshgotstate 0 ino 0x140bb3 nowgotoff 0xe6 nowgotstart 0xffffffffe0007 nowgotcount 0xa nowgotstate 0 ino 0x140bb3 oldicurpos 1 oldleafnr 2 oldleaf 0xfffffc00f0609a00 ino 0x140bb3 crapicurpos 2 crapleafnr 2 crapleaf 0xfffffc00f0609a00 ino 0x140bb3 freshicurpos 1 freshleafnr 2 freshleaf 0xfffffc00f0609a00 ino 0x140bb3 newicurpos 1 newleafnr 3 newleaf 0xfffffc00f0609a00
The first line shows that xfs_bmapi_reserve_delalloc was called with whichfork=XFS_DATA_FORK, off=0xe6, len=0x4, prealloc=6.
The second line ("oldgot") shows the contents of @got at the beginning of the call, which are the results of the first iext lookup in xfs_buffered_write_iomap_begin.
Line 3 ("crapgot") is the result of duplicating the cursor at the start of the body of xfs_bmapi_reserve_delalloc and performing a fresh lookup at @off.
Line 4 ("freshgot") is the result of a new xfs_iext_get_extent right before the call to xfs_bmap_add_extent_hole_delay. Totally garbage.
Line 5 ("nowgot") is contents of @got after the xfs_bmap_add_extent_hole_delay call.
Line 6 is the contents of @icur at the beginning fo the call. Lines 7-9 are the contents of the iext cursors at the point where the block mappings were sampled.
I think @oldgot is a HOLESTARTBLOCK extent because the first lookup didn't find anything, so we filled in imap with "fake hole until the end". At the time of the first lookup, I suspect that there's only one 32-block unwritten extent in the mapping (hence oldicurpos==1) but by the time we get to recording crapgot, crapicurpos==2.
Dave then added:
Ok, that's much simpler to reason about, and implies the smoke is coming from xfs_buffered_write_iomap_begin() or xfs_bmapi_reserve_delalloc(). I suspect the former - it does a lot of stuff with the ILOCK_EXCL held.....
.... including calling xfs_qm_dqattach_locked().
xfs_buffered_write_iomap_begin ILOCK_EXCL look up icur xfs_qm_dqattach_locked xfs_qm_dqattach_one xfs_qm_dqget_inode dquot cache miss xfs_iunlock(ip, XFS_ILOCK_EXCL); error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp); xfs_ilock(ip, XFS_ILOCK_EXCL); .... xfs_bmapi_reserve_delalloc(icur)
Yup, that's what is letting the magic smoke out - xfs_qm_dqattach_locked() can cycle the ILOCK. If that happens, we can pass a stale icur to xfs_bmapi_reserve_delalloc() and it all goes downhill from there.
Back to Darrick now:
So. Fix this by moving the dqattach_locked call up before we take the ILOCK, like all the other callers in that file.
Fixes: a526c85c2236 ("xfs: move xfs_file_iomap_begin_delay around") # goes further back than this Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com
Conflicts: fs/xfs/xfs_iomap.c
Signed-off-by: yangerkun yangerkun@huaweicloud.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- fs/xfs/xfs_iomap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 564d26f9c325..394b47faec6e 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -871,6 +871,10 @@ xfs_buffered_write_iomap_begin(
ASSERT(!XFS_IS_REALTIME_INODE(ip));
+ error = xfs_qm_dqattach(ip); + if (error) + return error; + xfs_ilock(ip, XFS_ILOCK_EXCL);
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || @@ -974,10 +978,6 @@ xfs_buffered_write_iomap_begin( allocfork = XFS_COW_FORK; }
- error = xfs_qm_dqattach_locked(ip, false); - if (error) - goto out_unlock; - if (eof && offset + count > XFS_ISIZE(ip)) { /* * Determine the initial size of the preallocation.
From: "Darrick J. Wong" djwong@kernel.org
mainline inclusion from mainline-v6.2-rc1 commit ddfdd530e43fcb3f7a0a69966e5f6c33497b4ae3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
While investigating test failures in xfs/17[1-3] in alwayscow mode, I noticed through code inspection that xfs_bmap_alloc_userdata isn't setting XFS_ALLOC_USERDATA when allocating extents for a file's CoW fork. COW staging extents should be flagged as USERDATA, since user data are persisted to these blocks before being remapped into a file.
This mis-classification has a few impacts on the behavior of the system. First, the filestreams allocator is supposed to keep allocating from a chosen AG until it runs out of space in that AG. However, it only does that for USERDATA allocations, which means that COW allocations aren't tied to the filestreams AG. Fortunately, few people use filestreams, so nobody's noticed.
A more serious problem is that xfs_alloc_ag_vextent_small looks for a buffer to invalidate *if* the USERDATA flag is set and the AG is so full that the allocation had to come from the AGFL because the cntbt is empty. The consequences of not invalidating the buffer are severe -- if the AIL incorrectly checkpoints a buffer that is now being used to store user data, that action will clobber the user's written data.
Fix filestreams and yet another data corruption vector by flagging COW allocations as USERDATA.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: yangerkun yangerkun@huaweicloud.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- fs/xfs/libxfs/xfs_bmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index f7a6c212de7d..02ce08745347 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4053,7 +4053,7 @@ xfs_bmap_alloc_userdata( * the busy list. */ bma->datatype = XFS_ALLOC_NOBUSY; - if (whichfork == XFS_DATA_FORK) { + if (whichfork == XFS_DATA_FORK || whichfork == XFS_COW_FORK) { bma->datatype |= XFS_ALLOC_USERDATA; if (bma->offset == 0) bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
From: Hironori Shiina shiina.hironori@gmail.com
mainline inclusion from mainline-v6.2-rc4 commit 817644fa4525258992f17fecf4f1d6cdd2e1b731 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The root inode number should be set to `breq->startino` for getting stat information of the root when XFS_BULK_IREQ_SPECIAL_ROOT is used. Otherwise, the inode search is started from 1 (XFS_BULK_IREQ_SPECIAL_ROOT) and the inode with the lowest number in a filesystem is returned.
Fixes: bf3cb3944792 ("xfs: allow single bulkstat of special inodes") Signed-off-by: Hironori Shiina shiina.hironori@fujitsu.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: yangerkun yangerkun@huaweicloud.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- fs/xfs/xfs_ioctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 79cf806f4e3e..3a2f52786047 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -830,7 +830,7 @@ xfs_bulkstat_fmt( static int xfs_bulk_ireq_setup( struct xfs_mount *mp, - struct xfs_bulk_ireq *hdr, + const struct xfs_bulk_ireq *hdr, struct xfs_ibulk *breq, void __user *ubuffer) { @@ -856,7 +856,7 @@ xfs_bulk_ireq_setup(
switch (hdr->ino) { case XFS_BULK_IREQ_SPECIAL_ROOT: - hdr->ino = mp->m_sb.sb_rootino; + breq->startino = mp->m_sb.sb_rootino; break; default: return -EINVAL;
From: Wengang Wang wen.gang.wang@oracle.com
mainline inclusion from mainline-v6.2-rc4 commit 601a27ea09a317d0fe2895df7d875381fb393041 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
In xfs_extent_busy_update_extent() case 6 and 7, whenever bno is modified on extent busy, the relavent length has to be modified accordingly.
Signed-off-by: Wengang Wang wen.gang.wang@oracle.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: yangerkun yangerkun@huaweicloud.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- fs/xfs/xfs_extent_busy.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index a4075685d9eb..26680444969c 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -244,6 +244,7 @@ xfs_extent_busy_update_extent( * */ busyp->bno = fend; + busyp->length = bend - fend; } else if (bbno < fbno) { /* * Case 8:
From: Dave Chinner dchinner@redhat.com
mainline inclusion from mainline-v6.3-rc1 commit c85007e2e3942da1f9361e4b5a9388ea3a8dcc5b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
When we split a BMBT due to record insertion, we offload it to a worker thread because we can be deep in the stack when we try to allocate a new block for the BMBT. Allocation can use several kilobytes of stack (full memory reclaim, swap and/or IO path can end up on the stack during allocation) and we can already be several kilobytes deep in the stack when we need to split the BMBT.
A recent workload demonstrated a deadlock in this BMBT split offload. It requires several things to happen at once:
1. two inodes need a BMBT split at the same time, one must be unwritten extent conversion from IO completion, the other must be from extent allocation.
2. there must be a no available xfs_alloc_wq worker threads available in the worker pool.
3. There must be sustained severe memory shortages such that new kworker threads cannot be allocated to the xfs_alloc_wq pool for both threads that need split work to be run
4. The split work from the unwritten extent conversion must run first.
5. when the BMBT block allocation runs from the split work, it must loop over all AGs and not be able to either trylock an AGF successfully, or each AGF is is able to lock has no space available for a single block allocation.
6. The BMBT allocation must then attempt to lock the AGF that the second task queued to the rescuer thread already has locked before it finds an AGF it can allocate from.
At this point, we have an ABBA deadlock between tasks queued on the xfs_alloc_wq rescuer thread and a locked AGF. i.e. The queued task holding the AGF lock can't be run by the rescuer thread until the task the rescuer thread is runing gets the AGF lock....
This is a highly improbably series of events, but there it is.
There's a couple of ways to fix this, but the easiest way to ensure that we only punt tasks with a locked AGF that holds enough space for the BMBT block allocations to the worker thread.
This works for unwritten extent conversion in IO completion (which doesn't have a locked AGF and space reservations) because we have tight control over the IO completion stack. It is typically only 6 functions deep when xfs_btree_split() is called because we've already offloaded the IO completion work to a worker thread and hence we don't need to worry about stack overruns here.
The other place we can be called for a BMBT split without a preceeding allocation is __xfs_bunmapi() when punching out the center of an existing extent. We don't remove extents in the IO path, so these operations don't tend to be called with a lot of stack consumed. Hence we don't really need to ship the split off to a worker thread in these cases, either.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: yangerkun yangerkun@huaweicloud.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- fs/xfs/libxfs/xfs_btree.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index ea713f2e57b2..7c4d3fd47950 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -2840,9 +2840,22 @@ xfs_btree_split_worker( }
/* - * BMBT split requests often come in with little stack to work on. Push + * BMBT split requests often come in with little stack to work on so we push * them off to a worker thread so there is lots of stack to use. For the other * btree types, just call directly to avoid the context switch overhead here. + * + * Care must be taken here - the work queue rescuer thread introduces potential + * AGF <> worker queue deadlocks if the BMBT block allocation has to lock new + * AGFs to allocate blocks. A task being run by the rescuer could attempt to + * lock an AGF that is already locked by a task queued to run by the rescuer, + * resulting in an ABBA deadlock as the rescuer cannot run the lock holder to + * release it until the current thread it is running gains the lock. + * + * To avoid this issue, we only ever queue BMBT splits that don't have an AGF + * already locked to allocate from. The only place that doesn't hold an AGF + * locked is unwritten extent conversion at IO completion, but that has already + * been offloaded to a worker thread and hence has no stack consumption issues + * we have to worry about. */ STATIC int /* error */ xfs_btree_split( @@ -2856,7 +2869,8 @@ xfs_btree_split( struct xfs_btree_split_args args; DECLARE_COMPLETION_ONSTACK(done);
- if (cur->bc_btnum != XFS_BTNUM_BMAP) + if (cur->bc_btnum != XFS_BTNUM_BMAP || + cur->bc_tp->t_firstblock == NULLFSBLOCK) return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
args.cur = cur;
From: Donald Douwsma ddouwsma@redhat.com
mainline inclusion from mainline-v6.3-rc1 commit 167ce4cbfa370114fee61ad5b58e401d95e2d5cd category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
xfs will not allow combining other panic masks with XFS_PTAG_VERIFIER_ERROR.
# sysctl fs.xfs.panic_mask=511 sysctl: setting key "fs.xfs.panic_mask": Invalid argument fs.xfs.panic_mask = 511
Update to the maximum value that can be set to allow the full range of masks. Do this using a mask of possible values to prevent this happening again as suggested by Darrick.
Fixes: d519da41e2b7 ("xfs: Introduce XFS_PTAG_VERIFIER_ERROR panic mask") Signed-off-by: Donald Douwsma ddouwsma@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com
Conflicts: fs/xfs/xfs_error.h
Signed-off-by: yangerkun yangerkun@huaweicloud.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- Documentation/admin-guide/xfs.rst | 2 +- fs/xfs/xfs_error.h | 12 +++++++++++- fs/xfs/xfs_globals.c | 3 ++- 3 files changed, 14 insertions(+), 3 deletions(-)
diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst index 82cc349e3aac..ef803f8ae689 100644 --- a/Documentation/admin-guide/xfs.rst +++ b/Documentation/admin-guide/xfs.rst @@ -296,7 +296,7 @@ The following sysctls are available for the XFS filesystem: XFS_ERRLEVEL_LOW: 1 XFS_ERRLEVEL_HIGH: 5
- fs.xfs.panic_mask (Min: 0 Default: 0 Max: 256) + fs.xfs.panic_mask (Min: 0 Default: 0 Max: 511) Causes certain error conditions to call BUG(). Value is a bitmask; OR together the tags which represent errors which should cause panics:
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 1717b7508356..dd25b666d9f5 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -62,7 +62,7 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
/* * XFS panic tags -- allow a call to xfs_alert_tag() be turned into - * a panic by setting xfs_panic_mask in a sysctl. + * a panic by setting fs.xfs.panic_mask in a sysctl. */ #define XFS_NO_PTAG 0 #define XFS_PTAG_IFLUSH 0x00000001 @@ -75,4 +75,14 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); #define XFS_PTAG_FSBLOCK_ZERO 0x00000080 #define XFS_PTAG_VERIFIER_ERROR 0x00000100
+#define XFS_PTAG_MASK (XFS_PTAG_IFLUSH | \ + XFS_PTAG_LOGRES | \ + XFS_PTAG_AILDELETE | \ + XFS_PTAG_ERROR_REPORT | \ + XFS_PTAG_SHUTDOWN_CORRUPT | \ + XFS_PTAG_SHUTDOWN_IOERROR | \ + XFS_PTAG_SHUTDOWN_LOGERROR | \ + XFS_PTAG_FSBLOCK_ZERO | \ + XFS_PTAG_VERIFIER_ERROR) + #endif /* __XFS_ERROR_H__ */ diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index f62fa652c2fd..4a71b945f0a9 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -4,6 +4,7 @@ * All Rights Reserved. */ #include "xfs.h" +#include "xfs_error.h"
/* * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, @@ -15,7 +16,7 @@ xfs_param_t xfs_params = { /* MIN DFLT MAX */ .sgid_inherit = { 0, 0, 1 }, .symlink_mode = { 0, 0, 1 }, - .panic_mask = { 0, 0, 256 }, + .panic_mask = { 0, 0, XFS_PTAG_MASK}, .error_level = { 0, 3, 11 }, .syncd_timer = { 1*100, 30*100, 7200*100}, .stats_clear = { 0, 0, 1 },
From: Chandan Babu R chandanrlinux@gmail.com
mainline inclusion from mainline-v5.12-rc4 commit b6785e279d53ca5c4fa6be1146e85000870d73ef category: bugfix bugzilla: 188220, https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The incore data fork of an inode stores the bmap btree root node as 'struct xfs_btree_block'. However, the ondisk version of the inode stores the bmap btree root node as a 'struct xfs_bmdr_block'.
xfs_bmap_add_attrfork_btree() checks if the btree root node fits inside the data fork of the inode. However, it incorrectly uses 'struct xfs_btree_block' to compute the size of the bmap btree root node. Since size of 'struct xfs_btree_block' is larger than that of 'struct xfs_bmdr_block', xfs_bmap_add_attrfork_btree() could end up unnecessarily demoting the current root node as the child of newly allocated root node.
This commit optimizes space usage by modifying xfs_bmap_add_attrfork_btree() to use 'struct xfs_bmdr_block' to check if the bmap btree root node fits inside the data fork of the inode.
Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Chandan Babu R chandanrlinux@gmail.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org
Conflicts: fs/xfs/libxfs/xfs_bmap.c
Signed-off-by: Long Li leo.lilong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- fs/xfs/libxfs/xfs_bmap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 02ce08745347..ea5a9a93948c 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -926,13 +926,15 @@ xfs_bmap_add_attrfork_btree( xfs_inode_t *ip, /* incore inode pointer */ int *flags) /* inode logging flags */ { + struct xfs_btree_block *block = ip->i_df.if_broot; xfs_btree_cur_t *cur; /* btree cursor */ int error; /* error return value */ xfs_mount_t *mp; /* file system mount struct */ int stat; /* newroot status */
mp = ip->i_mount; - if (ip->i_df.if_broot_bytes <= xfs_inode_data_fork_size(ip)) + + if (XFS_BMAP_BMDR_SPACE(block) <= xfs_inode_data_fork_size(ip)) *flags |= XFS_ILOG_DBROOT; else { cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
From: "Darrick J. Wong" djwong@kernel.org
mainline inclusion from mainline-v6.1-rc1 commit a38ebce1da271f480e47c3def4f810c6106b74a1 category: bugfix bugzilla: 188220, https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Starting in 6.1, CONFIG_FORTIFY_SOURCE checks the length parameter of memcpy. Unfortunately, it doesn't handle flex arrays correctly:
------------[ cut here ]------------ memcpy: detected field-spanning write (size 48) of single field "dst_bui_fmt" at fs/xfs/xfs_bmap_item.c:628 (size 16)
Fix this by refactoring the xfs_bui_copy_format function to handle the copying of the head and the flex array members separately. While we're at it, fix a minor validation deficiency in the recovery function.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Allison Henderson allison.henderson@oracle.com Reviewed-by: Dave Chinner dchinner@redhat.com
conflicts: fs/xfs/xfs_ondisk.h
Signed-off-by: Long Li leo.lilong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- fs/xfs/xfs_bmap_item.c | 46 ++++++++++++++++++++---------------------- fs/xfs/xfs_ondisk.h | 4 ++++ 2 files changed, 26 insertions(+), 24 deletions(-)
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index e6de8081451f..c9dae4b98e13 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -590,28 +590,18 @@ static const struct xfs_item_ops xfs_bui_item_ops = { .iop_relog = xfs_bui_item_relog, };
-/* - * Copy an BUI format buffer from the given buf, and into the destination - * BUI format structure. The BUI/BUD items were designed not to need any - * special alignment handling. - */ -static int +static inline void xfs_bui_copy_format( - struct xfs_log_iovec *buf, - struct xfs_bui_log_format *dst_bui_fmt) + struct xfs_bui_log_format *dst, + const struct xfs_bui_log_format *src) { - struct xfs_bui_log_format *src_bui_fmt; - uint len; + unsigned int i;
- src_bui_fmt = buf->i_addr; - len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents); + memcpy(dst, src, offsetof(struct xfs_bui_log_format, bui_extents));
- if (buf->i_len == len) { - memcpy(dst_bui_fmt, src_bui_fmt, len); - return 0; - } - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); - return -EFSCORRUPTED; + for (i = 0; i < src->bui_nextents; i++) + memcpy(&dst->bui_extents[i], &src->bui_extents[i], + sizeof(struct xfs_map_extent)); }
/* @@ -628,23 +618,31 @@ xlog_recover_bui_commit_pass2( struct xlog_recover_item *item, xfs_lsn_t lsn) { - int error; struct xfs_mount *mp = log->l_mp; struct xfs_bui_log_item *buip; struct xfs_bui_log_format *bui_formatp; + size_t len;
bui_formatp = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].i_len < xfs_bui_log_format_sizeof(0)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); return -EFSCORRUPTED; } - buip = xfs_bui_init(mp); - error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format); - if (error) { - xfs_bui_item_free(buip); - return error; + + len = xfs_bui_log_format_sizeof(bui_formatp->bui_nextents); + if (item->ri_buf[0].i_len != len) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; } + + buip = xfs_bui_init(mp); + xfs_bui_copy_format(&buip->bui_format, bui_formatp); atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents); /* * Insert the intent into the AIL directly and drop one reference so diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 0aa87c210104..0b85f90912cf 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -132,7 +132,11 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20); XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_bui_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_bud_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_map_extent, 32);
+ XFS_CHECK_OFFSET(struct xfs_bui_log_format, bui_extents, 16); /* * The v5 superblock format extended several v4 header structures with * additional data. While new fields are only accessible on v5
From: "Darrick J. Wong" djwong@kernel.org
mainline inclusion from mainline-v6.1-rc1 commit a38935c03c7914a6ab22eefb750b259868ed5a4b category: bugfix bugzilla: 188220, https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Starting in 6.1, CONFIG_FORTIFY_SOURCE checks the length parameter of memcpy. Since we're already fixing problems with BUI item copying, we should fix it everything else.
Refactor the xfs_cui_copy_format function to handle the copying of the head and the flex array members separately. While we're at it, fix a minor validation deficiency in the recovery function.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Allison Henderson allison.henderson@oracle.com Reviewed-by: Dave Chinner dchinner@redhat.com
conflicts: fs/xfs/xfs_ondisk.h
Signed-off-by: Long Li leo.lilong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- fs/xfs/xfs_ondisk.h | 5 +++++ fs/xfs/xfs_refcount_item.c | 45 ++++++++++++++++++-------------------- 2 files changed, 26 insertions(+), 24 deletions(-)
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 0b85f90912cf..ce1ee34fef03 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -134,9 +134,14 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_bui_log_format, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_bud_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_cui_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_cud_log_format, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_map_extent, 32); + XFS_CHECK_STRUCT_SIZE(struct xfs_phys_extent, 16);
XFS_CHECK_OFFSET(struct xfs_bui_log_format, bui_extents, 16); + XFS_CHECK_OFFSET(struct xfs_cui_log_format, cui_extents, 16); + /* * The v5 superblock format extended several v4 header structures with * additional data. While new fields are only accessible on v5 diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 9f4ff45c7a93..4840c194fa24 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -598,28 +598,18 @@ static const struct xfs_item_ops xfs_cui_item_ops = { .iop_relog = xfs_cui_item_relog, };
-/* - * Copy an CUI format buffer from the given buf, and into the destination - * CUI format structure. The CUI/CUD items were designed not to need any - * special alignment handling. - */ -static int +static inline void xfs_cui_copy_format( - struct xfs_log_iovec *buf, - struct xfs_cui_log_format *dst_cui_fmt) + struct xfs_cui_log_format *dst, + const struct xfs_cui_log_format *src) { - struct xfs_cui_log_format *src_cui_fmt; - uint len; + unsigned int i;
- src_cui_fmt = buf->i_addr; - len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents); + memcpy(dst, src, offsetof(struct xfs_cui_log_format, cui_extents));
- if (buf->i_len == len) { - memcpy(dst_cui_fmt, src_cui_fmt, len); - return 0; - } - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); - return -EFSCORRUPTED; + for (i = 0; i < src->cui_nextents; i++) + memcpy(&dst->cui_extents[i], &src->cui_extents[i], + sizeof(struct xfs_phys_extent)); }
/* @@ -636,19 +626,26 @@ xlog_recover_cui_commit_pass2( struct xlog_recover_item *item, xfs_lsn_t lsn) { - int error; struct xfs_mount *mp = log->l_mp; struct xfs_cui_log_item *cuip; struct xfs_cui_log_format *cui_formatp; + size_t len;
cui_formatp = item->ri_buf[0].i_addr;
- cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); - error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format); - if (error) { - xfs_cui_item_free(cuip); - return error; + if (item->ri_buf[0].i_len < xfs_cui_log_format_sizeof(0)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + + len = xfs_cui_log_format_sizeof(cui_formatp->cui_nextents); + if (item->ri_buf[0].i_len != len) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; } + + cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); + xfs_cui_copy_format(&cuip->cui_format, cui_formatp); atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); /* * Insert the intent into the AIL directly and drop one reference so
From: "Darrick J. Wong" djwong@kernel.org
mainline inclusion from mainline-v6.1-rc1 commit b45ca961e94673df83ab1900802afe82776966e6 category: bugfix bugzilla: 188220, https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Starting in 6.1, CONFIG_FORTIFY_SOURCE checks the length parameter of memcpy. Since we're already fixing problems with BUI item copying, we should fix it everything else.
Refactor the xfs_rui_copy_format function to handle the copying of the head and the flex array members separately. While we're at it, fix a minor validation deficiency in the recovery function.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Allison Henderson allison.henderson@oracle.com Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Long Li leo.lilong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- fs/xfs/xfs_ondisk.h | 3 +++ fs/xfs/xfs_rmap_item.c | 58 ++++++++++++++++++++---------------------- 2 files changed, 30 insertions(+), 31 deletions(-)
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index ce1ee34fef03..6ae655f769eb 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -136,11 +136,14 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_bud_log_format, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_cui_log_format, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_cud_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_rui_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_rud_log_format, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_map_extent, 32); XFS_CHECK_STRUCT_SIZE(struct xfs_phys_extent, 16);
XFS_CHECK_OFFSET(struct xfs_bui_log_format, bui_extents, 16); XFS_CHECK_OFFSET(struct xfs_cui_log_format, cui_extents, 16); + XFS_CHECK_OFFSET(struct xfs_rui_log_format, rui_extents, 16);
/* * The v5 superblock format extended several v4 header structures with diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index b5447ac7cb9b..4226895851b0 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -154,31 +154,6 @@ xfs_rui_init( return ruip; }
-/* - * Copy an RUI format buffer from the given buf, and into the destination - * RUI format structure. The RUI/RUD items were designed not to need any - * special alignment handling. - */ -STATIC int -xfs_rui_copy_format( - struct xfs_log_iovec *buf, - struct xfs_rui_log_format *dst_rui_fmt) -{ - struct xfs_rui_log_format *src_rui_fmt; - uint len; - - src_rui_fmt = buf->i_addr; - len = xfs_rui_log_format_sizeof(src_rui_fmt->rui_nextents); - - if (buf->i_len != len) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); - return -EFSCORRUPTED; - } - - memcpy(dst_rui_fmt, src_rui_fmt, len); - return 0; -} - static inline struct xfs_rud_log_item *RUD_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_rud_log_item, rud_item); @@ -621,6 +596,20 @@ static const struct xfs_item_ops xfs_rui_item_ops = { .iop_relog = xfs_rui_item_relog, };
+static inline void +xfs_rui_copy_format( + struct xfs_rui_log_format *dst, + const struct xfs_rui_log_format *src) +{ + unsigned int i; + + memcpy(dst, src, offsetof(struct xfs_rui_log_format, rui_extents)); + + for (i = 0; i < src->rui_nextents; i++) + memcpy(&dst->rui_extents[i], &src->rui_extents[i], + sizeof(struct xfs_map_extent)); +} + /* * This routine is called to create an in-core extent rmap update * item from the rui format structure which was logged on disk. @@ -635,19 +624,26 @@ xlog_recover_rui_commit_pass2( struct xlog_recover_item *item, xfs_lsn_t lsn) { - int error; struct xfs_mount *mp = log->l_mp; struct xfs_rui_log_item *ruip; struct xfs_rui_log_format *rui_formatp; + size_t len;
rui_formatp = item->ri_buf[0].i_addr;
- ruip = xfs_rui_init(mp, rui_formatp->rui_nextents); - error = xfs_rui_copy_format(&item->ri_buf[0], &ruip->rui_format); - if (error) { - xfs_rui_item_free(ruip); - return error; + if (item->ri_buf[0].i_len < xfs_rui_log_format_sizeof(0)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; } + + len = xfs_rui_log_format_sizeof(rui_formatp->rui_nextents); + if (item->ri_buf[0].i_len != len) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + + ruip = xfs_rui_init(mp, rui_formatp->rui_nextents); + xfs_rui_copy_format(&ruip->rui_format, rui_formatp); atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents); /* * Insert the intent into the AIL directly and drop one reference so
From: "Darrick J. Wong" djwong@kernel.org
mainline inclusion from mainline-v6.1-rc1 commit 03a7485cd701e1c08baadcf39d9592d83715e224 category: bugfix bugzilla: 188220, https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Starting in 6.1, CONFIG_FORTIFY_SOURCE checks the length parameter of memcpy. Since we're already fixing problems with BUI item copying, we should fix it everything else.
An extra difficulty here is that the ef[id]_extents arrays are declared as single-element arrays. This is not the convention for flex arrays in the modern kernel, and it causes all manner of problems with static checking tools, since they often cannot tell the difference between a single element array and a flex array.
So for starters, change those array[1] declarations to array[] declarations to signal that they are proper flex arrays and adjust all the "size-1" expressions to fit the new declaration style.
Next, refactor the xfs_efi_copy_format function to handle the copying of the head and the flex array members separately. While we're at it, fix a minor validation deficiency in the recovery function.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Kees Cook keescook@chromium.org Reviewed-by: Allison Henderson allison.henderson@oracle.com Reviewed-by: Dave Chinner dchinner@redhat.com
conflicts: fs/xfs/xfs_extfree_item.c fs/xfs/xfs_super.c
Signed-off-by: Long Li leo.lilong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- fs/xfs/libxfs/xfs_log_format.h | 12 ++++++------ fs/xfs/xfs_extfree_item.c | 31 +++++++++++++++++++++---------- fs/xfs/xfs_ondisk.h | 11 +++++++---- fs/xfs/xfs_super.c | 4 ++-- 4 files changed, 36 insertions(+), 22 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 954f137d10b7..9e6e2de04287 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -580,7 +580,7 @@ typedef struct xfs_efi_log_format { uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_t efi_extents[1]; /* array of extents to free */ + xfs_extent_t efi_extents[]; /* array of extents to free */ } xfs_efi_log_format_t;
typedef struct xfs_efi_log_format_32 { @@ -588,7 +588,7 @@ typedef struct xfs_efi_log_format_32 { uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_32_t efi_extents[1]; /* array of extents to free */ + xfs_extent_32_t efi_extents[]; /* array of extents to free */ } __attribute__((packed)) xfs_efi_log_format_32_t;
typedef struct xfs_efi_log_format_64 { @@ -596,7 +596,7 @@ typedef struct xfs_efi_log_format_64 { uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_64_t efi_extents[1]; /* array of extents to free */ + xfs_extent_64_t efi_extents[]; /* array of extents to free */ } xfs_efi_log_format_64_t;
/* @@ -609,7 +609,7 @@ typedef struct xfs_efd_log_format { uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_t efd_extents[1]; /* array of extents freed */ + xfs_extent_t efd_extents[]; /* array of extents freed */ } xfs_efd_log_format_t;
typedef struct xfs_efd_log_format_32 { @@ -617,7 +617,7 @@ typedef struct xfs_efd_log_format_32 { uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_32_t efd_extents[1]; /* array of extents freed */ + xfs_extent_32_t efd_extents[]; /* array of extents freed */ } __attribute__((packed)) xfs_efd_log_format_32_t;
typedef struct xfs_efd_log_format_64 { @@ -625,7 +625,7 @@ typedef struct xfs_efd_log_format_64 { uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_64_t efd_extents[1]; /* array of extents freed */ + xfs_extent_64_t efd_extents[]; /* array of extents freed */ } xfs_efd_log_format_64_t;
/* diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 11474770d630..993605079682 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -74,7 +74,7 @@ xfs_efi_item_sizeof( struct xfs_efi_log_item *efip) { return sizeof(struct xfs_efi_log_format) + - (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t); + efip->efi_format.efi_nextents * sizeof(xfs_extent_t); }
STATIC void @@ -158,7 +158,7 @@ xfs_efi_init( ASSERT(nextents > 0); if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { size = (uint)(sizeof(struct xfs_efi_log_item) + - ((nextents - 1) * sizeof(xfs_extent_t))); + (nextents * sizeof(xfs_extent_t))); efip = kmem_zalloc(size, 0); } else { efip = kmem_cache_zalloc(xfs_efi_zone, @@ -187,14 +187,19 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) xfs_efi_log_format_t *src_efi_fmt = buf->i_addr; uint i; uint len = sizeof(xfs_efi_log_format_t) + - (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t); + src_efi_fmt->efi_nextents * sizeof(xfs_extent_t); uint len32 = sizeof(xfs_efi_log_format_32_t) + - (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_32_t); + src_efi_fmt->efi_nextents * sizeof(xfs_extent_32_t); uint len64 = sizeof(xfs_efi_log_format_64_t) + - (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_64_t); + src_efi_fmt->efi_nextents * sizeof(xfs_extent_64_t);
if (buf->i_len == len) { - memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len); + memcpy(dst_efi_fmt, src_efi_fmt, + offsetof(struct xfs_efi_log_format, efi_extents)); + for (i = 0; i < src_efi_fmt->efi_nextents; i++) + memcpy(&dst_efi_fmt->efi_extents[i], + &src_efi_fmt->efi_extents[i], + sizeof(struct xfs_extent)); return 0; } else if (buf->i_len == len32) { xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr; @@ -254,7 +259,7 @@ xfs_efd_item_sizeof( struct xfs_efd_log_item *efdp) { return sizeof(xfs_efd_log_format_t) + - (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t); + efdp->efd_format.efd_nextents * sizeof(xfs_extent_t); }
STATIC void @@ -330,7 +335,7 @@ xfs_trans_get_efd(
if (nextents > XFS_EFD_MAX_FAST_EXTENTS) { efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) + - (nextents - 1) * sizeof(struct xfs_extent), + nextents * sizeof(struct xfs_extent), 0); } else { efdp = kmem_cache_zalloc(xfs_efd_zone, @@ -701,6 +706,12 @@ xlog_recover_efi_commit_pass2(
efi_formatp = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].i_len < + offsetof(struct xfs_efi_log_format, efi_extents)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + efip = xfs_efi_init(mp, efi_formatp->efi_nextents); error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format); if (error) { @@ -740,9 +751,9 @@ xlog_recover_efd_commit_pass2(
efd_formatp = item->ri_buf[0].i_addr; ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + - ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || + (efd_formatp->efd_nextents * sizeof(xfs_extent_32_t)))) || (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) + - ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t))))); + (efd_formatp->efd_nextents * sizeof(xfs_extent_64_t)))));
xlog_recover_release_intent(log, XFS_LI_EFI, efd_formatp->efd_efi_id); return 0; diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 6ae655f769eb..e0738944e1be 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -118,10 +118,10 @@ xfs_check_ondisk_structs(void) /* log structures */ XFS_CHECK_STRUCT_SIZE(struct xfs_buf_log_format, 88); XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24); - XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32, 28); - XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64, 32); - XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_32, 28); - XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_64, 32); + XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_32, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_64, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_extent_32, 12); XFS_CHECK_STRUCT_SIZE(struct xfs_extent_64, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_log_dinode, 176); @@ -144,6 +144,9 @@ xfs_check_ondisk_structs(void) XFS_CHECK_OFFSET(struct xfs_bui_log_format, bui_extents, 16); XFS_CHECK_OFFSET(struct xfs_cui_log_format, cui_extents, 16); XFS_CHECK_OFFSET(struct xfs_rui_log_format, rui_extents, 16); + XFS_CHECK_OFFSET(struct xfs_efi_log_format, efi_extents, 16); + XFS_CHECK_OFFSET(struct xfs_efi_log_format_32, efi_extents, 16); + XFS_CHECK_OFFSET(struct xfs_efi_log_format_64, efi_extents, 16);
/* * The v5 superblock format extended several v4 header structures with diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 9b4ff8247de2..04110183f3ed 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -2023,7 +2023,7 @@ xfs_init_zones(void)
xfs_efd_zone = kmem_cache_create("xfs_efd_item", (sizeof(struct xfs_efd_log_item) + - (XFS_EFD_MAX_FAST_EXTENTS - 1) * + XFS_EFD_MAX_FAST_EXTENTS * sizeof(struct xfs_extent)), 0, 0, NULL); if (!xfs_efd_zone) @@ -2031,7 +2031,7 @@ xfs_init_zones(void)
xfs_efi_zone = kmem_cache_create("xfs_efi_item", (sizeof(struct xfs_efi_log_item) + - (XFS_EFI_MAX_FAST_EXTENTS - 1) * + XFS_EFI_MAX_FAST_EXTENTS * sizeof(struct xfs_extent)), 0, 0, NULL); if (!xfs_efi_zone)
From: Dave Chinner dchinner@redhat.com
mainline inclusion from mainline-v5.12-rc4 commit 5825bea05265d2938c4c20a1c0f8b7d7ab59523d category: bugfix bugzilla: 188483, https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
- 21.92% __xfs_trans_commit - 21.62% xfs_log_commit_cil - 11.69% xfs_trans_unreserve_and_mod_sb - 11.58% __percpu_counter_compare - 11.45% __percpu_counter_sum - 10.29% _raw_spin_lock_irqsave - 10.28% do_raw_spin_lock __pv_queued_spin_lock_slowpath
We debated just getting rid of it last time this came up and there was no real objection to removing it. Now it's the biggest scalability limitation for debug kernels even on smallish machines, so let's just get rid of it.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Long Li leo.lilong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- fs/xfs/xfs_trans.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-)
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 8836bb02d82d..a4275322720a 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -608,19 +608,12 @@ xfs_trans_unreserve_and_mod_sb( ASSERT(!error); }
- if (idelta) { + if (idelta) percpu_counter_add_batch(&mp->m_icount, idelta, XFS_ICOUNT_BATCH); - if (idelta < 0) - ASSERT(__percpu_counter_compare(&mp->m_icount, 0, - XFS_ICOUNT_BATCH) >= 0); - }
- if (ifreedelta) { + if (ifreedelta) percpu_counter_add(&mp->m_ifree, ifreedelta); - if (ifreedelta < 0) - ASSERT(percpu_counter_compare(&mp->m_ifree, 0) >= 0); - }
if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY)) return;
From: Zheng Wang zyytlz.wz@163.com
mainline inclusion from mainline-v6.3-rc3 commit 5000fe6c27827a61d8250a7e4a1d26c3298ef4f6 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I6UW64 CVE: CVE-2023-1990
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit?id...
--------------------------------
This bug influences both st_nci_i2c_remove and st_nci_spi_remove. Take st_nci_i2c_remove as an example.
In st_nci_i2c_probe, it called ndlc_probe and bound &ndlc->sm_work with llt_ndlc_sm_work.
When it calls ndlc_recv or timeout handler, it will finally call schedule_work to start the work.
When we call st_nci_i2c_remove to remove the driver, there may be a sequence as follows:
Fix it by finishing the work before cleanup in ndlc_remove
CPU0 CPU1
|llt_ndlc_sm_work st_nci_i2c_remove | ndlc_remove | st_nci_remove | nci_free_device| kfree(ndev) | //free ndlc->ndev | |llt_ndlc_rcv_queue |nci_recv_frame |//use ndlc->ndev
Fixes: 35630df68d60 ("NFC: st21nfcb: Add driver for STMicroelectronics ST21NFCB NFC chip") Signed-off-by: Zheng Wang zyytlz.wz@163.com Reviewed-by: Krzysztof Kozlowski krzysztof.kozlowski@linaro.org Link: https://lore.kernel.org/r/20230312160837.2040857-1-zyytlz.wz@163.com Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Wei Li liwei391@huawei.com Reviewed-by: Xiongfeng Wang wangxiongfeng2@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- drivers/nfc/st-nci/ndlc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/nfc/st-nci/ndlc.c b/drivers/nfc/st-nci/ndlc.c index 5d74c674368a..8ccf5a86ad1b 100644 --- a/drivers/nfc/st-nci/ndlc.c +++ b/drivers/nfc/st-nci/ndlc.c @@ -286,13 +286,15 @@ EXPORT_SYMBOL(ndlc_probe);
void ndlc_remove(struct llt_ndlc *ndlc) { - st_nci_remove(ndlc->ndev); - /* cancel timers */ del_timer_sync(&ndlc->t1_timer); del_timer_sync(&ndlc->t2_timer); ndlc->t2_active = false; ndlc->t1_active = false; + /* cancel work */ + cancel_work_sync(&ndlc->sm_work); + + st_nci_remove(ndlc->ndev);
skb_queue_purge(&ndlc->rcv_q); skb_queue_purge(&ndlc->send_q);
From: Paolo Bonzini pbonzini@redhat.com
stable inclusion from stable-v5.10.176 commit c54974ccaff73525462e278602dfe4069877cfaa category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I6U7AN CVE: CVE-2023-30456
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit 112e66017bff7f2837030f34c2bc19501e9212d5 upstream.
The effective values of the guest CR0 and CR4 registers may differ from those included in the VMCS12. In particular, disabling EPT forces CR4.PAE=1 and disabling unrestricted guest mode forces CR0.PG=CR0.PE=1.
Therefore, checks on these bits cannot be delegated to the processor and must be performed by KVM.
Reported-by: Reima ISHII ishiir@g.ecc.u-tokyo.ac.jp Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Lin Yujun linyujun809@huawei.com Reviewed-by: Zhang Jianhua chris.zjh@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- arch/x86/kvm/vmx/nested.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 42d9345b10c3..3e46baff2a7a 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2986,7 +2986,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, enum vm_entry_failure_code *entry_failure_code) { - bool ia32e; + bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE);
*entry_failure_code = ENTRY_FAIL_DEFAULT;
@@ -3012,6 +3012,13 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, vmcs12->guest_ia32_perf_global_ctrl))) return -EINVAL;
+ if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG)) + return -EINVAL; + + if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) || + CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG))) + return -EINVAL; + /* * If the load IA32_EFER VM-entry control is 1, the following checks * are performed on the field for the IA32_EFER MSR: @@ -3023,7 +3030,6 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, */ if (to_vmx(vcpu)->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { - ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || CC(((vmcs12->guest_cr0 & X86_CR0_PG) &&
From: Zheng Wang zyytlz.wz@163.com
mainline inclusion from mainline-v6.3-rc7 commit 73f7b171b7c09139eb3c6a5677c200dc1be5f318 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I6UW68 CVE: CVE-2023-1989
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
---------------------------
In btsdio_probe, the data->work is bound with btsdio_work. It will be started in btsdio_send_frame.
If the btsdio_remove runs with a unfinished work, there may be a race condition that hdev is freed but used in btsdio_work. Fix it by canceling the work before do cleanup in btsdio_remove.
Signed-off-by: Zheng Wang zyytlz.wz@163.com Signed-off-by: Luiz Augusto von Dentz luiz.von.dentz@intel.com Signed-off-by: Liu Jian liujian56@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- drivers/bluetooth/btsdio.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/bluetooth/btsdio.c b/drivers/bluetooth/btsdio.c index 199e8f7d426d..2e4ac39dd975 100644 --- a/drivers/bluetooth/btsdio.c +++ b/drivers/bluetooth/btsdio.c @@ -355,6 +355,7 @@ static void btsdio_remove(struct sdio_func *func) if (!data) return;
+ cancel_work_sync(&data->work); hdev = data->hdev;
sdio_set_drvdata(func, NULL);
From: Pablo Neira Ayuso pablo@netfilter.org
mainline inclusion from mainline-v5.18-rc1 commit 6e1acfa387b9ff82cfc7db8cc3b6959221a95851 category: bugfix bugzilla: 186582, https://gitee.com/src-openeuler/kernel/issues/I50WB5 CVE: CVE-2022-1015
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Bail out in case userspace uses unsupported registers.
Fixes: 49499c3e6e18 ("netfilter: nf_tables: switch registers to 32 bit addressing") Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org
conflict: net/netfilter/nf_tables_api.c
Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Liu Jian liujian56@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- include/net/netfilter/nf_tables.h | 2 +- net/netfilter/nf_tables_api.c | 22 +++++++++++++++++----- 2 files changed, 18 insertions(+), 6 deletions(-)
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index e66fee99ed3e..48b85b8deabd 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -205,7 +205,7 @@ static inline enum nft_registers nft_type_to_reg(enum nft_data_types type) }
int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest); -unsigned int nft_parse_register(const struct nlattr *attr); +unsigned int nft_parse_register(const struct nlattr *attr, u32 *preg); int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg);
int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 1b039476e4d6..8870cb0fb0e5 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8479,17 +8479,23 @@ EXPORT_SYMBOL_GPL(nft_parse_u32_check); * Registers used to be 128 bit wide, these register numbers will be * mapped to the corresponding 32 bit register numbers. */ -unsigned int nft_parse_register(const struct nlattr *attr) +unsigned int nft_parse_register(const struct nlattr *attr, u32 *preg) { unsigned int reg;
reg = ntohl(nla_get_be32(attr)); switch (reg) { case NFT_REG_VERDICT...NFT_REG_4: - return reg * NFT_REG_SIZE / NFT_REG32_SIZE; + *preg = reg * NFT_REG_SIZE / NFT_REG32_SIZE; + break; + case NFT_REG32_00...NFT_REG32_15: + *preg = reg + NFT_REG_SIZE / NFT_REG32_SIZE - NFT_REG32_00; + break; default: - return reg + NFT_REG_SIZE / NFT_REG32_SIZE - NFT_REG32_00; + return -ERANGE; } + + return 0; } EXPORT_SYMBOL_GPL(nft_parse_register);
@@ -8541,7 +8547,10 @@ int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len) u32 reg; int err;
- reg = nft_parse_register(attr); + err = nft_parse_register(attr, ®); + if (err < 0) + return err; + err = nft_validate_register_load(reg, len); if (err < 0) return err; @@ -8610,7 +8619,10 @@ int nft_parse_register_store(const struct nft_ctx *ctx, int err; u32 reg;
- reg = nft_parse_register(attr); + err = nft_parse_register(attr, ®); + if (err < 0) + return err; + err = nft_validate_register_store(ctx, reg, data, type, len); if (err < 0) return err;
From: Antoine Tenart atenart@kernel.org
mainline inclusion from mainline-v5.18-rc3 commit 6c6f9f31ecd47dce1d0dafca4bec8805f9bc97cd category: bugfix bugzilla: 186582, https://gitee.com/src-openeuler/kernel/issues/I50WB5 CVE: CVE-2022-1015
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Since commit 6e1acfa387b9 ("netfilter: nf_tables: validate registers coming from userspace.") nft_parse_register can return a negative value, but the function prototype is still returning an unsigned int.
Fixes: 6e1acfa387b9 ("netfilter: nf_tables: validate registers coming from userspace.") Signed-off-by: Antoine Tenart atenart@kernel.org Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org
conflict: net/netfilter/nf_tables_api.c
Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Liu Jian liujian56@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- include/net/netfilter/nf_tables.h | 2 +- net/netfilter/nf_tables_api.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 48b85b8deabd..5b6803cd3299 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -205,7 +205,7 @@ static inline enum nft_registers nft_type_to_reg(enum nft_data_types type) }
int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest); -unsigned int nft_parse_register(const struct nlattr *attr, u32 *preg); +int nft_parse_register(const struct nlattr *attr, u32 *preg); int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg);
int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 8870cb0fb0e5..db54c64bd9b3 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8479,7 +8479,7 @@ EXPORT_SYMBOL_GPL(nft_parse_u32_check); * Registers used to be 128 bit wide, these register numbers will be * mapped to the corresponding 32 bit register numbers. */ -unsigned int nft_parse_register(const struct nlattr *attr, u32 *preg) +int nft_parse_register(const struct nlattr *attr, u32 *preg) { unsigned int reg;
From: Zack Rusin zackr@vmware.com
stable inclusion from stable-v5.10.163 commit 439cbbc1519547f9a7b483f0de33b556ebfec901 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5QLC4 CVE:CVE-2022-36280
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit 4cf949c7fafe21e085a4ee386bb2dade9067316e upstream.
Invalid userspace dma surface copies could potentially overflow the memcpy from the surface to the snooped image leading to crashes. To fix it the dimensions of the copybox have to be validated against the expected size of the snooped cursor.
Signed-off-by: Zack Rusin zackr@vmware.com Fixes: 2ac863719e51 ("vmwgfx: Snoop DMA transfers with non-covering sizes") Cc: stable@vger.kernel.org # v3.2+ Reviewed-by: Michael Banack banackm@vmware.com Reviewed-by: Martin Krastev krastevm@vmware.com Link: https://patchwork.freedesktop.org/patch/msgid/20221026031936.1004280-1-zack@... Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yuyao Lin linyuyao1@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- drivers/gpu/drm/vmwgfx/vmwgfx_kms.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c index e58112997c88..0e963fd7db17 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c @@ -182,7 +182,8 @@ void vmw_kms_cursor_snoop(struct vmw_surface *srf, if (cmd->dma.guest.ptr.offset % PAGE_SIZE || box->x != 0 || box->y != 0 || box->z != 0 || box->srcx != 0 || box->srcy != 0 || box->srcz != 0 || - box->d != 1 || box_count != 1) { + box->d != 1 || box_count != 1 || + box->w > 64 || box->h > 64) { /* TODO handle none page aligned offsets */ /* TODO handle more dst & src != 0 */ /* TODO handle more then one copy */
From: Jamal Hadi Salim jhs@mojatatu.com
stable inclusion from stable-v5.10.173 commit 18c3fa7a7fdbb4d21dafc8a7710ae2c1680930f6 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I6UYBU CVE: CVE-2023-1829
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit 8c710f75256bb3cf05ac7b1672c82b92c43f3d28 upstream.
The tcindex classifier has served us well for about a quarter of a century but has not been getting much TLC due to lack of known users. Most recently it has become easy prey to syzkaller. For this reason, we are retiring it.
Signed-off-by: Jamal Hadi Salim jhs@mojatatu.com Acked-by: Jiri Pirko jiri@nvidia.com Signed-off-by: Paolo Abeni pabeni@redhat.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org
Conflicts: net/sched/cls_tcindex.c
Signed-off-by: Zhengchao Shao shaozhengchao@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- net/sched/Kconfig | 11 - net/sched/Makefile | 1 - net/sched/cls_tcindex.c | 764 ---------------------------------------- 3 files changed, 776 deletions(-) delete mode 100644 net/sched/cls_tcindex.c
diff --git a/net/sched/Kconfig b/net/sched/Kconfig index d762e89ab74f..8f25d35dae7b 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -503,17 +503,6 @@ config NET_CLS_BASIC To compile this code as a module, choose M here: the module will be called cls_basic.
-config NET_CLS_TCINDEX - tristate "Traffic-Control Index (TCINDEX)" - select NET_CLS - help - Say Y here if you want to be able to classify packets based on - traffic control indices. You will want this feature if you want - to implement Differentiated Services together with DSMARK. - - To compile this code as a module, choose M here: the - module will be called cls_tcindex. - config NET_CLS_ROUTE4 tristate "Routing decision (ROUTE)" depends on INET diff --git a/net/sched/Makefile b/net/sched/Makefile index 66bbf9a98f9e..4311fdb21119 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -69,7 +69,6 @@ obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o obj-$(CONFIG_NET_CLS_FW) += cls_fw.o obj-$(CONFIG_NET_CLS_RSVP) += cls_rsvp.o -obj-$(CONFIG_NET_CLS_TCINDEX) += cls_tcindex.o obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o obj-$(CONFIG_NET_CLS_FLOW) += cls_flow.o diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c deleted file mode 100644 index 56612caf7833..000000000000 --- a/net/sched/cls_tcindex.c +++ /dev/null @@ -1,764 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/sched/cls_tcindex.c Packet classifier for skb->tc_index - * - * Written 1998,1999 by Werner Almesberger, EPFL ICA - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/skbuff.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/refcount.h> -#include <linux/rcupdate.h> -#include <net/act_api.h> -#include <net/netlink.h> -#include <net/pkt_cls.h> -#include <net/sch_generic.h> - -/* - * Passing parameters to the root seems to be done more awkwardly than really - * necessary. At least, u32 doesn't seem to use such dirty hacks. To be - * verified. FIXME. - */ - -#define PERFECT_HASH_THRESHOLD 64 /* use perfect hash if not bigger */ -#define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */ - - -struct tcindex_data; - -struct tcindex_filter_result { - struct tcf_exts exts; - struct tcf_result res; - struct tcindex_data *p; - struct rcu_work rwork; -}; - -struct tcindex_filter { - u16 key; - struct tcindex_filter_result result; - struct tcindex_filter __rcu *next; - struct rcu_work rwork; -}; - - -struct tcindex_data { - struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */ - struct tcindex_filter __rcu **h; /* imperfect hash; */ - struct tcf_proto *tp; - u16 mask; /* AND key with mask */ - u32 shift; /* shift ANDed key to the right */ - u32 hash; /* hash table size; 0 if undefined */ - u32 alloc_hash; /* allocated size */ - u32 fall_through; /* 0: only classify if explicit match */ - refcount_t refcnt; /* a temporary refcnt for perfect hash */ - struct rcu_work rwork; -}; - -static inline int tcindex_filter_is_set(struct tcindex_filter_result *r) -{ - return tcf_exts_has_actions(&r->exts) || r->res.classid; -} - -static void tcindex_data_get(struct tcindex_data *p) -{ - refcount_inc(&p->refcnt); -} - -static void tcindex_data_put(struct tcindex_data *p) -{ - if (refcount_dec_and_test(&p->refcnt)) { - kfree(p->perfect); - kfree(p->h); - kfree(p); - } -} - -static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p, - u16 key) -{ - if (p->perfect) { - struct tcindex_filter_result *f = p->perfect + key; - - return tcindex_filter_is_set(f) ? f : NULL; - } else if (p->h) { - struct tcindex_filter __rcu **fp; - struct tcindex_filter *f; - - fp = &p->h[key % p->hash]; - for (f = rcu_dereference_bh_rtnl(*fp); - f; - fp = &f->next, f = rcu_dereference_bh_rtnl(*fp)) - if (f->key == key) - return &f->result; - } - - return NULL; -} - - -static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) -{ - struct tcindex_data *p = rcu_dereference_bh(tp->root); - struct tcindex_filter_result *f; - int key = (skb->tc_index & p->mask) >> p->shift; - - pr_debug("tcindex_classify(skb %p,tp %p,res %p),p %p\n", - skb, tp, res, p); - - f = tcindex_lookup(p, key); - if (!f) { - struct Qdisc *q = tcf_block_q(tp->chain->block); - - if (!p->fall_through) - return -1; - res->classid = TC_H_MAKE(TC_H_MAJ(q->handle), key); - res->class = 0; - pr_debug("alg 0x%x\n", res->classid); - return 0; - } - *res = f->res; - pr_debug("map 0x%x\n", res->classid); - - return tcf_exts_exec(skb, &f->exts, res); -} - - -static void *tcindex_get(struct tcf_proto *tp, u32 handle) -{ - struct tcindex_data *p = rtnl_dereference(tp->root); - struct tcindex_filter_result *r; - - pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle); - if (p->perfect && handle >= p->alloc_hash) - return NULL; - r = tcindex_lookup(p, handle); - return r && tcindex_filter_is_set(r) ? r : NULL; -} - -static int tcindex_init(struct tcf_proto *tp) -{ - struct tcindex_data *p; - - pr_debug("tcindex_init(tp %p)\n", tp); - p = kzalloc(sizeof(struct tcindex_data), GFP_KERNEL); - if (!p) - return -ENOMEM; - - p->mask = 0xffff; - p->hash = DEFAULT_HASH_SIZE; - p->fall_through = 1; - refcount_set(&p->refcnt, 1); /* Paired with tcindex_destroy_work() */ - - rcu_assign_pointer(tp->root, p); - return 0; -} - -static void __tcindex_destroy_rexts(struct tcindex_filter_result *r) -{ - tcf_exts_destroy(&r->exts); - tcf_exts_put_net(&r->exts); - tcindex_data_put(r->p); -} - -static void tcindex_destroy_rexts_work(struct work_struct *work) -{ - struct tcindex_filter_result *r; - - r = container_of(to_rcu_work(work), - struct tcindex_filter_result, - rwork); - rtnl_lock(); - __tcindex_destroy_rexts(r); - rtnl_unlock(); -} - -static void __tcindex_destroy_fexts(struct tcindex_filter *f) -{ - tcf_exts_destroy(&f->result.exts); - tcf_exts_put_net(&f->result.exts); - kfree(f); -} - -static void tcindex_destroy_fexts_work(struct work_struct *work) -{ - struct tcindex_filter *f = container_of(to_rcu_work(work), - struct tcindex_filter, - rwork); - - rtnl_lock(); - __tcindex_destroy_fexts(f); - rtnl_unlock(); -} - -static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last, - bool rtnl_held, struct netlink_ext_ack *extack) -{ - struct tcindex_data *p = rtnl_dereference(tp->root); - struct tcindex_filter_result *r = arg; - struct tcindex_filter __rcu **walk; - struct tcindex_filter *f = NULL; - - pr_debug("tcindex_delete(tp %p,arg %p),p %p\n", tp, arg, p); - if (p->perfect) { - if (!r->res.class) - return -ENOENT; - } else { - int i; - - for (i = 0; i < p->hash; i++) { - walk = p->h + i; - for (f = rtnl_dereference(*walk); f; - walk = &f->next, f = rtnl_dereference(*walk)) { - if (&f->result == r) - goto found; - } - } - return -ENOENT; - -found: - rcu_assign_pointer(*walk, rtnl_dereference(f->next)); - } - tcf_unbind_filter(tp, &r->res); - /* all classifiers are required to call tcf_exts_destroy() after rcu - * grace period, since converted-to-rcu actions are relying on that - * in cleanup() callback - */ - if (f) { - if (tcf_exts_get_net(&f->result.exts)) - tcf_queue_work(&f->rwork, tcindex_destroy_fexts_work); - else - __tcindex_destroy_fexts(f); - } else { - tcindex_data_get(p); - - if (tcf_exts_get_net(&r->exts)) - tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work); - else - __tcindex_destroy_rexts(r); - } - - *last = false; - return 0; -} - -static void tcindex_destroy_work(struct work_struct *work) -{ - struct tcindex_data *p = container_of(to_rcu_work(work), - struct tcindex_data, - rwork); - - tcindex_data_put(p); -} - -static inline int -valid_perfect_hash(struct tcindex_data *p) -{ - return p->hash > (p->mask >> p->shift); -} - -static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = { - [TCA_TCINDEX_HASH] = { .type = NLA_U32 }, - [TCA_TCINDEX_MASK] = { .type = NLA_U16 }, - [TCA_TCINDEX_SHIFT] = { .type = NLA_U32 }, - [TCA_TCINDEX_FALL_THROUGH] = { .type = NLA_U32 }, - [TCA_TCINDEX_CLASSID] = { .type = NLA_U32 }, -}; - -static int tcindex_filter_result_init(struct tcindex_filter_result *r, - struct tcindex_data *p, - struct net *net) -{ - memset(r, 0, sizeof(*r)); - r->p = p; - return tcf_exts_init(&r->exts, net, TCA_TCINDEX_ACT, - TCA_TCINDEX_POLICE); -} - -static void tcindex_free_perfect_hash(struct tcindex_data *cp); - -static void tcindex_partial_destroy_work(struct work_struct *work) -{ - struct tcindex_data *p = container_of(to_rcu_work(work), - struct tcindex_data, - rwork); - - rtnl_lock(); - if (p->perfect) - tcindex_free_perfect_hash(p); - kfree(p); - rtnl_unlock(); -} - -static void tcindex_free_perfect_hash(struct tcindex_data *cp) -{ - int i; - - for (i = 0; i < cp->hash; i++) - tcf_exts_destroy(&cp->perfect[i].exts); - kfree(cp->perfect); -} - -static int tcindex_alloc_perfect_hash(struct net *net, struct tcindex_data *cp) -{ - int i, err = 0; - - cp->perfect = kcalloc(cp->hash, sizeof(struct tcindex_filter_result), - GFP_KERNEL | __GFP_NOWARN); - if (!cp->perfect) - return -ENOMEM; - - for (i = 0; i < cp->hash; i++) { - err = tcf_exts_init(&cp->perfect[i].exts, net, - TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); - if (err < 0) - goto errout; - cp->perfect[i].p = cp; - } - - return 0; - -errout: - tcindex_free_perfect_hash(cp); - return err; -} - -static int -tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, - u32 handle, struct tcindex_data *p, - struct tcindex_filter_result *r, struct nlattr **tb, - struct nlattr *est, bool ovr, struct netlink_ext_ack *extack) -{ - struct tcindex_filter_result new_filter_result, *old_r = r; - struct tcindex_data *cp = NULL, *oldp; - struct tcindex_filter *f = NULL; /* make gcc behave */ - struct tcf_result cr = {}; - int err, balloc = 0; - struct tcf_exts e; - bool update_h = false; - - err = tcf_exts_init(&e, net, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); - if (err < 0) - return err; - err = tcf_exts_validate(net, tp, tb, est, &e, ovr, true, extack); - if (err < 0) - goto errout; - - err = -ENOMEM; - /* tcindex_data attributes must look atomic to classifier/lookup so - * allocate new tcindex data and RCU assign it onto root. Keeping - * perfect hash and hash pointers from old data. - */ - cp = kzalloc(sizeof(*cp), GFP_KERNEL); - if (!cp) - goto errout; - - cp->mask = p->mask; - cp->shift = p->shift; - cp->hash = p->hash; - cp->alloc_hash = p->alloc_hash; - cp->fall_through = p->fall_through; - cp->tp = tp; - refcount_set(&cp->refcnt, 1); /* Paired with tcindex_destroy_work() */ - - if (tb[TCA_TCINDEX_HASH]) - cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); - - if (tb[TCA_TCINDEX_MASK]) - cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]); - - if (tb[TCA_TCINDEX_SHIFT]) { - cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]); - if (cp->shift > 16) { - err = -EINVAL; - goto errout; - } - } - if (!cp->hash) { - /* Hash not specified, use perfect hash if the upper limit - * of the hashing index is below the threshold. - */ - if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD) - cp->hash = (cp->mask >> cp->shift) + 1; - else - cp->hash = DEFAULT_HASH_SIZE; - } - - if (p->perfect) { - int i; - - if (tcindex_alloc_perfect_hash(net, cp) < 0) - goto errout; - cp->alloc_hash = cp->hash; - for (i = 0; i < min(cp->hash, p->hash); i++) - cp->perfect[i].res = p->perfect[i].res; - balloc = 1; - } - cp->h = p->h; - - err = tcindex_filter_result_init(&new_filter_result, cp, net); - if (err < 0) - goto errout_alloc; - if (old_r) - cr = r->res; - - err = -EBUSY; - - /* Hash already allocated, make sure that we still meet the - * requirements for the allocated hash. - */ - if (cp->perfect) { - if (!valid_perfect_hash(cp) || - cp->hash > cp->alloc_hash) - goto errout_alloc; - } else if (cp->h && cp->hash != cp->alloc_hash) { - goto errout_alloc; - } - - err = -EINVAL; - if (tb[TCA_TCINDEX_FALL_THROUGH]) - cp->fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]); - - if (!cp->perfect && !cp->h) - cp->alloc_hash = cp->hash; - - /* Note: this could be as restrictive as if (handle & ~(mask >> shift)) - * but then, we'd fail handles that may become valid after some future - * mask change. While this is extremely unlikely to ever matter, - * the check below is safer (and also more backwards-compatible). - */ - if (cp->perfect || valid_perfect_hash(cp)) - if (handle >= cp->alloc_hash) - goto errout_alloc; - - - err = -ENOMEM; - if (!cp->perfect && !cp->h) { - if (valid_perfect_hash(cp)) { - if (tcindex_alloc_perfect_hash(net, cp) < 0) - goto errout_alloc; - balloc = 1; - } else { - struct tcindex_filter __rcu **hash; - - hash = kcalloc(cp->hash, - sizeof(struct tcindex_filter *), - GFP_KERNEL); - - if (!hash) - goto errout_alloc; - - cp->h = hash; - balloc = 2; - } - } - - if (cp->perfect) { - r = cp->perfect + handle; - } else { - /* imperfect area is updated in-place using rcu */ - update_h = !!tcindex_lookup(cp, handle); - r = &new_filter_result; - } - - if (r == &new_filter_result) { - f = kzalloc(sizeof(*f), GFP_KERNEL); - if (!f) - goto errout_alloc; - f->key = handle; - f->next = NULL; - err = tcindex_filter_result_init(&f->result, cp, net); - if (err < 0) { - kfree(f); - goto errout_alloc; - } - } - - if (tb[TCA_TCINDEX_CLASSID]) { - cr.classid = nla_get_u32(tb[TCA_TCINDEX_CLASSID]); - tcf_bind_filter(tp, &cr, base); - } - - if (old_r && old_r != r) { - err = tcindex_filter_result_init(old_r, cp, net); - if (err < 0) { - kfree(f); - goto errout_alloc; - } - } - - oldp = p; - r->res = cr; - tcf_exts_change(&r->exts, &e); - - rcu_assign_pointer(tp->root, cp); - - if (update_h) { - struct tcindex_filter __rcu **fp; - struct tcindex_filter *cf; - - f->result.res = r->res; - tcf_exts_change(&f->result.exts, &r->exts); - - /* imperfect area bucket */ - fp = cp->h + (handle % cp->hash); - - /* lookup the filter, guaranteed to exist */ - for (cf = rcu_dereference_bh_rtnl(*fp); cf; - fp = &cf->next, cf = rcu_dereference_bh_rtnl(*fp)) - if (cf->key == (u16)handle) - break; - - f->next = cf->next; - - cf = rcu_replace_pointer(*fp, f, 1); - tcf_exts_get_net(&cf->result.exts); - tcf_queue_work(&cf->rwork, tcindex_destroy_fexts_work); - } else if (r == &new_filter_result) { - struct tcindex_filter *nfp; - struct tcindex_filter __rcu **fp; - - f->result.res = r->res; - tcf_exts_change(&f->result.exts, &r->exts); - - fp = cp->h + (handle % cp->hash); - for (nfp = rtnl_dereference(*fp); - nfp; - fp = &nfp->next, nfp = rtnl_dereference(*fp)) - ; /* nothing */ - - rcu_assign_pointer(*fp, f); - } else { - tcf_exts_destroy(&new_filter_result.exts); - } - - if (oldp) - tcf_queue_work(&oldp->rwork, tcindex_partial_destroy_work); - return 0; - -errout_alloc: - if (balloc == 1) - tcindex_free_perfect_hash(cp); - else if (balloc == 2) - kfree(cp->h); - tcf_exts_destroy(&new_filter_result.exts); -errout: - kfree(cp); - tcf_exts_destroy(&e); - return err; -} - -static int -tcindex_change(struct net *net, struct sk_buff *in_skb, - struct tcf_proto *tp, unsigned long base, u32 handle, - struct nlattr **tca, void **arg, bool ovr, - bool rtnl_held, struct netlink_ext_ack *extack) -{ - struct nlattr *opt = tca[TCA_OPTIONS]; - struct nlattr *tb[TCA_TCINDEX_MAX + 1]; - struct tcindex_data *p = rtnl_dereference(tp->root); - struct tcindex_filter_result *r = *arg; - int err; - - pr_debug("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p," - "p %p,r %p,*arg %p\n", - tp, handle, tca, arg, opt, p, r, *arg); - - if (!opt) - return 0; - - err = nla_parse_nested_deprecated(tb, TCA_TCINDEX_MAX, opt, - tcindex_policy, NULL); - if (err < 0) - return err; - - return tcindex_set_parms(net, tp, base, handle, p, r, tb, - tca[TCA_RATE], ovr, extack); -} - -static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker, - bool rtnl_held) -{ - struct tcindex_data *p = rtnl_dereference(tp->root); - struct tcindex_filter *f, *next; - int i; - - pr_debug("tcindex_walk(tp %p,walker %p),p %p\n", tp, walker, p); - if (p->perfect) { - for (i = 0; i < p->hash; i++) { - if (!p->perfect[i].res.class) - continue; - if (walker->count >= walker->skip) { - if (walker->fn(tp, p->perfect + i, walker) < 0) { - walker->stop = 1; - return; - } - } - walker->count++; - } - } - if (!p->h) - return; - for (i = 0; i < p->hash; i++) { - for (f = rtnl_dereference(p->h[i]); f; f = next) { - next = rtnl_dereference(f->next); - if (walker->count >= walker->skip) { - if (walker->fn(tp, &f->result, walker) < 0) { - walker->stop = 1; - return; - } - } - walker->count++; - } - } -} - -static void tcindex_destroy(struct tcf_proto *tp, bool rtnl_held, - struct netlink_ext_ack *extack) -{ - struct tcindex_data *p = rtnl_dereference(tp->root); - int i; - - pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p); - - if (p->perfect) { - for (i = 0; i < p->hash; i++) { - struct tcindex_filter_result *r = p->perfect + i; - - /* tcf_queue_work() does not guarantee the ordering we - * want, so we have to take this refcnt temporarily to - * ensure 'p' is freed after all tcindex_filter_result - * here. Imperfect hash does not need this, because it - * uses linked lists rather than an array. - */ - tcindex_data_get(p); - - tcf_unbind_filter(tp, &r->res); - if (tcf_exts_get_net(&r->exts)) - tcf_queue_work(&r->rwork, - tcindex_destroy_rexts_work); - else - __tcindex_destroy_rexts(r); - } - } - - for (i = 0; p->h && i < p->hash; i++) { - struct tcindex_filter *f, *next; - bool last; - - for (f = rtnl_dereference(p->h[i]); f; f = next) { - next = rtnl_dereference(f->next); - tcindex_delete(tp, &f->result, &last, rtnl_held, NULL); - } - } - - tcf_queue_work(&p->rwork, tcindex_destroy_work); -} - - -static int tcindex_dump(struct net *net, struct tcf_proto *tp, void *fh, - struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) -{ - struct tcindex_data *p = rtnl_dereference(tp->root); - struct tcindex_filter_result *r = fh; - struct nlattr *nest; - - pr_debug("tcindex_dump(tp %p,fh %p,skb %p,t %p),p %p,r %p\n", - tp, fh, skb, t, p, r); - pr_debug("p->perfect %p p->h %p\n", p->perfect, p->h); - - nest = nla_nest_start_noflag(skb, TCA_OPTIONS); - if (nest == NULL) - goto nla_put_failure; - - if (!fh) { - t->tcm_handle = ~0; /* whatever ... */ - if (nla_put_u32(skb, TCA_TCINDEX_HASH, p->hash) || - nla_put_u16(skb, TCA_TCINDEX_MASK, p->mask) || - nla_put_u32(skb, TCA_TCINDEX_SHIFT, p->shift) || - nla_put_u32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through)) - goto nla_put_failure; - nla_nest_end(skb, nest); - } else { - if (p->perfect) { - t->tcm_handle = r - p->perfect; - } else { - struct tcindex_filter *f; - struct tcindex_filter __rcu **fp; - int i; - - t->tcm_handle = 0; - for (i = 0; !t->tcm_handle && i < p->hash; i++) { - fp = &p->h[i]; - for (f = rtnl_dereference(*fp); - !t->tcm_handle && f; - fp = &f->next, f = rtnl_dereference(*fp)) { - if (&f->result == r) - t->tcm_handle = f->key; - } - } - } - pr_debug("handle = %d\n", t->tcm_handle); - if (r->res.class && - nla_put_u32(skb, TCA_TCINDEX_CLASSID, r->res.classid)) - goto nla_put_failure; - - if (tcf_exts_dump(skb, &r->exts) < 0) - goto nla_put_failure; - nla_nest_end(skb, nest); - - if (tcf_exts_dump_stats(skb, &r->exts) < 0) - goto nla_put_failure; - } - - return skb->len; - -nla_put_failure: - nla_nest_cancel(skb, nest); - return -1; -} - -static void tcindex_bind_class(void *fh, u32 classid, unsigned long cl, - void *q, unsigned long base) -{ - struct tcindex_filter_result *r = fh; - - if (r && r->res.classid == classid) { - if (cl) - __tcf_bind_filter(q, &r->res, base); - else - __tcf_unbind_filter(q, &r->res); - } -} - -static struct tcf_proto_ops cls_tcindex_ops __read_mostly = { - .kind = "tcindex", - .classify = tcindex_classify, - .init = tcindex_init, - .destroy = tcindex_destroy, - .get = tcindex_get, - .change = tcindex_change, - .delete = tcindex_delete, - .walk = tcindex_walk, - .dump = tcindex_dump, - .bind_class = tcindex_bind_class, - .owner = THIS_MODULE, -}; - -static int __init init_tcindex(void) -{ - return register_tcf_proto_ops(&cls_tcindex_ops); -} - -static void __exit exit_tcindex(void) -{ - unregister_tcf_proto_ops(&cls_tcindex_ops); -} - -module_init(init_tcindex) -module_exit(exit_tcindex) -MODULE_LICENSE("GPL");