Chuck Lever (5): libfs: Re-arrange locking in offset_iterate_dir() libfs: Define a minimum directory offset libfs: Add simple_offset_empty() maple_tree: Add mtree_alloc_cyclic() libfs: Convert simple directory offsets to use a Maple Tree
Liam R. Howlett (1): test_maple_tree: testing the cyclic allocation
Yu Kuai (1): fs: fix kabi kroken in struct offset_ctx
yangerkun (1): libfs: fix infinite directory reads for offset dir
fs/libfs.c | 129 +++++++++++++++++++++++++------------ include/linux/fs.h | 15 ++++- include/linux/maple_tree.h | 7 ++ lib/maple_tree.c | 93 ++++++++++++++++++++++++++ lib/test_maple_tree.c | 44 +++++++++++++ mm/shmem.c | 4 +- 6 files changed, 247 insertions(+), 45 deletions(-)
From: Chuck Lever chuck.lever@oracle.com
mainline inclusion from mainline-v6.9-rc1 commit 3f6d810665dfde0d33785420618ceb03fba0619d category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IAR5GW CVE: CVE-2024-46701
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Liam and Matthew say that once the RCU read lock is released, xa_state is not safe to re-use for the next xas_find() call. But the RCU read lock must be released on each loop iteration so that dput(), which might_sleep(), can be called safely.
Thus we are forced to walk the offset tree with fresh state for each directory entry. xa_find() can do this for us, though it might be a little less efficient than maintaining xa_state locally.
We believe that in the current code base, inode->i_rwsem provides protection for the xa_state maintained in offset_iterate_dir(). However, there is no guarantee that will continue to be the case in the future.
Since offset_iterate_dir() doesn't build xa_state locally any more, there's no longer a strong need for offset_find_next(). Clean up by rolling these two helpers together.
Suggested-by: Liam R. Howlett Liam.Howlett@Oracle.com Message-ID: 170785993027.11135.8830043889278631735.stgit@91.116.238.104.host.secureserver.net Signed-off-by: Chuck Lever chuck.lever@oracle.com Link: https://lore.kernel.org/r/170820142021.6328.15047865406275957018.stgit@91.11... Reviewed-by: Jan Kara jack@suse.cz Signed-off-by: Christian Brauner brauner@kernel.org Signed-off-by: Yu Kuai yukuai3@huawei.com --- fs/libfs.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/fs/libfs.c b/fs/libfs.c index dc0f7519045f..430f7c95336c 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -401,12 +401,13 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) return vfs_setpos(file, offset, U32_MAX); }
-static struct dentry *offset_find_next(struct xa_state *xas) +static struct dentry *offset_find_next(struct offset_ctx *octx, loff_t offset) { struct dentry *child, *found = NULL; + XA_STATE(xas, &octx->xa, offset);
rcu_read_lock(); - child = xas_next_entry(xas, U32_MAX); + child = xas_next_entry(&xas, U32_MAX); if (!child) goto out; spin_lock(&child->d_lock); @@ -429,12 +430,11 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) { - struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode); - XA_STATE(xas, &so_ctx->xa, ctx->pos); + struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode); struct dentry *dentry;
while (true) { - dentry = offset_find_next(&xas); + dentry = offset_find_next(octx, ctx->pos); if (!dentry) return ERR_PTR(-ENOENT);
@@ -443,8 +443,8 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) break; }
+ ctx->pos = dentry2offset(dentry) + 1; dput(dentry); - ctx->pos = xas.xa_index + 1; } return NULL; }
From: Chuck Lever chuck.lever@oracle.com
mainline inclusion from mainline-v6.9-rc1 commit 7beea725a8ca412c6190090ce7c3a13b169592a1 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IAR5GW CVE: CVE-2024-46701
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
This value is used in several places, so make it a symbolic constant.
Reviewed-by: Jan Kara jack@suse.cz Signed-off-by: Chuck Lever chuck.lever@oracle.com Link: https://lore.kernel.org/r/170820142741.6328.12428356024575347885.stgit@91.11... Signed-off-by: Christian Brauner brauner@kernel.org Signed-off-by: Yu Kuai yukuai3@huawei.com --- fs/libfs.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/fs/libfs.c b/fs/libfs.c index 430f7c95336c..c3dc58e776f9 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -239,6 +239,11 @@ const struct inode_operations simple_dir_inode_operations = { }; EXPORT_SYMBOL(simple_dir_inode_operations);
+/* 0 is '.', 1 is '..', so always start with offset 2 or more */ +enum { + DIR_OFFSET_MIN = 2, +}; + static void offset_set(struct dentry *dentry, u32 offset) { dentry->d_fsdata = (void *)((uintptr_t)(offset)); @@ -260,9 +265,7 @@ void simple_offset_init(struct offset_ctx *octx) { xa_init_flags(&octx->xa, XA_FLAGS_ALLOC1); lockdep_set_class(&octx->xa.xa_lock, &simple_offset_xa_lock); - - /* 0 is '.', 1 is '..', so always start with offset 2 */ - octx->next_offset = 2; + octx->next_offset = DIR_OFFSET_MIN; }
/** @@ -275,7 +278,7 @@ void simple_offset_init(struct offset_ctx *octx) */ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry) { - static const struct xa_limit limit = XA_LIMIT(2, U32_MAX); + static const struct xa_limit limit = XA_LIMIT(DIR_OFFSET_MIN, U32_MAX); u32 offset; int ret;
@@ -480,7 +483,7 @@ static int offset_readdir(struct file *file, struct dir_context *ctx) return 0;
/* In this case, ->private_data is protected by f_pos_lock */ - if (ctx->pos == 2) + if (ctx->pos == DIR_OFFSET_MIN) file->private_data = NULL; else if (file->private_data == ERR_PTR(-ENOENT)) return 0;
From: Chuck Lever chuck.lever@oracle.com
mainline inclusion from mainline-v6.9-rc1 commit ecba88a3b32d733d41e27973e25b2bc580f64281 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IAR5GW CVE: CVE-2024-46701
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
For simple filesystems that use directory offset mapping, rely strictly on the directory offset map to tell when a directory has no children.
After this patch is applied, the emptiness test holds only the RCU read lock when the directory being tested has no children.
In addition, this adds another layer of confirmation that simple_offset_add/remove() are working as expected.
Reviewed-by: Jan Kara jack@suse.cz Signed-off-by: Chuck Lever chuck.lever@oracle.com Link: https://lore.kernel.org/r/170820143463.6328.7872919188371286951.stgit@91.116... Signed-off-by: Christian Brauner brauner@kernel.org Signed-off-by: Yu Kuai yukuai3@huawei.com --- fs/libfs.c | 32 ++++++++++++++++++++++++++++++++ include/linux/fs.h | 1 + mm/shmem.c | 4 ++-- 3 files changed, 35 insertions(+), 2 deletions(-)
diff --git a/fs/libfs.c b/fs/libfs.c index c3dc58e776f9..d7b901cb9af4 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -312,6 +312,38 @@ void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry) offset_set(dentry, 0); }
+/** + * simple_offset_empty - Check if a dentry can be unlinked + * @dentry: dentry to be tested + * + * Returns 0 if @dentry is a non-empty directory; otherwise returns 1. + */ +int simple_offset_empty(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + struct offset_ctx *octx; + struct dentry *child; + unsigned long index; + int ret = 1; + + if (!inode || !S_ISDIR(inode->i_mode)) + return ret; + + index = DIR_OFFSET_MIN; + octx = inode->i_op->get_offset_ctx(inode); + xa_for_each(&octx->xa, index, child) { + spin_lock(&child->d_lock); + if (simple_positive(child)) { + spin_unlock(&child->d_lock); + ret = 0; + break; + } + spin_unlock(&child->d_lock); + } + + return ret; +} + /** * simple_offset_rename_exchange - exchange rename with directory offsets * @old_dir: parent of dentry being moved diff --git a/include/linux/fs.h b/include/linux/fs.h index 5e37afb1b844..cd582f571498 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3262,6 +3262,7 @@ struct offset_ctx { void simple_offset_init(struct offset_ctx *octx); int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry); void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry); +int simple_offset_empty(struct dentry *dentry); int simple_offset_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, diff --git a/mm/shmem.c b/mm/shmem.c index 72e0ec87219e..bda116d1bcb5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3543,7 +3543,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
static int shmem_rmdir(struct inode *dir, struct dentry *dentry) { - if (!simple_empty(dentry)) + if (!simple_offset_empty(dentry)) return -ENOTEMPTY;
drop_nlink(d_inode(dentry)); @@ -3600,7 +3600,7 @@ static int shmem_rename2(struct mnt_idmap *idmap, return simple_offset_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
- if (!simple_empty(new_dentry)) + if (!simple_offset_empty(new_dentry)) return -ENOTEMPTY;
if (flags & RENAME_WHITEOUT) {
From: Chuck Lever chuck.lever@oracle.com
mainline inclusion from mainline-v6.9-rc1 commit 9b6713cc75229f25552c643083cbdbfb771e5bca category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IAR5GW CVE: CVE-2024-46701
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
I need a cyclic allocator for the simple_offset implementation in fs/libfs.c.
Signed-off-by: Chuck Lever chuck.lever@oracle.com Link: https://lore.kernel.org/r/170820144179.6328.12838600511394432325.stgit@91.11... Signed-off-by: Christian Brauner brauner@kernel.org Signed-off-by: Yu Kuai yukuai3@huawei.com --- include/linux/maple_tree.h | 7 +++ lib/maple_tree.c | 93 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+)
diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index b3d63123b945..a53ad4dabd7e 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -171,6 +171,7 @@ enum maple_type { #define MT_FLAGS_LOCK_IRQ 0x100 #define MT_FLAGS_LOCK_BH 0x200 #define MT_FLAGS_LOCK_EXTERN 0x300 +#define MT_FLAGS_ALLOC_WRAPPED 0x0800
#define MAPLE_HEIGHT_MAX 31
@@ -319,6 +320,9 @@ int mtree_insert_range(struct maple_tree *mt, unsigned long first, int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp); +int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp, + void *entry, unsigned long range_lo, unsigned long range_hi, + unsigned long *next, gfp_t gfp); int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp); @@ -499,6 +503,9 @@ void *mas_find_range(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); void *mas_find_range_rev(struct ma_state *mas, unsigned long max); int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); +int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp, + void *entry, unsigned long range_lo, unsigned long range_hi, + unsigned long *next, gfp_t gfp);
bool mas_nomem(struct ma_state *mas, gfp_t gfp); void mas_pause(struct ma_state *mas); diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 1af83414877a..5328e08723d7 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4337,6 +4337,56 @@ static inline void *mas_insert(struct ma_state *mas, void *entry)
}
+/** + * mas_alloc_cyclic() - Internal call to find somewhere to store an entry + * @mas: The maple state. + * @startp: Pointer to ID. + * @range_lo: Lower bound of range to search. + * @range_hi: Upper bound of range to search. + * @entry: The entry to store. + * @next: Pointer to next ID to allocate. + * @gfp: The GFP_FLAGS to use for allocations. + * + * Return: 0 if the allocation succeeded without wrapping, 1 if the + * allocation succeeded after wrapping, or -EBUSY if there are no + * free entries. + */ +int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp, + void *entry, unsigned long range_lo, unsigned long range_hi, + unsigned long *next, gfp_t gfp) +{ + unsigned long min = range_lo; + int ret = 0; + + range_lo = max(min, *next); + ret = mas_empty_area(mas, range_lo, range_hi, 1); + if ((mas->tree->ma_flags & MT_FLAGS_ALLOC_WRAPPED) && ret == 0) { + mas->tree->ma_flags &= ~MT_FLAGS_ALLOC_WRAPPED; + ret = 1; + } + if (ret < 0 && range_lo > min) { + ret = mas_empty_area(mas, min, range_hi, 1); + if (ret == 0) + ret = 1; + } + if (ret < 0) + return ret; + + do { + mas_insert(mas, entry); + } while (mas_nomem(mas, gfp)); + if (mas_is_err(mas)) + return xa_err(mas->node); + + *startp = mas->index; + *next = *startp + 1; + if (*next == 0) + mas->tree->ma_flags |= MT_FLAGS_ALLOC_WRAPPED; + + return ret; +} +EXPORT_SYMBOL(mas_alloc_cyclic); + static __always_inline void mas_rewalk(struct ma_state *mas, unsigned long index) { retry: @@ -6490,6 +6540,49 @@ int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp, } EXPORT_SYMBOL(mtree_alloc_range);
+/** + * mtree_alloc_cyclic() - Find somewhere to store this entry in the tree. + * @mt: The maple tree. + * @startp: Pointer to ID. + * @range_lo: Lower bound of range to search. + * @range_hi: Upper bound of range to search. + * @entry: The entry to store. + * @next: Pointer to next ID to allocate. + * @gfp: The GFP_FLAGS to use for allocations. + * + * Finds an empty entry in @mt after @next, stores the new index into + * the @id pointer, stores the entry at that index, then updates @next. + * + * @mt must be initialized with the MT_FLAGS_ALLOC_RANGE flag. + * + * Context: Any context. Takes and releases the mt.lock. May sleep if + * the @gfp flags permit. + * + * Return: 0 if the allocation succeeded without wrapping, 1 if the + * allocation succeeded after wrapping, -ENOMEM if memory could not be + * allocated, -EINVAL if @mt cannot be used, or -EBUSY if there are no + * free entries. + */ +int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp, + void *entry, unsigned long range_lo, unsigned long range_hi, + unsigned long *next, gfp_t gfp) +{ + int ret; + + MA_STATE(mas, mt, 0, 0); + + if (!mt_is_alloc(mt)) + return -EINVAL; + if (WARN_ON_ONCE(mt_is_reserved(entry))) + return -EINVAL; + mtree_lock(mt); + ret = mas_alloc_cyclic(&mas, startp, entry, range_lo, range_hi, + next, gfp); + mtree_unlock(mt); + return ret; +} +EXPORT_SYMBOL(mtree_alloc_cyclic); + int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp)
From: "Liam R. Howlett" Liam.Howlett@oracle.com
mainline inclusion from mainline-v6.9-rc1 commit f92e1a829d64dd66fa173c6934f03817d9e68d43 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IAR5GW CVE: CVE-2024-46701
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
This tests the interactions of the cyclic allocations, the maple state index and last, and overflow.
Signed-off-by: Liam R. Howlett Liam.Howlett@oracle.com Signed-off-by: Chuck Lever chuck.lever@oracle.com Link: https://lore.kernel.org/r/170820144894.6328.13052830860966450674.stgit@91.11... Signed-off-by: Christian Brauner brauner@kernel.org Signed-off-by: Yu Kuai yukuai3@huawei.com --- lib/test_maple_tree.c | 44 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+)
diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 29185ac5c727..399380db449c 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -3599,6 +3599,45 @@ static noinline void __init check_state_handling(struct maple_tree *mt) mas_unlock(&mas); }
+static noinline void __init alloc_cyclic_testing(struct maple_tree *mt) +{ + unsigned long location; + unsigned long next; + int ret = 0; + MA_STATE(mas, mt, 0, 0); + + next = 0; + mtree_lock(mt); + for (int i = 0; i < 100; i++) { + mas_alloc_cyclic(&mas, &location, mt, 2, ULONG_MAX, &next, GFP_KERNEL); + MAS_BUG_ON(&mas, i != location - 2); + MAS_BUG_ON(&mas, mas.index != location); + MAS_BUG_ON(&mas, mas.last != location); + MAS_BUG_ON(&mas, i != next - 3); + } + + mtree_unlock(mt); + mtree_destroy(mt); + next = 0; + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + for (int i = 0; i < 100; i++) { + mtree_alloc_cyclic(mt, &location, mt, 2, ULONG_MAX, &next, GFP_KERNEL); + MT_BUG_ON(mt, i != location - 2); + MT_BUG_ON(mt, i != next - 3); + MT_BUG_ON(mt, mtree_load(mt, location) != mt); + } + + mtree_destroy(mt); + /* Overflow test */ + next = ULONG_MAX - 1; + ret = mtree_alloc_cyclic(mt, &location, mt, 2, ULONG_MAX, &next, GFP_KERNEL); + MT_BUG_ON(mt, ret != 0); + ret = mtree_alloc_cyclic(mt, &location, mt, 2, ULONG_MAX, &next, GFP_KERNEL); + MT_BUG_ON(mt, ret != 0); + ret = mtree_alloc_cyclic(mt, &location, mt, 2, ULONG_MAX, &next, GFP_KERNEL); + MT_BUG_ON(mt, ret != 1); +} + static DEFINE_MTREE(tree); static int __init maple_tree_seed(void) { @@ -3880,6 +3919,11 @@ static int __init maple_tree_seed(void) check_state_handling(&tree); mtree_destroy(&tree);
+ mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + alloc_cyclic_testing(&tree); + mtree_destroy(&tree); + + #if defined(BENCH) skip: #endif
From: Chuck Lever chuck.lever@oracle.com
mainline inclusion from mainline-v6.9-rc1 commit 0e4a862174f2a8d1653a8a9cf0815020e1d3af24 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IAR5GW CVE: CVE-2024-46701
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Test robot reports:
kernel test robot noticed a -19.0% regression of aim9.disk_src.ops_per_sec on:
commit: a2e459555c5f9da3e619b7e47a63f98574dc75f1 ("shmem: stable directory offsets") https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git master
Feng Tang further clarifies that:
... the new simple_offset_add() called by shmem_mknod() brings extra cost related with slab, specifically the 'radix_tree_node', which cause the regression.
Willy's analysis is that, over time, the test workload causes xa_alloc_cyclic() to fragment the underlying SLAB cache.
This patch replaces the offset_ctx's xarray with a Maple Tree in the hope that Maple Tree's dense node mode will handle this scenario more scalably.
In addition, we can widen the simple directory offset maximum to signed long (as loff_t is also signed).
Suggested-by: Matthew Wilcox willy@infradead.org Reported-by: kernel test robot oliver.sang@intel.com Closes: https://lore.kernel.org/oe-lkp/202309081306.3ecb3734-oliver.sang@intel.com Signed-off-by: Chuck Lever chuck.lever@oracle.com Link: https://lore.kernel.org/r/170820145616.6328.12620992971699079156.stgit@91.11... Reviewed-by: Jan Kara jack@suse.cz Signed-off-by: Christian Brauner brauner@kernel.org Conflicts: include/linux/fs.h [commit 1119ec49ac4a ("mm, fs: Add BPF_READAHEAD build option for bpf readhead") changes context.] Signed-off-by: Yu Kuai yukuai3@huawei.com --- fs/libfs.c | 47 +++++++++++++++++++++++----------------------- include/linux/fs.h | 5 +++-- 2 files changed, 26 insertions(+), 26 deletions(-)
diff --git a/fs/libfs.c b/fs/libfs.c index d7b901cb9af4..98731178a3c1 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -244,17 +244,17 @@ enum { DIR_OFFSET_MIN = 2, };
-static void offset_set(struct dentry *dentry, u32 offset) +static void offset_set(struct dentry *dentry, long offset) { - dentry->d_fsdata = (void *)((uintptr_t)(offset)); + dentry->d_fsdata = (void *)offset; }
-static u32 dentry2offset(struct dentry *dentry) +static long dentry2offset(struct dentry *dentry) { - return (u32)((uintptr_t)(dentry->d_fsdata)); + return (long)dentry->d_fsdata; }
-static struct lock_class_key simple_offset_xa_lock; +static struct lock_class_key simple_offset_lock_class;
/** * simple_offset_init - initialize an offset_ctx @@ -263,8 +263,8 @@ static struct lock_class_key simple_offset_xa_lock; */ void simple_offset_init(struct offset_ctx *octx) { - xa_init_flags(&octx->xa, XA_FLAGS_ALLOC1); - lockdep_set_class(&octx->xa.xa_lock, &simple_offset_xa_lock); + mt_init_flags(&octx->mt, MT_FLAGS_ALLOC_RANGE); + lockdep_set_class(&octx->mt.ma_lock, &simple_offset_lock_class); octx->next_offset = DIR_OFFSET_MIN; }
@@ -273,20 +273,19 @@ void simple_offset_init(struct offset_ctx *octx) * @octx: directory offset ctx to be updated * @dentry: new dentry being added * - * Returns zero on success. @so_ctx and the dentry offset are updated. + * Returns zero on success. @octx and the dentry's offset are updated. * Otherwise, a negative errno value is returned. */ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry) { - static const struct xa_limit limit = XA_LIMIT(DIR_OFFSET_MIN, U32_MAX); - u32 offset; + unsigned long offset; int ret;
if (dentry2offset(dentry) != 0) return -EBUSY;
- ret = xa_alloc_cyclic(&octx->xa, &offset, dentry, limit, - &octx->next_offset, GFP_KERNEL); + ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN, + LONG_MAX, &octx->next_offset, GFP_KERNEL); if (ret < 0) return ret;
@@ -302,13 +301,13 @@ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry) */ void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry) { - u32 offset; + long offset;
offset = dentry2offset(dentry); if (offset == 0) return;
- xa_erase(&octx->xa, offset); + mtree_erase(&octx->mt, offset); offset_set(dentry, 0); }
@@ -331,7 +330,7 @@ int simple_offset_empty(struct dentry *dentry)
index = DIR_OFFSET_MIN; octx = inode->i_op->get_offset_ctx(inode); - xa_for_each(&octx->xa, index, child) { + mt_for_each(&octx->mt, child, index, LONG_MAX) { spin_lock(&child->d_lock); if (simple_positive(child)) { spin_unlock(&child->d_lock); @@ -361,8 +360,8 @@ int simple_offset_rename_exchange(struct inode *old_dir, { struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir); struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir); - u32 old_index = dentry2offset(old_dentry); - u32 new_index = dentry2offset(new_dentry); + long old_index = dentry2offset(old_dentry); + long new_index = dentry2offset(new_dentry); int ret;
simple_offset_remove(old_ctx, old_dentry); @@ -388,9 +387,9 @@ int simple_offset_rename_exchange(struct inode *old_dir,
out_restore: offset_set(old_dentry, old_index); - xa_store(&old_ctx->xa, old_index, old_dentry, GFP_KERNEL); + mtree_store(&old_ctx->mt, old_index, old_dentry, GFP_KERNEL); offset_set(new_dentry, new_index); - xa_store(&new_ctx->xa, new_index, new_dentry, GFP_KERNEL); + mtree_store(&new_ctx->mt, new_index, new_dentry, GFP_KERNEL); return ret; }
@@ -403,7 +402,7 @@ int simple_offset_rename_exchange(struct inode *old_dir, */ void simple_offset_destroy(struct offset_ctx *octx) { - xa_destroy(&octx->xa); + mtree_destroy(&octx->mt); }
/** @@ -433,16 +432,16 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
/* In this case, ->private_data is protected by f_pos_lock */ file->private_data = NULL; - return vfs_setpos(file, offset, U32_MAX); + return vfs_setpos(file, offset, LONG_MAX); }
static struct dentry *offset_find_next(struct offset_ctx *octx, loff_t offset) { + MA_STATE(mas, &octx->mt, offset, offset); struct dentry *child, *found = NULL; - XA_STATE(xas, &octx->xa, offset);
rcu_read_lock(); - child = xas_next_entry(&xas, U32_MAX); + child = mas_find(&mas, LONG_MAX); if (!child) goto out; spin_lock(&child->d_lock); @@ -456,8 +455,8 @@ static struct dentry *offset_find_next(struct offset_ctx *octx, loff_t offset)
static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) { - u32 offset = dentry2offset(dentry); struct inode *inode = d_inode(dentry); + long offset = dentry2offset(dentry);
return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset, inode->i_ino, fs_umode_to_dtype(inode->i_mode)); diff --git a/include/linux/fs.h b/include/linux/fs.h index cd582f571498..4b3993b957e2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -43,6 +43,7 @@ #include <linux/cred.h> #include <linux/mnt_idmapping.h> #include <linux/slab.h> +#include <linux/maple_tree.h> #ifdef CONFIG_BPF_READAHEAD #include <linux/tracepoint-defs.h> #endif @@ -3255,8 +3256,8 @@ extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, const void __user *from, size_t count);
struct offset_ctx { - struct xarray xa; - u32 next_offset; + struct maple_tree mt; + unsigned long next_offset; };
void simple_offset_init(struct offset_ctx *octx);
hulk inclusion category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IAR5GW CVE: CVE-2024-46701
--------------------------------
The filed "struct xarray xa" is changed to "struct maple_tree mt", and the field 'next_offset' is changed from u32 to unsigned long.
Fixes: 0e4a862174f2 ("libfs: Convert simple directory offsets to use a Maple Tree") Signed-off-by: Yu Kuai yukuai3@huawei.com --- include/linux/fs.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-)
diff --git a/include/linux/fs.h b/include/linux/fs.h index 4b3993b957e2..3cdb256a0d4c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3256,8 +3256,17 @@ extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, const void __user *from, size_t count);
struct offset_ctx { - struct maple_tree mt; - unsigned long next_offset; + KABI_REPLACE(struct xarray xa, struct maple_tree mt) +#if BITS_PER_LONG == 32 + KABI_REPLACE(u32 next_offset, unsigned long next_offset) +#else +#ifdef __GENKSYMS__ + u32 next_offset; + /* 4 bytes hole */ +#else + unsigned long next_offset; +#endif +#endif };
void simple_offset_init(struct offset_ctx *octx);
From: yangerkun yangerkun@huawei.com
mainline inclusion from mainline-v6.11-rc4 commit 64a7ce76fb901bf9f9c36cf5d681328fc0fd4b5a category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/IAR5GW CVE: CVE-2024-46701
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
After we switch tmpfs dir operations from simple_dir_operations to simple_offset_dir_operations, every rename happened will fill new dentry to dest dir's maple tree(&SHMEM_I(inode)->dir_offsets->mt) with a free key starting with octx->newx_offset, and then set newx_offset equals to free key + 1. This will lead to infinite readdir combine with rename happened at the same time, which fail generic/736 in xfstests(detail show as below).
1. create 5000 files(1 2 3...) under one dir 2. call readdir(man 3 readdir) once, and get one entry 3. rename(entry, "TEMPFILE"), then rename("TEMPFILE", entry) 4. loop 2~3, until readdir return nothing or we loop too many times(tmpfs break test with the second condition)
We choose the same logic what commit 9b378f6ad48cf ("btrfs: fix infinite directory reads") to fix it, record the last_index when we open dir, and do not emit the entry which index >= last_index. The file->private_data now used in offset dir can use directly to do this, and we also update the last_index when we llseek the dir file.
Fixes: a2e459555c5f ("shmem: stable directory offsets") Signed-off-by: yangerkun yangerkun@huawei.com Link: https://lore.kernel.org/r/20240731043835.1828697-1-yangerkun@huawei.com Reviewed-by: Chuck Lever chuck.lever@oracle.com [brauner: only update last_index after seek when offset is zero like Jan suggested] Signed-off-by: Christian Brauner brauner@kernel.org Signed-off-by: Yu Kuai yukuai3@huawei.com --- fs/libfs.c | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-)
diff --git a/fs/libfs.c b/fs/libfs.c index 98731178a3c1..fd5d30c798de 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -405,6 +405,14 @@ void simple_offset_destroy(struct offset_ctx *octx) mtree_destroy(&octx->mt); }
+static int offset_dir_open(struct inode *inode, struct file *file) +{ + struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode); + + file->private_data = (void *)ctx->next_offset; + return 0; +} + /** * offset_dir_llseek - Advance the read position of a directory descriptor * @file: an open directory whose position is to be updated @@ -418,6 +426,9 @@ void simple_offset_destroy(struct offset_ctx *octx) */ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) { + struct inode *inode = file->f_inode; + struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode); + switch (whence) { case SEEK_CUR: offset += file->f_pos; @@ -431,7 +442,8 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) }
/* In this case, ->private_data is protected by f_pos_lock */ - file->private_data = NULL; + if (!offset) + file->private_data = (void *)ctx->next_offset; return vfs_setpos(file, offset, LONG_MAX); }
@@ -462,7 +474,7 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) inode->i_ino, fs_umode_to_dtype(inode->i_mode)); }
-static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) +static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, long last_index) { struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode); struct dentry *dentry; @@ -470,17 +482,21 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) while (true) { dentry = offset_find_next(octx, ctx->pos); if (!dentry) - return ERR_PTR(-ENOENT); + return; + + if (dentry2offset(dentry) >= last_index) { + dput(dentry); + return; + }
if (!offset_dir_emit(ctx, dentry)) { dput(dentry); - break; + return; }
ctx->pos = dentry2offset(dentry) + 1; dput(dentry); } - return NULL; }
/** @@ -507,22 +523,19 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) static int offset_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dir = file->f_path.dentry; + long last_index = (long)file->private_data;
lockdep_assert_held(&d_inode(dir)->i_rwsem);
if (!dir_emit_dots(file, ctx)) return 0;
- /* In this case, ->private_data is protected by f_pos_lock */ - if (ctx->pos == DIR_OFFSET_MIN) - file->private_data = NULL; - else if (file->private_data == ERR_PTR(-ENOENT)) - return 0; - file->private_data = offset_iterate_dir(d_inode(dir), ctx); + offset_iterate_dir(d_inode(dir), ctx, last_index); return 0; }
const struct file_operations simple_offset_dir_operations = { + .open = offset_dir_open, .llseek = offset_dir_llseek, .iterate_shared = offset_readdir, .read = generic_read_dir,
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/11874 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/S...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/11874 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/S...