
mainline inclusion from mainline-v6.17-rc1 commit 8f2c3b74865cac9b3bd87fb15633475b46069ca8 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICN2MB Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i... -------------------------------- When allocating data blocks, if the first try (goal allocation) fails and stream allocation is on, it tries a global goal starting from the last group we used (s_mb_last_group). This helps cluster large files together to reduce free space fragmentation, and the data block contiguity also accelerates write-back to disk. However, when multiple processes allocate blocks, having just one global goal means they all fight over the same group. This drastically lowers the chances of extents merging and leads to much worse file fragmentation. To mitigate this multi-process contention, we now employ multiple global goals, with the number of goals being the minimum between the number of possible CPUs and one-quarter of the filesystem's total block group count. To ensure a consistent goal for each inode, we select the corresponding goal by taking the inode number modulo the total number of goals. Performance test data follows: Test: Running will-it-scale/fallocate2 on CPU-bound containers. Observation: Average fallocate operations per container per second. |CPU: Kunpeng 920 | P80 | P1 | |Memory: 512GB |------------------------|-------------------------| |960GB SSD (0.5GB/s)| base | patched | base | patched | |-------------------|-------|----------------|--------|----------------| |mb_optimize_scan=0 | 9636 | 19628 (+103%) | 337597 | 320885 (-4.9%) | |mb_optimize_scan=1 | 4834 | 7129 (+47.4%) | 341440 | 321275 (-5.9%) | |CPU: AMD 9654 * 2 | P96 | P1 | |Memory: 1536GB |------------------------|-------------------------| |960GB SSD (1GB/s) | base | patched | base | patched | |-------------------|-------|----------------|--------|----------------| |mb_optimize_scan=0 | 22341 | 53760 (+140%) | 219707 | 213145 (-2.9%) | |mb_optimize_scan=1 | 9177 | 12716 (+38.5%) | 215732 | 215262 (+0.2%) | Suggested-by: Jan Kara <jack@suse.cz> Signed-off-by: Baokun Li <libaokun1@huawei.com> Reviewed-by: Zhang Yi <yi.zhang@huawei.com> Link: https://patch.msgid.link/20250714130327.1830534-6-libaokun1@huawei.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> Conflicts: fs/ext4/mballoc.c [Context differences because there is no commit 908177175a2a ("ext4: remove unused return value of ext4_mb_release").] Signed-off-by: Baokun Li <libaokun1@huawei.com> --- fs/ext4/ext4.h | 6 ++++-- fs/ext4/mballoc.c | 27 +++++++++++++++++++++++---- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d8382d116211..4b6896198101 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1611,12 +1611,14 @@ struct ext4_sb_info { unsigned int s_mb_order2_reqs; unsigned int s_mb_group_prealloc; unsigned int s_max_dir_size_kb; - /* where last allocation was done - for stream allocation */ - ext4_group_t s_mb_last_group; unsigned int s_mb_prefetch; unsigned int s_mb_prefetch_limit; unsigned int s_mb_best_avail_max_trim_order; + /* where last allocation was done - for stream allocation */ + ext4_group_t *s_mb_last_groups; + unsigned int s_mb_nr_global_goals; + /* stats for buddy allocator */ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ atomic_t s_bal_success; /* we found long enough chunks */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d6c76fb297e3..c39f23c21a5c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2169,8 +2169,12 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, ac->ac_buddy_page = e4b->bd_buddy_page; get_page(ac->ac_buddy_page); /* store last allocated for subsequent stream allocation */ - if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) - WRITE_ONCE(sbi->s_mb_last_group, ac->ac_f_ex.fe_group); + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { + int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals; + + WRITE_ONCE(sbi->s_mb_last_groups[hash], ac->ac_f_ex.fe_group); + } + /* * As we've just preallocated more space than * user requested originally, we store allocated @@ -2843,7 +2847,9 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) /* if stream allocation is enabled, use global goal */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { - ac->ac_g_ex.fe_group = READ_ONCE(sbi->s_mb_last_group); + int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals; + + ac->ac_g_ex.fe_group = READ_ONCE(sbi->s_mb_last_groups[hash]); ac->ac_g_ex.fe_start = -1; ac->ac_flags &= ~EXT4_MB_HINT_TRY_GOAL; } @@ -3716,10 +3722,19 @@ int ext4_mb_init(struct super_block *sb) sbi->s_mb_group_prealloc, EXT4_B2C(sbi, sbi->s_stripe)); } + sbi->s_mb_nr_global_goals = umin(num_possible_cpus(), + DIV_ROUND_UP(sbi->s_groups_count, 4)); + sbi->s_mb_last_groups = kcalloc(sbi->s_mb_nr_global_goals, + sizeof(ext4_group_t), GFP_KERNEL); + if (sbi->s_mb_last_groups == NULL) { + ret = -ENOMEM; + goto out; + } + sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); if (sbi->s_locality_groups == NULL) { ret = -ENOMEM; - goto out; + goto out_free_last_groups; } for_each_possible_cpu(i) { struct ext4_locality_group *lg; @@ -3744,6 +3759,9 @@ int ext4_mb_init(struct super_block *sb) out_free_locality_groups: free_percpu(sbi->s_locality_groups); sbi->s_locality_groups = NULL; +out_free_last_groups: + kfree(sbi->s_mb_last_groups); + sbi->s_mb_last_groups = NULL; out: kfree(sbi->s_mb_avg_fragment_size); kfree(sbi->s_mb_avg_fragment_size_locks); @@ -3848,6 +3866,7 @@ int ext4_mb_release(struct super_block *sb) } free_percpu(sbi->s_locality_groups); + kfree(sbi->s_mb_last_groups); return 0; } -- 2.46.1