From: Qu Wenruo wqu@suse.com
mainline inclusion from mainline-v5.13-rc5 commit 6d4572a9d71d5fc2affee0258d8582d39859188c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I39MZM CVE: NA
------------------------------------------------------
[BUG] When the data space is exhausted, even if the inode has NOCOW attribute, we will still refuse to truncate unaligned range due to ENOSPC.
The following script can reproduce it pretty easily: #!/bin/bash
dev=/dev/test/test mnt=/mnt/btrfs
umount $dev &> /dev/null umount $mnt &> /dev/null
mkfs.btrfs -f $dev -b 1G mount -o nospace_cache $dev $mnt touch $mnt/foobar chattr +C $mnt/foobar
xfs_io -f -c "pwrite -b 4k 0 4k" $mnt/foobar > /dev/null xfs_io -f -c "pwrite -b 4k 0 1G" $mnt/padding &> /dev/null sync
xfs_io -c "fpunch 0 2k" $mnt/foobar umount $mnt
Currently this will fail at the fpunch part.
[CAUSE] Because btrfs_truncate_block() always reserves space without checking the NOCOW attribute.
Since the writeback path follows NOCOW bit, we only need to bother the space reservation code in btrfs_truncate_block().
[FIX] Make btrfs_truncate_block() follow btrfs_buffered_write() to try to reserve data space first, and fall back to NOCOW check only when we don't have enough space.
Such always-try-reserve is an optimization introduced in btrfs_buffered_write(), to avoid expensive btrfs_check_can_nocow() call.
This patch will export check_can_nocow() as btrfs_check_can_nocow(), and use it in btrfs_truncate_block() to fix the problem.
Reference: https://patchwork.kernel.org/project/linux-btrfs/patch/20200130052822.11765-... Reported-by: Martin Doucha martin.doucha@suse.com Reviewed-by: Filipe Manana fdmanana@suse.com Reviewed-by: Anand Jain anand.jain@oracle.com Signed-off-by: Qu Wenruo wqu@suse.com Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Conflicts: fs/btrfs/file.c fs/btrfs/inode.c Signed-off-by: Gou Hao gouhao@uniontech.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Jiao Fenfang jiaofenfang@uniontech.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/btrfs/ctree.h | 3 ++- fs/btrfs/file.c | 8 ++++---- fs/btrfs/inode.c | 39 +++++++++++++++++++++++++++++++++------ 3 files changed, 39 insertions(+), 11 deletions(-)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4d1c12faada89..4f5c58d40a79f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3271,7 +3271,8 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); int btrfs_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len); - +int btrfs_check_can_nocow(struct btrfs_inode *inode, loff_t pos, + size_t *write_bytes); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 41ad37f8062a9..3cd05edca30ce 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1536,8 +1536,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, return ret; }
-static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes) +int btrfs_check_can_nocow(struct btrfs_inode *inode, loff_t pos, + size_t *write_bytes) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; @@ -1647,7 +1647,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, if (ret < 0) { if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) && - check_can_nocow(BTRFS_I(inode), pos, + btrfs_check_can_nocow(BTRFS_I(inode), pos, &write_bytes) > 0) { /* * For nodata cow case, no need to reserve @@ -1925,7 +1925,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, */ if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) || - check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) { + btrfs_check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) { inode_unlock(inode); return -EAGAIN; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bf0e0e3e09c5d..9e3e003e4488e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4958,11 +4958,13 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; char *kaddr; + bool only_release_metadata = false; u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; unsigned offset = from & (blocksize - 1); struct page *page; gfp_t mask = btrfs_alloc_write_mask(mapping); + size_t write_bytes = blocksize; int ret = 0; u64 block_start; u64 block_end; @@ -4974,10 +4976,26 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, block_start = round_down(from, blocksize); block_end = block_start + blocksize - 1;
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved, - block_start, blocksize); - if (ret) + ret = btrfs_check_data_free_space(inode, &data_reserved, + block_start, blocksize); + if (ret < 0) { + if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC)) && + btrfs_check_can_nocow(BTRFS_I(inode), block_start, + &write_bytes) > 0) { + /* For nocow case, no need to reserve data space */ + only_release_metadata = true; + } else { + goto out; + } + } + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), blocksize); + if (ret < 0) { + if (!only_release_metadata) + btrfs_free_reserved_data_space(inode, data_reserved, + block_start, blocksize); goto out; + }
again: page = find_or_create_page(mapping, index, mask); @@ -5048,10 +5066,19 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, set_page_dirty(page); unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
+ if (only_release_metadata) + set_extent_bit(&BTRFS_I(inode)->io_tree, block_start, + block_end, EXTENT_NORESERVE, NULL, NULL, + GFP_NOFS); out_unlock: - if (ret) - btrfs_delalloc_release_space(inode, data_reserved, block_start, - blocksize, true); + if (ret) { + if (only_release_metadata) + btrfs_delalloc_release_metadata(BTRFS_I(inode), + blocksize, true); + else + btrfs_delalloc_release_space(inode, data_reserved, + block_start, blocksize, true); + } btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); unlock_page(page); put_page(page);
Hi,苟浩您好。
首先非常感谢您参与 openEuler kernel 开发。
您的 PATCH 已经合入 openEuler-20.03 LTS/SP1/SP2,对应 commit 号如下:
a486c868c4b9 btrfs: allow btrfs_truncate_block() to fallback to nocow for data space reservation
该问题对应 issue
https://gitee.com/openeuler/kernel/issues/I39MZM
如果您有什么信息要同步的,可以在 issue 里面更新, 或者联系 @成坚(gatieme)
最后
再次感谢您参与 openEuler,社区有您更精彩。
On 2021/7/9 16:29, Yang Yingliang wrote:
From: Qu Wenruo wqu@suse.com
mainline inclusion from mainline-v5.13-rc5 commit 6d4572a9d71d5fc2affee0258d8582d39859188c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I39MZM CVE: NA
[BUG] When the data space is exhausted, even if the inode has NOCOW attribute, we will still refuse to truncate unaligned range due to ENOSPC.
The following script can reproduce it pretty easily: #!/bin/bash
dev=/dev/test/test mnt=/mnt/btrfs
umount $dev &> /dev/null umount $mnt &> /dev/null
mkfs.btrfs -f $dev -b 1G mount -o nospace_cache $dev $mnt touch $mnt/foobar chattr +C $mnt/foobar
xfs_io -f -c "pwrite -b 4k 0 4k" $mnt/foobar > /dev/null xfs_io -f -c "pwrite -b 4k 0 1G" $mnt/padding &> /dev/null sync
xfs_io -c "fpunch 0 2k" $mnt/foobar umount $mnt
Currently this will fail at the fpunch part.
[CAUSE] Because btrfs_truncate_block() always reserves space without checking the NOCOW attribute.
Since the writeback path follows NOCOW bit, we only need to bother the space reservation code in btrfs_truncate_block().
[FIX] Make btrfs_truncate_block() follow btrfs_buffered_write() to try to reserve data space first, and fall back to NOCOW check only when we don't have enough space.
Such always-try-reserve is an optimization introduced in btrfs_buffered_write(), to avoid expensive btrfs_check_can_nocow() call.
This patch will export check_can_nocow() as btrfs_check_can_nocow(), and use it in btrfs_truncate_block() to fix the problem.
Reference: https://patchwork.kernel.org/project/linux-btrfs/patch/20200130052822.11765-... Reported-by: Martin Doucha martin.doucha@suse.com Reviewed-by: Filipe Manana fdmanana@suse.com Reviewed-by: Anand Jain anand.jain@oracle.com Signed-off-by: Qu Wenruo wqu@suse.com Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Conflicts: fs/btrfs/file.c fs/btrfs/inode.c Signed-off-by: Gou Hao gouhao@uniontech.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Jiao Fenfang jiaofenfang@uniontech.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com
fs/btrfs/ctree.h | 3 ++- fs/btrfs/file.c | 8 ++++---- fs/btrfs/inode.c | 39 +++++++++++++++++++++++++++++++++------ 3 files changed, 39 insertions(+), 11 deletions(-)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4d1c12faada89..4f5c58d40a79f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3271,7 +3271,8 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); int btrfs_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len);
+int btrfs_check_can_nocow(struct btrfs_inode *inode, loff_t pos,
/* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root);size_t *write_bytes);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 41ad37f8062a9..3cd05edca30ce 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1536,8 +1536,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, return ret; }
-static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
size_t *write_bytes)
+int btrfs_check_can_nocow(struct btrfs_inode *inode, loff_t pos,
{ struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root;size_t *write_bytes)
@@ -1647,7 +1647,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, if (ret < 0) { if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) &&
check_can_nocow(BTRFS_I(inode), pos,
btrfs_check_can_nocow(BTRFS_I(inode), pos, &write_bytes) > 0) { /* * For nodata cow case, no need to reserve
@@ -1925,7 +1925,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, */ if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) ||
check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) {
}btrfs_check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) { inode_unlock(inode); return -EAGAIN;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bf0e0e3e09c5d..9e3e003e4488e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4958,11 +4958,13 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; char *kaddr;
- bool only_release_metadata = false; u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; unsigned offset = from & (blocksize - 1); struct page *page; gfp_t mask = btrfs_alloc_write_mask(mapping);
- size_t write_bytes = blocksize; int ret = 0; u64 block_start; u64 block_end;
@@ -4974,10 +4976,26 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, block_start = round_down(from, blocksize); block_end = block_start + blocksize - 1;
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
block_start, blocksize);
- if (ret)
ret = btrfs_check_data_free_space(inode, &data_reserved,
block_start, blocksize);
if (ret < 0) {
if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
BTRFS_INODE_PREALLOC)) &&
btrfs_check_can_nocow(BTRFS_I(inode), block_start,
&write_bytes) > 0) {
/* For nocow case, no need to reserve data space */
only_release_metadata = true;
} else {
goto out;
}
}
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), blocksize);
if (ret < 0) {
if (!only_release_metadata)
btrfs_free_reserved_data_space(inode, data_reserved,
block_start, blocksize);
goto out;
}
again: page = find_or_create_page(mapping, index, mask);
@@ -5048,10 +5066,19 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, set_page_dirty(page); unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
- if (only_release_metadata)
set_extent_bit(&BTRFS_I(inode)->io_tree, block_start,
block_end, EXTENT_NORESERVE, NULL, NULL,
out_unlock:GFP_NOFS);
- if (ret)
btrfs_delalloc_release_space(inode, data_reserved, block_start,
blocksize, true);
- if (ret) {
if (only_release_metadata)
btrfs_delalloc_release_metadata(BTRFS_I(inode),
blocksize, true);
else
btrfs_delalloc_release_space(inode, data_reserved,
block_start, blocksize, true);
- } btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); unlock_page(page); put_page(page);