Junxiao Bi (3): ocfs2: fix data corruption by fallocate ocfs2: fix zero out valid data ocfs2: issue zeroout to EOF blocks
fs/ocfs2/file.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 70 insertions(+), 4 deletions(-)
From: Junxiao Bi junxiao.bi@oracle.com
stable inclusion from stable-v5.10.43 commit c8d5faee46242c3f33b8a71a4d7d52214785bfcc category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I9909V CVE: CVE-2021-47114
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
commit 6bba4471f0cc1296fe3c2089b9e52442d3074b2e upstream.
When fallocate punches holes out of inode size, if original isize is in the middle of last cluster, then the part from isize to the end of the cluster will be zeroed with buffer write, at that time isize is not yet updated to match the new size, if writeback is kicked in, it will invoke ocfs2_writepage()->block_write_full_page() where the pages out of inode size will be dropped. That will cause file corruption. Fix this by zero out eof blocks when extending the inode size.
Running the following command with qemu-image 4.2.1 can get a corrupted coverted image file easily.
qemu-img convert -p -t none -T none -f qcow2 $qcow_image \ -O qcow2 -o compat=1.1 $qcow_image.conv
The usage of fallocate in qemu is like this, it first punches holes out of inode size, then extend the inode size.
fallocate(11, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, 2276196352, 65536) = 0 fallocate(11, 0, 2276196352, 65536) = 0
v1: https://www.spinics.net/lists/linux-fsdevel/msg193999.html v2: https://lore.kernel.org/linux-fsdevel/20210525093034.GB4112@quack2.suse.cz/T...
Link: https://lkml.kernel.org/r/20210528210648.9124-1-junxiao.bi@oracle.com Signed-off-by: Junxiao Bi junxiao.bi@oracle.com Reviewed-by: Joseph Qi joseph.qi@linux.alibaba.com Cc: Jan Kara jack@suse.cz Cc: Mark Fasheh mark@fasheh.com Cc: Joel Becker jlbec@evilplan.org Cc: Changwei Ge gechangwei@live.cn Cc: Gang He ghe@suse.com Cc: Jun Piao piaojun@huawei.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Yifan Qiao qiaoyifan4@huawei.com --- fs/ocfs2/file.c | 55 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 5 deletions(-)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index a3e077fcfeb9..3e699c4a2e37 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1859,6 +1859,45 @@ int ocfs2_remove_inode_range(struct inode *inode, return ret; }
+/* + * zero out partial blocks of one cluster. + * + * start: file offset where zero starts, will be made upper block aligned. + * len: it will be trimmed to the end of current cluster if "start + len" + * is bigger than it. + */ +static int ocfs2_zeroout_partial_cluster(struct inode *inode, + u64 start, u64 len) +{ + int ret; + u64 start_block, end_block, nr_blocks; + u64 p_block, offset; + u32 cluster, p_cluster, nr_clusters; + struct super_block *sb = inode->i_sb; + u64 end = ocfs2_align_bytes_to_clusters(sb, start); + + if (start + len < end) + end = start + len; + + start_block = ocfs2_blocks_for_bytes(sb, start); + end_block = ocfs2_blocks_for_bytes(sb, end); + nr_blocks = end_block - start_block; + if (!nr_blocks) + return 0; + + cluster = ocfs2_bytes_to_clusters(sb, start); + ret = ocfs2_get_clusters(inode, cluster, &p_cluster, + &nr_clusters, NULL); + if (ret) + return ret; + if (!p_cluster) + return 0; + + offset = start_block - ocfs2_clusters_to_blocks(sb, cluster); + p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset; + return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS); +} + /* * Parts of this function taken from xfs_change_file_space() */ @@ -1869,7 +1908,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, { int ret; s64 llen; - loff_t size; + loff_t size, orig_isize; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct buffer_head *di_bh = NULL; handle_t *handle; @@ -1900,6 +1939,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, goto out_inode_unlock; }
+ orig_isize = i_size_read(inode); switch (sr->l_whence) { case 0: /*SEEK_SET*/ break; @@ -1907,7 +1947,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, sr->l_start += f_pos; break; case 2: /*SEEK_END*/ - sr->l_start += i_size_read(inode); + sr->l_start += orig_isize; break; default: ret = -EINVAL; @@ -1961,6 +2001,14 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, default: ret = -EINVAL; } + + /* zeroout eof blocks in the cluster. */ + if (!ret && change_size && orig_isize < size) { + ret = ocfs2_zeroout_partial_cluster(inode, orig_isize, + size - orig_isize); + if (!ret) + i_size_write(inode, size); + } up_write(&OCFS2_I(inode)->ip_alloc_sem); if (ret) { mlog_errno(ret); @@ -1977,9 +2025,6 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, goto out_inode_unlock; }
- if (change_size && i_size_read(inode) < size) - i_size_write(inode, size); - inode->i_ctime = inode->i_mtime = current_time(inode); ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); if (ret < 0)
From: Junxiao Bi junxiao.bi@oracle.com
stable inclusion from stable-v5.10.56 commit 94301459306115e007966bd607638232423d15d9 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I9909V CVE: CVE-2021-47114
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit f267aeb6dea5e468793e5b8eb6a9c72c0020d418 upstream.
If append-dio feature is enabled, direct-io write and fallocate could run in parallel to extend file size, fallocate used "orig_isize" to record i_size before taking "ip_alloc_sem", when ocfs2_zeroout_partial_cluster() zeroout EOF blocks, i_size maybe already extended by ocfs2_dio_end_io_write(), that will cause valid data zeroed out.
Link: https://lkml.kernel.org/r/20210722054923.24389-1-junxiao.bi@oracle.com Fixes: 6bba4471f0cc ("ocfs2: fix data corruption by fallocate") Signed-off-by: Junxiao Bi junxiao.bi@oracle.com Reviewed-by: Joseph Qi joseph.qi@linux.alibaba.com Cc: Changwei Ge gechangwei@live.cn Cc: Gang He ghe@suse.com Cc: Joel Becker jlbec@evilplan.org Cc: Jun Piao piaojun@huawei.com Cc: Mark Fasheh mark@fasheh.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yifan Qiao qiaoyifan4@huawei.com --- fs/ocfs2/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 3e699c4a2e37..c2ab9d2819ca 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1939,7 +1939,6 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, goto out_inode_unlock; }
- orig_isize = i_size_read(inode); switch (sr->l_whence) { case 0: /*SEEK_SET*/ break; @@ -1947,7 +1946,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, sr->l_start += f_pos; break; case 2: /*SEEK_END*/ - sr->l_start += orig_isize; + sr->l_start += i_size_read(inode); break; default: ret = -EINVAL; @@ -2002,6 +2001,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, ret = -EINVAL; }
+ orig_isize = i_size_read(inode); /* zeroout eof blocks in the cluster. */ if (!ret && change_size && orig_isize < size) { ret = ocfs2_zeroout_partial_cluster(inode, orig_isize,
From: Junxiao Bi junxiao.bi@oracle.com
stable inclusion from stable-v5.10.56 commit da4f4916dab2b55cd38eec2f9a33800a7abc6bb9 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I9909V CVE: CVE-2021-47114
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit 9449ad33be8480f538b11a593e2dda2fb33ca06d upstream.
For punch holes in EOF blocks, fallocate used buffer write to zero the EOF blocks in last cluster. But since ->writepage will ignore EOF pages, those zeros will not be flushed.
This "looks" ok as commit 6bba4471f0cc ("ocfs2: fix data corruption by fallocate") will zero the EOF blocks when extend the file size, but it isn't. The problem happened on those EOF pages, before writeback, those pages had DIRTY flag set and all buffer_head in them also had DIRTY flag set, when writeback run by write_cache_pages(), DIRTY flag on the page was cleared, but DIRTY flag on the buffer_head not.
When next write happened to those EOF pages, since buffer_head already had DIRTY flag set, it would not mark page DIRTY again. That made writeback ignore them forever. That will cause data corruption. Even directio write can't work because it will fail when trying to drop pages caches before direct io, as it found the buffer_head for those pages still had DIRTY flag set, then it will fall back to buffer io mode.
To make a summary of the issue, as writeback ingores EOF pages, once any EOF page is generated, any write to it will only go to the page cache, it will never be flushed to disk even file size extends and that page is not EOF page any more. The fix is to avoid zero EOF blocks with buffer write.
The following code snippet from qemu-img could trigger the corruption.
656 open("6b3711ae-3306-4bdd-823c-cf1c0060a095.conv.2", O_RDWR|O_DIRECT|O_CLOEXEC) = 11 ... 660 fallocate(11, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, 2275868672, 327680 <unfinished ...> 660 fallocate(11, 0, 2275868672, 327680) = 0 658 pwrite64(11, "
Link: https://lkml.kernel.org/r/20210722054923.24389-2-junxiao.bi@oracle.com Signed-off-by: Junxiao Bi junxiao.bi@oracle.com Reviewed-by: Joseph Qi joseph.qi@linux.alibaba.com Cc: Mark Fasheh mark@fasheh.com Cc: Joel Becker jlbec@evilplan.org Cc: Changwei Ge gechangwei@live.cn Cc: Gang He ghe@suse.com Cc: Jun Piao piaojun@huawei.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Chen Jun chenjun102@huawei.com Acked-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Chen Jun chenjun102@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Signed-off-by: Yifan Qiao qiaoyifan4@huawei.com --- fs/ocfs2/file.c | 99 ++++++++++++++++++++++++++++++------------------- 1 file changed, 60 insertions(+), 39 deletions(-)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index c2ab9d2819ca..fd49fd2ad86f 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1533,6 +1533,45 @@ static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start, } }
+/* + * zero out partial blocks of one cluster. + * + * start: file offset where zero starts, will be made upper block aligned. + * len: it will be trimmed to the end of current cluster if "start + len" + * is bigger than it. + */ +static int ocfs2_zeroout_partial_cluster(struct inode *inode, + u64 start, u64 len) +{ + int ret; + u64 start_block, end_block, nr_blocks; + u64 p_block, offset; + u32 cluster, p_cluster, nr_clusters; + struct super_block *sb = inode->i_sb; + u64 end = ocfs2_align_bytes_to_clusters(sb, start); + + if (start + len < end) + end = start + len; + + start_block = ocfs2_blocks_for_bytes(sb, start); + end_block = ocfs2_blocks_for_bytes(sb, end); + nr_blocks = end_block - start_block; + if (!nr_blocks) + return 0; + + cluster = ocfs2_bytes_to_clusters(sb, start); + ret = ocfs2_get_clusters(inode, cluster, &p_cluster, + &nr_clusters, NULL); + if (ret) + return ret; + if (!p_cluster) + return 0; + + offset = start_block - ocfs2_clusters_to_blocks(sb, cluster); + p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset; + return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS); +} + static int ocfs2_zero_partial_clusters(struct inode *inode, u64 start, u64 len) { @@ -1542,6 +1581,7 @@ static int ocfs2_zero_partial_clusters(struct inode *inode, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); unsigned int csize = osb->s_clustersize; handle_t *handle; + loff_t isize = i_size_read(inode);
/* * The "start" and "end" values are NOT necessarily part of @@ -1562,6 +1602,26 @@ static int ocfs2_zero_partial_clusters(struct inode *inode, if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0) goto out;
+ /* No page cache for EOF blocks, issue zero out to disk. */ + if (end > isize) { + /* + * zeroout eof blocks in last cluster starting from + * "isize" even "start" > "isize" because it is + * complicated to zeroout just at "start" as "start" + * may be not aligned with block size, buffer write + * would be required to do that, but out of eof buffer + * write is not supported. + */ + ret = ocfs2_zeroout_partial_cluster(inode, isize, + end - isize); + if (ret) { + mlog_errno(ret); + goto out; + } + if (start >= isize) + goto out; + end = isize; + } handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { ret = PTR_ERR(handle); @@ -1859,45 +1919,6 @@ int ocfs2_remove_inode_range(struct inode *inode, return ret; }
-/* - * zero out partial blocks of one cluster. - * - * start: file offset where zero starts, will be made upper block aligned. - * len: it will be trimmed to the end of current cluster if "start + len" - * is bigger than it. - */ -static int ocfs2_zeroout_partial_cluster(struct inode *inode, - u64 start, u64 len) -{ - int ret; - u64 start_block, end_block, nr_blocks; - u64 p_block, offset; - u32 cluster, p_cluster, nr_clusters; - struct super_block *sb = inode->i_sb; - u64 end = ocfs2_align_bytes_to_clusters(sb, start); - - if (start + len < end) - end = start + len; - - start_block = ocfs2_blocks_for_bytes(sb, start); - end_block = ocfs2_blocks_for_bytes(sb, end); - nr_blocks = end_block - start_block; - if (!nr_blocks) - return 0; - - cluster = ocfs2_bytes_to_clusters(sb, start); - ret = ocfs2_get_clusters(inode, cluster, &p_cluster, - &nr_clusters, NULL); - if (ret) - return ret; - if (!p_cluster) - return 0; - - offset = start_block - ocfs2_clusters_to_blocks(sb, cluster); - p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset; - return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS); -} - /* * Parts of this function taken from xfs_change_file_space() */
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/5443 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/C...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/5443 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/C...