- Kernel - mailweb.openeuler.org

[PATCH kernel-4.19] vhost_net: avoid tx queue stuck when sendmsg fails
by Yang Yingliang 28 Jun '21

28 Jun '21

From: Yunjian Wang <wangyunjian(a)huawei.com> mainline inclusion from mainline-5.12-rc1 commit dc9c9e72ff3ba01ae63e6263ac26234ba1869cd7 category: bugfix bugzilla: NA CVE: NA ------------------------------------------------- Currently the driver doesn't drop a packet which can't be sent by tun (e.g bad packet). In this case, the driver will always process the same packet lead to the tx queue stuck. To fix this issue: 1. in the case of persistent failure (e.g bad packet), the driver can skip this descriptor by ignoring the error. 2. in the case of transient failure (e.g -ENOBUFS, -EAGAIN and -ENOMEM), the driver schedules the worker to try again. Signed-off-by: Yunjian Wang <wangyunjian(a)huawei.com> Acked-by: Jason Wang <jasowang(a)redhat.com> Acked-by: Willem de Bruijn <willemb(a)google.com> Acked-by: Michael S. Tsirkin <mst(a)redhat.com> Link: https://lore.kernel.org/r/1610685980-38608-1-git-send-email-wangyunjian@hua… Signed-off-by: Jakub Kicinski <kuba(a)kernel.org> Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com> Reviewed-by: Yue Haibing <yuehaibing(a)huawei.com> Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com> --- drivers/vhost/net.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 0c7bbc92b22a9..1d99f5c443eeb 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -580,14 +580,15 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) else msg.msg_flags &= ~MSG_MORE; - /* TODO: Check specific error and bomb out unless ENOBUFS? */ err = sock->ops->sendmsg(sock, &msg, len); if (unlikely(err < 0)) { - vhost_discard_vq_desc(vq, 1); - vhost_net_enable_vq(net, vq); - break; - } - if (err != len) + if (err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS) { + vhost_discard_vq_desc(vq, 1); + vhost_net_enable_vq(net, vq); + break; + } + pr_debug("Fail to send packet: err %d", err); + } else if (unlikely(err != len)) pr_debug("Truncated TX packet: len %d != %zd\n", err, len); if (++nvq->done_idx >= VHOST_NET_BATCH) @@ -670,7 +671,6 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) msg.msg_flags &= ~MSG_MORE; } - /* TODO: Check specific error and bomb out unless ENOBUFS? */ err = sock->ops->sendmsg(sock, &msg, len); if (unlikely(err < 0)) { if (zcopy_used) { @@ -679,11 +679,13 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) nvq->upend_idx = ((unsigned)nvq->upend_idx - 1) % UIO_MAXIOV; } - vhost_discard_vq_desc(vq, 1); - vhost_net_enable_vq(net, vq); - break; - } - if (err != len) + if (err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS) { + vhost_discard_vq_desc(vq, 1); + vhost_net_enable_vq(net, vq); + break; + } + pr_debug("Fail to send packet: err %d", err); + } else if (unlikely(err != len)) pr_debug("Truncated TX packet: " " len %d != %zd\n", err, len); if (!zcopy_used) -- 2.25.1

1 0

[PATCH hulk-4.19-next] itrace: add missing itrace_hardirqs_ignore() calls
by Bixuan Cui 28 Jun '21

28 Jun '21

From: Bixuan Cui <c00283067(a)huawei.com> ascend inclusion category: feature bugzilla: NA DTS: #601 CVE: NA --------------------------------------------- There is an anomalous statistics in itrace irqsoff: max_time:3945(us) caller:__do_softirq+0x8c/0x310 max_time:7996(us) caller:__do_softirq+0x8c/0x310 Use function tracer to trace: stress-ng-cpu-3723 [011] d... 2817.510377: rcu_nmi_exit <-rcu_irq_exit stress-ng-cpu-3723 [011] d... 2817.510377: rcu_dynticks_curr_cpu_in_eqs <-rcu_nmi_exit stress-ng-cpu-3723 [011] d... 2817.514356: gic_handle_irq <-el0_irq_naked stress-ng-cpu-3723 [011] d... 2817.514356: __handle_domain_irq <-gic_handle_irq ... The per-cpu interrupt is enabled after irq_handler in el0_irq_naked() is executed. However, the itrace does not add the irqsoff monitoring to el0_irq_naked. The same omission appears in the following files: arch/arm64/include/asm/daifflags.h arch/arm64/kernel/entry.S arch/arm64/kernel/syscall.c arch/arm64/mm/fault.c Add missing itrace_hardirqs_ignore() calls into it. Signed-off-by: Bixuan Cui <cuibixuan(a)huawei.com> --- arch/arm64/include/asm/daifflags.h | 3 +++ arch/arm64/kernel/entry.S | 9 +++++++++ arch/arm64/kernel/syscall.c | 3 +++ arch/arm64/mm/fault.c | 6 +++++- 4 files changed, 20 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/daifflags.h b/arch/arm64/include/asm/daifflags.h index 1230923b032d..1d832814eae8 100644 --- a/arch/arm64/include/asm/daifflags.h +++ b/arch/arm64/include/asm/daifflags.h @@ -72,6 +72,9 @@ static inline void local_daif_restore(unsigned long flags) if (!irq_disabled) { trace_hardirqs_on(); +#ifdef CONFIG_ITRACE_IRQSOFF + itrace_hardirqs_ignore(); +#endif if (system_uses_irq_prio_masking()) { gic_write_pmr(GIC_PRIO_IRQON); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 2eb1b657de2f..2c9b8d94d367 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -718,6 +718,9 @@ alternative_else_nop_endif #endif bl trace_hardirqs_on 1: +#endif +#ifdef CONFIG_ITRACE_IRQSOFF + bl itrace_hardirqs_ignore #endif kernel_exit 1 @@ -954,6 +957,9 @@ el0_irq_naked: #ifdef CONFIG_TRACE_IRQFLAGS bl trace_hardirqs_on +#endif +#ifdef CONFIG_ITRACE_IRQSOFF + bl itrace_hardirqs_ignore #endif b ret_to_user ENDPROC(el0_irq) @@ -989,6 +995,9 @@ work_pending: bl do_notify_resume #ifdef CONFIG_TRACE_IRQFLAGS bl trace_hardirqs_on // enabled while in userspace +#endif +#ifdef CONFIG_ITRACE_IRQSOFF + bl itrace_hardirqs_ignore #endif ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for single-step b finish_ret_to_user diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index f2d2dbbbfca2..964df44d7429 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -131,6 +131,9 @@ static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr, * the SPSR. */ trace_hardirqs_on(); +#ifdef CONFIG_ITRACE_IRQSOFF + itrace_hardirqs_ignore(); +#endif return; } local_daif_restore(DAIF_PROCCTX); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 6cd448d9835c..9f30be41bb87 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -934,8 +934,12 @@ asmlinkage int __exception do_debug_exception(unsigned long addr_if_watchpoint, rv = 0; } - if (interrupts_enabled(regs)) + if (interrupts_enabled(regs)) { trace_hardirqs_on(); +#ifdef CONFIG_ITRACE_IRQSOFF + itrace_hardirqs_ignore(); +#endif + } return rv; } -- 2.17.1

1 1

[PATCH kernel-4.19 1/2] mm: vmalloc: prevent use after free in _vm_unmap_aliases
by Yang Yingliang 28 Jun '21

28 Jun '21

From: Vijayanand Jitta <vjitta(a)codeaurora.org> mainline inclusion from mainline-5.13-rc1 commit ad216c0316ad6391d90f4de0a7f59396b2925a06 category: feature bugzilla: NA CVE: NA --------------------------- A potential use after free can occur in _vm_unmap_aliases where an already freed vmap_area could be accessed, Consider the following scenario: Process 1 Process 2 __vm_unmap_aliases __vm_unmap_aliases purge_fragmented_blocks_allcpus rcu_read_lock() rcu_read_lock() list_del_rcu(&vb->free_list) list_for_each_entry_rcu(vb .. ) __purge_vmap_area_lazy kmem_cache_free(va) va_start = vb->va->va_start Here Process 1 is in purge path and it does list_del_rcu on vmap_block and later frees the vmap_area, since Process 2 was holding the rcu lock at this time vmap_block will still be present in and Process 2 accesse it and thereby it tries to access vmap_area of that vmap_block which was already freed by Process 1 and this results in use after free. Fix this by adding a check for vb->dirty before accessing vmap_area structure since vb->dirty will be set to VMAP_BBMAP_BITS in purge path checking for this will prevent the use after free. Link: https://lkml.kernel.org/r/1616062105-23263-1-git-send-email-vjitta@codeauro… Signed-off-by: Vijayanand Jitta <vjitta(a)codeaurora.org> Reviewed-by: Uladzislau Rezki (Sony) <urezki(a)gmail.com> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org> Signed-off-by: Rui Xiang <rui.xiang(a)huawei.com> Reviewed-by: Ding Tianhong <dingtianhong(a)huawei.com> Reviewed-by: Weilong Chen <chenweilong(a)huawei.com> Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com> --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 8c70131e0b078..011a84ebec04d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1953,7 +1953,7 @@ void vm_unmap_aliases(void) rcu_read_lock(); list_for_each_entry_rcu(vb, &vbq->free, free_list) { spin_lock(&vb->lock); - if (vb->dirty) { + if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) { unsigned long va_start = vb->va->va_start; unsigned long s, e; -- 2.25.1

1 1

[PATCH kernel-4.19 1/4] iommu/vt-d: Add support for ACPI device use physical, node as pci device to establish identity mapping
by Yang Yingliang 28 Jun '21

28 Jun '21

From: LeoLiu-oc <LeoLiu-oc(a)zhaoxin.com> zhaoxin inclusion category: feature bugzilla: https://bugzilla.openeuler.org/show_bug.cgi?id=19 CVE: NA ---------------------------------------------------------------- When using ACPI device to create identity mapping, if the physical node of the ACPI device is a pci device, domain_1 will be created for these two devices. But if the pci device and other pci devices belong to another domain_2, this will happen conflict and destroy another domain. Such as PCI devices under the PCIE-to-PCI bridge. Therefore, when the physical node of the ACPI device is a PCI device, this patch uses the PCI device to create the domain. In this way, there is only one domain. Signed-off-by: LeoLiu-oc <LeoLiu-oc(a)zhaoxin.com> Reviewed-by: Xiongfeng Wang <wangxiongfeng2(a)huawei.com> Signed-off-by: Cheng Jian <cj.chengjian(a)huawei.com> Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com> --- drivers/iommu/intel-iommu.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 18e0be8e05a53..3e5e1791abbb3 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -2756,6 +2756,27 @@ static int domain_prepare_identity_map(struct device *dev, return iommu_domain_identity_map(domain, start, end); } +static struct device *acpi_dev_find_pci_dev(struct device *dev) +{ + struct acpi_device_physical_node *pn; + struct acpi_device *adev; + + if (dev->bus == &acpi_bus_type) { + adev = to_acpi_device(dev); + + mutex_lock(&adev->physical_node_lock); + list_for_each_entry(pn, &adev->physical_node_list, node) { + if (dev_is_pci(pn->dev)) { + mutex_unlock(&adev->physical_node_lock); + return pn->dev; + } + } + mutex_unlock(&adev->physical_node_lock); + } + + return dev; +} + static int iommu_prepare_identity_map(struct device *dev, unsigned long long start, unsigned long long end) @@ -2763,6 +2784,8 @@ static int iommu_prepare_identity_map(struct device *dev, struct dmar_domain *domain; int ret; + dev = acpi_dev_find_pci_dev(dev); + domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH); if (!domain) return -ENOMEM; -- 2.25.1

1 3

Re: SMMU Cmdq Related Patch Unable to Reproduce Performance Improvement
by Leizhen (ThunderTown) 28 Jun '21

28 Jun '21

On 2021/6/26 22:33, 李泽斌 wrote: > > Hi all > > I am recently curious about a patch mentioned in OpenEuler Kernel Meeting about SMMU. The patch name is "iommu/arm-smmu-v3: Reduce contention during command-queue insertion", and it has been merged, the commit link is here(https://github.com/torvalds/linux/commit/587e6c10a7ce89a5924fdbeff2ec5…. > > But I find it difficult to reproduce the huge performance improvement, and I notice that there is an evaluation result figure in the meeting. So I turn to you guys for help. I wonder if you can provide more evaluation details to reproduce the improvement. TIA First，it only works in strict mode on ARM64, so please make sure: "iommu.passthrough=1" or "iommu.strict=0" is not set. Second, only some high-end peripherals have this performance bottleneck. For example: 1) 100Gb/s or higher NIC interconnection (direct connection), use command 'iperf' to test it 2) Attach more than 12 SSDs or NVMEs to the PCI expander, use command 'fio' to test it For most low-speed devices, their performance in strict mode is close to iommu.passthrough=1. There is no room for further optimization. > > > Best Regards > > Zebin Li > > . >

1 0

[PATCH kernel-4.19] block: dio: ensure the memory order between bi_private and bi_css
by Yang Yingliang 26 Jun '21

26 Jun '21

From: Hou Tao <houtao1(a)huawei.com> hulk inclusion category: bugfix bugzilla: 167067 CVE: NA -------------------------------- In __blkdev_direct_IO_simple(), when bi_private is NULL, it assumes bi_css must be NULL as showed below: CPU 1: CPU 2: __blkdev_direct_IO_simple submit_bio bio_endio bio_uninit(bio) css_put(bi_css) bi_css = NULL set_current_state(TASK_UNINTERRUPTIBLE) bio->bi_end_io blkdev_bio_end_io_simple bio->bi_private = NULL // bi_private is NULL READ_ONCE(bio->bi_private) wake_up_process smp_mb__after_spinlock bio_unint(bio) // read bi_css as no-NULL css_put(bi_css) Because there is no memory barrier between the reading and the writing of these two variables, the assumption is wrong under weak-memory model machine (e.g. arm64). bi_css will be put twice and leads to the following warning: percpu_ref_switch_to_atomic_rcu: percpu ref (css_release) <= 0 (-3) after switching to atomic There is a similar problem in __blkdev_direct_IO() which occurs between dio->waiter and bio.bi_status. Fixing it by adding a smp_rmb() between the reads of two variables, and a corresponding smp_wmb() between the writes. Signed-off-by: Hou Tao <houtao1(a)huawei.com> Signed-off-by: Yu Kuai <yukuai3(a)huawei.com> Reviewed-by: Jason Yan <yanaijie(a)huawei.com> Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com> --- fs/block_dev.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 786d105692e85..30dd7b19bd2e3 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -195,6 +195,11 @@ static void blkdev_bio_end_io_simple(struct bio *bio) { struct task_struct *waiter = bio->bi_private; + /* + * Paired with smp_rmb() after reading bio->bi_private + * in __blkdev_direct_IO_simple() + */ + smp_wmb(); WRITE_ONCE(bio->bi_private, NULL); wake_up_process(waiter); } @@ -251,8 +256,14 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, qc = submit_bio(&bio); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); - if (!READ_ONCE(bio.bi_private)) + if (!READ_ONCE(bio.bi_private)) { + /* + * Paired with smp_wmb() in + * blkdev_bio_end_io_simple() + */ + smp_rmb(); break; + } if (!(iocb->ki_flags & IOCB_HIPRI) || !blk_poll(bdev_get_queue(bdev), qc)) io_schedule(); @@ -317,6 +328,12 @@ static void blkdev_bio_end_io(struct bio *bio) } else { struct task_struct *waiter = dio->waiter; + if (!dio->multi_bio) + /* + * Paired with smp_rmb() after reading + * dio->waiter in __blkdev_direct_IO() + */ + smp_wmb(); WRITE_ONCE(dio->waiter, NULL); wake_up_process(waiter); } @@ -417,8 +434,11 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); - if (!READ_ONCE(dio->waiter)) + if (!READ_ONCE(dio->waiter)) { + /* Paired with smp_wmb() in blkdev_bio_end_io() */ + smp_rmb(); break; + } if (!(iocb->ki_flags & IOCB_HIPRI) || !blk_poll(bdev_get_queue(bdev), qc)) -- 2.25.1

1 0

[PATCH openEuler-1.0-LTS] btrfs: allow btrfs_truncate_block() to fallback to nocow for data space reservation
by Cheng Jian 25 Jun '21

25 Jun '21

From: Qu Wenruo <wqu(a)suse.com> mainline inclusion from mainline-5.13.0-rc5 commit 6d4572a9d71d5fc2affee0258d8582d39859188c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I39MZM CVE: NA ------------------------------------------------------ From: Gou Hao <gouhao(a)uniontech.com> btrfs: make btrfs_truncate_block check NOCOW attribute For the modification of the main line, there are two places not merge. The mainline code adds the parameter nowait in btrfs_check_can_nocow, during the round, it only changes the name of check_can_nocow. The code of the main line calls the btrfs_drew_write_unlock function at the end of btrfs_truncate_block, and 4.19 kernel of the code does not have this function. In the Reference address, the two codes were not merge too. ------------------------------------------------------ btrfs: allow btrfs_truncate_block() to fallback to nocow for data space reservation [BUG] When the data space is exhausted, even if the inode has NOCOW attribute, we will still refuse to truncate unaligned range due to ENOSPC. The following script can reproduce it pretty easily: #!/bin/bash dev=/dev/test/test mnt=/mnt/btrfs umount $dev &> /dev/null umount $mnt &> /dev/null mkfs.btrfs -f $dev -b 1G mount -o nospace_cache $dev $mnt touch $mnt/foobar chattr +C $mnt/foobar xfs_io -f -c "pwrite -b 4k 0 4k" $mnt/foobar > /dev/null xfs_io -f -c "pwrite -b 4k 0 1G" $mnt/padding &> /dev/null sync xfs_io -c "fpunch 0 2k" $mnt/foobar umount $mnt Currently this will fail at the fpunch part. [CAUSE] Because btrfs_truncate_block() always reserves space without checking the NOCOW attribute. Since the writeback path follows NOCOW bit, we only need to bother the space reservation code in btrfs_truncate_block(). [FIX] Make btrfs_truncate_block() follow btrfs_buffered_write() to try to reserve data space first, and fall back to NOCOW check only when we don't have enough space. Such always-try-reserve is an optimization introduced in btrfs_buffered_write(), to avoid expensive btrfs_check_can_nocow() call. This patch will export check_can_nocow() as btrfs_check_can_nocow(), and use it in btrfs_truncate_block() to fix the problem. Reference: https://patchwork.kernel.org/project/linux-btrfs/patch/20200130052822.11765… Reported-by: Martin Doucha <martin.doucha(a)suse.com> Reviewed-by: Filipe Manana <fdmanana(a)suse.com> Reviewed-by: Anand Jain <anand.jain(a)oracle.com> Signed-off-by: Qu Wenruo <wqu(a)suse.com> Reviewed-by: David Sterba <dsterba(a)suse.com> Signed-off-by: David Sterba <dsterba(a)suse.com> Signed-off-by: Gou Hao <gouhao(a)uniontech.com> Signed-off-by: Cheng Jian <cj.chengjian(a)huawei.com> --- fs/btrfs/ctree.h | 3 ++- fs/btrfs/file.c | 8 ++++---- fs/btrfs/inode.c | 39 +++++++++++++++++++++++++++++++++------ 3 files changed, 39 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 49ef2e48a8c6..3d8c699e44ea 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3273,7 +3273,8 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); int btrfs_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len); - +int btrfs_check_can_nocow(struct btrfs_inode *inode, loff_t pos, + size_t *write_bytes); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d9d90f0b66d2..f0600b1c6d90 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1536,8 +1536,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, return ret; } -static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes) +int btrfs_check_can_nocow(struct btrfs_inode *inode, loff_t pos, + size_t *write_bytes) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; @@ -1647,7 +1647,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, if (ret < 0) { if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) && - check_can_nocow(BTRFS_I(inode), pos, + btrfs_check_can_nocow(BTRFS_I(inode), pos, &write_bytes) > 0) { /* * For nodata cow case, no need to reserve @@ -1923,7 +1923,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, */ if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) || - check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) { + btrfs_check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) { inode_unlock(inode); return -EAGAIN; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 69e376d27bcc..52da573741ed 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4901,11 +4901,13 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; char *kaddr; + bool only_release_metadata = false; u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; unsigned offset = from & (blocksize - 1); struct page *page; gfp_t mask = btrfs_alloc_write_mask(mapping); + size_t write_bytes = blocksize; int ret = 0; u64 block_start; u64 block_end; @@ -4917,10 +4919,26 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, block_start = round_down(from, blocksize); block_end = block_start + blocksize - 1; - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, - block_start, blocksize); - if (ret) + ret = btrfs_check_data_free_space(inode, &data_reserved, + block_start, blocksize); + if (ret < 0) { + if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC)) && + btrfs_check_can_nocow(BTRFS_I(inode), block_start, + &write_bytes) > 0) { + /* For nocow case, no need to reserve data space */ + only_release_metadata = true; + } else { + goto out; + } + } + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), blocksize); + if (ret < 0) { + if (!only_release_metadata) + btrfs_free_reserved_data_space(inode, data_reserved, + block_start, blocksize); goto out; + } again: page = find_or_create_page(mapping, index, mask); @@ -4991,10 +5009,19 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, set_page_dirty(page); unlock_extent_cached(io_tree, block_start, block_end, &cached_state); + if (only_release_metadata) + set_extent_bit(&BTRFS_I(inode)->io_tree, block_start, + block_end, EXTENT_NORESERVE, NULL, NULL, + GFP_NOFS); out_unlock: - if (ret) - btrfs_delalloc_release_space(inode, data_reserved, block_start, - blocksize, true); + if (ret) { + if (only_release_metadata) + btrfs_delalloc_release_metadata(BTRFS_I(inode), + blocksize, true); + else + btrfs_delalloc_release_space(inode, data_reserved, + block_start, blocksize, true); + } btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); unlock_page(page); put_page(page); -- 2.25.1

1 0

【Meeting Notice】openEuler kernel 技术分享第七期 & 双周例会 Time: 2021-06-25 14:00-18:00
by Meeting Book 25 Jun '21

25 Jun '21

1 0

[PATCH kernel-4.19] ext4: fix memory leak in ext4_fill_super
by Yang Yingliang 24 Jun '21

24 Jun '21

From: Alexey Makhalov <amakhalov(a)vmware.com> mainline inclusion from mainline-v5.13-rc5 commit afd09b617db3786b6ef3dc43e28fe728cfea84df category: bugfix bugzilla: 148443 CVE: NA ----------------------------------------------- Buffer head references must be released before calling kill_bdev(); otherwise the buffer head (and its page referenced by b_data) will not be freed by kill_bdev, and subsequently that bh will be leaked. If blocksizes differ, sb_set_blocksize() will kill current buffers and page cache by using kill_bdev(). And then super block will be reread again but using correct blocksize this time. sb_set_blocksize() didn't fully free superblock page and buffer head, and being busy, they were not freed and instead leaked. This can easily be reproduced by calling an infinite loop of: systemctl start <ext4_on_lvm>.mount, and systemctl stop <ext4_on_lvm>.mount ... since systemd creates a cgroup for each slice which it mounts, and the bh leak get amplified by a dying memory cgroup that also never gets freed, and memory consumption is much more easily noticed. Fixes: ce40733ce93d ("ext4: Check for return value from sb_set_blocksize") Fixes: ac27a0ec112a ("ext4: initial copy of files from ext3") Link: https://lore.kernel.org/r/20210521075533.95732-1-amakhalov@vmware.com Signed-off-by: Alexey Makhalov <amakhalov(a)vmware.com> Signed-off-by: Theodore Ts'o <tytso(a)mit.edu> Cc: stable(a)kernel.org conflicts: fs/ext4/super.c Signed-off-by: Ye Bin <yebin10(a)huawei.com> Reviewed-by: Zhang Yi <yi.zhang(a)huawei.com> Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com> --- fs/ext4/super.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a964ed63601c6..f66bbe73d1a94 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4179,14 +4179,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (sb->s_blocksize != blocksize) { + /* + * bh must be released before kill_bdev(), otherwise + * it won't be freed and its page also. kill_bdev() + * is called by sb_set_blocksize(). + */ + brelse(bh); /* Validate the filesystem blocksize */ if (!sb_set_blocksize(sb, blocksize)) { ext4_msg(sb, KERN_ERR, "bad block size %d", blocksize); + bh = NULL; goto failed_mount; } - brelse(bh); logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); bh = ext4_sb_bread_unmovable(sb, logical_sb_block); @@ -4861,8 +4867,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(sbi->s_qf_names[i]); #endif - ext4_blkdev_remove(sbi); + /* ext4_blkdev_remove() calls kill_bdev(), release bh before it. */ brelse(bh); + ext4_blkdev_remove(sbi); out_fail: sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); -- 2.25.1

1 0

【Meeting Notice】openEuler kernel 技术分享第七期 & 双周例会 Time: 2021-06-25 14:00-18:00
by Meeting Book 24 Jun '21

24 Jun '21

2 1