[PATCH kernel-4.19 1/4] iommu/vt-d: Add support for ACPI device use physical, node as pci device to establish identity mapping

From: LeoLiu-oc <LeoLiu-oc@zhaoxin.com> zhaoxin inclusion category: feature bugzilla: https://bugzilla.openeuler.org/show_bug.cgi?id=19 CVE: NA ---------------------------------------------------------------- When using ACPI device to create identity mapping, if the physical node of the ACPI device is a pci device, domain_1 will be created for these two devices. But if the pci device and other pci devices belong to another domain_2, this will happen conflict and destroy another domain. Such as PCI devices under the PCIE-to-PCI bridge. Therefore, when the physical node of the ACPI device is a PCI device, this patch uses the PCI device to create the domain. In this way, there is only one domain. Signed-off-by: LeoLiu-oc <LeoLiu-oc@zhaoxin.com> Reviewed-by: Xiongfeng Wang <wangxiongfeng2@huawei.com> Signed-off-by: Cheng Jian <cj.chengjian@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- drivers/iommu/intel-iommu.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 18e0be8e05a53..3e5e1791abbb3 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -2756,6 +2756,27 @@ static int domain_prepare_identity_map(struct device *dev, return iommu_domain_identity_map(domain, start, end); } +static struct device *acpi_dev_find_pci_dev(struct device *dev) +{ + struct acpi_device_physical_node *pn; + struct acpi_device *adev; + + if (dev->bus == &acpi_bus_type) { + adev = to_acpi_device(dev); + + mutex_lock(&adev->physical_node_lock); + list_for_each_entry(pn, &adev->physical_node_list, node) { + if (dev_is_pci(pn->dev)) { + mutex_unlock(&adev->physical_node_lock); + return pn->dev; + } + } + mutex_unlock(&adev->physical_node_lock); + } + + return dev; +} + static int iommu_prepare_identity_map(struct device *dev, unsigned long long start, unsigned long long end) @@ -2763,6 +2784,8 @@ static int iommu_prepare_identity_map(struct device *dev, struct dmar_domain *domain; int ret; + dev = acpi_dev_find_pci_dev(dev); + domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH); if (!domain) return -ENOMEM; -- 2.25.1

From: Ding Tianhong <dingtianhong@huawei.com> ascend inclusion category: bugfix bugzilla: NA CVE: NA --------------------------------------------------- When mmput is called concurrently, the judgment of "mm_users == 2" in sp_group_exit is not atomic with atomic_dec_and_test in mmput. The judgment of "mm_users == 2" may never be valid. As a result, mm leakage occurs. For example, in a typical scenario, a process has two threads, with the mmget is performed in sp_group_add_task. In this case, mm_users is 3. When two threads exit at the same time, the judgment of "mm_users == 2" fail. Therefore, the judgment and atomic_dec_and_test are put in the spg rw_lock to ensure the serialization of the whole process. Signed-off-by: Ding Tianhong <dingtianhong@huawei.com> Signed-off-by: Zhou Guanghui <zhouguanghui1@huawei.com> Reviewed-by: Weilong Chen <chenweilong@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- include/linux/share_pool.h | 5 +++-- kernel/fork.c | 3 ++- mm/share_pool.c | 25 ++++++++++++++++++++----- 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index b0b2750e7bbe1..c03b83beaf63c 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -156,7 +156,7 @@ static inline void sp_init_mm(struct mm_struct *mm) } extern int sp_group_add_task(int pid, int spg_id); -extern void sp_group_exit(struct mm_struct *mm); +extern int sp_group_exit(struct mm_struct *mm); extern void sp_group_post_exit(struct mm_struct *mm); extern int sp_group_id_by_pid(int pid); extern int sp_group_walk(int spg_id, void *data, int (*func)(struct mm_struct *mm, void *)); @@ -299,8 +299,9 @@ static inline int sp_group_add_task(int pid, int spg_id) return -EPERM; } -static inline void sp_group_exit(struct mm_struct *mm) +static inline int sp_group_exit(struct mm_struct *mm) { + return 0; } static inline void sp_group_post_exit(struct mm_struct *mm) diff --git a/kernel/fork.c b/kernel/fork.c index da79ba9c83ac3..e306f8925008b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1082,7 +1082,8 @@ void mmput(struct mm_struct *mm) { might_sleep(); - sp_group_exit(mm); + if (sp_group_exit(mm)) + return; if (atomic_dec_and_test(&mm->mm_users)) __mmput(mm); diff --git a/mm/share_pool.c b/mm/share_pool.c index 90930e4a8dfe4..61bbbd772c847 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -3110,14 +3110,20 @@ EXPORT_SYMBOL(sharepool_no_page); #define MM_WOULD_FREE 2 -void sp_group_exit(struct mm_struct *mm) +int sp_group_exit(struct mm_struct *mm) { struct sp_group *spg = mm->sp_group; bool is_alive = true; if (!spg || !enable_ascend_share_pool) - return; + return 0; + /* + * The judgment of mm->mm_users == MM_WOULD_FREE and atomic_dec_and_test + * must be atomic. Otherwise, mm->mm_users == MM_WOULD_FREE may never be + * true due to the gap in the middle. + */ + down_write(&spg->rw_lock); /* * Recall we add mm->users by 1 deliberately in sp_group_add_task(). * If the mm_users is 2, it means that the mm is ready to be freed @@ -3125,21 +3131,30 @@ void sp_group_exit(struct mm_struct *mm) * do_exit() -> exit_mm() -> mmput() -> THIS function. */ if (atomic_read(&mm->mm_users) == MM_WOULD_FREE) { - down_write(&spg->rw_lock); /* a dead group should NOT be reactive again */ if (spg_valid(spg) && list_is_singular(&spg->procs)) is_alive = spg->is_alive = false; if (mm->sp_group) /* concurrency handle of sp_group_add_task */ list_del(&mm->sp_node); /* affect spg->procs */ + /* match with get_task_mm() in sp_group_add_task() */ + atomic_dec(&mm->mm_users); up_write(&spg->rw_lock); if (!is_alive) blocking_notifier_call_chain(&sp_notifier_chain, 0, mm->sp_group); - /* match with get_task_mm() in sp_group_add_task() */ - atomic_dec(&mm->mm_users); + return 0; } + + if (atomic_dec_and_test(&mm->mm_users)) { + up_write(&spg->rw_lock); + WARN(1, "Invalid user counting\n"); + return 0; + } + + up_write(&spg->rw_lock); + return 1; } void sp_group_post_exit(struct mm_struct *mm) -- 2.25.1

From: Alexey Makhalov <amakhalov@vmware.com> mainline inclusion from mainline-v5.13-rc5 commit afd09b617db3786b6ef3dc43e28fe728cfea84df category: bugfix bugzilla: 148443 CVE: NA ----------------------------------------------- Buffer head references must be released before calling kill_bdev(); otherwise the buffer head (and its page referenced by b_data) will not be freed by kill_bdev, and subsequently that bh will be leaked. If blocksizes differ, sb_set_blocksize() will kill current buffers and page cache by using kill_bdev(). And then super block will be reread again but using correct blocksize this time. sb_set_blocksize() didn't fully free superblock page and buffer head, and being busy, they were not freed and instead leaked. This can easily be reproduced by calling an infinite loop of: systemctl start <ext4_on_lvm>.mount, and systemctl stop <ext4_on_lvm>.mount ... since systemd creates a cgroup for each slice which it mounts, and the bh leak get amplified by a dying memory cgroup that also never gets freed, and memory consumption is much more easily noticed. Fixes: ce40733ce93d ("ext4: Check for return value from sb_set_blocksize") Fixes: ac27a0ec112a ("ext4: initial copy of files from ext3") Link: https://lore.kernel.org/r/20210521075533.95732-1-amakhalov@vmware.com Signed-off-by: Alexey Makhalov <amakhalov@vmware.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@kernel.org conflicts: fs/ext4/super.c Signed-off-by: Ye Bin <yebin10@huawei.com> Reviewed-by: Zhang Yi <yi.zhang@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- fs/ext4/super.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a964ed63601c6..f66bbe73d1a94 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4179,14 +4179,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (sb->s_blocksize != blocksize) { + /* + * bh must be released before kill_bdev(), otherwise + * it won't be freed and its page also. kill_bdev() + * is called by sb_set_blocksize(). + */ + brelse(bh); /* Validate the filesystem blocksize */ if (!sb_set_blocksize(sb, blocksize)) { ext4_msg(sb, KERN_ERR, "bad block size %d", blocksize); + bh = NULL; goto failed_mount; } - brelse(bh); logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); bh = ext4_sb_bread_unmovable(sb, logical_sb_block); @@ -4861,8 +4867,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(sbi->s_qf_names[i]); #endif - ext4_blkdev_remove(sbi); + /* ext4_blkdev_remove() calls kill_bdev(), release bh before it. */ brelse(bh); + ext4_blkdev_remove(sbi); out_fail: sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); -- 2.25.1

From: Wang Wensheng <wangwensheng4@huawei.com> ascend inclusion category: bugfix bugzilla: NA CVE: NA ------------------- Check whether the topological structure of the DDR/HBM brokes our assumption or not. If it got broken we just return the input nid, or an invalid nid could be returned and it may break the kernel. Fixes: aabbfd385ab2 ("numa: Move the management structures for cdm nodes to ddr") Signed-off-by: Wang Wensheng <wangwensheng4@huawei.com> Reviewed-by: Weilong Chen <chenweilong@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- arch/arm64/mm/numa.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c index a194bad6fdfcf..82d53927554d8 100644 --- a/arch/arm64/mm/numa.c +++ b/arch/arm64/mm/numa.c @@ -85,7 +85,11 @@ int __init cdm_node_to_ddr_node(int nid) nodes_xor(ddr_mask, cdmmask, numa_nodes_parsed); nr_ddr = nodes_weight(ddr_mask); - cdm_per_part = nr_cdm / nr_ddr ? : 1; + cdm_per_part = nr_cdm / nr_ddr; + + if (cdm_per_part == 0 || nid < nr_ddr) + /* our assumption has borken, just return the original nid. */ + return nid; fake_nid = (nid - nr_ddr) / cdm_per_part; fake_nid = !node_isset(fake_nid, cdmmask) ? fake_nid : nid; -- 2.25.1
participants (1)
-
Yang Yingliang