From: LeoLiu-oc LeoLiu-oc@zhaoxin.com
zhaoxin inclusion category: feature bugzilla: https://bugzilla.openeuler.org/show_bug.cgi?id=19 CVE: NA
----------------------------------------------------------------
When using ACPI device to create identity mapping, if the physical node of the ACPI device is a pci device, domain_1 will be created for these two devices. But if the pci device and other pci devices belong to another domain_2, this will happen conflict and destroy another domain. Such as PCI devices under the PCIE-to-PCI bridge. Therefore, when the physical node of the ACPI device is a PCI device, this patch uses the PCI device to create the domain. In this way, there is only one domain.
Signed-off-by: LeoLiu-oc LeoLiu-oc@zhaoxin.com Reviewed-by: Xiongfeng Wang wangxiongfeng2@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/iommu/intel-iommu.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+)
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 18e0be8e05a53..3e5e1791abbb3 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -2756,6 +2756,27 @@ static int domain_prepare_identity_map(struct device *dev, return iommu_domain_identity_map(domain, start, end); }
+static struct device *acpi_dev_find_pci_dev(struct device *dev) +{ + struct acpi_device_physical_node *pn; + struct acpi_device *adev; + + if (dev->bus == &acpi_bus_type) { + adev = to_acpi_device(dev); + + mutex_lock(&adev->physical_node_lock); + list_for_each_entry(pn, &adev->physical_node_list, node) { + if (dev_is_pci(pn->dev)) { + mutex_unlock(&adev->physical_node_lock); + return pn->dev; + } + } + mutex_unlock(&adev->physical_node_lock); + } + + return dev; +} + static int iommu_prepare_identity_map(struct device *dev, unsigned long long start, unsigned long long end) @@ -2763,6 +2784,8 @@ static int iommu_prepare_identity_map(struct device *dev, struct dmar_domain *domain; int ret;
+ dev = acpi_dev_find_pci_dev(dev); + domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH); if (!domain) return -ENOMEM;
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: bugfix bugzilla: NA CVE: NA
---------------------------------------------------
When mmput is called concurrently, the judgment of "mm_users == 2" in sp_group_exit is not atomic with atomic_dec_and_test in mmput. The judgment of "mm_users == 2" may never be valid. As a result, mm leakage occurs.
For example, in a typical scenario, a process has two threads, with the mmget is performed in sp_group_add_task. In this case, mm_users is 3. When two threads exit at the same time, the judgment of "mm_users == 2" fail.
Therefore, the judgment and atomic_dec_and_test are put in the spg rw_lock to ensure the serialization of the whole process.
Signed-off-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 5 +++-- kernel/fork.c | 3 ++- mm/share_pool.c | 25 ++++++++++++++++++++----- 3 files changed, 25 insertions(+), 8 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index b0b2750e7bbe1..c03b83beaf63c 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -156,7 +156,7 @@ static inline void sp_init_mm(struct mm_struct *mm) }
extern int sp_group_add_task(int pid, int spg_id); -extern void sp_group_exit(struct mm_struct *mm); +extern int sp_group_exit(struct mm_struct *mm); extern void sp_group_post_exit(struct mm_struct *mm); extern int sp_group_id_by_pid(int pid); extern int sp_group_walk(int spg_id, void *data, int (*func)(struct mm_struct *mm, void *)); @@ -299,8 +299,9 @@ static inline int sp_group_add_task(int pid, int spg_id) return -EPERM; }
-static inline void sp_group_exit(struct mm_struct *mm) +static inline int sp_group_exit(struct mm_struct *mm) { + return 0; }
static inline void sp_group_post_exit(struct mm_struct *mm) diff --git a/kernel/fork.c b/kernel/fork.c index da79ba9c83ac3..e306f8925008b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1082,7 +1082,8 @@ void mmput(struct mm_struct *mm) { might_sleep();
- sp_group_exit(mm); + if (sp_group_exit(mm)) + return;
if (atomic_dec_and_test(&mm->mm_users)) __mmput(mm); diff --git a/mm/share_pool.c b/mm/share_pool.c index 90930e4a8dfe4..61bbbd772c847 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -3110,14 +3110,20 @@ EXPORT_SYMBOL(sharepool_no_page);
#define MM_WOULD_FREE 2
-void sp_group_exit(struct mm_struct *mm) +int sp_group_exit(struct mm_struct *mm) { struct sp_group *spg = mm->sp_group; bool is_alive = true;
if (!spg || !enable_ascend_share_pool) - return; + return 0;
+ /* + * The judgment of mm->mm_users == MM_WOULD_FREE and atomic_dec_and_test + * must be atomic. Otherwise, mm->mm_users == MM_WOULD_FREE may never be + * true due to the gap in the middle. + */ + down_write(&spg->rw_lock); /* * Recall we add mm->users by 1 deliberately in sp_group_add_task(). * If the mm_users is 2, it means that the mm is ready to be freed @@ -3125,21 +3131,30 @@ void sp_group_exit(struct mm_struct *mm) * do_exit() -> exit_mm() -> mmput() -> THIS function. */ if (atomic_read(&mm->mm_users) == MM_WOULD_FREE) { - down_write(&spg->rw_lock); /* a dead group should NOT be reactive again */ if (spg_valid(spg) && list_is_singular(&spg->procs)) is_alive = spg->is_alive = false; if (mm->sp_group) /* concurrency handle of sp_group_add_task */ list_del(&mm->sp_node); /* affect spg->procs */ + /* match with get_task_mm() in sp_group_add_task() */ + atomic_dec(&mm->mm_users); up_write(&spg->rw_lock);
if (!is_alive) blocking_notifier_call_chain(&sp_notifier_chain, 0, mm->sp_group);
- /* match with get_task_mm() in sp_group_add_task() */ - atomic_dec(&mm->mm_users); + return 0; } + + if (atomic_dec_and_test(&mm->mm_users)) { + up_write(&spg->rw_lock); + WARN(1, "Invalid user counting\n"); + return 0; + } + + up_write(&spg->rw_lock); + return 1; }
void sp_group_post_exit(struct mm_struct *mm)
From: Alexey Makhalov amakhalov@vmware.com
mainline inclusion from mainline-v5.13-rc5 commit afd09b617db3786b6ef3dc43e28fe728cfea84df category: bugfix bugzilla: 148443 CVE: NA
-----------------------------------------------
Buffer head references must be released before calling kill_bdev(); otherwise the buffer head (and its page referenced by b_data) will not be freed by kill_bdev, and subsequently that bh will be leaked.
If blocksizes differ, sb_set_blocksize() will kill current buffers and page cache by using kill_bdev(). And then super block will be reread again but using correct blocksize this time. sb_set_blocksize() didn't fully free superblock page and buffer head, and being busy, they were not freed and instead leaked.
This can easily be reproduced by calling an infinite loop of:
systemctl start <ext4_on_lvm>.mount, and systemctl stop <ext4_on_lvm>.mount
... since systemd creates a cgroup for each slice which it mounts, and the bh leak get amplified by a dying memory cgroup that also never gets freed, and memory consumption is much more easily noticed.
Fixes: ce40733ce93d ("ext4: Check for return value from sb_set_blocksize") Fixes: ac27a0ec112a ("ext4: initial copy of files from ext3") Link: https://lore.kernel.org/r/20210521075533.95732-1-amakhalov@vmware.com Signed-off-by: Alexey Makhalov amakhalov@vmware.com Signed-off-by: Theodore Ts'o tytso@mit.edu Cc: stable@kernel.org
conflicts: fs/ext4/super.c
Signed-off-by: Ye Bin yebin10@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/ext4/super.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a964ed63601c6..f66bbe73d1a94 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4179,14 +4179,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) }
if (sb->s_blocksize != blocksize) { + /* + * bh must be released before kill_bdev(), otherwise + * it won't be freed and its page also. kill_bdev() + * is called by sb_set_blocksize(). + */ + brelse(bh); /* Validate the filesystem blocksize */ if (!sb_set_blocksize(sb, blocksize)) { ext4_msg(sb, KERN_ERR, "bad block size %d", blocksize); + bh = NULL; goto failed_mount; }
- brelse(bh); logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); bh = ext4_sb_bread_unmovable(sb, logical_sb_block); @@ -4861,8 +4867,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(sbi->s_qf_names[i]); #endif - ext4_blkdev_remove(sbi); + /* ext4_blkdev_remove() calls kill_bdev(), release bh before it. */ brelse(bh); + ext4_blkdev_remove(sbi); out_fail: sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock);
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: bugfix bugzilla: NA CVE: NA
-------------------
Check whether the topological structure of the DDR/HBM brokes our assumption or not. If it got broken we just return the input nid, or an invalid nid could be returned and it may break the kernel.
Fixes: aabbfd385ab2 ("numa: Move the management structures for cdm nodes to ddr") Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/mm/numa.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c index a194bad6fdfcf..82d53927554d8 100644 --- a/arch/arm64/mm/numa.c +++ b/arch/arm64/mm/numa.c @@ -85,7 +85,11 @@ int __init cdm_node_to_ddr_node(int nid)
nodes_xor(ddr_mask, cdmmask, numa_nodes_parsed); nr_ddr = nodes_weight(ddr_mask); - cdm_per_part = nr_cdm / nr_ddr ? : 1; + cdm_per_part = nr_cdm / nr_ddr; + + if (cdm_per_part == 0 || nid < nr_ddr) + /* our assumption has borken, just return the original nid. */ + return nid;
fake_nid = (nid - nr_ddr) / cdm_per_part; fake_nid = !node_isset(fake_nid, cdmmask) ? fake_nid : nid;