From: Jason Gunthorpe jgg@nvidia.com
For SMMUv3 a IOMMU_DOMAIN_NESTED is composed of a S2 iommu_domain acting as the parent and a user provided STE fragment that defines the CD table and related data with addresses translated by the S2 iommu_domain.
The kernel only permits userspace to control certain allowed bits of the STE that are safe for user control.
IOTLB maintenance is a bit subtle here, the S1 implicitly includes the S2 translation, but there is no way of knowing which S1 entries refer to a range of S2. There is no option but to discard the all the S1 entries from the IOTLB and thus flush the entire ATC.
Signed-off-by: Jason Gunthorpe jgg@nvidia.com Signed-off-by: Kunkun Jiang jiangkunkun@huawei.com --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 151 +++++++++++++++++++- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 22 ++- include/uapi/linux/iommufd.h | 18 +++ 3 files changed, 187 insertions(+), 4 deletions(-)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 12e18ebc277a..79292a57804c 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1829,6 +1829,29 @@ static void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, STRTAB_STE_3_S2TTB_MASK); }
+static void arm_smmu_make_nested_domain_ste( + struct arm_smmu_ste *target, struct arm_smmu_master *master, + struct arm_smmu_nested_domain *nested_domain, bool ats_enabled) +{ + /* + * Userspace can request a non-valid STE through the nesting interface. + * We relay that into a non-valid physical STE with the intention that + * C_BAD_STE for this SID can be delivered to userspace. + */ + if (!(nested_domain->ste[0] & cpu_to_le64(STRTAB_STE_0_V))) { + memset(target, 0, sizeof(*target)); + return; + } + + arm_smmu_make_s2_domain_ste(target, master, nested_domain->s2_parent, + ats_enabled); + + target->data[0] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_0_CFG, + STRTAB_STE_0_CFG_NESTED)) | + nested_domain->ste[0]; + target->data[1] |= nested_domain->ste[1]; +} + /* * This can safely directly manipulate the STE memory without a sync sequence * because the STE table has not been installed in the SMMU yet. @@ -2257,7 +2280,16 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, if (!master->ats_enabled) continue;
- arm_smmu_atc_inv_to_cmd(master_domain->ssid, iova, size, &cmd); + if (master_domain->nested_parent) { + /* + * If a S2 used as a nesting parent is changed we have + * no option but to completely flush the ATC. + */ + arm_smmu_atc_inv_to_cmd(IOMMU_NO_PASID, 0, 0, &cmd); + } else { + arm_smmu_atc_inv_to_cmd(master_domain->ssid, iova, size, + &cmd); + }
for (i = 0; i < master->num_streams; i++) { cmd.atc.sid = master->streams[i].id; @@ -2399,7 +2431,8 @@ static void arm_smmu_tlb_inv_range_s2(struct arm_smmu_domain *smmu_domain, }, };
- if (arm_smmu_inv_range_too_big(smmu_domain->smmu, size, granule)) { + if (smmu_domain->nesting_parent || + arm_smmu_inv_range_too_big(smmu_domain->smmu, size, granule)) { cmd.opcode = CMDQ_OP_TLBI_S12_VMALL; arm_smmu_cmdq_issue_cmd_with_sync(smmu_domain->smmu, &cmd); } else { @@ -2832,6 +2865,9 @@ to_smmu_domain_devices(struct iommu_domain *domain) if ((domain->type & __IOMMU_DOMAIN_PAGING) || domain->type == IOMMU_DOMAIN_SVA) return to_smmu_domain(domain); + if (domain->type == IOMMU_DOMAIN_NESTED) + return container_of(domain, struct arm_smmu_nested_domain, + domain)->s2_parent; return NULL; }
@@ -2893,6 +2929,8 @@ static int arm_smmu_attach_prepare(struct arm_smmu_master *master, return -ENOMEM; master_domain->master = master; master_domain->ssid = state->ssid; + master_domain->nested_parent = domain->type == + IOMMU_DOMAIN_NESTED;
/* * During prepare we want the current smmu_domain and new @@ -3250,6 +3288,108 @@ static struct iommu_domain arm_smmu_blocked_domain = { .ops = &arm_smmu_blocked_ops, };
+static int arm_smmu_attach_dev_nested(struct iommu_domain *domain, + struct device *dev) +{ + struct arm_smmu_nested_domain *nested_domain = + container_of(domain, struct arm_smmu_nested_domain, domain); + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + struct attach_state state = {.ssid = IOMMU_NO_PASID}; + struct arm_smmu_ste ste; + int ret; + + if (arm_smmu_ssids_in_use(&master->cd_table) || + nested_domain->s2_parent->smmu != master->smmu) + return -EINVAL; + + mutex_lock(&master->smmu->asid_lock); + /* + * The VM has to control the actual ATS state at the PCI device because + * we forward the invalidations directly from the VM. If the VM doesn't + * think ATS is on it will not generate ATC flushes and the ATC will + * become incoherent. Since we can't access the actual virtual PCI ATS + * config bit here base this off the EATS value in the STE. If the EATS + * is set then the VM must generate ATC flushes. + */ + state.disable_ats = !nested_domain->enable_ats; + ret = arm_smmu_attach_prepare(master, domain, &state); + if (ret) { + mutex_unlock(&master->smmu->asid_lock); + return ret; + } + arm_smmu_make_nested_domain_ste(&ste, master, nested_domain, + state.want_ats); + arm_smmu_install_ste_for_dev(master, &ste); + arm_smmu_attach_commit(master, &state); + mutex_unlock(&master->smmu->asid_lock); + return 0; +} + +static void arm_smmu_domain_nested_free(struct iommu_domain *domain) +{ + kfree(container_of(domain, struct arm_smmu_nested_domain, domain)); +} + +static const struct iommu_domain_ops arm_smmu_nested_ops = { + .attach_dev = arm_smmu_attach_dev_nested, + .free = arm_smmu_domain_nested_free, +}; + +static struct iommu_domain * +arm_smmu_domain_alloc_nesting(struct device *dev, u32 flags, + struct iommu_domain *parent, + const struct iommu_user_data *user_data) +{ + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + struct arm_smmu_nested_domain *nested_domain; + struct arm_smmu_domain *smmu_parent; + struct iommu_hwpt_arm_smmuv3 arg; + unsigned int eats; + int ret; + + if (!(master->smmu->features & ARM_SMMU_FEAT_NESTING)) + return ERR_PTR(-EOPNOTSUPP); + + ret = iommu_copy_struct_from_user(&arg, user_data, + IOMMU_HWPT_DATA_ARM_SMMUV3, ste); + if (ret) + return ERR_PTR(ret); + + if (flags || !(master->smmu->features & ARM_SMMU_FEAT_TRANS_S1)) + return ERR_PTR(-EOPNOTSUPP); + + if (!(parent->type & __IOMMU_DOMAIN_PAGING)) + return ERR_PTR(-EINVAL); + + smmu_parent = to_smmu_domain(parent); + if (smmu_parent->stage != ARM_SMMU_DOMAIN_S2 || + smmu_parent->smmu != master->smmu) + return ERR_PTR(-EINVAL); + + /* EIO is reserved for invalid STE data. */ + if ((arg.ste[0] & ~STRTAB_STE_0_NESTING_ALLOWED) || + (arg.ste[1] & ~STRTAB_STE_1_NESTING_ALLOWED)) + return ERR_PTR(-EIO); + + /* Only Full ATS or ATS UR is supported */ + eats = FIELD_GET(STRTAB_STE_1_EATS, le64_to_cpu(arg.ste[1])); + if (eats != STRTAB_STE_1_EATS_ABT && eats != STRTAB_STE_1_EATS_TRANS) + return ERR_PTR(-EIO); + + nested_domain = kzalloc(sizeof(*nested_domain), GFP_KERNEL_ACCOUNT); + if (!nested_domain) + return ERR_PTR(-ENOMEM); + + nested_domain->domain.type = IOMMU_DOMAIN_NESTED; + nested_domain->domain.ops = &arm_smmu_nested_ops; + nested_domain->s2_parent = smmu_parent; + nested_domain->enable_ats = eats == STRTAB_STE_1_EATS_TRANS; + nested_domain->ste[0] = arg.ste[0]; + nested_domain->ste[1] = arg.ste[1] & ~cpu_to_le64(STRTAB_STE_1_EATS); + + return &nested_domain->domain; +} + static struct iommu_domain * arm_smmu_domain_alloc_user(struct device *dev, u32 flags, struct iommu_domain *parent, @@ -3260,7 +3400,11 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags, struct arm_smmu_domain *smmu_domain; int ret;
- if (parent || (flags & ~PAGING_FLAGS)) + if (parent) + return arm_smmu_domain_alloc_nesting(dev, flags, parent, + user_data); + + if (flags & ~PAGING_FLAGS) return ERR_PTR(-EOPNOTSUPP); if (user_data) return ERR_PTR(-EINVAL); @@ -3275,6 +3419,7 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags, goto err_free; } smmu_domain->stage = ARM_SMMU_DOMAIN_S2; + smmu_domain->nesting_parent = true; }
smmu_domain->domain.type = IOMMU_DOMAIN_UNMANAGED; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 393e8466b66c..9e6e4a47b2d2 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -247,6 +247,7 @@ struct arm_smmu_ste { #define STRTAB_STE_0_CFG_BYPASS 4 #define STRTAB_STE_0_CFG_S1_TRANS 5 #define STRTAB_STE_0_CFG_S2_TRANS 6 +#define STRTAB_STE_0_CFG_NESTED 7
#define STRTAB_STE_0_S1FMT GENMASK_ULL(5, 4) #define STRTAB_STE_0_S1FMT_LINEAR 0 @@ -297,6 +298,15 @@ struct arm_smmu_ste {
#define STRTAB_STE_3_S2TTB_MASK GENMASK_ULL(51, 4)
+/* These bits can be controlled by userspace for STRTAB_STE_0_CFG_NESTED */ +#define STRTAB_STE_0_NESTING_ALLOWED \ + cpu_to_le64(STRTAB_STE_0_V | STRTAB_STE_0_S1FMT | \ + STRTAB_STE_0_S1CTXPTR_MASK | STRTAB_STE_0_S1CDMAX) +#define STRTAB_STE_1_NESTING_ALLOWED \ + cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR | \ + STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH | \ + STRTAB_STE_1_S1STALLD | STRTAB_STE_1_EATS) + /* * Context descriptors. * @@ -810,13 +820,23 @@ struct arm_smmu_domain { spinlock_t devices_lock;
struct mmu_notifier mmu_notifier; - bool btm_invalidation; + bool btm_invalidation : 1; + bool nesting_parent : 1; +}; + +struct arm_smmu_nested_domain { + struct iommu_domain domain; + struct arm_smmu_domain *s2_parent; + u8 enable_ats : 1; + + __le64 ste[2]; };
struct arm_smmu_master_domain { struct list_head devices_elm; struct arm_smmu_master *master; u16 ssid; + u8 nested_parent; };
static inline struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom) diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index be16f6f36474..79b7d1558e11 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -394,14 +394,32 @@ struct iommu_hwpt_vtd_s1 { __u32 __reserved; };
+/** + * struct iommu_hwpt_arm_smmuv3 - ARM SMMUv3 Context Descriptor Table info + * (IOMMU_HWPT_DATA_ARM_SMMUV3) + * + * @ste: The first two double words of the user space Stream Table Entry for + * a user stage-1 Context Descriptor Table. Must be little-endian. + * Allowed fields: (Refer to "5.2 Stream Table Entry" in SMMUv3 HW Spec) + * - word-0: V, S1Fmt, S1ContextPtr, S1CDMax + * - word-1: S1DSS, S1CIR, S1COR, S1CSH, S1STALLD + * + * -EIO will be returned if @ste is not legal or contains any non-allowed field. + */ +struct iommu_hwpt_arm_smmuv3 { + __aligned_le64 ste[2]; +}; + /** * enum iommu_hwpt_data_type - IOMMU HWPT Data Type * @IOMMU_HWPT_DATA_NONE: no data * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table + * @IOMMU_HWPT_DATA_ARM_SMMUV3: ARM SMMUv3 Context Descriptor Table */ enum iommu_hwpt_data_type { IOMMU_HWPT_DATA_NONE, IOMMU_HWPT_DATA_VTD_S1, + IOMMU_HWPT_DATA_ARM_SMMUV3, };
/**