From: Xiang Chen chenxiang66@hisilicon.com
When rmmod the driver of the last device in the group, cached iovas are not used, and it is better to free them to save memories. And also export function free_rcache_cached_iovas() and iommu_domain_to_iova().
Xiang Chen (4): iommu/iova: add a function to free all rcached iovas and export it iommu/iova: use function free_rcache_cached_iovas() to free all rcached iovas dma-iommu: add a interface to get iova_domain from iommu domain iommu: free cached iovas when rmmod the driver of the last device in the group
drivers/iommu/dma-iommu.c | 7 +++++++ drivers/iommu/iommu.c | 7 +++++++ drivers/iommu/iova.c | 17 ++++++++++++----- include/linux/dma-iommu.h | 6 ++++++ include/linux/iova.h | 5 +++++ 5 files changed, 37 insertions(+), 5 deletions(-)
From: Xiang Chen chenxiang66@hisilicon.com
Add a function to free all rcached iovas including cpu rcache iovas and global rcache iovas, and export it.
Signed-off-by: Xiang Chen chenxiang66@hisilicon.com --- drivers/iommu/iova.c | 11 +++++++++++ include/linux/iova.h | 5 +++++ 2 files changed, 16 insertions(+)
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index b7ecd5b..f595867 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -1077,5 +1077,16 @@ static void free_global_cached_iovas(struct iova_domain *iovad) spin_unlock_irqrestore(&rcache->lock, flags); } } + +void free_rcache_cached_iovas(struct iova_domain *iovad) +{ + unsigned int cpu; + + for_each_online_cpu(cpu) + free_cpu_cached_iovas(cpu, iovad); + free_global_cached_iovas(iovad); +} +EXPORT_SYMBOL_GPL(free_rcache_cached_iovas); + MODULE_AUTHOR("Anil S Keshavamurthy anil.s.keshavamurthy@intel.com"); MODULE_LICENSE("GPL"); diff --git a/include/linux/iova.h b/include/linux/iova.h index 71d8a2d..7006d9f 100644 --- a/include/linux/iova.h +++ b/include/linux/iova.h @@ -157,6 +157,7 @@ int init_iova_flush_queue(struct iova_domain *iovad, iova_flush_cb flush_cb, iova_entry_dtor entry_dtor); struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn); void put_iova_domain(struct iova_domain *iovad); +void free_rcache_cached_iovas(struct iova_domain *iovad); #else static inline int iova_cache_get(void) { @@ -233,6 +234,10 @@ static inline void put_iova_domain(struct iova_domain *iovad) { }
+static inline void free_rcache_cached_iovas(struct iova_domain *iovad) +{ +} + #endif
#endif
From: Xiang Chen chenxiang66@hisilicon.com
Use function free_rcached_iovas() to free all rcached iovas instead.
Signed-off-by: Xiang Chen chenxiang66@hisilicon.com --- drivers/iommu/iova.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-)
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index f595867..59926d5 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -503,16 +503,12 @@ alloc_iova_fast(struct iova_domain *iovad, unsigned long size, retry: new_iova = alloc_iova(iovad, size, limit_pfn, true); if (!new_iova) { - unsigned int cpu; - if (!flush_rcache) return 0;
/* Try replenishing IOVAs by flushing rcache. */ flush_rcache = false; - for_each_online_cpu(cpu) - free_cpu_cached_iovas(cpu, iovad); - free_global_cached_iovas(iovad); + free_rcache_cached_iovas(iovad); goto retry; }
From: Xiang Chen chenxiang66@hisilicon.com
Add a function iommu_domain_to_iova() to get iova_domain from iommu domain.
Signed-off-by: Xiang Chen chenxiang66@hisilicon.com --- drivers/iommu/dma-iommu.c | 7 +++++++ include/linux/dma-iommu.h | 6 ++++++ 2 files changed, 13 insertions(+)
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 7bcdd12..a1431a9 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -54,6 +54,13 @@ struct iommu_dma_cookie { static DEFINE_STATIC_KEY_FALSE(iommu_deferred_attach_enabled); bool iommu_dma_forcedac __read_mostly;
+struct iova_domain *iommu_domain_to_iova(struct iommu_domain *domain) +{ + struct iommu_dma_cookie *cookie = domain->iova_cookie; + + return &cookie->iovad; +} + static int __init iommu_dma_forcedac_setup(char *str) { int ret = kstrtobool(str, &iommu_dma_forcedac); diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h index 6e75a2d..8577122 100644 --- a/include/linux/dma-iommu.h +++ b/include/linux/dma-iommu.h @@ -42,6 +42,8 @@ void iommu_dma_free_cpu_cached_iovas(unsigned int cpu,
extern bool iommu_dma_forcedac;
+struct iova_domain *iommu_domain_to_iova(struct iommu_domain *domain); + #else /* CONFIG_IOMMU_DMA */
struct iommu_domain; @@ -83,5 +85,9 @@ static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_he { }
+struct iova_domain *iommu_domain_to_iova(struct iommu_domain *domain) +{ +} + #endif /* CONFIG_IOMMU_DMA */ #endif /* __DMA_IOMMU_H */
From: Xiang Chen chenxiang66@hisilicon.com
When rmmod the driver of the last device in the domain, cached iovas are not used and it is better to free them to save memories.
Signed-off-by: Xiang Chen chenxiang66@hisilicon.com --- drivers/iommu/iommu.c | 7 +++++++ 1 file changed, 7 insertions(+)
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 808ab70..d318d80 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -9,6 +9,7 @@ #include <linux/device.h> #include <linux/kernel.h> #include <linux/bug.h> +#include <linux/dma-iommu.h> #include <linux/types.h> #include <linux/init.h> #include <linux/export.h> @@ -16,6 +17,7 @@ #include <linux/errno.h> #include <linux/iommu.h> #include <linux/idr.h> +#include <linux/iova.h> #include <linux/notifier.h> #include <linux/err.h> #include <linux/pci.h> @@ -1667,6 +1669,11 @@ static int iommu_bus_notifier(struct notifier_block *nb, group_action = IOMMU_GROUP_NOTIFY_UNBIND_DRIVER; break; case BUS_NOTIFY_UNBOUND_DRIVER: + if (iommu_group_device_count(group) == 1) { + struct iommu_domain *domain = group->domain; + + free_rcache_cached_iovas(iommu_domain_to_iova(domain)); + } group_action = IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER; break; }
On 2021-06-07 03:42, chenxiang wrote:
From: Xiang Chen chenxiang66@hisilicon.com
When rmmod the driver of the last device in the group, cached iovas are not used, and it is better to free them to save memories. And also export function free_rcache_cached_iovas() and iommu_domain_to_iova().
How common is it to use a device a significant amount, then unbind its driver and not use it for long enough to care about? Off-hand I can't think of a particularly realistic use-case which looks like that - the closest I can imagine is unbinding a default kernel driver to replace it with VFIO, but I would expect the set of devices intended for assignment to be distinct from the set of devices used by the host itself, and thus the host driver wouldn't have actually done much to generate a lot of DMA mappings in that initial period. Is my expectation there wrong?
If there is such a case, how much memory does this actually save in practice? The theoretical worst-case should be roughly 4 * 128 * 6 * sizeof(struct iova) bytes per CPU, which is around 192KB assuming 64-byte kmem cache alignment. However it seems rather unlikely in practice to have every possible cache entry of every size used, so if saving smaller amounts of memory is a concern wouldn't you also want to explicitly drain the flush queues (16KB per CPU) and maybe look at trying to reclaim the unused pagetable pages from the domain itself - that ~192KB worth of cached IOVAs represents ~32K pages worth of IOVA space, which for an implementation like io-pgtable-arm with the 4KB granule means ~256KB worth of non-leaf pagetables left behind.
I'm not against the idea of having a mechanism to "compact" an idle DMA domain if there are convincing numbers to back it up, but the actual implementation would need to be better than this as well - having the IOMMU core code poking directly into the internals of the iommu-dma abstraction is not the way to go, and exporting anything to modules, which the IOMU core is not, smells suspicious.
Robin.
Xiang Chen (4): iommu/iova: add a function to free all rcached iovas and export it iommu/iova: use function free_rcache_cached_iovas() to free all rcached iovas dma-iommu: add a interface to get iova_domain from iommu domain iommu: free cached iovas when rmmod the driver of the last device in the group
drivers/iommu/dma-iommu.c | 7 +++++++ drivers/iommu/iommu.c | 7 +++++++ drivers/iommu/iova.c | 17 ++++++++++++----- include/linux/dma-iommu.h | 6 ++++++ include/linux/iova.h | 5 +++++ 5 files changed, 37 insertions(+), 5 deletions(-)
Hi Robin,
在 2021/6/7 19:23, Robin Murphy 写道:
On 2021-06-07 03:42, chenxiang wrote:
From: Xiang Chen chenxiang66@hisilicon.com
When rmmod the driver of the last device in the group, cached iovas are not used, and it is better to free them to save memories. And also export function free_rcache_cached_iovas() and iommu_domain_to_iova().
How common is it to use a device a significant amount, then unbind its driver and not use it for long enough to care about? Off-hand I can't think of a particularly realistic use-case which looks like that - the closest I can imagine is unbinding a default kernel driver to replace it with VFIO, but I would expect the set of devices intended for assignment to be distinct from the set of devices used by the host itself, and thus the host driver wouldn't have actually done much to generate a lot of DMA mappings in that initial period. Is my expectation there wrong?
It is possible that user uses the driver for a while and then rmmod it (for example, SAS card is the driver we use always, but sometimes we need to compare the performance with raid card, so we will insmod the raid driver, and rmmod the driver after the test). At this situation, the rcache iovas of raid card are always still even if rmmod the driver (user always rmmod the driver rather than removing the device which will release the group and release all resources).
If there is such a case, how much memory does this actually save in practice? The theoretical worst-case should be roughly 4 * 128 * 6 * sizeof(struct iova) bytes per CPU, which is around 192KB assuming 64-byte kmem cache alignment.
There are 128 CPUs in our ARM64 kunpeng920 board, and i add a debugfs interface drop_rcache of every group in local code which calls function free_rcache_cached_iovas(), and we can see that it saves 1~2M memory after freeing cached iovas.
estuary:/$ free total used free shared buffers cached Mem: 526776720 1336216 525440504 177664 0 177664 -/+ buffers/cache: 1158552 525618168 Swap: 0 0 0 estuary:/sys/kernel/debug/iommu/iovad/iommu_domain29$ echo 1 > drop_rcache estuary:/sys/kernel/debug/iommu/iovad/iommu_domain29$ free total used free shared buffers cached Mem: 526776720 1334672 525442048 177664 0 177664 -/+ buffers/cache: 1157008 525619712 Swap: 0 0 0
The other reason i want to free rcache iovas in suitable place is the IOVA longterm issue[0] (which is unrecoverable), if freeing cached iovas when rmmod the driver or adding a debugfs to do that, i think we can recover it by rmmod the driver and then insmod it or calling the debugfs drop_rcache though it doesn't solve the issue.
[0] https://lore.kernel.org/linux-iommu/1607538189-237944-4-git-send-email-john....
However it seems rather unlikely in practice to have every possible cache entry of every size used, so if saving smaller amounts of memory is a concern wouldn't you also want to explicitly drain the flush queues (16KB per CPU) and maybe look at trying to reclaim the unused pagetable pages from the domain itself - that ~192KB worth of cached IOVAs represents ~32K pages worth of IOVA space, which for an implementation like io-pgtable-arm with the 4KB granule means ~256KB worth of non-leaf pagetables left behind.
Ok, we may consider about those.
I'm not against the idea of having a mechanism to "compact" an idle DMA domain if there are convincing numbers to back it up, but the actual implementation would need to be better than this as well - having the IOMMU core code poking directly into the internals of the iommu-dma abstraction is not the way to go, and exporting anything to modules, which the IOMU core is not, smells suspicious.
Robin.
Xiang Chen (4): iommu/iova: add a function to free all rcached iovas and export it iommu/iova: use function free_rcache_cached_iovas() to free all rcached iovas dma-iommu: add a interface to get iova_domain from iommu domain iommu: free cached iovas when rmmod the driver of the last device in the group
drivers/iommu/dma-iommu.c | 7 +++++++ drivers/iommu/iommu.c | 7 +++++++ drivers/iommu/iova.c | 17 ++++++++++++----- include/linux/dma-iommu.h | 6 ++++++ include/linux/iova.h | 5 +++++ 5 files changed, 37 insertions(+), 5 deletions(-)
Linuxarm mailing list -- linuxarm@openeuler.org To unsubscribe send an email to linuxarm-leave@openeuler.org
On 2021-06-08 08:50, chenxiang (M) wrote:
Hi Robin,
在 2021/6/7 19:23, Robin Murphy 写道:
On 2021-06-07 03:42, chenxiang wrote:
From: Xiang Chen chenxiang66@hisilicon.com
When rmmod the driver of the last device in the group, cached iovas are not used, and it is better to free them to save memories. And also export function free_rcache_cached_iovas() and iommu_domain_to_iova().
How common is it to use a device a significant amount, then unbind its driver and not use it for long enough to care about? Off-hand I can't think of a particularly realistic use-case which looks like that - the closest I can imagine is unbinding a default kernel driver to replace it with VFIO, but I would expect the set of devices intended for assignment to be distinct from the set of devices used by the host itself, and thus the host driver wouldn't have actually done much to generate a lot of DMA mappings in that initial period. Is my expectation there wrong?
It is possible that user uses the driver for a while and then rmmod it (for example, SAS card is the driver we use always, but sometimes we need to compare the performance with raid card, so we will insmod the raid driver, and rmmod the driver after the test). At this situation, the rcache iovas of raid card are always still even if rmmod the driver (user always rmmod the driver rather than removing the device which will release the group and release all resources).
OK, so my question is how many end users are doing that on production systems with distro kernels, such that this feature deserves being supported and maintained in mainline?
If there is such a case, how much memory does this actually save in practice? The theoretical worst-case should be roughly 4 * 128 * 6 * sizeof(struct iova) bytes per CPU, which is around 192KB assuming 64-byte kmem cache alignment.
There are 128 CPUs in our ARM64 kunpeng920 board, and i add a debugfs interface drop_rcache of every group in local code which calls function free_rcache_cached_iovas(), and we can see that it saves 1~2M memory after freeing cached iovas.
estuary:/$ free total used free shared buffers cached Mem: 526776720 1336216 525440504 177664 0 177664 -/+ buffers/cache: 1158552 525618168 Swap: 0 0 0 estuary:/sys/kernel/debug/iommu/iovad/iommu_domain29$ echo 1 > drop_rcache estuary:/sys/kernel/debug/iommu/iovad/iommu_domain29$ free total used free shared buffers cached Mem: 526776720 1334672 525442048 177664 0 177664 -/+ buffers/cache: 1157008 525619712 Swap: 0 0 0
Erm, isn't 12KB from 4GB (per CPU) basically just noise? :/
Sure, it might start to look a bit more significant on a quad-core TV box with 1GB of RAM total, but then something like that isn't ever going to be driving a big multi-queue SAS adapter to churn that many DMA mappings in the first place.
Consider that a "memory saving" feature which only has some minimal benefit for niche use-cases on very large systems, but costs a constant amount of memory in terms of code and symbol bloat in the kernel image, even on actually-memory-constrained systems, is rather counterproductive overall. As I said before, I do like the general idea *if* there's a significant saving to be had from it, but what you're showing us here suggests that there isn't :(
Note also that on arm64 systems, commit 65688d2a05de ("arm64: cache: Lower ARCH_DMA_MINALIGN to 64 (L1_CACHE_BYTES)") queued in linux-next is already likely to cut everything in half here.
The other reason i want to free rcache iovas in suitable place is the IOVA longterm issue[0] (which is unrecoverable), if freeing cached iovas when rmmod the driver or adding a debugfs to do that, i think we can recover it by rmmod the driver and then insmod it or calling the debugfs drop_rcache though it doesn't solve the issue.
Or perhaps we could, y'know, actually solve that problem, rather than invent hacks to bodge around it. I've lost track of where we got to with that thread (sorry!), but I think it still needs someone to look into the relative timing of all the flush queue interactions if we're to really understand what's going on, and if a simple obvious fix doesn't fall out of that then it probably makes sense to convert the rcache depot to a garbage-collected list (per the original concept).
Much as I think that purging the rcaches automatically to paper over that issue isn't a great idea, making the user do it manually is certainly even worse.
Robin.
[0] https://lore.kernel.org/linux-iommu/1607538189-237944-4-git-send-email-john....
However it seems rather unlikely in practice to have every possible cache entry of every size used, so if saving smaller amounts of memory is a concern wouldn't you also want to explicitly drain the flush queues (16KB per CPU) and maybe look at trying to reclaim the unused pagetable pages from the domain itself - that ~192KB worth of cached IOVAs represents ~32K pages worth of IOVA space, which for an implementation like io-pgtable-arm with the 4KB granule means ~256KB worth of non-leaf pagetables left behind.
Ok, we may consider about those.
I'm not against the idea of having a mechanism to "compact" an idle DMA domain if there are convincing numbers to back it up, but the actual implementation would need to be better than this as well - having the IOMMU core code poking directly into the internals of the iommu-dma abstraction is not the way to go, and exporting anything to modules, which the IOMU core is not, smells suspicious.
Robin.
Xiang Chen (4): iommu/iova: add a function to free all rcached iovas and export it iommu/iova: use function free_rcache_cached_iovas() to free all rcached iovas dma-iommu: add a interface to get iova_domain from iommu domain iommu: free cached iovas when rmmod the driver of the last device in the group
drivers/iommu/dma-iommu.c | 7 +++++++ drivers/iommu/iommu.c | 7 +++++++ drivers/iommu/iova.c | 17 ++++++++++++----- include/linux/dma-iommu.h | 6 ++++++ include/linux/iova.h | 5 +++++ 5 files changed, 37 insertions(+), 5 deletions(-)
Linuxarm mailing list -- linuxarm@openeuler.org To unsubscribe send an email to linuxarm-leave@openeuler.org