Juergen Gross (4): xen/events: add a new "late EOI" evtchn framework xen/events: switch user event channels to lateeoi model xen/events: use a common cpu hotplug hook for event channels xen/events: defer eoi in case of excessive number of events
.../admin-guide/kernel-parameters.txt | 8 + drivers/xen/events/events_2l.c | 7 +- drivers/xen/events/events_base.c | 363 +++++++++++++++++- drivers/xen/events/events_fifo.c | 70 ++-- drivers/xen/events/events_internal.h | 17 +- drivers/xen/evtchn.c | 7 +- include/xen/events.h | 21 + 7 files changed, 420 insertions(+), 73 deletions(-)
From: Juergen Gross jgross@suse.com
mainline inclusion from mainline-v5.10 commit 54c9de89895e0a36047fcc4ae754ea5b8655fb9d category: bugfix bugzilla: NA CVE: CVE-2020-27673
Prepare for fixing CVE-2020-27673 --------------------------------
In order to avoid tight event channel related IRQ loops add a new framework of "late EOI" handling: the IRQ the event channel is bound to will be masked until the event has been handled and the related driver is capable to handle another event. The driver is responsible for unmasking the event channel via the new function xen_irq_lateeoi().
This is similar to binding an event channel to a threaded IRQ, but without having to structure the driver accordingly.
In order to support a future special handling in case a rogue guest is sending lots of unsolicited events, add a flag to xen_irq_lateeoi() which can be set by the caller to indicate the event was a spurious one.
This is part of XSA-332.
Cc: stable@vger.kernel.org Reported-by: Julien Grall julien@xen.org Signed-off-by: Juergen Gross jgross@suse.com Reviewed-by: Jan Beulich jbeulich@suse.com Reviewed-by: Stefano Stabellini sstabellini@kernel.org Reviewed-by: Wei Liu wl@xen.org Conflicts: drivers/xen/events/events_base.c include/xen/events.h [yyl: replace evtchn_port_t with usigned int] Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/xen/events/events_base.c | 151 +++++++++++++++++++++++++++---- include/xen/events.h | 21 +++++ 2 files changed, 155 insertions(+), 17 deletions(-)
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 75eade7d2017..b199f946f508 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -109,6 +109,7 @@ static bool (*pirq_needs_eoi)(unsigned irq); #define VALID_EVTCHN(chn) ((chn) != 0)
static struct irq_chip xen_dynamic_chip; +static struct irq_chip xen_lateeoi_chip; static struct irq_chip xen_percpu_chip; static struct irq_chip xen_pirq_chip; static void enable_dynirq(struct irq_data *data); @@ -382,6 +383,33 @@ void notify_remote_via_irq(int irq) } EXPORT_SYMBOL_GPL(notify_remote_via_irq);
+static void xen_irq_lateeoi_locked(struct irq_info *info) +{ + unsigned int evtchn; + + evtchn = info->evtchn; + if (!VALID_EVTCHN(evtchn)) + return; + + unmask_evtchn(evtchn); +} + +void xen_irq_lateeoi(unsigned int irq, unsigned int eoi_flags) +{ + struct irq_info *info; + unsigned long flags; + + read_lock_irqsave(&evtchn_rwlock, flags); + + info = info_for_irq(irq); + + if (info) + xen_irq_lateeoi_locked(info); + + read_unlock_irqrestore(&evtchn_rwlock, flags); +} +EXPORT_SYMBOL_GPL(xen_irq_lateeoi); + static void xen_irq_init(unsigned irq) { struct irq_info *info; @@ -853,7 +881,7 @@ int xen_pirq_from_irq(unsigned irq) } EXPORT_SYMBOL_GPL(xen_pirq_from_irq);
-int bind_evtchn_to_irq(unsigned int evtchn) +static int bind_evtchn_to_irq_chip(unsigned int evtchn, struct irq_chip *chip) { int irq; int ret; @@ -870,7 +898,7 @@ int bind_evtchn_to_irq(unsigned int evtchn) if (irq < 0) goto out;
- irq_set_chip_and_handler_name(irq, &xen_dynamic_chip, + irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "event");
ret = xen_irq_info_evtchn_setup(irq, evtchn); @@ -891,8 +919,19 @@ int bind_evtchn_to_irq(unsigned int evtchn)
return irq; } + +int bind_evtchn_to_irq(unsigned int evtchn) +{ + return bind_evtchn_to_irq_chip(evtchn, &xen_dynamic_chip); +} EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
+int bind_evtchn_to_irq_lateeoi(unsigned int evtchn) +{ + return bind_evtchn_to_irq_chip(evtchn, &xen_lateeoi_chip); +} +EXPORT_SYMBOL_GPL(bind_evtchn_to_irq_lateeoi); + static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) { struct evtchn_bind_ipi bind_ipi; @@ -934,8 +973,9 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) return irq; }
-int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, - unsigned int remote_port) +static int bind_interdomain_evtchn_to_irq_chip(unsigned int remote_domain, + unsigned int remote_port, + struct irq_chip *chip) { struct evtchn_bind_interdomain bind_interdomain; int err; @@ -946,10 +986,26 @@ int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, &bind_interdomain);
- return err ? : bind_evtchn_to_irq(bind_interdomain.local_port); + return err ? : bind_evtchn_to_irq_chip(bind_interdomain.local_port, + chip); +} + +int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, + unsigned int remote_port) +{ + return bind_interdomain_evtchn_to_irq_chip(remote_domain, remote_port, + &xen_dynamic_chip); } EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irq);
+int bind_interdomain_evtchn_to_irq_lateeoi(unsigned int remote_domain, + unsigned int remote_port) +{ + return bind_interdomain_evtchn_to_irq_chip(remote_domain, remote_port, + &xen_lateeoi_chip); +} +EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irq_lateeoi); + static int find_virq(unsigned int virq, unsigned int cpu) { struct evtchn_status status; @@ -1045,14 +1101,15 @@ static void unbind_from_irq(unsigned int irq) mutex_unlock(&irq_mapping_update_lock); }
-int bind_evtchn_to_irqhandler(unsigned int evtchn, - irq_handler_t handler, - unsigned long irqflags, - const char *devname, void *dev_id) +static int bind_evtchn_to_irqhandler_chip(unsigned int evtchn, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, void *dev_id, + struct irq_chip *chip) { int irq, retval;
- irq = bind_evtchn_to_irq(evtchn); + irq = bind_evtchn_to_irq_chip(evtchn, chip); if (irq < 0) return irq; retval = request_irq(irq, handler, irqflags, devname, dev_id); @@ -1063,18 +1120,38 @@ int bind_evtchn_to_irqhandler(unsigned int evtchn,
return irq; } + +int bind_evtchn_to_irqhandler(unsigned int evtchn, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, void *dev_id) +{ + return bind_evtchn_to_irqhandler_chip(evtchn, handler, irqflags, + devname, dev_id, + &xen_dynamic_chip); +} EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
-int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, - unsigned int remote_port, - irq_handler_t handler, - unsigned long irqflags, - const char *devname, - void *dev_id) +int bind_evtchn_to_irqhandler_lateeoi(unsigned int evtchn, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, void *dev_id) +{ + return bind_evtchn_to_irqhandler_chip(evtchn, handler, irqflags, + devname, dev_id, + &xen_lateeoi_chip); +} +EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler_lateeoi); + +static int bind_interdomain_evtchn_to_irqhandler_chip( + unsigned int remote_domain, unsigned int remote_port, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id, struct irq_chip *chip) { int irq, retval;
- irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port); + irq = bind_interdomain_evtchn_to_irq_chip(remote_domain, remote_port, + chip); if (irq < 0) return irq;
@@ -1086,8 +1163,33 @@ int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
return irq; } + +int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, + unsigned int remote_port, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, + void *dev_id) +{ + return bind_interdomain_evtchn_to_irqhandler_chip(remote_domain, + remote_port, handler, irqflags, devname, + dev_id, &xen_dynamic_chip); +} EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler);
+int bind_interdomain_evtchn_to_irqhandler_lateeoi(unsigned int remote_domain, + unsigned int remote_port, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, + void *dev_id) +{ + return bind_interdomain_evtchn_to_irqhandler_chip(remote_domain, + remote_port, handler, irqflags, devname, + dev_id, &xen_lateeoi_chip); +} +EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler_lateeoi); + int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id) @@ -1631,6 +1733,21 @@ static struct irq_chip xen_dynamic_chip __read_mostly = { .irq_retrigger = retrigger_dynirq, };
+static struct irq_chip xen_lateeoi_chip __read_mostly = { + /* The chip name needs to contain "xen-dyn" for irqbalance to work. */ + .name = "xen-dyn-lateeoi", + + .irq_disable = disable_dynirq, + .irq_mask = disable_dynirq, + .irq_unmask = enable_dynirq, + + .irq_ack = mask_ack_dynirq, + .irq_mask_ack = mask_ack_dynirq, + + .irq_set_affinity = set_affinity_irq, + .irq_retrigger = retrigger_dynirq, +}; + static struct irq_chip xen_pirq_chip __read_mostly = { .name = "xen-pirq",
diff --git a/include/xen/events.h b/include/xen/events.h index 1650d39decae..80692331c451 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -15,10 +15,15 @@ unsigned xen_evtchn_nr_channels(void);
int bind_evtchn_to_irq(unsigned int evtchn); +int bind_evtchn_to_irq_lateeoi(unsigned int evtchn); int bind_evtchn_to_irqhandler(unsigned int evtchn, irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id); +int bind_evtchn_to_irqhandler_lateeoi(unsigned int evtchn, + irq_handler_t handler, + unsigned long irqflags, const char *devname, + void *dev_id); int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu); int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, irq_handler_t handler, @@ -32,12 +37,20 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi, void *dev_id); int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, unsigned int remote_port); +int bind_interdomain_evtchn_to_irq_lateeoi(unsigned int remote_domain, + unsigned int remote_port); int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, unsigned int remote_port, irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id); +int bind_interdomain_evtchn_to_irqhandler_lateeoi(unsigned int remote_domain, + unsigned int remote_port, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, + void *dev_id);
/* * Common unbind function for all event sources. Takes IRQ to unbind from. @@ -46,6 +59,14 @@ int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, */ void unbind_from_irqhandler(unsigned int irq, void *dev_id);
+/* + * Send late EOI for an IRQ bound to an event channel via one of the *_lateeoi + * functions above. + */ +void xen_irq_lateeoi(unsigned int irq, unsigned int eoi_flags); +/* Signal an event was spurious, i.e. there was no action resulting from it. */ +#define XEN_EOI_FLAG_SPURIOUS 0x00000001 + #define XEN_IRQ_PRIORITY_MAX EVTCHN_FIFO_PRIORITY_MAX #define XEN_IRQ_PRIORITY_DEFAULT EVTCHN_FIFO_PRIORITY_DEFAULT #define XEN_IRQ_PRIORITY_MIN EVTCHN_FIFO_PRIORITY_MIN
From: Juergen Gross jgross@suse.com
mainline inclusion from mainline-v5.10 commit c44b849cee8c3ac587da3b0980e01f77500d158c category: bugfix bugzilla: NA CVE: NA
--------------------------------
Instead of disabling the irq when an event is received and enabling it again when handled by the user process use the lateeoi model.
This is part of XSA-332.
Cc: stable@vger.kernel.org Reported-by: Julien Grall julien@xen.org Signed-off-by: Juergen Gross jgross@suse.com Tested-by: Stefano Stabellini sstabellini@kernel.org Reviewed-by: Stefano Stabellini sstabellini@kernel.org Reviewed-by: Jan Beulich jbeulich@suse.com Reviewed-by: Wei Liu wl@xen.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/xen/evtchn.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c index 47c70b826a6a..4b11e60e37a3 100644 --- a/drivers/xen/evtchn.c +++ b/drivers/xen/evtchn.c @@ -166,7 +166,6 @@ static irqreturn_t evtchn_interrupt(int irq, void *data) "Interrupt for port %d, but apparently not enabled; per-user %p\n", evtchn->port, u);
- disable_irq_nosync(irq); evtchn->enabled = false;
spin_lock(&u->ring_prod_lock); @@ -292,7 +291,7 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf, evtchn = find_evtchn(u, port); if (evtchn && !evtchn->enabled) { evtchn->enabled = true; - enable_irq(irq_from_evtchn(port)); + xen_irq_lateeoi(irq_from_evtchn(port), 0); } }
@@ -392,8 +391,8 @@ static int evtchn_bind_to_user(struct per_user_data *u, int port) if (rc < 0) goto err;
- rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, 0, - u->name, evtchn); + rc = bind_evtchn_to_irqhandler_lateeoi(port, evtchn_interrupt, 0, + u->name, evtchn); if (rc < 0) goto err;
From: Juergen Gross jgross@suse.com
mainline inclusion from mainline-v5.10 commit 7beb290caa2adb0a399e735a1e175db9aae0523a category: bugfix bugzilla: NA CVE: CVE-2020-27673
Prepare for fixing CVE-2020-27673 --------------------------------
Today only fifo event channels have a cpu hotplug callback. In order to prepare for more percpu (de)init work move that callback into events_base.c and add percpu_init() and percpu_deinit() hooks to struct evtchn_ops.
This is part of XSA-332.
Cc: stable@vger.kernel.org Signed-off-by: Juergen Gross jgross@suse.com Reviewed-by: Jan Beulich jbeulich@suse.com Reviewed-by: Wei Liu wl@xen.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/xen/events/events_base.c | 25 +++++++++++++++++ drivers/xen/events/events_fifo.c | 40 +++++++++++++--------------- drivers/xen/events/events_internal.h | 3 +++ 3 files changed, 47 insertions(+), 21 deletions(-)
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index b199f946f508..33ccb442ccb4 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -33,6 +33,7 @@ #include <linux/irqnr.h> #include <linux/pci.h> #include <linux/spinlock.h> +#include <linux/cpuhotplug.h>
#ifdef CONFIG_X86 #include <asm/desc.h> @@ -1820,6 +1821,26 @@ void xen_callback_vector(void) {} static bool fifo_events = true; module_param(fifo_events, bool, 0);
+static int xen_evtchn_cpu_prepare(unsigned int cpu) +{ + int ret = 0; + + if (evtchn_ops->percpu_init) + ret = evtchn_ops->percpu_init(cpu); + + return ret; +} + +static int xen_evtchn_cpu_dead(unsigned int cpu) +{ + int ret = 0; + + if (evtchn_ops->percpu_deinit) + ret = evtchn_ops->percpu_deinit(cpu); + + return ret; +} + void __init xen_init_IRQ(void) { int ret = -EINVAL; @@ -1830,6 +1851,10 @@ void __init xen_init_IRQ(void) if (ret < 0) xen_evtchn_2l_init();
+ cpuhp_setup_state_nocalls(CPUHP_XEN_EVTCHN_PREPARE, + "xen/evtchn:prepare", + xen_evtchn_cpu_prepare, xen_evtchn_cpu_dead); + evtchn_to_irq = kcalloc(EVTCHN_ROW(xen_evtchn_max_channels()), sizeof(*evtchn_to_irq), GFP_KERNEL); BUG_ON(!evtchn_to_irq); diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c index 76b318e88382..301fd64a61d3 100644 --- a/drivers/xen/events/events_fifo.c +++ b/drivers/xen/events/events_fifo.c @@ -380,21 +380,6 @@ static void evtchn_fifo_resume(void) event_array_pages = 0; }
-static const struct evtchn_ops evtchn_ops_fifo = { - .max_channels = evtchn_fifo_max_channels, - .nr_channels = evtchn_fifo_nr_channels, - .setup = evtchn_fifo_setup, - .bind_to_cpu = evtchn_fifo_bind_to_cpu, - .clear_pending = evtchn_fifo_clear_pending, - .set_pending = evtchn_fifo_set_pending, - .is_pending = evtchn_fifo_is_pending, - .test_and_set_mask = evtchn_fifo_test_and_set_mask, - .mask = evtchn_fifo_mask, - .unmask = evtchn_fifo_unmask, - .handle_events = evtchn_fifo_handle_events, - .resume = evtchn_fifo_resume, -}; - static int evtchn_fifo_alloc_control_block(unsigned cpu) { void *control_block = NULL; @@ -417,19 +402,36 @@ static int evtchn_fifo_alloc_control_block(unsigned cpu) return ret; }
-static int xen_evtchn_cpu_prepare(unsigned int cpu) +static int evtchn_fifo_percpu_init(unsigned int cpu) { if (!per_cpu(cpu_control_block, cpu)) return evtchn_fifo_alloc_control_block(cpu); return 0; }
-static int xen_evtchn_cpu_dead(unsigned int cpu) +static int evtchn_fifo_percpu_deinit(unsigned int cpu) { __evtchn_fifo_handle_events(cpu, true); return 0; }
+static const struct evtchn_ops evtchn_ops_fifo = { + .max_channels = evtchn_fifo_max_channels, + .nr_channels = evtchn_fifo_nr_channels, + .setup = evtchn_fifo_setup, + .bind_to_cpu = evtchn_fifo_bind_to_cpu, + .clear_pending = evtchn_fifo_clear_pending, + .set_pending = evtchn_fifo_set_pending, + .is_pending = evtchn_fifo_is_pending, + .test_and_set_mask = evtchn_fifo_test_and_set_mask, + .mask = evtchn_fifo_mask, + .unmask = evtchn_fifo_unmask, + .handle_events = evtchn_fifo_handle_events, + .resume = evtchn_fifo_resume, + .percpu_init = evtchn_fifo_percpu_init, + .percpu_deinit = evtchn_fifo_percpu_deinit, +}; + int __init xen_evtchn_fifo_init(void) { int cpu = smp_processor_id(); @@ -443,9 +445,5 @@ int __init xen_evtchn_fifo_init(void)
evtchn_ops = &evtchn_ops_fifo;
- cpuhp_setup_state_nocalls(CPUHP_XEN_EVTCHN_PREPARE, - "xen/evtchn:prepare", - xen_evtchn_cpu_prepare, xen_evtchn_cpu_dead); - return ret; } diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h index 50c2050a1e32..980a56d51e4c 100644 --- a/drivers/xen/events/events_internal.h +++ b/drivers/xen/events/events_internal.h @@ -71,6 +71,9 @@ struct evtchn_ops {
void (*handle_events)(unsigned cpu); void (*resume)(void); + + int (*percpu_init)(unsigned int cpu); + int (*percpu_deinit)(unsigned int cpu); };
extern const struct evtchn_ops *evtchn_ops;
From: Juergen Gross jgross@suse.com
mainline inclusion from mainline-v5.10 commit e99502f76271d6bc4e374fe368c50c67a1fd3070 category: bugfix bugzilla: NA CVE: CVE-2020-27673
--------------------------------
In case rogue guests are sending events at high frequency it might happen that xen_evtchn_do_upcall() won't stop processing events in dom0. As this is done in irq handling a crash might be the result.
In order to avoid that, delay further inter-domain events after some time in xen_evtchn_do_upcall() by forcing eoi processing into a worker on the same cpu, thus inhibiting new events coming in.
The time after which eoi processing is to be delayed is configurable via a new module parameter "event_loop_timeout" which specifies the maximum event loop time in jiffies (default: 2, the value was chosen after some tests showing that a value of 2 was the lowest with an only slight drop of dom0 network throughput while multiple guests performed an event storm).
How long eoi processing will be delayed can be specified via another parameter "event_eoi_delay" (again in jiffies, default 10, again the value was chosen after testing with different delay values).
This is part of XSA-332.
Cc: stable@vger.kernel.org Reported-by: Julien Grall julien@xen.org Signed-off-by: Juergen Gross jgross@suse.com Reviewed-by: Stefano Stabellini sstabellini@kernel.org Reviewed-by: Wei Liu wl@xen.org Conflicts: Documentation/admin-guide/kernel-parameters.txt drivers/xen/events/events_base.c drivers/xen/events/events_fifo.c drivers/xen/events/events_internal.h [yyl: adjust context] Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- .../admin-guide/kernel-parameters.txt | 8 + drivers/xen/events/events_2l.c | 7 +- drivers/xen/events/events_base.c | 189 +++++++++++++++++- drivers/xen/events/events_fifo.c | 30 +-- drivers/xen/events/events_internal.h | 14 +- 5 files changed, 216 insertions(+), 32 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 514b884e8105..5b147d5dfe9e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5335,6 +5335,14 @@ Disables the ticketlock slowpath using Xen PV optimizations.
+ xen.event_eoi_delay= [XEN] + How long to delay EOI handling in case of event + storms (jiffies). Default is 10. + + xen.event_loop_timeout= [XEN] + After which time (jiffies) the event handling loop + should start to delay EOI handling. Default is 2. + xen_nopv [X86] Disables the PV optimizations forcing the HVM guest to run as generic HVM guest with no PV drivers. diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c index 8edef51c92e5..5478c761cd41 100644 --- a/drivers/xen/events/events_2l.c +++ b/drivers/xen/events/events_2l.c @@ -159,7 +159,7 @@ static inline xen_ulong_t active_evtchns(unsigned int cpu, * a bitset of words which contain pending event bits. The second * level is a bitset of pending events themselves. */ -static void evtchn_2l_handle_events(unsigned cpu) +static void evtchn_2l_handle_events(unsigned cpu, struct evtchn_loop_ctrl *ctrl) { int irq; xen_ulong_t pending_words; @@ -240,10 +240,7 @@ static void evtchn_2l_handle_events(unsigned cpu)
/* Process port. */ port = (word_idx * BITS_PER_EVTCHN_WORD) + bit_idx; - irq = get_evtchn_to_irq(port); - - if (irq != -1) - generic_handle_irq(irq); + handle_irq_for_port(port, ctrl);
bit_idx = (bit_idx + 1) % BITS_PER_EVTCHN_WORD;
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 33ccb442ccb4..73df74c1bfc6 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -34,6 +34,8 @@ #include <linux/pci.h> #include <linux/spinlock.h> #include <linux/cpuhotplug.h> +#include <linux/atomic.h> +#include <linux/ktime.h>
#ifdef CONFIG_X86 #include <asm/desc.h> @@ -63,6 +65,15 @@
#include "events_internal.h"
+#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "xen." + +static uint __read_mostly event_loop_timeout = 2; +module_param(event_loop_timeout, uint, 0644); + +static uint __read_mostly event_eoi_delay = 10; +module_param(event_eoi_delay, uint, 0644); + const struct evtchn_ops *evtchn_ops;
/* @@ -86,6 +97,7 @@ static DEFINE_RWLOCK(evtchn_rwlock); * irq_mapping_update_lock * evtchn_rwlock * IRQ-desc lock + * percpu eoi_list_lock */
static LIST_HEAD(xen_irq_list_head); @@ -116,6 +128,8 @@ static struct irq_chip xen_pirq_chip; static void enable_dynirq(struct irq_data *data); static void disable_dynirq(struct irq_data *data);
+static DEFINE_PER_CPU(unsigned int, irq_epoch); + static void clear_evtchn_to_irq_row(unsigned row) { unsigned col; @@ -384,17 +398,120 @@ void notify_remote_via_irq(int irq) } EXPORT_SYMBOL_GPL(notify_remote_via_irq);
+struct lateeoi_work { + struct delayed_work delayed; + spinlock_t eoi_list_lock; + struct list_head eoi_list; +}; + +static DEFINE_PER_CPU(struct lateeoi_work, lateeoi); + +static void lateeoi_list_del(struct irq_info *info) +{ + struct lateeoi_work *eoi = &per_cpu(lateeoi, info->eoi_cpu); + unsigned long flags; + + spin_lock_irqsave(&eoi->eoi_list_lock, flags); + list_del_init(&info->eoi_list); + spin_unlock_irqrestore(&eoi->eoi_list_lock, flags); +} + +static void lateeoi_list_add(struct irq_info *info) +{ + struct lateeoi_work *eoi = &per_cpu(lateeoi, info->eoi_cpu); + struct irq_info *elem; + u64 now = get_jiffies_64(); + unsigned long delay; + unsigned long flags; + + if (now < info->eoi_time) + delay = info->eoi_time - now; + else + delay = 1; + + spin_lock_irqsave(&eoi->eoi_list_lock, flags); + + if (list_empty(&eoi->eoi_list)) { + list_add(&info->eoi_list, &eoi->eoi_list); + mod_delayed_work_on(info->eoi_cpu, system_wq, + &eoi->delayed, delay); + } else { + list_for_each_entry_reverse(elem, &eoi->eoi_list, eoi_list) { + if (elem->eoi_time <= info->eoi_time) + break; + } + list_add(&info->eoi_list, &elem->eoi_list); + } + + spin_unlock_irqrestore(&eoi->eoi_list_lock, flags); +} + static void xen_irq_lateeoi_locked(struct irq_info *info) { unsigned int evtchn; + unsigned int cpu;
evtchn = info->evtchn; - if (!VALID_EVTCHN(evtchn)) + if (!VALID_EVTCHN(evtchn) || !list_empty(&info->eoi_list)) return;
+ cpu = info->eoi_cpu; + if (info->eoi_time && info->irq_epoch == per_cpu(irq_epoch, cpu)) { + lateeoi_list_add(info); + return; + } + + info->eoi_time = 0; unmask_evtchn(evtchn); }
+static void xen_irq_lateeoi_worker(struct work_struct *work) +{ + struct lateeoi_work *eoi; + struct irq_info *info; + u64 now = get_jiffies_64(); + unsigned long flags; + + eoi = container_of(to_delayed_work(work), struct lateeoi_work, delayed); + + read_lock_irqsave(&evtchn_rwlock, flags); + + while (true) { + spin_lock(&eoi->eoi_list_lock); + + info = list_first_entry_or_null(&eoi->eoi_list, struct irq_info, + eoi_list); + + if (info == NULL || now < info->eoi_time) { + spin_unlock(&eoi->eoi_list_lock); + break; + } + + list_del_init(&info->eoi_list); + + spin_unlock(&eoi->eoi_list_lock); + + info->eoi_time = 0; + + xen_irq_lateeoi_locked(info); + } + + if (info) + mod_delayed_work_on(info->eoi_cpu, system_wq, + &eoi->delayed, info->eoi_time - now); + + read_unlock_irqrestore(&evtchn_rwlock, flags); +} + +static void xen_cpu_init_eoi(unsigned int cpu) +{ + struct lateeoi_work *eoi = &per_cpu(lateeoi, cpu); + + INIT_DELAYED_WORK(&eoi->delayed, xen_irq_lateeoi_worker); + spin_lock_init(&eoi->eoi_list_lock); + INIT_LIST_HEAD(&eoi->eoi_list); +} + void xen_irq_lateeoi(unsigned int irq, unsigned int eoi_flags) { struct irq_info *info; @@ -414,6 +531,7 @@ EXPORT_SYMBOL_GPL(xen_irq_lateeoi); static void xen_irq_init(unsigned irq) { struct irq_info *info; + #ifdef CONFIG_SMP /* By default all event channels notify CPU#0. */ cpumask_copy(irq_get_affinity_mask(irq), cpumask_of(0)); @@ -428,6 +546,7 @@ static void xen_irq_init(unsigned irq)
irq_set_chip_data(irq, info);
+ INIT_LIST_HEAD(&info->eoi_list); list_add_tail(&info->list, &xen_irq_list_head); }
@@ -483,6 +602,9 @@ static void xen_free_irq(unsigned irq)
write_lock_irqsave(&evtchn_rwlock, flags);
+ if (!list_empty(&info->eoi_list)) + lateeoi_list_del(info); + list_del(&info->list);
irq_set_chip_data(irq, NULL); @@ -1342,12 +1464,61 @@ void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) notify_remote_via_irq(irq); }
+struct evtchn_loop_ctrl { + ktime_t timeout; + unsigned count; + bool defer_eoi; +}; + +void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl) +{ + int irq; + struct irq_info *info; + + irq = get_evtchn_to_irq(port); + if (irq == -1) + return; + + /* + * Check for timeout every 256 events. + * We are setting the timeout value only after the first 256 + * events in order to not hurt the common case of few loop + * iterations. The 256 is basically an arbitrary value. + * + * In case we are hitting the timeout we need to defer all further + * EOIs in order to ensure to leave the event handling loop rather + * sooner than later. + */ + if (!ctrl->defer_eoi && !(++ctrl->count & 0xff)) { + ktime_t kt = ktime_get(); + + if (!ctrl->timeout) { + kt = ktime_add_ms(kt, + jiffies_to_msecs(event_loop_timeout)); + ctrl->timeout = kt; + } else if (kt > ctrl->timeout) { + ctrl->defer_eoi = true; + } + } + + info = info_for_irq(irq); + + if (ctrl->defer_eoi) { + info->eoi_cpu = smp_processor_id(); + info->irq_epoch = __this_cpu_read(irq_epoch); + info->eoi_time = get_jiffies_64() + event_eoi_delay; + } + + generic_handle_irq(irq); +} + static DEFINE_PER_CPU(unsigned, xed_nesting_count);
static void __xen_evtchn_do_upcall(void) { struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); int cpu = get_cpu(); + struct evtchn_loop_ctrl ctrl = { 0 }; unsigned count;
read_lock(&evtchn_rwlock); @@ -1358,7 +1529,7 @@ static void __xen_evtchn_do_upcall(void) if (__this_cpu_inc_return(xed_nesting_count) - 1) goto out;
- xen_evtchn_handle_events(cpu); + xen_evtchn_handle_events(cpu, &ctrl);
BUG_ON(!irqs_disabled());
@@ -1369,6 +1540,13 @@ static void __xen_evtchn_do_upcall(void) out:
read_unlock(&evtchn_rwlock); + + /* + * Increment irq_epoch only now to defer EOIs only for + * xen_irq_lateeoi() invocations occurring from inside the loop + * above. + */ + __this_cpu_inc(irq_epoch); put_cpu(); }
@@ -1815,9 +1993,6 @@ void xen_callback_vector(void) void xen_callback_vector(void) {} #endif
-#undef MODULE_PARAM_PREFIX -#define MODULE_PARAM_PREFIX "xen." - static bool fifo_events = true; module_param(fifo_events, bool, 0);
@@ -1825,6 +2000,8 @@ static int xen_evtchn_cpu_prepare(unsigned int cpu) { int ret = 0;
+ xen_cpu_init_eoi(cpu); + if (evtchn_ops->percpu_init) ret = evtchn_ops->percpu_init(cpu);
@@ -1851,6 +2028,8 @@ void __init xen_init_IRQ(void) if (ret < 0) xen_evtchn_2l_init();
+ xen_cpu_init_eoi(smp_processor_id()); + cpuhp_setup_state_nocalls(CPUHP_XEN_EVTCHN_PREPARE, "xen/evtchn:prepare", xen_evtchn_cpu_prepare, xen_evtchn_cpu_dead); diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c index 301fd64a61d3..9e460ccf38d2 100644 --- a/drivers/xen/events/events_fifo.c +++ b/drivers/xen/events/events_fifo.c @@ -270,19 +270,9 @@ static uint32_t clear_linked(volatile event_word_t *word) return w & EVTCHN_FIFO_LINK_MASK; }
-static void handle_irq_for_port(unsigned port) -{ - int irq; - - irq = get_evtchn_to_irq(port); - if (irq != -1) - generic_handle_irq(irq); -} - -static void consume_one_event(unsigned cpu, +static void consume_one_event(unsigned cpu, struct evtchn_loop_ctrl *ctrl, struct evtchn_fifo_control_block *control_block, - unsigned priority, unsigned long *ready, - bool drop) + unsigned priority, unsigned long *ready) { struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu); uint32_t head; @@ -315,16 +305,17 @@ static void consume_one_event(unsigned cpu, clear_bit(priority, ready);
if (evtchn_fifo_is_pending(port) && !evtchn_fifo_is_masked(port)) { - if (unlikely(drop)) + if (unlikely(!ctrl)) pr_warn("Dropping pending event for port %u\n", port); else - handle_irq_for_port(port); + handle_irq_for_port(port, ctrl); }
q->head[priority] = head; }
-static void __evtchn_fifo_handle_events(unsigned cpu, bool drop) +static void __evtchn_fifo_handle_events(unsigned cpu, + struct evtchn_loop_ctrl *ctrl) { struct evtchn_fifo_control_block *control_block; unsigned long ready; @@ -336,14 +327,15 @@ static void __evtchn_fifo_handle_events(unsigned cpu, bool drop)
while (ready) { q = find_first_bit(&ready, EVTCHN_FIFO_MAX_QUEUES); - consume_one_event(cpu, control_block, q, &ready, drop); + consume_one_event(cpu, ctrl, control_block, q, &ready); ready |= xchg(&control_block->ready, 0); } }
-static void evtchn_fifo_handle_events(unsigned cpu) +static void evtchn_fifo_handle_events(unsigned cpu, + struct evtchn_loop_ctrl *ctrl) { - __evtchn_fifo_handle_events(cpu, false); + __evtchn_fifo_handle_events(cpu, ctrl); }
static void evtchn_fifo_resume(void) @@ -411,7 +403,7 @@ static int evtchn_fifo_percpu_init(unsigned int cpu)
static int evtchn_fifo_percpu_deinit(unsigned int cpu) { - __evtchn_fifo_handle_events(cpu, true); + __evtchn_fifo_handle_events(cpu, NULL); return 0; }
diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h index 980a56d51e4c..2cb9c2d2c5c0 100644 --- a/drivers/xen/events/events_internal.h +++ b/drivers/xen/events/events_internal.h @@ -32,11 +32,15 @@ enum xen_irq_type { */ struct irq_info { struct list_head list; + struct list_head eoi_list; int refcnt; enum xen_irq_type type; /* type */ unsigned irq; unsigned int evtchn; /* event channel */ unsigned short cpu; /* cpu bound */ + unsigned short eoi_cpu; /* EOI must happen on this cpu */ + unsigned int irq_epoch; /* If eoi_cpu valid: irq_epoch of event */ + u64 eoi_time; /* Time in jiffies when to EOI. */
union { unsigned short virq; @@ -55,6 +59,8 @@ struct irq_info { #define PIRQ_SHAREABLE (1 << 1) #define PIRQ_MSI_GROUP (1 << 2)
+struct evtchn_loop_ctrl; + struct evtchn_ops { unsigned (*max_channels)(void); unsigned (*nr_channels)(void); @@ -69,7 +75,7 @@ struct evtchn_ops { void (*mask)(unsigned port); void (*unmask)(unsigned port);
- void (*handle_events)(unsigned cpu); + void (*handle_events)(unsigned cpu, struct evtchn_loop_ctrl *ctrl); void (*resume)(void);
int (*percpu_init)(unsigned int cpu); @@ -80,6 +86,7 @@ extern const struct evtchn_ops *evtchn_ops;
extern int **evtchn_to_irq; int get_evtchn_to_irq(unsigned int evtchn); +void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl);
struct irq_info *info_for_irq(unsigned irq); unsigned cpu_from_irq(unsigned irq); @@ -137,9 +144,10 @@ static inline void unmask_evtchn(unsigned port) return evtchn_ops->unmask(port); }
-static inline void xen_evtchn_handle_events(unsigned cpu) +static inline void xen_evtchn_handle_events(unsigned cpu, + struct evtchn_loop_ctrl *ctrl) { - return evtchn_ops->handle_events(cpu); + return evtchn_ops->handle_events(cpu, ctrl); }
static inline void xen_evtchn_resume(void)