
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/ID1OOE -------------------------------- Inject pmu irq when select online/offline task. Reuse the task label logic of SMT expel. Signed-off-by: Jinjie Ruan <ruanjinjie@huawei.com> --- arch/arm64/include/asm/smt_qos.h | 93 +++++++++++++++ arch/arm64/kernel/smt_qos.c | 76 ++++++++++++ drivers/perf/arm_pmu.c | 11 ++ kernel/sched/core.c | 55 ++++++++- kernel/sched/fair.c | 194 ++++++++++++++++++++++++++++++- kernel/sched/sched.h | 6 +- kernel/sysctl.c | 18 +++ 7 files changed, 447 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/smt_qos.h b/arch/arm64/include/asm/smt_qos.h index d4558b681958..18786de17345 100644 --- a/arch/arm64/include/asm/smt_qos.h +++ b/arch/arm64/include/asm/smt_qos.h @@ -2,6 +2,99 @@ #ifndef __ASM_SMT_QOS_H #define __ASM_SMT_QOS_H +#include <linux/perf/arm_pmuv3.h> +#include <asm/sysreg.h> + +#define INST_RETIRED_COUNTER 0 +#define CYCLE_COUNTER 1 + +DECLARE_PER_CPU(bool, pmu_enable); + extern unsigned int sysctl_delay_cycles; +extern unsigned int sysctl_sample_interval_inst; +extern unsigned int sysctl_sample_interval_cycles; + +// Enable Performance Monitors +static inline void pmu_start(void) +{ + u64 reg_val; + + reg_val = read_sysreg(pmcr_el0); + reg_val |= ARMV8_PMU_PMCR_E; // Enable the PMU counter + write_sysreg(reg_val, pmcr_el0); + isb(); +} + +// Disable the PMU entirely +static inline void pmu_stop(void) +{ + u64 reg_val; + + reg_val = read_sysreg(pmcr_el0); + reg_val &= ~ARMV8_PMU_PMCR_E; // Clear bit 0 (E) to Disable the PMU + write_sysreg(reg_val, pmcr_el0); + isb(); +} + +static inline void write_pmevtypern_el0(int n, u64 val) +{ + u64 and = ARMV8_PMU_INCLUDE_EL2 | ARMV8_PMU_EXCLUDE_EL1; + + switch(n) { + case 0: + write_sysreg(val | and, pmevtyper0_el0); + break; + case 1: + write_sysreg(val | and, pmevtyper1_el0); + break; + default: + break; + } +} + +static inline void write_pmevcntrn_el0(int n, u64 val) +{ + val |= GENMASK_ULL(63, 32); + + switch(n) { + case 0: + write_sysreg(val, pmevcntr0_el0); + break; + case 1: + write_sysreg(val, pmevcntr1_el0); + break; + default: + break; + } +} + +static inline void write_pmintenset_el1(unsigned int counter) +{ + write_sysreg(BIT(counter), pmintenset_el1); +} + +static inline void write_pmcntenset_el0(unsigned int counter) +{ + write_sysreg(BIT(counter), pmcntenset_el0); +} + +static inline void write_pmcntenclr_el0(unsigned int counter) +{ + write_sysreg(BIT(counter), pmcntenclr_el0); +} + +static inline void write_pmintenclr_el1(unsigned int counter) +{ + write_sysreg(BIT(counter), pmintenclr_el1); +} + +static inline void write_pmovsclr_el0(unsigned int counter) +{ + write_sysreg(BIT(counter), pmovsclr_el0); +} + +void setup_pmu_counter(void *info); +void stop_pmu_counter(void *info); +irqreturn_t my_pmu_irq_handler(int irq, void *dev_id); #endif /* __ASM_SMT_QOS_H */ diff --git a/arch/arm64/kernel/smt_qos.c b/arch/arm64/kernel/smt_qos.c index 2fdc7707fac4..28cc576d6438 100644 --- a/arch/arm64/kernel/smt_qos.c +++ b/arch/arm64/kernel/smt_qos.c @@ -11,6 +11,8 @@ #include <asm/smt_qos.h> unsigned int sysctl_delay_cycles = 10000000; +unsigned int sysctl_sample_interval_inst = 100000000; +unsigned int sysctl_sample_interval_cycles = 100000000; SYSCALL_DEFINE0(vdso_wfxt_return) { @@ -22,3 +24,77 @@ SYSCALL_DEFINE0(vdso_wfxt_return) return regs->regs[0]; } + +void setup_pmu_counter(void *info) +{ + if (unlikely(__this_cpu_read(pmu_enable))) + return; + + trace_printk("Enable pmu on CPU %d.\n", smp_processor_id()); + + pmu_start(); + + if (sysctl_sample_interval_inst != 0) { + write_pmevtypern_el0(INST_RETIRED_COUNTER, ARMV8_PMUV3_PERFCTR_INST_RETIRED); + write_pmevcntrn_el0(INST_RETIRED_COUNTER, (0xffffffffUL - sysctl_sample_interval_inst)); + write_pmintenset_el1(INST_RETIRED_COUNTER); + write_pmcntenset_el0(INST_RETIRED_COUNTER); + } + + if (sysctl_sample_interval_cycles != 0) { + write_pmevtypern_el0(CYCLE_COUNTER, ARMV8_PMUV3_PERFCTR_CPU_CYCLES); + write_pmevcntrn_el0(CYCLE_COUNTER, (0xffffffffUL - sysctl_sample_interval_cycles)); + write_pmintenset_el1(CYCLE_COUNTER); + write_pmcntenset_el0(CYCLE_COUNTER); + } + isb(); + + __this_cpu_write(pmu_enable, true); +} + +void stop_pmu_counter(void *info) +{ + if (likely(!__this_cpu_read(pmu_enable))) + return; + + trace_printk("Disable pmu on cpu%d\n", smp_processor_id()); + + if (sysctl_sample_interval_inst != 0) { + write_pmcntenclr_el0(INST_RETIRED_COUNTER); + write_pmintenclr_el1(INST_RETIRED_COUNTER); + write_pmovsclr_el0(INST_RETIRED_COUNTER); + } + + if (sysctl_sample_interval_cycles != 0) { + write_pmcntenclr_el0(CYCLE_COUNTER); + write_pmintenclr_el1(CYCLE_COUNTER); + write_pmovsclr_el0(CYCLE_COUNTER); + } + isb(); + + __this_cpu_write(pmu_enable, false); +} + +irqreturn_t my_pmu_irq_handler(int irq, void *dev_id) +{ + u64 pmovsclr; + + pmovsclr = read_sysreg(pmovsclr_el0); + write_sysreg(pmovsclr, pmovsclr_el0); + + // Check if our specific counter caused the interrupt + if (!(pmovsclr & BIT(INST_RETIRED_COUNTER)) && !(pmovsclr & BIT(CYCLE_COUNTER))) + return IRQ_NONE; + + pmu_stop(); + + if (pmovsclr & BIT(INST_RETIRED_COUNTER)) + write_pmevcntrn_el0(INST_RETIRED_COUNTER, (0xffffffffUL - sysctl_sample_interval_inst)); + + if (pmovsclr & BIT(CYCLE_COUNTER)) + write_pmevcntrn_el0(CYCLE_COUNTER, (0xffffffffUL - sysctl_sample_interval_cycles)); + + pmu_start(); + + return IRQ_HANDLED; +} diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index 9a97651b7afb..a0f7feb998c9 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -448,7 +448,11 @@ validate_group(struct perf_event *event) return 0; } +#ifdef CONFIG_XCALL_SMT_QOS +static __maybe_unused irqreturn_t armpmu_dispatch_irq(int irq, void *dev) +#else static irqreturn_t armpmu_dispatch_irq(int irq, void *dev) +#endif { struct arm_pmu *armpmu; int ret; @@ -658,10 +662,17 @@ void armpmu_free_irq(int irq, int cpu) per_cpu(cpu_irq_ops, cpu) = NULL; } +#ifdef CONFIG_XCALL_SMT_QOS +extern irqreturn_t my_pmu_irq_handler(int irq, void *dev_id); +#endif int armpmu_request_irq(int irq, int cpu) { int err = 0; +#ifndef CONFIG_XCALL_SMT_QOS const irq_handler_t handler = armpmu_dispatch_irq; +#else + const irq_handler_t handler = my_pmu_irq_handler; +#endif const struct pmu_irq_ops *irq_ops; if (!irq) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 47877f3b52f6..f393275fe674 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10480,7 +10480,7 @@ void ia64_set_curr_task(int cpu, struct task_struct *p) /* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock); -#ifdef CONFIG_QOS_SCHED +#if defined(CONFIG_QOS_SCHED) || defined(CONFIG_XCALL_SMT_QOS) static inline int alloc_qos_sched_group(struct task_group *tg, struct task_group *parent) { @@ -10488,7 +10488,9 @@ static inline int alloc_qos_sched_group(struct task_group *tg, return 1; } +#endif +#ifdef CONFIG_QOS_SCHED static void sched_change_qos_group(struct task_struct *tsk, struct task_group *tg) { struct sched_attr attr = {0}; @@ -10598,7 +10600,7 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_fair_sched_group(tg, parent)) goto err; -#ifdef CONFIG_QOS_SCHED +#if defined(CONFIG_QOS_SCHED) || defined(CONFIG_XCALL_SMT_QOS) if (!alloc_qos_sched_group(tg, parent)) goto err; #endif @@ -11706,6 +11708,47 @@ static inline s64 cpu_soft_quota_read(struct cgroup_subsys_state *css, } #endif +#ifdef CONFIG_XCALL_SMT_QOS +static int pmu_tg_change_scheduler(struct task_group *tg, void *data) +{ + s64 qos_level = *(s64 *)data; + + tg->qos_level = qos_level; + + return 0; +} + +static int pmu_cpu_qos_write(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 qos_level) +{ + struct task_group *tg = css_tg(css); + + if (!tg->se[0]) + return -EINVAL; + + if (qos_level > QOS_LEVEL_HIGH_EX || qos_level < QOS_LEVEL_OFFLINE_EX) + return -EINVAL; + + if (tg->qos_level == qos_level) + goto done; + + if (tg->qos_level != QOS_LEVEL_ONLINE) + return -EINVAL; + + rcu_read_lock(); + walk_tg_tree_from(tg, pmu_tg_change_scheduler, tg_nop, (void *)(&qos_level)); + rcu_read_unlock(); +done: + return 0; +} + +static inline s64 pmu_cpu_qos_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return css_tg(css)->qos_level; +} +#endif + #ifdef CONFIG_BPF_SCHED void sched_settag(struct task_struct *tsk, s64 tag) { @@ -11960,6 +12003,14 @@ static struct cftype cpu_legacy_files[] = { .write_s64 = cpu_soft_quota_write, }, #endif +#ifdef CONFIG_XCALL_SMT_QOS + { + .name = "pmu_qos_level", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = pmu_cpu_qos_read, + .write_s64 = pmu_cpu_qos_write, + }, +#endif #ifdef CONFIG_BPF_SCHED { .name = "tag", diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b22f3c072d20..4dfb75304f73 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -79,6 +79,10 @@ #include <asm/prefer_numa.h> #endif +#ifdef CONFIG_XCALL_SMT_QOS +#include <asm/smt_qos.h> +#endif + /* * The initial- and re-scaling of tunables is configurable * @@ -169,6 +173,11 @@ static bool is_offline_task(struct task_struct *p); static DEFINE_PER_CPU(int, qos_smt_status); #endif +#ifdef CONFIG_XCALL_SMT_QOS +static DEFINE_PER_CPU(int, pmu_smt_status); +DEFINE_PER_CPU(bool, pmu_enable); +#endif + #ifdef CONFIG_QOS_SCHED_PRIO_LB unsigned int sysctl_sched_prio_load_balance_enabled; #endif @@ -9970,6 +9979,183 @@ static bool qos_smt_expelled(int this_cpu) #endif +#ifdef CONFIG_XCALL_SMT_QOS +static bool pmu_smt_update_status(struct task_struct *p) +{ + int status = QOS_LEVEL_ONLINE; + + if (p != NULL && task_group(p)->qos_level < QOS_LEVEL_ONLINE) + status = QOS_LEVEL_OFFLINE; + + if (p != NULL && task_group(p)->qos_level > QOS_LEVEL_ONLINE) + status = QOS_LEVEL_HIGH; + + if (__this_cpu_read(pmu_smt_status) == status) + return false; + + __this_cpu_write(pmu_smt_status, status); + if (status == QOS_LEVEL_OFFLINE) + trace_printk("udpate %s-%d to offline!\n", p->comm, p->pid); + else if (status == QOS_LEVEL_HIGH) + trace_printk("udpate %s-%d to high level online!\n", p->comm, p->pid); + + return true; +} + +static DEFINE_PER_CPU(call_single_data_t, pmu_setup_csd) = + CSD_INIT(setup_pmu_counter, NULL); + +static void send_pmu_setup_ipi(int cpu) +{ + call_single_data_t *csd; + int ret; + + csd = &per_cpu(pmu_setup_csd, cpu); + ret = smp_call_function_single_async(cpu, csd); + if (ret) + trace_printk("Sending IPI failed to CPU %d\n", cpu); +} + +static void pmu_smt_send_ipi_setup_pmu(int this_cpu) +{ + struct rq *rq = NULL; + int cpu; + + /* + * If the cfs.h_nr_running of current cpu is 0 (which means + * current CPU is idle), not send IPI to setup pmu + * for sibling CPU + */ + rq = cpu_rq(this_cpu); + if (rq->cfs.h_nr_running == 0) + return; + + for_each_cpu(cpu, cpu_smt_mask(this_cpu)) { + if (cpu == this_cpu) + continue; + + rq = cpu_rq(cpu); + + /* + * There are two cases where current don't need to send ipi + * to setup PMU: + * a) The pmu_smt_status of siblings cpu is online; + * b) The cfs.h_nr_running of siblings cpu is 0. + */ + if (per_cpu(pmu_smt_status, cpu) >= QOS_LEVEL_ONLINE || + rq->cfs.h_nr_running == 0) + continue; + + if (!per_cpu(pmu_enable, cpu)) { + trace_printk("cpu%d send ipi to cpu%d to setup pmu\n", smp_processor_id(), cpu); + send_pmu_setup_ipi(cpu); + } + } +} + +static DEFINE_PER_CPU(call_single_data_t, pmu_stop_csd) = + CSD_INIT(stop_pmu_counter, NULL); + +static void send_pmu_stop_ipi(int cpu) +{ + call_single_data_t *csd; + int ret; + + csd = &per_cpu(pmu_stop_csd, cpu); + ret = smp_call_function_single_async(cpu, csd); + if (ret) + trace_printk("Sending IPI failed to CPU %d\n", cpu); +} + +static void pmu_smt_send_ipi_stop_pmu(int this_cpu) +{ + struct rq *rq = NULL; + int cpu; + + for_each_cpu(cpu, cpu_smt_mask(this_cpu)) { + if (cpu == this_cpu) + continue; + + rq = cpu_rq(cpu); + + trace_printk("cpu%d send ipi to cpu%d to stop pmu\n", smp_processor_id(), cpu); + send_pmu_stop_ipi(cpu); + } +} + +/* + * If current cpu runs offline task, check whether + * SMT cpu runs online task, if so, enable PMU + * counter on current cpu. + */ +static void setup_pmu_counter_on_cpu(int this_cpu) +{ + struct rq *rq = NULL; + int cpu; + + for_each_cpu(cpu, cpu_smt_mask(this_cpu)) { + if (cpu == this_cpu) + continue; + + rq = cpu_rq(cpu); + + /* + * There are two cases where current don't need to enable PMU counter + * to setup PMU: + * a) The pmu_smt_status of siblings cpu is offline; + * b) The cfs.h_nr_running of siblings cpu is 0. + */ + if (per_cpu(pmu_smt_status, cpu) <= QOS_LEVEL_ONLINE || + rq->cfs.h_nr_running == 0) + continue; + + setup_pmu_counter(NULL); + } +} + +static void pmu_smt_qos_setup(int this_cpu, struct task_struct *p) +{ + int old_status = __this_cpu_read(pmu_smt_status); + + pmu_smt_update_status(p); + + /* + * Offline task has finished, need to stop pmu counter + */ + if (old_status < QOS_LEVEL_ONLINE && __this_cpu_read(pmu_smt_status) >= QOS_LEVEL_ONLINE) + stop_pmu_counter(NULL); + + /* + * Online -> High or offline -> High switch need to check if SMT cpu is + * already running offline task. + * + * If current cpu is to run High task, check whether SMT cpu + * runs offline task, if so, send IPI to enable PMU counter. + */ + if (__this_cpu_read(pmu_smt_status) > QOS_LEVEL_ONLINE) { + pmu_smt_send_ipi_setup_pmu(this_cpu); + return; + } + + /* + * High -> online or High -> offline + * which means high task has finished on this cpu + * we need to stop pmu counter on sibling cpu. + */ + if (old_status > QOS_LEVEL_ONLINE && __this_cpu_read(pmu_smt_status) <= QOS_LEVEL_ONLINE) + pmu_smt_send_ipi_stop_pmu(this_cpu); + + /* + * High -> offline or online -> offline + * If current cpu is to run offline task, check whether SMT cpu + * runs High task, if so, enable PMU counter. + */ + if (old_status >= QOS_LEVEL_ONLINE && __this_cpu_read(pmu_smt_status) < QOS_LEVEL_ONLINE) + setup_pmu_counter_on_cpu(this_cpu); +} + +#endif + #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER DEFINE_STATIC_KEY_TRUE(qos_smt_expell_switch); @@ -10175,7 +10361,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf struct task_struct *p; int new_tasks; unsigned long time; -#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER +#if defined(CONFIG_QOS_SCHED_SMT_EXPELLER) || defined(CONFIG_XCALL_SMT_QOS) int this_cpu = rq->cpu; #endif @@ -10356,6 +10542,9 @@ done: __maybe_unused; #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER qos_smt_expel(this_cpu, p); #endif +#ifdef CONFIG_XCALL_SMT_QOS + pmu_smt_qos_setup(this_cpu, p); +#endif return p; @@ -10409,6 +10598,9 @@ done: __maybe_unused; #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER qos_smt_expel(this_cpu, NULL); #endif +#ifdef CONFIG_XCALL_SMT_QOS + pmu_smt_qos_setup(this_cpu, p); +#endif return NULL; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0e21ad151ec9..bea584e76c50 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -458,7 +458,7 @@ struct task_group { struct cfs_bandwidth cfs_bandwidth; -#ifdef CONFIG_QOS_SCHED +#if defined(CONFIG_QOS_SCHED) || defined(CONFIG_XCALL_SMT_QOS) long qos_level; #endif @@ -1575,8 +1575,8 @@ do { \ flags = _raw_spin_rq_lock_irqsave(rq); \ } while (0) -#ifdef CONFIG_QOS_SCHED -#ifdef CONFIG_QOS_SCHED_MULTILEVEL +#if defined(CONFIG_QOS_SCHED) || defined(CONFIG_XCALL_SMT_QOS) +#if defined(CONFIG_QOS_SCHED_MULTILEVEL) || defined(CONFIG_XCALL_SMT_QOS) enum task_qos_level { QOS_LEVEL_OFFLINE_EX = -2, QOS_LEVEL_OFFLINE = -1, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e012d91b6002..dff8309d4956 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2059,6 +2059,24 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_INT_MAX, }, + { + .procname = "sample_interval_inst", + .data = &sysctl_sample_interval_inst, + .maxlen = sizeof(sysctl_sample_interval_inst), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, + }, + { + .procname = "sample_interval_cycles", + .data = &sysctl_sample_interval_cycles, + .maxlen = sizeof(sysctl_sample_interval_cycles), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, + }, #endif { } }; -- 2.34.1