hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW
-------------------------------
Introduce cpu fine grained stall tracking(cpu cfs bandwidth or cpu qos) in pressure.stat. For cpu fine grained stall tracking, only "full" information in pressure.stat.
for example:
/test # cat /tmp/cpuacct/test/pressure.stat cgroup_memory_reclaim some avg10=0.00 avg60=0.00 avg300=0.00 total=0 full avg10=0.00 avg60=0.00 avg300=0.00 total=0 global_memory_reclaim some avg10=0.00 avg60=0.00 avg300=0.00 total=0 full avg10=0.00 avg60=0.00 avg300=0.00 total=0 compact some avg10=0.00 avg60=0.00 avg300=0.00 total=0 full avg10=0.00 avg60=0.00 avg300=0.00 total=0 cgroup_async_memory_reclaim some avg10=0.00 avg60=0.00 avg300=0.00 total=0 full avg10=0.00 avg60=0.00 avg300=0.00 total=0 swap some avg10=0.00 avg60=0.00 avg300=0.00 total=0 full avg10=0.00 avg60=0.00 avg300=0.00 total=0 cpu_cfs_bandwidth full avg10=21.76 avg60=4.58 avg300=0.98 total=3893827 cpu_qos full avg10=0.00 avg60=0.00 avg300=0.00 total=0
Signed-off-by: Lu Jialin lujialin4@huawei.com --- include/linux/psi_types.h | 8 +++++ kernel/sched/fair.c | 6 ---- kernel/sched/psi.c | 75 ++++++++++++++++++++++++++++++++++++--- kernel/sched/stats.h | 8 +++++ 4 files changed, 86 insertions(+), 11 deletions(-)
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index d20a83184fd0..bd2a28224910 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -220,6 +220,10 @@ enum psi_stat_states { PSI_ASYNC_MEMCG_RECLAIM_FULL, PSI_SWAP_SOME, PSI_SWAP_FULL, + PSI_CPU_CFS_BANDWIDTH_FULL, +#ifdef CONFIG_QOS_SCHED + PSI_CPU_QOS_FULL, +#endif NR_PSI_STAT_STATES, };
@@ -237,6 +241,8 @@ enum psi_stat_task_count { NR_PSI_STAT_TASK_COUNTS, };
+#define CPU_CFS_BANDWIDTH 1 + struct psi_group_stat_cpu { u32 state_mask; u32 times[NR_PSI_STAT_STATES]; @@ -244,6 +250,8 @@ struct psi_group_stat_cpu { unsigned int tasks[NR_PSI_STAT_TASK_COUNTS]; u32 times_delta; u32 times_prev[NR_PSI_AGGREGATORS][NR_PSI_STAT_STATES]; + int prev_throttle; + int cur_throttle; };
struct psi_group_ext { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0de55884f9da..9245d35be87d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -131,12 +131,6 @@ int __weak arch_asym_cpu_priority(int cpu)
#ifdef CONFIG_QOS_SCHED
-/* - * To distinguish cfs bw, use QOS_THROTTLED mark cfs_rq->throttled - * when qos throttled(and cfs bw throttle mark cfs_rq->throttled as 1). - */ -#define QOS_THROTTLED 2 - static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq); static DEFINE_PER_CPU_SHARED_ALIGNED(struct hrtimer, qos_overload_timer); static DEFINE_PER_CPU(int, qos_cpu_overload); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index c5cde57bf8de..6b232145742c 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -454,7 +454,7 @@ static void psi_group_stat_change(struct psi_group *group, int cpu, for (t = 0; set; set &= ~(1 << t), t++) if (set & (1 << t)) ext_groupc->tasks[t]++; - for (s = 0; s < NR_PSI_STAT_STATES; s++) + for (s = 0; s < PSI_CPU_CFS_BANDWIDTH_FULL; s++) if (test_fine_grained_stat(ext_groupc->tasks, groupc->tasks[NR_RUNNING], s)) state_mask |= (1 << s); @@ -544,6 +544,52 @@ static inline void update_stat_averages(struct psi_group_ext *psi_ext, u64 period) {} #endif
+#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_CGROUP_CPUACCT) && \ + defined(CONFIG_PSI_FINE_GRAINED) +static void record_cpu_stat_times(struct psi_group *group, int cpu) +{ + struct psi_group_ext *psi_ext = to_psi_group_ext(group); + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + struct psi_group_stat_cpu *ext_groupc = per_cpu_ptr(psi_ext->pcpu, cpu); + u32 delta = ext_groupc->psi_delta; + + if (groupc->state_mask & (1 << PSI_CPU_FULL)) { + if (ext_groupc->prev_throttle == CPU_CFS_BANDWIDTH) + ext_groupc->times[PSI_CPU_CFS_BANDWIDTH_FULL] += delta; +#ifdef CONFIG_QOS_SCHED + else if (ext_groupc->prev_throttle == QOS_THROTTLED) + ext_groupc->times[PSI_CPU_QOS_FULL] += delta; +#endif + } +} + +static void update_throttle_type(struct task_struct *task, int cpu, bool next) +{ + struct cgroup *cpuacct_cgrp; + struct psi_group_ext *psi_ext; + struct psi_group_stat_cpu *groupc; + struct task_group *tsk_grp; + + if (!cgroup_subsys_on_dfl(cpuacct_cgrp_subsys)) { + rcu_read_lock(); + cpuacct_cgrp = task_cgroup(task, cpuacct_cgrp_id); + if (cgroup_parent(cpuacct_cgrp)) { + psi_ext = to_psi_group_ext(cgroup_psi(cpuacct_cgrp)); + groupc = per_cpu_ptr(psi_ext->pcpu, cpu); + tsk_grp = task_group(task); + if (next) + groupc->prev_throttle = groupc->cur_throttle; + groupc->cur_throttle = tsk_grp->cfs_rq[cpu]->throttled; + } + rcu_read_unlock(); + } +} +#else +static inline void record_cpu_stat_times(struct psi_group *group, int cpu) {} +static inline void update_throttle_type(struct task_struct *task, int cpu, + bool next) {} +#endif + static void collect_percpu_times(struct psi_group *group, enum psi_aggregators aggregator, u32 *pchanged_states) @@ -1072,8 +1118,10 @@ static void psi_group_change(struct psi_group *group, int cpu, * may have already incorporated the live state into times_prev; * avoid a delta sample underflow when PSI is later re-enabled. */ - if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE))) + if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE))) { record_times(groupc, now); + record_cpu_stat_times(group, cpu); + }
groupc->state_mask = state_mask;
@@ -1098,6 +1146,7 @@ static void psi_group_change(struct psi_group *group, int cpu, state_mask |= (1 << PSI_MEM_FULL);
record_times(groupc, now); + record_cpu_stat_times(group, cpu);
groupc->state_mask = state_mask;
@@ -1183,6 +1232,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, u64 now = cpu_clock(cpu);
if (next->pid) { + update_throttle_type(next, cpu, true); psi_flags_change(next, 0, TSK_ONCPU); /* * Set TSK_ONCPU on @next's cgroups. If @next shares any @@ -1210,6 +1260,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, int stat_clear = 0; bool memstall_type_change = false;
+ update_throttle_type(prev, cpu, false); /* * When we're going to sleep, psi_dequeue() lets us * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and @@ -1297,6 +1348,7 @@ void psi_account_irqtime(struct task_struct *task, u32 delta) update_psi_stat_delta(group, cpu, now); record_stat_times(to_psi_group_ext(group), cpu); record_times(groupc, now); + record_cpu_stat_times(group, cpu); groupc->times[PSI_IRQ_FULL] += delta;
write_seqcount_end(&groupc->seq); @@ -1937,8 +1989,22 @@ static const char *const psi_stat_names[] = { "compact", "cgroup_async_memory_reclaim", "swap", + "cpu_cfs_bandwidth", + "cpu_qos", };
+static void get_stat_names(struct seq_file *m, int i, bool is_full) +{ + if (i <= PSI_SWAP_FULL && !is_full) + return seq_printf(m, "%s\n", psi_stat_names[i / 2]); + else if (i == PSI_CPU_CFS_BANDWIDTH_FULL) + return seq_printf(m, "%s\n", "cpu_cfs_bandwidth"); +#ifdef CONFIG_QOS_SCHED + else if (i == PSI_CPU_QOS_FULL) + return seq_printf(m, "%s\n", "cpu_qos"); +#endif +} + int psi_stat_show(struct seq_file *m, struct psi_group *group) { struct psi_group_ext *psi_ext; @@ -1958,12 +2024,11 @@ int psi_stat_show(struct seq_file *m, struct psi_group *group) group->avg_next_update = update_averages(group, now); mutex_unlock(&group->avgs_lock); for (i = 0; i < NR_PSI_STAT_STATES; i++) { - is_full = i % 2; + is_full = i % 2 || i > PSI_SWAP_FULL; for (w = 0; w < 3; w++) avg[w] = psi_ext->avg[i][w]; total = div_u64(psi_ext->total[PSI_AVGS][i], NSEC_PER_USEC); - if (!is_full) - seq_printf(m, "%s\n", psi_stat_names[i / 2]); + get_stat_names(m, i, is_full); seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", is_full ? "full" : "some", LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 38f3698f5e5b..9546cbf02d55 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -106,6 +106,14 @@ __schedstats_from_se(struct sched_entity *se) return &task_of(se)->stats; }
+#ifdef CONFIG_QOS_SCHED +/* + * To distinguish cfs bw, use QOS_THROTTLED mark cfs_rq->throttled + * when qos throttled(and cfs bw throttle mark cfs_rq->throttled as 1). + */ +#define QOS_THROTTLED 2 +#endif + #ifdef CONFIG_PSI void psi_task_change(struct task_struct *task, int clear, int set); void psi_task_switch(struct task_struct *prev, struct task_struct *next,