hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8QUNW
-------------------------------
PSI will tracking pressure stall for memory, cpu, io and irq. But, there are differrnt pressure types which will cause memory pressure, memory.pressure could not show the type of pressure effectively. The same situation for cpu.pressure. Introduce pressure.stat in psi, which will monitor specific reasons for the memory.pressure and cpu.pressure, such as global/cgroup memory reclaim, memory compact, cpu cfs bandwidth and so on. Therefore, userland could make the right solution to reduce the pressure depends on the specific pressure reasons. This patch will introduce memory fine grained stall time collect for cgroup reclaim.
Signed-off-by: Lu Jialin lujialin4@huawei.com --- include/linux/psi_types.h | 34 +++++++++ include/linux/sched.h | 1 + kernel/sched/psi.c | 150 +++++++++++++++++++++++++++++++++++++- mm/memcontrol.c | 9 +++ 4 files changed, 191 insertions(+), 3 deletions(-)
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index e8058b9ae609..5994c545d250 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -208,8 +208,29 @@ struct psi_group { };
#ifdef CONFIG_PSI_FINE_GRAINED + +enum psi_stat_states { + PSI_MEMCG_RECLAIM_SOME, + PSI_MEMCG_RECLAIM_FULL, + NR_PSI_STAT_STATES, +}; + +enum psi_stat_task_count { + NR_MEMCG_RECLAIM, + NR_MEMCG_RECLAIM_RUNNING, + NR_PSI_STAT_TASK_COUNTS, +}; + +struct psi_group_stat_cpu { + u32 state_mask; + u32 times[NR_PSI_STAT_STATES]; + u32 psi_delta; + unsigned int tasks[NR_PSI_STAT_TASK_COUNTS]; +}; + struct psi_group_ext { struct psi_group psi; + struct psi_group_stat_cpu __percpu *pcpu; }; #else struct psi_group_ext {}; @@ -223,4 +244,17 @@ struct psi_group { };
#endif /* CONFIG_PSI */
+/* + * one type should have two task stats: regular running and memstall + * threads. The reason is the same as NR_MEMSTALL_RUNNING. + * Because of the psi_memstall_type is start with 1, the correspondence + * between psi_memstall_type and psi_stat_task_count should be as below: + * + * memstall : psi_memstall_type * 2 - 2; + * running : psi_memstall_type * 2 - 1; + */ +enum psi_memstall_type { + PSI_MEMCG_RECLAIM = 1, +}; + #endif /* _LINUX_PSI_TYPES_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 3520e3fbaa91..b7014cd0122f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1537,6 +1537,7 @@ struct task_struct { struct user_event_mm *user_event_mm; #endif
+ int memstall_type; /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 89c0160531f6..0b526eab6d8f 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -177,7 +177,10 @@ struct psi_group psi_system = {
#ifdef CONFIG_PSI_FINE_GRAINED /* System-level fine grained pressure and stall tracking */ -struct psi_group_ext psi_stat_system = { }; +static DEFINE_PER_CPU(struct psi_group_stat_cpu, system_stat_group_pcpu); +struct psi_group_ext psi_stat_system = { + .pcpu = &system_stat_group_pcpu, +};
struct psi_group_ext *to_psi_group_ext(struct psi_group *psi) { @@ -354,6 +357,109 @@ static void calc_avgs(unsigned long avg[3], int missed_periods, avg[2] = calc_load(avg[2], EXP_300s, pct); }
+#ifdef CONFIG_PSI_FINE_GRAINED + +static void record_stat_times(struct psi_group_ext *psi_ext, int cpu) +{ + struct psi_group_stat_cpu *ext_grpc = per_cpu_ptr(psi_ext->pcpu, cpu); + + u32 delta = ext_grpc->psi_delta; + + if (ext_grpc->state_mask & (1 << PSI_MEMCG_RECLAIM_SOME)) { + ext_grpc->times[PSI_MEMCG_RECLAIM_SOME] += delta; + if (ext_grpc->state_mask & (1 << PSI_MEMCG_RECLAIM_FULL)) + ext_grpc->times[PSI_MEMCG_RECLAIM_FULL] += delta; + } +} + +static bool test_fine_grained_stat(unsigned int *stat_tasks, + unsigned int nr_running, + enum psi_stat_states state) +{ + switch (state) { + case PSI_MEMCG_RECLAIM_SOME: + return unlikely(stat_tasks[NR_MEMCG_RECLAIM]); + case PSI_MEMCG_RECLAIM_FULL: + return unlikely(stat_tasks[NR_MEMCG_RECLAIM] && + nr_running == stat_tasks[NR_MEMCG_RECLAIM_RUNNING]); + default: + return false; + } +} + +static void psi_group_stat_change(struct psi_group *group, int cpu, + int clear, int set) +{ + int t; + u32 state_mask = 0; + enum psi_stat_states s; + struct psi_group_ext *psi_ext = to_psi_group_ext(group); + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + struct psi_group_stat_cpu *ext_groupc = per_cpu_ptr(psi_ext->pcpu, cpu); + + write_seqcount_begin(&groupc->seq); + record_stat_times(psi_ext, cpu); + + for (t = 0; clear; clear &= ~(1 << t), t++) + if (clear & (1 << t)) + ext_groupc->tasks[t]--; + for (t = 0; set; set &= ~(1 << t), t++) + if (set & (1 << t)) + ext_groupc->tasks[t]++; + for (s = 0; s < NR_PSI_STAT_STATES; s++) + if (test_fine_grained_stat(ext_groupc->tasks, + groupc->tasks[NR_RUNNING], s)) + state_mask |= (1 << s); + if (unlikely(groupc->state_mask & PSI_ONCPU) && + cpu_curr(cpu)->memstall_type) + state_mask |= (1 << (cpu_curr(cpu)->memstall_type * 2 - 1)); + + ext_groupc->state_mask = state_mask; + write_seqcount_end(&groupc->seq); +} + +static void update_psi_stat_delta(struct psi_group *group, int cpu, u64 now) +{ + struct psi_group_ext *psi_ext = to_psi_group_ext(group); + struct psi_group_stat_cpu *ext_groupc = per_cpu_ptr(psi_ext->pcpu, cpu); + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + + ext_groupc->psi_delta = now - groupc->state_start; +} + +static void psi_stat_flags_change(struct task_struct *task, int *stat_set, + int *stat_clear, int set, int clear) +{ + if (!task->memstall_type) + return; + + if (clear) { + if (clear & TSK_MEMSTALL) + *stat_clear |= 1 << (2 * task->memstall_type - 2); + if (clear & TSK_MEMSTALL_RUNNING) + *stat_clear |= 1 << (2 * task->memstall_type - 1); + } + if (set) { + if (set & TSK_MEMSTALL) + *stat_set |= 1 << (2 * task->memstall_type - 2); + if (set & TSK_MEMSTALL_RUNNING) + *stat_set |= 1 << (2 * task->memstall_type - 1); + } + if (!task->in_memstall) + task->memstall_type = 0; +} + +#else +static inline void psi_group_stat_change(struct psi_group *group, int cpu, + int clear, int set) {} +static inline void update_psi_stat_delta(struct psi_group *group, int cpu, + u64 now) {} +static inline void psi_stat_flags_change(struct task_struct *task, + int *stat_set, int *stat_clear, + int set, int clear) {} +static inline void record_stat_times(struct psi_group_ext *psi_ext, int cpu) {} +#endif + static void collect_percpu_times(struct psi_group *group, enum psi_aggregators aggregator, u32 *pchanged_states) @@ -948,17 +1054,22 @@ void psi_task_change(struct task_struct *task, int clear, int set) int cpu = task_cpu(task); struct psi_group *group; u64 now; + int stat_set = 0; + int stat_clear = 0;
if (!task->pid) return;
psi_flags_change(task, clear, set); + psi_stat_flags_change(task, &stat_set, &stat_clear, set, clear);
now = cpu_clock(cpu);
group = task_psi_group(task); do { + update_psi_stat_delta(group, cpu, now); psi_group_change(group, cpu, clear, set, now, true); + psi_group_stat_change(group, cpu, stat_clear, stat_set); } while ((group = group->parent)); }
@@ -984,13 +1095,18 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, break; }
+ update_psi_stat_delta(group, cpu, now); psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); + psi_group_stat_change(group, cpu, 0, 0); } while ((group = group->parent)); }
if (prev->pid) { int clear = TSK_ONCPU, set = 0; bool wake_clock = true; + int stat_set = 0; + int stat_clear = 0; + bool memstall_type_change = false;
/* * When we're going to sleep, psi_dequeue() lets us @@ -1017,24 +1133,36 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, }
psi_flags_change(prev, clear, set); + psi_stat_flags_change(prev, &stat_set, &stat_clear, set, clear);
group = task_psi_group(prev); do { if (group == common) break; + update_psi_stat_delta(group, cpu, now); psi_group_change(group, cpu, clear, set, now, wake_clock); + psi_group_stat_change(group, cpu, stat_clear, stat_set); } while ((group = group->parent));
+#ifdef CONFIG_PSI_FINE_GRAINED + if (next->memstall_type != prev->memstall_type) + memstall_type_change = true; +#endif + /* * TSK_ONCPU is handled up to the common ancestor. If there are * any other differences between the two tasks (e.g. prev goes * to sleep, or only one task is memstall), finish propagating * those differences all the way up to the root. */ - if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { + if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU || + memstall_type_change) { clear &= ~TSK_ONCPU; - for (; group; group = group->parent) + for (; group; group = group->parent) { + update_psi_stat_delta(group, cpu, now); psi_group_change(group, cpu, clear, set, now, wake_clock); + psi_group_stat_change(group, cpu ,stat_clear, stat_set); + } } } } @@ -1064,6 +1192,8 @@ void psi_account_irqtime(struct task_struct *task, u32 delta)
write_seqcount_begin(&groupc->seq);
+ update_psi_stat_delta(group, cpu, now); + record_stat_times(to_psi_group_ext(group), cpu); record_times(groupc, now); groupc->times[PSI_IRQ_FULL] += delta;
@@ -1086,6 +1216,9 @@ void psi_memstall_enter(unsigned long *flags) { struct rq_flags rf; struct rq *rq; +#ifdef CONFIG_PSI_FINE_GRAINED + unsigned long stat_flags = *flags; +#endif
if (static_branch_likely(&psi_disabled)) return; @@ -1103,6 +1236,10 @@ void psi_memstall_enter(unsigned long *flags) rq = this_rq_lock_irq(&rf);
current->in_memstall = 1; +#ifdef CONFIG_PSI_FINE_GRAINED + if (stat_flags) + current->memstall_type = stat_flags; +#endif psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
rq_unlock_irq(rq, &rf); @@ -1156,6 +1293,11 @@ int psi_cgroup_alloc(struct cgroup *cgroup) psi_ext = kzalloc(sizeof(struct psi_group_ext), GFP_KERNEL); if (!psi_ext) return -ENOMEM; + psi_ext->pcpu = alloc_percpu(struct psi_group_stat_cpu); + if (!psi_ext->pcpu) { + kfree(psi_ext); + return -ENOMEM; + } cgroup->psi = &psi_ext->psi; #else cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL); @@ -1165,6 +1307,7 @@ int psi_cgroup_alloc(struct cgroup *cgroup) cgroup->psi->pcpu = alloc_percpu(struct psi_group_cpu); if (!cgroup->psi->pcpu) { #ifdef CONFIG_PSI_FINE_GRAINED + free_percpu(psi_ext->pcpu); kfree(psi_ext); #else kfree(cgroup->psi); @@ -1186,6 +1329,7 @@ void psi_cgroup_free(struct cgroup *cgroup) /* All triggers must be removed by now */ WARN_ONCE(cgroup->psi->rtpoll_states, "psi: trigger leak\n"); #ifdef CONFIG_PSI_FINE_GRAINED + free_percpu(to_psi_group_ext(cgroup->psi)->pcpu); kfree(to_psi_group_ext(cgroup->psi)); #else kfree(cgroup->psi); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2d9a873e5522..a3c3a508a24d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2410,6 +2410,9 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
memcg_memory_event(memcg, MEMCG_HIGH);
+#ifdef CONFIG_PSI_FINE_GRAINED + pflags = PSI_MEMCG_RECLAIM; +#endif psi_memstall_enter(&pflags); nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, @@ -2681,6 +2684,9 @@ void mem_cgroup_handle_over_high(gfp_t gfp_mask) * schedule_timeout_killable sets TASK_KILLABLE). This means we don't * need to account for any ill-begotten jiffies to pay them off later. */ +#ifdef CONFIG_PSI_FINE_GRAINED + pflags = PSI_MEMCG_RECLAIM; +#endif psi_memstall_enter(&pflags); schedule_timeout_killable(penalty_jiffies); psi_memstall_leave(&pflags); @@ -2742,6 +2748,9 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, memcg_memory_event(mem_over_limit, MEMCG_MAX); raised_max_event = true;
+#ifdef CONFIG_PSI_FINE_GRAINED + pflags = PSI_MEMCG_RECLAIM; +#endif psi_memstall_enter(&pflags); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, gfp_mask, reclaim_options);