From: Song Liu songliubraving@fb.com
stable inclusion from stable-5.10.74 commit bdae2a08343613782be6c5be97b6c3a1ebccbfff bugzilla: 182986 https://gitee.com/openeuler/kernel/issues/I4I3MG
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
[ Upstream commit f792565326825ed806626da50c6f9a928f1079c1 ]
Users of rdpmc rely on the mmapped user page to calculate accurate time_enabled. Currently, userpage->time_enabled is only updated when the event is added to the pmu. As a result, inactive event (due to counter multiplexing) does not have accurate userpage->time_enabled. This can be reproduced with something like:
/* open 20 task perf_event "cycles", to create multiplexing */
fd = perf_event_open(); /* open task perf_event "cycles" */ userpage = mmap(fd); /* use mmap and rdmpc */
while (true) { time_enabled_mmap = xxx; /* use logic in perf_event_mmap_page */ time_enabled_read = read(fd).time_enabled; if (time_enabled_mmap > time_enabled_read) BUG(); }
Fix this by updating userpage for inactive events in merge_sched_in.
Suggested-by: Peter Zijlstra (Intel) peterz@infradead.org Reported-and-tested-by: Lucian Grijincu lucian@fb.com Signed-off-by: Song Liu songliubraving@fb.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Link: https://lkml.kernel.org/r/20210929194313.2398474-1-songliubraving@fb.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Chen Jun chenjun102@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Acked-by: Weilong Chen chenweilong@huawei.com
Signed-off-by: Chen Jun chenjun102@huawei.com --- include/linux/perf_event.h | 4 +++- kernel/events/core.c | 34 ++++++++++++++++++++++++++++++---- 2 files changed, 33 insertions(+), 5 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 072ac6c1ef2b..c095e713cf08 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -682,7 +682,9 @@ struct perf_event { /* * timestamp shadows the actual context timing but it can * be safely used in NMI interrupt context. It reflects the - * context time as it was when the event was last scheduled in. + * context time as it was when the event was last scheduled in, + * or when ctx_sched_in failed to schedule the event because we + * run out of PMC. * * ctx_time already accounts for ctx->timestamp. Therefore to * compute ctx_time for a sample, simply add perf_clock(). diff --git a/kernel/events/core.c b/kernel/events/core.c index c677f934353a..c81151926171 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3695,6 +3695,29 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, return 0; }
+static inline bool event_update_userpage(struct perf_event *event) +{ + if (likely(!atomic_read(&event->mmap_count))) + return false; + + perf_event_update_time(event); + perf_set_shadow_time(event, event->ctx); + perf_event_update_userpage(event); + + return true; +} + +static inline void group_update_userpage(struct perf_event *group_event) +{ + struct perf_event *event; + + if (!event_update_userpage(group_event)) + return; + + for_each_sibling_event(event, group_event) + event_update_userpage(event); +} + static int merge_sched_in(struct perf_event *event, void *data) { struct perf_event_context *ctx = event->ctx; @@ -3713,14 +3736,15 @@ static int merge_sched_in(struct perf_event *event, void *data) }
if (event->state == PERF_EVENT_STATE_INACTIVE) { + *can_add_hw = 0; if (event->attr.pinned) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + } else { + ctx->rotate_necessary = 1; + perf_mux_hrtimer_restart(cpuctx); + group_update_userpage(event); } - - *can_add_hw = 0; - ctx->rotate_necessary = 1; - perf_mux_hrtimer_restart(cpuctx); }
return 0; @@ -6239,6 +6263,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
ring_buffer_attach(event, rb);
+ perf_event_update_time(event); + perf_set_shadow_time(event, event->ctx); perf_event_init_userpage(event); perf_event_update_userpage(event); } else {