In the clock_read_data structure, the epoch_ns and epoch_cyc members are often updated by the update_sched_clock(). In fact, the sched_clock_mask, read_sched_clock, mult, and shift will no longer change after they are initialized and registered in sched_clock_register(). Because all of them are in the same cache line, especially in multi-core scenarios, this will cause frequent cache miss.
Here we move these almostly read-only members into the struct clock_data to avoid false share with epoch_* members, and separate them into different cache line with padding.
Here are the detailed test results of unixbench.
./Run -i 3 -c 96 context1
Without Patch ------------------------------------------------------------------------ Pipe-based Context Switching 7423119.7 lps (10.0 s, 2 samples)
System Benchmarks Partial Index BASELINE RESULT INDEX Pipe-based Context Switching 4000.0 7423119.7 18557.8 ======== System Benchmarks Index Score (Partial Only) 18557.8
With Patch ------------------------------------------------------------------------ Pipe-based Context Switching 15869071.9 lps (10.0 s, 2 samples)
System Benchmarks Partial Index BASELINE RESULT INDEX Pipe-based Context Switching 4000.0 15869071.9 39672.7 ======== System Benchmarks Index Score (Partial Only) 39672.7
Signed-off-by: Zeng Heng zengheng4@huawei.com --- arch/arm64/kernel/perf_event.c | 12 ++++---- include/linux/sched_clock.h | 42 ++++++++++++++++++++++---- kernel/time/sched_clock.c | 55 ++++++++++------------------------ 3 files changed, 59 insertions(+), 50 deletions(-)
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index cdb3d4549b3a..4894fb0668fa 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -1255,6 +1255,8 @@ static int __init armv8_pmu_driver_init(void) } device_initcall(armv8_pmu_driver_init)
+extern struct clock_data cd; + void arch_perf_update_userpage(struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now) { @@ -1269,21 +1271,21 @@ void arch_perf_update_userpage(struct perf_event *event, do { rd = sched_clock_read_begin(&seq);
- if (rd->read_sched_clock != arch_timer_read_counter) + if (cd.read_sched_clock != arch_timer_read_counter) return;
- userpg->time_mult = rd->mult; - userpg->time_shift = rd->shift; + userpg->time_mult = cd.mult; + userpg->time_shift = cd.shift; userpg->time_zero = rd->epoch_ns; userpg->time_cycles = rd->epoch_cyc; - userpg->time_mask = rd->sched_clock_mask; + userpg->time_mask = cd.sched_clock_mask;
/* * Subtract the cycle base, such that software that * doesn't know about cap_user_time_short still 'works' * assuming no wraps. */ - ns = mul_u64_u32_shr(rd->epoch_cyc, rd->mult, rd->shift); + ns = mul_u64_u32_shr(rd->epoch_cyc, cd.mult, cd.shift); userpg->time_zero -= ns;
} while (sched_clock_read_retry(seq)); diff --git a/include/linux/sched_clock.h b/include/linux/sched_clock.h index 835ee87ed792..579b6d02c80c 100644 --- a/include/linux/sched_clock.h +++ b/include/linux/sched_clock.h @@ -11,11 +11,6 @@ * * @epoch_ns: sched_clock() value at last update * @epoch_cyc: Clock cycle value at last update. - * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit - * clocks. - * @read_sched_clock: Current clock source (or dummy source when suspended). - * @mult: Multiplier for scaled math conversion. - * @shift: Shift value for scaled math conversion. * * Care must be taken when updating this structure; it is read by * some very hot code paths. It occupies <=40 bytes and, when combined @@ -25,10 +20,47 @@ struct clock_read_data { u64 epoch_ns; u64 epoch_cyc; +}; + +/** + * struct clock_data - all data needed for sched_clock() (including + * registration of a new clock source) + * + * @seq: Sequence counter for protecting updates. The lowest + * bit is the index for @read_data. + * @read_data: Data required to read from sched_clock. + * + * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit + * clocks. + * @read_sched_clock: Current clock source (or dummy source when suspended). + * @mult: Multiplier for scaled math conversion. + * @shift: Shift value for scaled math conversion. + * + * @wrap_kt: Duration for which clock can run before wrapping. + * @rate: Tick rate of the registered clock. + * @actual_read_sched_clock: Registered hardware level clock read function. + * + * The ordering of this structure has been chosen to optimize cache + * performance. In particular 'seq' and 'read_data[0]' (combined) should fit + * into a single 64-byte cache line. + */ +struct clock_data { + seqcount_latch_t seq; + struct clock_read_data read_data[2]; + + struct { + char padding[0]; + } ____cacheline_aligned; + u64 sched_clock_mask; u64 (*read_sched_clock)(void); u32 mult; u32 shift; + + ktime_t wrap_kt; + unsigned long rate; + + u64 (*actual_read_sched_clock)(void); };
extern struct clock_read_data *sched_clock_read_begin(unsigned int *seq); diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index b1b9b12899f5..de50a321f835 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -19,30 +19,6 @@
#include "timekeeping.h"
-/** - * struct clock_data - all data needed for sched_clock() (including - * registration of a new clock source) - * - * @seq: Sequence counter for protecting updates. The lowest - * bit is the index for @read_data. - * @read_data: Data required to read from sched_clock. - * @wrap_kt: Duration for which clock can run before wrapping. - * @rate: Tick rate of the registered clock. - * @actual_read_sched_clock: Registered hardware level clock read function. - * - * The ordering of this structure has been chosen to optimize cache - * performance. In particular 'seq' and 'read_data[0]' (combined) should fit - * into a single 64-byte cache line. - */ -struct clock_data { - seqcount_latch_t seq; - struct clock_read_data read_data[2]; - ktime_t wrap_kt; - unsigned long rate; - - u64 (*actual_read_sched_clock)(void); -}; - static struct hrtimer sched_clock_timer; static int irqtime = -1;
@@ -57,9 +33,9 @@ static u64 notrace jiffy_sched_clock_read(void) return (u64)(jiffies - INITIAL_JIFFIES); }
-static struct clock_data cd ____cacheline_aligned = { - .read_data[0] = { .mult = NSEC_PER_SEC / HZ, - .read_sched_clock = jiffy_sched_clock_read, }, +struct clock_data cd ____cacheline_aligned = { + .mult = NSEC_PER_SEC / HZ, + .read_sched_clock = jiffy_sched_clock_read, .actual_read_sched_clock = jiffy_sched_clock_read, };
@@ -88,9 +64,9 @@ unsigned long long notrace sched_clock(void) do { rd = sched_clock_read_begin(&seq);
- cyc = (rd->read_sched_clock() - rd->epoch_cyc) & - rd->sched_clock_mask; - res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift); + cyc = (cd.read_sched_clock() - rd->epoch_cyc) & + cd.sched_clock_mask; + res = rd->epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift); } while (sched_clock_read_retry(seq));
return res; @@ -133,7 +109,7 @@ static void update_sched_clock(void) rd = cd.read_data[0];
cyc = cd.actual_read_sched_clock(); - ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift); + ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & cd.sched_clock_mask, cd.mult, cd.shift);
rd.epoch_ns = ns; rd.epoch_cyc = cyc; @@ -179,13 +155,14 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) /* Update epoch for new counter and update 'epoch_ns' from old counter*/ new_epoch = read(); cyc = cd.actual_read_sched_clock(); - ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift); + ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & cd.sched_clock_mask, cd.mult, cd.shift); + cd.actual_read_sched_clock = read; + cd.read_sched_clock = read; + cd.sched_clock_mask = new_mask; + cd.mult = new_mult; + cd.shift = new_shift;
- rd.read_sched_clock = read; - rd.sched_clock_mask = new_mask; - rd.mult = new_mult; - rd.shift = new_shift; rd.epoch_cyc = new_epoch; rd.epoch_ns = ns;
@@ -265,11 +242,9 @@ static u64 notrace suspended_sched_clock_read(void)
int sched_clock_suspend(void) { - struct clock_read_data *rd = &cd.read_data[0]; - update_sched_clock(); hrtimer_cancel(&sched_clock_timer); - rd->read_sched_clock = suspended_sched_clock_read; + cd.read_sched_clock = suspended_sched_clock_read;
return 0; } @@ -280,7 +255,7 @@ void sched_clock_resume(void)
rd->epoch_cyc = cd.actual_read_sched_clock(); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD); - rd->read_sched_clock = cd.actual_read_sched_clock; + cd.read_sched_clock = cd.actual_read_sched_clock; }
static struct syscore_ops sched_clock_ops = {