In the clock_read_data structure, the epoch_ns and epoch_cyc members are
often updated by the update_sched_clock(). In fact, the sched_clock_mask,
read_sched_clock, mult, and shift will no longer change after they are
initialized and registered in sched_clock_register(). Because all of them
are in the same cache line, especially in multi-core scenarios, this will
cause frequent cache miss.
Here we move these almostly read-only members into the struct clock_data
to avoid false share with epoch_* members, and separate them into
different cache line with padding.
Here are the detailed test results of unixbench.
./Run -i 3 -c 96 context1
Without Patch
------------------------------------------------------------------------
Pipe-based Context Switching 7423119.7 lps (10.0 s, 2 samples)
System Benchmarks Partial Index BASELINE RESULT INDEX
Pipe-based Context Switching 4000.0 7423119.7 18557.8
========
System Benchmarks Index Score (Partial Only) 18557.8
With Patch
------------------------------------------------------------------------
Pipe-based Context Switching 15869071.9 lps (10.0 s, 2 samples)
System Benchmarks Partial Index BASELINE RESULT INDEX
Pipe-based Context Switching 4000.0 15869071.9 39672.7
========
System Benchmarks Index Score (Partial Only) 39672.7
Signed-off-by: Zeng Heng <zengheng4(a)huawei.com>
---
arch/arm64/kernel/perf_event.c | 12 ++++----
include/linux/sched_clock.h | 42 ++++++++++++++++++++++----
kernel/time/sched_clock.c | 55 ++++++++++------------------------
3 files changed, 59 insertions(+), 50 deletions(-)
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index cdb3d4549b3a..4894fb0668fa 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -1255,6 +1255,8 @@ static int __init armv8_pmu_driver_init(void)
}
device_initcall(armv8_pmu_driver_init)
+extern struct clock_data cd;
+
void arch_perf_update_userpage(struct perf_event *event,
struct perf_event_mmap_page *userpg, u64 now)
{
@@ -1269,21 +1271,21 @@ void arch_perf_update_userpage(struct perf_event *event,
do {
rd = sched_clock_read_begin(&seq);
- if (rd->read_sched_clock != arch_timer_read_counter)
+ if (cd.read_sched_clock != arch_timer_read_counter)
return;
- userpg->time_mult = rd->mult;
- userpg->time_shift = rd->shift;
+ userpg->time_mult = cd.mult;
+ userpg->time_shift = cd.shift;
userpg->time_zero = rd->epoch_ns;
userpg->time_cycles = rd->epoch_cyc;
- userpg->time_mask = rd->sched_clock_mask;
+ userpg->time_mask = cd.sched_clock_mask;
/*
* Subtract the cycle base, such that software that
* doesn't know about cap_user_time_short still 'works'
* assuming no wraps.
*/
- ns = mul_u64_u32_shr(rd->epoch_cyc, rd->mult, rd->shift);
+ ns = mul_u64_u32_shr(rd->epoch_cyc, cd.mult, cd.shift);
userpg->time_zero -= ns;
} while (sched_clock_read_retry(seq));
diff --git a/include/linux/sched_clock.h b/include/linux/sched_clock.h
index 835ee87ed792..579b6d02c80c 100644
--- a/include/linux/sched_clock.h
+++ b/include/linux/sched_clock.h
@@ -11,11 +11,6 @@
*
* @epoch_ns: sched_clock() value at last update
* @epoch_cyc: Clock cycle value at last update.
- * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit
- * clocks.
- * @read_sched_clock: Current clock source (or dummy source when suspended).
- * @mult: Multiplier for scaled math conversion.
- * @shift: Shift value for scaled math conversion.
*
* Care must be taken when updating this structure; it is read by
* some very hot code paths. It occupies <=40 bytes and, when combined
@@ -25,10 +20,47 @@
struct clock_read_data {
u64 epoch_ns;
u64 epoch_cyc;
+};
+
+/**
+ * struct clock_data - all data needed for sched_clock() (including
+ * registration of a new clock source)
+ *
+ * @seq: Sequence counter for protecting updates. The lowest
+ * bit is the index for @read_data.
+ * @read_data: Data required to read from sched_clock.
+ *
+ * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit
+ * clocks.
+ * @read_sched_clock: Current clock source (or dummy source when suspended).
+ * @mult: Multiplier for scaled math conversion.
+ * @shift: Shift value for scaled math conversion.
+ *
+ * @wrap_kt: Duration for which clock can run before wrapping.
+ * @rate: Tick rate of the registered clock.
+ * @actual_read_sched_clock: Registered hardware level clock read function.
+ *
+ * The ordering of this structure has been chosen to optimize cache
+ * performance. In particular 'seq' and 'read_data[0]' (combined) should fit
+ * into a single 64-byte cache line.
+ */
+struct clock_data {
+ seqcount_latch_t seq;
+ struct clock_read_data read_data[2];
+
+ struct {
+ char padding[0];
+ } ____cacheline_aligned;
+
u64 sched_clock_mask;
u64 (*read_sched_clock)(void);
u32 mult;
u32 shift;
+
+ ktime_t wrap_kt;
+ unsigned long rate;
+
+ u64 (*actual_read_sched_clock)(void);
};
extern struct clock_read_data *sched_clock_read_begin(unsigned int *seq);
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index b1b9b12899f5..de50a321f835 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -19,30 +19,6 @@
#include "timekeeping.h"
-/**
- * struct clock_data - all data needed for sched_clock() (including
- * registration of a new clock source)
- *
- * @seq: Sequence counter for protecting updates. The lowest
- * bit is the index for @read_data.
- * @read_data: Data required to read from sched_clock.
- * @wrap_kt: Duration for which clock can run before wrapping.
- * @rate: Tick rate of the registered clock.
- * @actual_read_sched_clock: Registered hardware level clock read function.
- *
- * The ordering of this structure has been chosen to optimize cache
- * performance. In particular 'seq' and 'read_data[0]' (combined) should fit
- * into a single 64-byte cache line.
- */
-struct clock_data {
- seqcount_latch_t seq;
- struct clock_read_data read_data[2];
- ktime_t wrap_kt;
- unsigned long rate;
-
- u64 (*actual_read_sched_clock)(void);
-};
-
static struct hrtimer sched_clock_timer;
static int irqtime = -1;
@@ -57,9 +33,9 @@ static u64 notrace jiffy_sched_clock_read(void)
return (u64)(jiffies - INITIAL_JIFFIES);
}
-static struct clock_data cd ____cacheline_aligned = {
- .read_data[0] = { .mult = NSEC_PER_SEC / HZ,
- .read_sched_clock = jiffy_sched_clock_read, },
+struct clock_data cd ____cacheline_aligned = {
+ .mult = NSEC_PER_SEC / HZ,
+ .read_sched_clock = jiffy_sched_clock_read,
.actual_read_sched_clock = jiffy_sched_clock_read,
};
@@ -88,9 +64,9 @@ unsigned long long notrace sched_clock(void)
do {
rd = sched_clock_read_begin(&seq);
- cyc = (rd->read_sched_clock() - rd->epoch_cyc) &
- rd->sched_clock_mask;
- res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift);
+ cyc = (cd.read_sched_clock() - rd->epoch_cyc) &
+ cd.sched_clock_mask;
+ res = rd->epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift);
} while (sched_clock_read_retry(seq));
return res;
@@ -133,7 +109,7 @@ static void update_sched_clock(void)
rd = cd.read_data[0];
cyc = cd.actual_read_sched_clock();
- ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
+ ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & cd.sched_clock_mask, cd.mult, cd.shift);
rd.epoch_ns = ns;
rd.epoch_cyc = cyc;
@@ -179,13 +155,14 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
/* Update epoch for new counter and update 'epoch_ns' from old counter*/
new_epoch = read();
cyc = cd.actual_read_sched_clock();
- ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
+ ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & cd.sched_clock_mask, cd.mult, cd.shift);
+
cd.actual_read_sched_clock = read;
+ cd.read_sched_clock = read;
+ cd.sched_clock_mask = new_mask;
+ cd.mult = new_mult;
+ cd.shift = new_shift;
- rd.read_sched_clock = read;
- rd.sched_clock_mask = new_mask;
- rd.mult = new_mult;
- rd.shift = new_shift;
rd.epoch_cyc = new_epoch;
rd.epoch_ns = ns;
@@ -265,11 +242,9 @@ static u64 notrace suspended_sched_clock_read(void)
int sched_clock_suspend(void)
{
- struct clock_read_data *rd = &cd.read_data[0];
-
update_sched_clock();
hrtimer_cancel(&sched_clock_timer);
- rd->read_sched_clock = suspended_sched_clock_read;
+ cd.read_sched_clock = suspended_sched_clock_read;
return 0;
}
@@ -280,7 +255,7 @@ void sched_clock_resume(void)
rd->epoch_cyc = cd.actual_read_sched_clock();
hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD);
- rd->read_sched_clock = cd.actual_read_sched_clock;
+ cd.read_sched_clock = cd.actual_read_sched_clock;
}
static struct syscore_ops sched_clock_ops = {
--
2.25.1