Kernel
Threads by month
- ----- 2025 -----
- June
- May
- April
- March
- February
- January
- ----- 2024 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2023 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2022 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2021 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2020 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2019 -----
- December
- 47 participants
- 18690 discussions
From: Xiongfeng Wang <wangxiongfeng2(a)huawei.com>
hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I40AXF
CVE: NA
--------------------------------------
Enable CONFIG_USERSWAP for openeuler_defconfig
Signed-off-by: Xiongfeng Wang <wangxiongfeng2(a)huawei.com>
Reviewed-by: tong tiangen <tongtiangen(a)huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai(a)huawei.com>
---
arch/x86/configs/openeuler_defconfig | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig
index e2a7fda97fa3..89dc79bd375d 100644
--- a/arch/x86/configs/openeuler_defconfig
+++ b/arch/x86/configs/openeuler_defconfig
@@ -8510,3 +8510,4 @@ CONFIG_ARCH_HAS_KCOV=y
# end of Kernel hacking
CONFIG_ETMEM_SCAN=m
CONFIG_ETMEM_SWAP=m
+CONFIG_USERSWAP=y
--
2.20.1
1
0

[PATCH openEuler-1.0-LTS 1/4] bpf/verifier: per-register parent pointers
by Yang Yingliang 24 Aug '21
by Yang Yingliang 24 Aug '21
24 Aug '21
From: Edward Cree <ecree(a)solarflare.com>
mainline inclusion
from mainline-v4.20-rc1
commit 679c782de14bd48c19dd74cd1af20a2bc05dd936
category: feature
bugzilla: 43460
CVE: NA
---------------------------------------
By giving each register its own liveness chain, we elide the skip_callee()
logic. Instead, each register's parent is the state it inherits from;
both check_func_call() and prepare_func_exit() automatically connect
reg states to the correct chain since when they copy the reg state across
(r1-r5 into the callee as args, and r0 out as the return value) they also
copy the parent pointer.
Signed-off-by: Edward Cree <ecree(a)solarflare.com>
Signed-off-by: Alexei Starovoitov <ast(a)kernel.org>
Conflicts:
kernel/bpf/verifier.c
[liuxin:solve the conflicts in verifier.c]
Signed-off-by: liuxin <liuxin264(a)huawei.com>
Reviewed-by: Cheng Jian <cj.chengjian(a)huawei.com>
Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com>
Reviewed-by: Wei Yongjun <weiyongjun1(a)huawei.com>
Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com>
---
include/linux/bpf_verifier.h | 8 +-
kernel/bpf/verifier.c | 183 +++++++++--------------------------
2 files changed, 47 insertions(+), 144 deletions(-)
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 1c8517320ea64..daab0960c0544 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -41,6 +41,7 @@ enum bpf_reg_liveness {
};
struct bpf_reg_state {
+ /* Ordering of fields matters. See states_equal() */
enum bpf_reg_type type;
union {
/* valid when type == PTR_TO_PACKET */
@@ -62,7 +63,6 @@ struct bpf_reg_state {
* came from, when one is tested for != NULL.
*/
u32 id;
- /* Ordering of fields matters. See states_equal() */
/* For scalar types (SCALAR_VALUE), this represents our knowledge of
* the actual value.
* For pointer types, this represents the variable part of the offset
@@ -79,15 +79,15 @@ struct bpf_reg_state {
s64 smax_value; /* maximum possible (s64)value */
u64 umin_value; /* minimum possible (u64)value */
u64 umax_value; /* maximum possible (u64)value */
+ /* parentage chain for liveness checking */
+ struct bpf_reg_state *parent;
/* Inside the callee two registers can be both PTR_TO_STACK like
* R1=fp-8 and R2=fp-8, but one of them points to this function stack
* while another to the caller's stack. To differentiate them 'frameno'
* is used which is an index in bpf_verifier_state->frame[] array
* pointing to bpf_func_state.
- * This field must be second to last, for states_equal() reasons.
*/
u32 frameno;
- /* This field must be last, for states_equal() reasons. */
enum bpf_reg_liveness live;
};
@@ -110,7 +110,6 @@ struct bpf_stack_state {
*/
struct bpf_func_state {
struct bpf_reg_state regs[MAX_BPF_REG];
- struct bpf_verifier_state *parent;
/* index of call instruction that called into this func */
int callsite;
/* stack frame number of this function state from pov of
@@ -132,7 +131,6 @@ struct bpf_func_state {
struct bpf_verifier_state {
/* call stack tracking */
struct bpf_func_state *frame[MAX_CALL_FRAMES];
- struct bpf_verifier_state *parent;
u32 curframe;
bool speculative;
};
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 650cec781f5a8..0bb6664ced7e1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -381,9 +381,9 @@ static int copy_stack_state(struct bpf_func_state *dst,
/* do_check() starts with zero-sized stack in struct bpf_verifier_state to
* make it consume minimal amount of memory. check_stack_write() access from
* the program calls into realloc_func_state() to grow the stack size.
- * Note there is a non-zero 'parent' pointer inside bpf_verifier_state
- * which this function copies over. It points to previous bpf_verifier_state
- * which is never reallocated
+ * Note there is a non-zero parent pointer inside each reg of bpf_verifier_state
+ * which this function copies over. It points to corresponding reg in previous
+ * bpf_verifier_state which is never reallocated
*/
static int realloc_func_state(struct bpf_func_state *state, int size,
bool copy_old)
@@ -468,7 +468,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
}
dst_state->speculative = src->speculative;
dst_state->curframe = src->curframe;
- dst_state->parent = src->parent;
for (i = 0; i <= src->curframe; i++) {
dst = dst_state->frame[i];
if (!dst) {
@@ -740,6 +739,7 @@ static void init_reg_state(struct bpf_verifier_env *env,
for (i = 0; i < MAX_BPF_REG; i++) {
mark_reg_not_init(env, regs, i);
regs[i].live = REG_LIVE_NONE;
+ regs[i].parent = NULL;
}
/* frame pointer */
@@ -884,74 +884,21 @@ static int check_subprogs(struct bpf_verifier_env *env)
return 0;
}
-static
-struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env,
- const struct bpf_verifier_state *state,
- struct bpf_verifier_state *parent,
- u32 regno)
-{
- struct bpf_verifier_state *tmp = NULL;
-
- /* 'parent' could be a state of caller and
- * 'state' could be a state of callee. In such case
- * parent->curframe < state->curframe
- * and it's ok for r1 - r5 registers
- *
- * 'parent' could be a callee's state after it bpf_exit-ed.
- * In such case parent->curframe > state->curframe
- * and it's ok for r0 only
- */
- if (parent->curframe == state->curframe ||
- (parent->curframe < state->curframe &&
- regno >= BPF_REG_1 && regno <= BPF_REG_5) ||
- (parent->curframe > state->curframe &&
- regno == BPF_REG_0))
- return parent;
-
- if (parent->curframe > state->curframe &&
- regno >= BPF_REG_6) {
- /* for callee saved regs we have to skip the whole chain
- * of states that belong to callee and mark as LIVE_READ
- * the registers before the call
- */
- tmp = parent;
- while (tmp && tmp->curframe != state->curframe) {
- tmp = tmp->parent;
- }
- if (!tmp)
- goto bug;
- parent = tmp;
- } else {
- goto bug;
- }
- return parent;
-bug:
- verbose(env, "verifier bug regno %d tmp %p\n", regno, tmp);
- verbose(env, "regno %d parent frame %d current frame %d\n",
- regno, parent->curframe, state->curframe);
- return NULL;
-}
-
+/* Parentage chain of this register (or stack slot) should take care of all
+ * issues like callee-saved registers, stack slot allocation time, etc.
+ */
static int mark_reg_read(struct bpf_verifier_env *env,
- const struct bpf_verifier_state *state,
- struct bpf_verifier_state *parent,
- u32 regno)
+ const struct bpf_reg_state *state,
+ struct bpf_reg_state *parent)
{
bool writes = parent == state->parent; /* Observe write marks */
- if (regno == BPF_REG_FP)
- /* We don't need to worry about FP liveness because it's read-only */
- return 0;
-
while (parent) {
/* if read wasn't screened by an earlier write ... */
- if (writes && state->frame[state->curframe]->regs[regno].live & REG_LIVE_WRITTEN)
+ if (writes && state->live & REG_LIVE_WRITTEN)
break;
- parent = skip_callee(env, state, parent, regno);
- if (!parent)
- return -EFAULT;
/* ... then we depend on parent's value */
- parent->frame[parent->curframe]->regs[regno].live |= REG_LIVE_READ;
+ parent->live |= REG_LIVE_READ;
state = parent;
parent = state->parent;
writes = true;
@@ -977,7 +924,10 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
verbose(env, "R%d !read_ok\n", regno);
return -EACCES;
}
- return mark_reg_read(env, vstate, vstate->parent, regno);
+ /* We don't need to worry about FP liveness because it's read-only */
+ if (regno != BPF_REG_FP)
+ return mark_reg_read(env, ®s[regno],
+ regs[regno].parent);
} else {
/* check whether register used as dest operand can be written to */
if (regno == BPF_REG_FP) {
@@ -1088,8 +1038,8 @@ static int check_stack_write(struct bpf_verifier_env *env,
} else {
u8 type = STACK_MISC;
- /* regular write of data into stack */
- state->stack[spi].spilled_ptr = (struct bpf_reg_state) {};
+ /* regular write of data into stack destroys any spilled ptr */
+ state->stack[spi].spilled_ptr.type = NOT_INIT;
/* only mark the slot as written if all 8 bytes were written
* otherwise read propagation may incorrectly stop too soon
@@ -1114,61 +1064,6 @@ static int check_stack_write(struct bpf_verifier_env *env,
return 0;
}
-/* registers of every function are unique and mark_reg_read() propagates
- * the liveness in the following cases:
- * - from callee into caller for R1 - R5 that were used as arguments
- * - from caller into callee for R0 that used as result of the call
- * - from caller to the same caller skipping states of the callee for R6 - R9,
- * since R6 - R9 are callee saved by implicit function prologue and
- * caller's R6 != callee's R6, so when we propagate liveness up to
- * parent states we need to skip callee states for R6 - R9.
- *
- * stack slot marking is different, since stacks of caller and callee are
- * accessible in both (since caller can pass a pointer to caller's stack to
- * callee which can pass it to another function), hence mark_stack_slot_read()
- * has to propagate the stack liveness to all parent states at given frame number.
- * Consider code:
- * f1() {
- * ptr = fp - 8;
- * *ptr = ctx;
- * call f2 {
- * .. = *ptr;
- * }
- * .. = *ptr;
- * }
- * First *ptr is reading from f1's stack and mark_stack_slot_read() has
- * to mark liveness at the f1's frame and not f2's frame.
- * Second *ptr is also reading from f1's stack and mark_stack_slot_read() has
- * to propagate liveness to f2 states at f1's frame level and further into
- * f1 states at f1's frame level until write into that stack slot
- */
-static void mark_stack_slot_read(struct bpf_verifier_env *env,
- const struct bpf_verifier_state *state,
- struct bpf_verifier_state *parent,
- int slot, int frameno)
-{
- bool writes = parent == state->parent; /* Observe write marks */
-
- while (parent) {
- if (parent->frame[frameno]->allocated_stack <= slot * BPF_REG_SIZE)
- /* since LIVE_WRITTEN mark is only done for full 8-byte
- * write the read marks are conservative and parent
- * state may not even have the stack allocated. In such case
- * end the propagation, since the loop reached beginning
- * of the function
- */
- break;
- /* if read wasn't screened by an earlier write ... */
- if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN)
- break;
- /* ... then we depend on parent's value */
- parent->frame[frameno]->stack[slot].spilled_ptr.live |= REG_LIVE_READ;
- state = parent;
- parent = state->parent;
- writes = true;
- }
-}
-
static int check_stack_read(struct bpf_verifier_env *env,
struct bpf_func_state *reg_state /* func where register points to */,
int off, int size, int value_regno)
@@ -1206,8 +1101,8 @@ static int check_stack_read(struct bpf_verifier_env *env,
*/
state->regs[value_regno].live |= REG_LIVE_WRITTEN;
}
- mark_stack_slot_read(env, vstate, vstate->parent, spi,
- reg_state->frameno);
+ mark_reg_read(env, ®_state->stack[spi].spilled_ptr,
+ reg_state->stack[spi].spilled_ptr.parent);
return 0;
} else {
int zeros = 0;
@@ -1223,8 +1118,8 @@ static int check_stack_read(struct bpf_verifier_env *env,
off, i, size);
return -EACCES;
}
- mark_stack_slot_read(env, vstate, vstate->parent, spi,
- reg_state->frameno);
+ mark_reg_read(env, ®_state->stack[spi].spilled_ptr,
+ reg_state->stack[spi].spilled_ptr.parent);
if (value_regno >= 0) {
if (zeros == size) {
/* any size read into register is zero extended,
@@ -1958,8 +1853,8 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
/* reading any byte out of 8-byte 'spill_slot' will cause
* the whole slot to be marked as 'read'
*/
- mark_stack_slot_read(env, env->cur_state, env->cur_state->parent,
- spi, state->frameno);
+ mark_reg_read(env, &state->stack[spi].spilled_ptr,
+ state->stack[spi].spilled_ptr.parent);
}
return update_stack_depth(env, state, off);
}
@@ -2415,11 +2310,13 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
state->curframe + 1 /* frameno within this callchain */,
subprog /* subprog number within this prog */);
- /* copy r1 - r5 args that callee can access */
+ /* copy r1 - r5 args that callee can access. The copy includes parent
+ * pointers, which connects us up to the liveness chain
+ */
for (i = BPF_REG_1; i <= BPF_REG_5; i++)
callee->regs[i] = caller->regs[i];
- /* after the call regsiters r0 - r5 were scratched */
+ /* after the call registers r0 - r5 were scratched */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(env, caller->regs, caller_saved[i]);
check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
@@ -5058,7 +4955,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
/* explored state didn't use this */
return true;
- equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, frameno)) == 0;
+ equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0;
if (rold->type == PTR_TO_STACK)
/* two stack pointers are equal only if they're pointing to
@@ -5297,7 +5194,7 @@ static bool states_equal(struct bpf_verifier_env *env,
* equivalent state (jump target or such) we didn't arrive by the straight-line
* code, so read marks in the state must propagate to the parent regardless
* of the state's write marks. That's what 'parent == state->parent' comparison
- * in mark_reg_read() and mark_stack_slot_read() is for.
+ * in mark_reg_read() is for.
*/
static int propagate_liveness(struct bpf_verifier_env *env,
const struct bpf_verifier_state *vstate,
@@ -5318,7 +5215,8 @@ static int propagate_liveness(struct bpf_verifier_env *env,
if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ)
continue;
if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) {
- err = mark_reg_read(env, vstate, vparent, i);
+ err = mark_reg_read(env, &vstate->frame[vstate->curframe]->regs[i],
+ &vparent->frame[vstate->curframe]->regs[i]);
if (err)
return err;
}
@@ -5333,7 +5231,8 @@ static int propagate_liveness(struct bpf_verifier_env *env,
if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ)
continue;
if (state->stack[i].spilled_ptr.live & REG_LIVE_READ)
- mark_stack_slot_read(env, vstate, vparent, i, frame);
+ mark_reg_read(env, &state->stack[i].spilled_ptr,
+ &parent->stack[i].spilled_ptr);
}
}
return err;
@@ -5343,7 +5242,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
struct bpf_verifier_state_list *new_sl;
struct bpf_verifier_state_list *sl;
- struct bpf_verifier_state *cur = env->cur_state;
+ struct bpf_verifier_state *cur = env->cur_state, *new;
int i, j, err, states_cnt = 0;
sl = env->explored_states[insn_idx];
@@ -5389,16 +5288,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
return -ENOMEM;
/* add new state to the head of linked list */
- err = copy_verifier_state(&new_sl->state, cur);
+ new = &new_sl->state;
+ err = copy_verifier_state(new, cur);
if (err) {
- free_verifier_state(&new_sl->state, false);
+ free_verifier_state(new, false);
kfree(new_sl);
return err;
}
new_sl->next = env->explored_states[insn_idx];
env->explored_states[insn_idx] = new_sl;
/* connect new state to parentage chain */
- cur->parent = &new_sl->state;
+ for (i = 0; i < BPF_REG_FP; i++)
+ cur_regs(env)[i].parent = &new->frame[new->curframe]->regs[i];
/* clear write marks in current state: the writes we did are not writes
* our child did, so they don't screen off its reads from us.
* (There are no read marks in current state, because reads always mark
@@ -5411,9 +5312,13 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
/* all stack frames are accessible from callee, clear them all */
for (j = 0; j <= cur->curframe; j++) {
struct bpf_func_state *frame = cur->frame[j];
+ struct bpf_func_state *newframe = new->frame[j];
- for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++)
+ for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) {
frame->stack[i].spilled_ptr.live = REG_LIVE_NONE;
+ frame->stack[i].spilled_ptr.parent =
+ &newframe->stack[i].spilled_ptr;
+ }
}
return 0;
}
--
2.25.1
1
3

[PATCH openEuler-1.0-LTS] blk-mq: clear active_queues before clearing BLK_MQ_F_TAG_QUEUE_SHARED
by Yang Yingliang 24 Aug '21
by Yang Yingliang 24 Aug '21
24 Aug '21
From: Yu Kuai <yukuai3(a)huawei.com>
mainline inclusion
from mainline-5.14-rc6
commit 454bb6775202d94f0f489c4632efecdb62d3c904
category: bugfix
bugzilla: 175277
CVE: NA
-------------------------------------------------
We run a test that delete and recover devcies frequently(two devices on
the same host), and we found that 'active_queues' is super big after a
period of time.
If device a and device b share a tag set, and a is deleted, then
blk_mq_exit_queue() will clear BLK_MQ_F_TAG_QUEUE_SHARED because there
is only one queue that are using the tag set. However, if b is still
active, the active_queues of b might never be cleared even if b is
deleted.
Thus clear active_queues before BLK_MQ_F_TAG_QUEUE_SHARED is cleared.
Signed-off-by: Yu Kuai <yukuai3(a)huawei.com>
Reviewed-by: Ming Lei <ming.lei(a)redhat.com>
Link: https://lore.kernel.org/r/20210731062130.1533893-1-yukuai3@huawei.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
Conflict: block/blk-mq.c
Signed-off-by: Yu Kuai <yukuai3(a)huawei.com>
Reviewed-by: Jason Yan <yanaijie(a)huawei.com>
Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com>
---
block/blk-mq.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index cad7507d4f709..b37721774895b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2513,10 +2513,12 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared)
int i;
queue_for_each_hw_ctx(q, hctx, i) {
- if (shared)
+ if (shared) {
hctx->flags |= BLK_MQ_F_TAG_SHARED;
- else
+ } else {
+ blk_mq_tag_idle(hctx);
hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
+ }
}
}
--
2.25.1
1
0

[PATCH kernel-4.19] blk-mq: clear active_queues before clearing BLK_MQ_F_TAG_QUEUE_SHARED
by Yang Yingliang 24 Aug '21
by Yang Yingliang 24 Aug '21
24 Aug '21
From: Yu Kuai <yukuai3(a)huawei.com>
mainline inclusion
from mainline-5.14-rc6
commit 454bb6775202d94f0f489c4632efecdb62d3c904
category: bugfix
bugzilla: 175277
CVE: NA
-------------------------------------------------
We run a test that delete and recover devcies frequently(two devices on
the same host), and we found that 'active_queues' is super big after a
period of time.
If device a and device b share a tag set, and a is deleted, then
blk_mq_exit_queue() will clear BLK_MQ_F_TAG_QUEUE_SHARED because there
is only one queue that are using the tag set. However, if b is still
active, the active_queues of b might never be cleared even if b is
deleted.
Thus clear active_queues before BLK_MQ_F_TAG_QUEUE_SHARED is cleared.
Signed-off-by: Yu Kuai <yukuai3(a)huawei.com>
Reviewed-by: Ming Lei <ming.lei(a)redhat.com>
Link: https://lore.kernel.org/r/20210731062130.1533893-1-yukuai3@huawei.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
Conflict: block/blk-mq.c
Signed-off-by: Yu Kuai <yukuai3(a)huawei.com>
Reviewed-by: Jason Yan <yanaijie(a)huawei.com>
Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com>
---
block/blk-mq.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index b5b41b9a72294..f23500becbc80 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2516,10 +2516,12 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared)
int i;
queue_for_each_hw_ctx(q, hctx, i) {
- if (shared)
+ if (shared) {
hctx->flags |= BLK_MQ_F_TAG_SHARED;
- else
+ } else {
+ blk_mq_tag_idle(hctx);
hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
+ }
}
}
--
2.25.1
1
0
From: Yu Kuai <yukuai3(a)huawei.com>
hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I40JRR
CVE: NA
--------------------------------------
Enable eulerfs as module by default.
Signed-off-by: Yu Kuai <yukuai3(a)huawei.com>
Reviewed-by: Hou Tao <houtao1(a)huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai(a)huawei.com>
---
arch/arm64/configs/openeuler_defconfig | 1 +
arch/x86/configs/openeuler_defconfig | 1 +
2 files changed, 2 insertions(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig
index fd62a5d7d069..44a9b95abc74 100644
--- a/arch/arm64/configs/openeuler_defconfig
+++ b/arch/arm64/configs/openeuler_defconfig
@@ -6304,6 +6304,7 @@ CONFIG_CIFS_DFS_UPCALL=y
# CONFIG_CIFS_FSCACHE is not set
# CONFIG_CODA_FS is not set
# CONFIG_AFS_FS is not set
+CONFIG_EULER_FS=m
CONFIG_NLS=y
CONFIG_NLS_DEFAULT="utf8"
CONFIG_NLS_CODEPAGE_437=y
diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig
index e2a7fda97fa3..6c225fc3f106 100644
--- a/arch/x86/configs/openeuler_defconfig
+++ b/arch/x86/configs/openeuler_defconfig
@@ -7643,6 +7643,7 @@ CONFIG_CEPH_FS=m
# CONFIG_CEPH_FSCACHE is not set
CONFIG_CEPH_FS_POSIX_ACL=y
# CONFIG_CEPH_FS_SECURITY_LABEL is not set
+CONFIG_EULER_FS=m
CONFIG_CIFS=m
# CONFIG_CIFS_STATS2 is not set
CONFIG_CIFS_ALLOW_INSECURE_LEGACY=y
--
2.20.1
1
0
EulerFS is NVDIMM filesystem. It uses soft updates and pointer-based
dual views to delay synchronous cache flushes and reduce latency
significantly in critical path.
We run eulerfs with xfstests, no WARNING or BUG message is found in kernel
log.
We test the performance with multiple benchmark, the results is much
greater than ext4dax.
We run the code coverage testing, line coverage is over 80%, functions
coverage is almost 100%.
Yu Kuai (17):
eulerfs: common definitions
eulerfs: add kmeme_cache definitions and interfaces
eulerfs: add memory allocation interfaces
eulerfs: add flush interfaces
eulerfs: add interfaces for inode lock transfer
eulerfs: add interfaces for page wear
eulerfs: add filename interfaces
eulerfs: add nv dict operations
eulerfs: add dependency operations
eulerfs: add inode related interfaces
eulerfs: add dax operations
eulerfs: add file operations and inode operations for regular file
eulerfs: add inode_operations for dir inode and special inode
eulerfs: add file_operations for dir inode
eulerfs: add inode_operations for symlink inode
eulerfs: add super_operations and module_init/exit
eulerfs: add Kconfig and Makefile
fs/Kconfig | 1 +
fs/Makefile | 1 +
fs/eulerfs/Kconfig | 10 +
fs/eulerfs/Makefile | 9 +
fs/eulerfs/alloc_interface.h | 113 +++
fs/eulerfs/const.h | 80 ++
fs/eulerfs/dax.c | 1696 ++++++++++++++++++++++++++++++++++
fs/eulerfs/dax.h | 101 ++
fs/eulerfs/dep.c | 791 ++++++++++++++++
fs/eulerfs/dep.h | 218 +++++
fs/eulerfs/dht.c | 312 +++++++
fs/eulerfs/dht.h | 156 ++++
fs/eulerfs/dir.c | 139 +++
fs/eulerfs/euler.h | 84 ++
fs/eulerfs/euler_common.h | 225 +++++
fs/eulerfs/euler_dbg.h | 36 +
fs/eulerfs/euler_def.h | 201 ++++
fs/eulerfs/file.c | 294 ++++++
fs/eulerfs/filename.h | 120 +++
fs/eulerfs/flush.h | 171 ++++
fs/eulerfs/inode.c | 602 ++++++++++++
fs/eulerfs/inode.h | 44 +
fs/eulerfs/kmem_cache.c | 107 +++
fs/eulerfs/kmem_cache.h | 37 +
fs/eulerfs/lock.h | 49 +
fs/eulerfs/namei.c | 872 +++++++++++++++++
fs/eulerfs/nvalloc.c | 1451 +++++++++++++++++++++++++++++
fs/eulerfs/nvalloc.h | 214 +++++
fs/eulerfs/nvm_struct.h | 297 ++++++
fs/eulerfs/pbatch.h | 314 +++++++
fs/eulerfs/super.c | 811 ++++++++++++++++
fs/eulerfs/symlink.c | 29 +
fs/eulerfs/wear.c | 48 +
fs/eulerfs/wear.h | 30 +
34 files changed, 9663 insertions(+)
create mode 100644 fs/eulerfs/Kconfig
create mode 100644 fs/eulerfs/Makefile
create mode 100644 fs/eulerfs/alloc_interface.h
create mode 100644 fs/eulerfs/const.h
create mode 100644 fs/eulerfs/dax.c
create mode 100644 fs/eulerfs/dax.h
create mode 100644 fs/eulerfs/dep.c
create mode 100644 fs/eulerfs/dep.h
create mode 100644 fs/eulerfs/dht.c
create mode 100644 fs/eulerfs/dht.h
create mode 100644 fs/eulerfs/dir.c
create mode 100644 fs/eulerfs/euler.h
create mode 100644 fs/eulerfs/euler_common.h
create mode 100644 fs/eulerfs/euler_dbg.h
create mode 100644 fs/eulerfs/euler_def.h
create mode 100644 fs/eulerfs/file.c
create mode 100644 fs/eulerfs/filename.h
create mode 100644 fs/eulerfs/flush.h
create mode 100644 fs/eulerfs/inode.c
create mode 100644 fs/eulerfs/inode.h
create mode 100644 fs/eulerfs/kmem_cache.c
create mode 100644 fs/eulerfs/kmem_cache.h
create mode 100644 fs/eulerfs/lock.h
create mode 100644 fs/eulerfs/namei.c
create mode 100644 fs/eulerfs/nvalloc.c
create mode 100644 fs/eulerfs/nvalloc.h
create mode 100644 fs/eulerfs/nvm_struct.h
create mode 100644 fs/eulerfs/pbatch.h
create mode 100644 fs/eulerfs/super.c
create mode 100644 fs/eulerfs/symlink.c
create mode 100644 fs/eulerfs/wear.c
create mode 100644 fs/eulerfs/wear.h
--
2.20.1
1
17
From: Marco Elver <elver(a)google.com>
Add KFENCE test suite, testing various error detection scenarios. Makes
use of KUnit for test organization. Since KFENCE's interface to obtain
error reports is via the console, the test verifies that KFENCE outputs
expected reports to the console.
[elver(a)google.com: fix typo in test]
Link: https://lkml.kernel.org/r/X9lHQExmHGvETxY4@elver.google.com
[elver(a)google.com: show access type in report]
Link: https://lkml.kernel.org/r/20210111091544.3287013-2-elver@google.com
Link: https://lkml.kernel.org/r/20201103175841.3495947-9-elver@google.com
Signed-off-by: Alexander Potapenko <glider(a)google.com>
Signed-off-by: Marco Elver <elver(a)google.com>
Reviewed-by: Dmitry Vyukov <dvyukov(a)google.com>
Co-developed-by: Alexander Potapenko <glider(a)google.com>
Reviewed-by: Jann Horn <jannh(a)google.com>
Cc: Andrey Konovalov <andreyknvl(a)google.com>
Cc: Andrey Ryabinin <aryabinin(a)virtuozzo.com>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Borislav Petkov <bp(a)alien8.de>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Christopher Lameter <cl(a)linux.com>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: David Rientjes <rientjes(a)google.com>
Cc: Eric Dumazet <edumazet(a)google.com>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: Hillf Danton <hdanton(a)sina.com>
Cc: "H. Peter Anvin" <hpa(a)zytor.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Joern Engel <joern(a)purestorage.com>
Cc: Jonathan Corbet <corbet(a)lwn.net>
Cc: Joonsoo Kim <iamjoonsoo.kim(a)lge.com>
Cc: Kees Cook <keescook(a)chromium.org>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Paul E. McKenney <paulmck(a)kernel.org>
Cc: Pekka Enberg <penberg(a)kernel.org>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: SeongJae Park <sjpark(a)amazon.de>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Will Deacon <will(a)kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Signed-off-by: Yingjie Shang <1415317271(a)qq.com>
Reviewed-by: Bixuan Cui <cuibixuan(a)huawei.com>
---
Documentation/dev-tools/kfence.rst | 12 +-
arch/arm64/mm/fault.c | 2 +-
arch/x86/mm/fault.c | 3 +-
include/linux/kfence.h | 9 +-
lib/Kconfig.kfence | 13 +
mm/kfence/Makefile | 3 +
mm/kfence/core.c | 11 +-
mm/kfence/kfence.h | 2 +-
mm/kfence/kfence_test.c | 858 +++++++++++++++++++++++++++++
mm/kfence/report.c | 27 +-
10 files changed, 915 insertions(+), 25 deletions(-)
create mode 100644 mm/kfence/kfence_test.c
diff --git a/Documentation/dev-tools/kfence.rst b/Documentation/dev-tools/kfence.rst
index 0e2fb6ef3016..58a0a5fa1ddc 100644
--- a/Documentation/dev-tools/kfence.rst
+++ b/Documentation/dev-tools/kfence.rst
@@ -65,9 +65,9 @@ Error reports
A typical out-of-bounds access looks like this::
==================================================================
- BUG: KFENCE: out-of-bounds in test_out_of_bounds_read+0xa3/0x22b
+ BUG: KFENCE: out-of-bounds read in test_out_of_bounds_read+0xa3/0x22b
- Out-of-bounds access at 0xffffffffb672efff (1B left of kfence-#17):
+ Out-of-bounds read at 0xffffffffb672efff (1B left of kfence-#17):
test_out_of_bounds_read+0xa3/0x22b
kunit_try_run_case+0x51/0x85
kunit_generic_run_threadfn_adapter+0x16/0x30
@@ -94,9 +94,9 @@ its origin. Note that, real kernel addresses are only shown for
Use-after-free accesses are reported as::
==================================================================
- BUG: KFENCE: use-after-free in test_use_after_free_read+0xb3/0x143
+ BUG: KFENCE: use-after-free read in test_use_after_free_read+0xb3/0x143
- Use-after-free access at 0xffffffffb673dfe0 (in kfence-#24):
+ Use-after-free read at 0xffffffffb673dfe0 (in kfence-#24):
test_use_after_free_read+0xb3/0x143
kunit_try_run_case+0x51/0x85
kunit_generic_run_threadfn_adapter+0x16/0x30
@@ -193,9 +193,9 @@ where it was not possible to determine an associated object, e.g. if adjacent
object pages had not yet been allocated::
==================================================================
- BUG: KFENCE: invalid access in test_invalid_access+0x26/0xe0
+ BUG: KFENCE: invalid read in test_invalid_access+0x26/0xe0
- Invalid access at 0xffffffffb670b00a:
+ Invalid read at 0xffffffffb670b00a:
test_invalid_access+0x26/0xe0
kunit_try_run_case+0x51/0x85
kunit_generic_run_threadfn_adapter+0x16/0x30
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index a5fe9f7b423b..8f4ceab2fa35 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -323,7 +323,7 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr,
} else if (addr < PAGE_SIZE) {
msg = "NULL pointer dereference";
} else {
- if (kfence_handle_page_fault(addr, regs))
+ if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
return;
msg = "paging request";
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f5f359ccaeae..738985c131ea 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -734,7 +734,8 @@ no_context(struct pt_regs *regs, unsigned long error_code,
efi_recover_from_page_fault(address);
/* Only not-present faults should be handled by KFENCE. */
- if (!(error_code & X86_PF_PROT) && kfence_handle_page_fault(address, regs))
+ if (!(error_code & X86_PF_PROT) &&
+ kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
return;
oops:
/*
diff --git a/include/linux/kfence.h b/include/linux/kfence.h
index 5a56bcf5606c..a70d1ea03532 100644
--- a/include/linux/kfence.h
+++ b/include/linux/kfence.h
@@ -186,6 +186,7 @@ static __always_inline __must_check bool kfence_free(void *addr)
/**
* kfence_handle_page_fault() - perform page fault handling for KFENCE pages
* @addr: faulting address
+ * @is_write: is access a write
* @regs: current struct pt_regs (can be NULL, but shows full stack trace)
*
* Return:
@@ -197,7 +198,7 @@ static __always_inline __must_check bool kfence_free(void *addr)
* cases KFENCE prints an error message and marks the offending page as
* present, so that the kernel can proceed.
*/
-bool __must_check kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs);
+bool __must_check kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs);
#else /* CONFIG_KFENCE */
@@ -210,7 +211,11 @@ static inline size_t kfence_ksize(const void *addr) { return 0; }
static inline void *kfence_object_start(const void *addr) { return NULL; }
static inline void __kfence_free(void *addr) { }
static inline bool __must_check kfence_free(void *addr) { return false; }
-static inline bool __must_check kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs) { return false; }
+static inline bool __must_check kfence_handle_page_fault(unsigned long addr, bool is_write,
+ struct pt_regs *regs)
+{
+ return false;
+}
#endif
diff --git a/lib/Kconfig.kfence b/lib/Kconfig.kfence
index 605125ac2ae0..78f50ccb3b45 100644
--- a/lib/Kconfig.kfence
+++ b/lib/Kconfig.kfence
@@ -66,4 +66,17 @@ config KFENCE_STRESS_TEST_FAULTS
Only for KFENCE testing; set to 0 if you are not a KFENCE developer.
+config KFENCE_KUNIT_TEST
+ tristate "KFENCE integration test suite" if !KUNIT_ALL_TESTS
+ default KUNIT_ALL_TESTS
+ depends on TRACEPOINTS && KUNIT
+ help
+ Test suite for KFENCE, testing various error detection scenarios with
+ various allocation types, and checking that reports are correctly
+ output to console.
+
+ Say Y here if you want the test to be built into the kernel and run
+ during boot; say M if you want the test to build as a module; say N
+ if you are unsure.
+
endif # KFENCE
diff --git a/mm/kfence/Makefile b/mm/kfence/Makefile
index d991e9a349f0..6872cd5e5390 100644
--- a/mm/kfence/Makefile
+++ b/mm/kfence/Makefile
@@ -1,3 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_KFENCE) := core.o report.o
+
+CFLAGS_kfence_test.o := -g -fno-omit-frame-pointer -fno-optimize-sibling-calls
+obj-$(CONFIG_KFENCE_KUNIT_TEST) += kfence_test.o
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 7692af715fdb..cfe3d32ac5b7 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -216,7 +216,7 @@ static inline bool check_canary_byte(u8 *addr)
return true;
atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
- kfence_report_error((unsigned long)addr, NULL, addr_to_metadata((unsigned long)addr),
+ kfence_report_error((unsigned long)addr, false, NULL, addr_to_metadata((unsigned long)addr),
KFENCE_ERROR_CORRUPTION);
return false;
}
@@ -355,7 +355,8 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) {
/* Invalid or double-free, bail out. */
atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
- kfence_report_error((unsigned long)addr, NULL, meta, KFENCE_ERROR_INVALID_FREE);
+ kfence_report_error((unsigned long)addr, false, NULL, meta,
+ KFENCE_ERROR_INVALID_FREE);
raw_spin_unlock_irqrestore(&meta->lock, flags);
return;
}
@@ -770,7 +771,7 @@ void __kfence_free(void *addr)
kfence_guarded_free(addr, meta, false);
}
-bool kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs)
+bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs)
{
const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE;
struct kfence_metadata *to_report = NULL;
@@ -833,11 +834,11 @@ bool kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs)
out:
if (to_report) {
- kfence_report_error(addr, regs, to_report, error_type);
+ kfence_report_error(addr, is_write, regs, to_report, error_type);
raw_spin_unlock_irqrestore(&to_report->lock, flags);
} else {
/* This may be a UAF or OOB access, but we can't be sure. */
- kfence_report_error(addr, regs, NULL, KFENCE_ERROR_INVALID);
+ kfence_report_error(addr, is_write, regs, NULL, KFENCE_ERROR_INVALID);
}
return kfence_unprotect(addr); /* Unprotect and let access proceed. */
diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
index 0d83e628a97d..1accc840dbbe 100644
--- a/mm/kfence/kfence.h
+++ b/mm/kfence/kfence.h
@@ -105,7 +105,7 @@ enum kfence_error_type {
KFENCE_ERROR_INVALID_FREE, /* Invalid free. */
};
-void kfence_report_error(unsigned long address, struct pt_regs *regs,
+void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs,
const struct kfence_metadata *meta, enum kfence_error_type type);
void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta);
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
new file mode 100644
index 000000000000..db1bb596acaf
--- /dev/null
+++ b/mm/kfence/kfence_test.c
@@ -0,0 +1,858 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test cases for KFENCE memory safety error detector. Since the interface with
+ * which KFENCE's reports are obtained is via the console, this is the output we
+ * should verify. For each test case checks the presence (or absence) of
+ * generated reports. Relies on 'console' tracepoint to capture reports as they
+ * appear in the kernel log.
+ *
+ * Copyright (C) 2020, Google LLC.
+ * Author: Alexander Potapenko <glider(a)google.com>
+ * Marco Elver <elver(a)google.com>
+ */
+
+#include <kunit/test.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/kfence.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/tracepoint.h>
+#include <trace/events/printk.h>
+
+#include "kfence.h"
+
+/* Report as observed from console. */
+static struct {
+ spinlock_t lock;
+ int nlines;
+ char lines[2][256];
+} observed = {
+ .lock = __SPIN_LOCK_UNLOCKED(observed.lock),
+};
+
+/* Probe for console output: obtains observed lines of interest. */
+static void probe_console(void *ignore, const char *buf, size_t len)
+{
+ unsigned long flags;
+ int nlines;
+
+ spin_lock_irqsave(&observed.lock, flags);
+ nlines = observed.nlines;
+
+ if (strnstr(buf, "BUG: KFENCE: ", len) && strnstr(buf, "test_", len)) {
+ /*
+ * KFENCE report and related to the test.
+ *
+ * The provided @buf is not NUL-terminated; copy no more than
+ * @len bytes and let strscpy() add the missing NUL-terminator.
+ */
+ strscpy(observed.lines[0], buf, min(len + 1, sizeof(observed.lines[0])));
+ nlines = 1;
+ } else if (nlines == 1 && (strnstr(buf, "at 0x", len) || strnstr(buf, "of 0x", len))) {
+ strscpy(observed.lines[nlines++], buf, min(len + 1, sizeof(observed.lines[0])));
+ }
+
+ WRITE_ONCE(observed.nlines, nlines); /* Publish new nlines. */
+ spin_unlock_irqrestore(&observed.lock, flags);
+}
+
+/* Check if a report related to the test exists. */
+static bool report_available(void)
+{
+ return READ_ONCE(observed.nlines) == ARRAY_SIZE(observed.lines);
+}
+
+/* Information we expect in a report. */
+struct expect_report {
+ enum kfence_error_type type; /* The type or error. */
+ void *fn; /* Function pointer to expected function where access occurred. */
+ char *addr; /* Address at which the bad access occurred. */
+ bool is_write; /* Is access a write. */
+};
+
+static const char *get_access_type(const struct expect_report *r)
+{
+ return r->is_write ? "write" : "read";
+}
+
+/* Check observed report matches information in @r. */
+static bool report_matches(const struct expect_report *r)
+{
+ bool ret = false;
+ unsigned long flags;
+ typeof(observed.lines) expect;
+ const char *end;
+ char *cur;
+
+ /* Doubled-checked locking. */
+ if (!report_available())
+ return false;
+
+ /* Generate expected report contents. */
+
+ /* Title */
+ cur = expect[0];
+ end = &expect[0][sizeof(expect[0]) - 1];
+ switch (r->type) {
+ case KFENCE_ERROR_OOB:
+ cur += scnprintf(cur, end - cur, "BUG: KFENCE: out-of-bounds %s",
+ get_access_type(r));
+ break;
+ case KFENCE_ERROR_UAF:
+ cur += scnprintf(cur, end - cur, "BUG: KFENCE: use-after-free %s",
+ get_access_type(r));
+ break;
+ case KFENCE_ERROR_CORRUPTION:
+ cur += scnprintf(cur, end - cur, "BUG: KFENCE: memory corruption");
+ break;
+ case KFENCE_ERROR_INVALID:
+ cur += scnprintf(cur, end - cur, "BUG: KFENCE: invalid %s",
+ get_access_type(r));
+ break;
+ case KFENCE_ERROR_INVALID_FREE:
+ cur += scnprintf(cur, end - cur, "BUG: KFENCE: invalid free");
+ break;
+ }
+
+ scnprintf(cur, end - cur, " in %pS", r->fn);
+ /* The exact offset won't match, remove it; also strip module name. */
+ cur = strchr(expect[0], '+');
+ if (cur)
+ *cur = '\0';
+
+ /* Access information */
+ cur = expect[1];
+ end = &expect[1][sizeof(expect[1]) - 1];
+
+ switch (r->type) {
+ case KFENCE_ERROR_OOB:
+ cur += scnprintf(cur, end - cur, "Out-of-bounds %s at", get_access_type(r));
+ break;
+ case KFENCE_ERROR_UAF:
+ cur += scnprintf(cur, end - cur, "Use-after-free %s at", get_access_type(r));
+ break;
+ case KFENCE_ERROR_CORRUPTION:
+ cur += scnprintf(cur, end - cur, "Corrupted memory at");
+ break;
+ case KFENCE_ERROR_INVALID:
+ cur += scnprintf(cur, end - cur, "Invalid %s at", get_access_type(r));
+ break;
+ case KFENCE_ERROR_INVALID_FREE:
+ cur += scnprintf(cur, end - cur, "Invalid free of");
+ break;
+ }
+
+ cur += scnprintf(cur, end - cur, " 0x" PTR_FMT, (void *)r->addr);
+
+ spin_lock_irqsave(&observed.lock, flags);
+ if (!report_available())
+ goto out; /* A new report is being captured. */
+
+ /* Finally match expected output to what we actually observed. */
+ ret = strstr(observed.lines[0], expect[0]) && strstr(observed.lines[1], expect[1]);
+out:
+ spin_unlock_irqrestore(&observed.lock, flags);
+ return ret;
+}
+
+/* ===== Test cases ===== */
+
+#define TEST_PRIV_WANT_MEMCACHE ((void *)1)
+
+/* Cache used by tests; if NULL, allocate from kmalloc instead. */
+static struct kmem_cache *test_cache;
+
+static size_t setup_test_cache(struct kunit *test, size_t size, slab_flags_t flags,
+ void (*ctor)(void *))
+{
+ if (test->priv != TEST_PRIV_WANT_MEMCACHE)
+ return size;
+
+ kunit_info(test, "%s: size=%zu, ctor=%ps\n", __func__, size, ctor);
+
+ /*
+ * Use SLAB_NOLEAKTRACE to prevent merging with existing caches. Any
+ * other flag in SLAB_NEVER_MERGE also works. Use SLAB_ACCOUNT to
+ * allocate via memcg, if enabled.
+ */
+ flags |= SLAB_NOLEAKTRACE | SLAB_ACCOUNT;
+ test_cache = kmem_cache_create("test", size, 1, flags, ctor);
+ KUNIT_ASSERT_TRUE_MSG(test, test_cache, "could not create cache");
+
+ return size;
+}
+
+static void test_cache_destroy(void)
+{
+ if (!test_cache)
+ return;
+
+ kmem_cache_destroy(test_cache);
+ test_cache = NULL;
+}
+
+static inline size_t kmalloc_cache_alignment(size_t size)
+{
+ return kmalloc_caches[kmalloc_type(GFP_KERNEL)][kmalloc_index(size)]->align;
+}
+
+/* Must always inline to match stack trace against caller. */
+static __always_inline void test_free(void *ptr)
+{
+ if (test_cache)
+ kmem_cache_free(test_cache, ptr);
+ else
+ kfree(ptr);
+}
+
+/*
+ * If this should be a KFENCE allocation, and on which side the allocation and
+ * the closest guard page should be.
+ */
+enum allocation_policy {
+ ALLOCATE_ANY, /* KFENCE, any side. */
+ ALLOCATE_LEFT, /* KFENCE, left side of page. */
+ ALLOCATE_RIGHT, /* KFENCE, right side of page. */
+ ALLOCATE_NONE, /* No KFENCE allocation. */
+};
+
+/*
+ * Try to get a guarded allocation from KFENCE. Uses either kmalloc() or the
+ * current test_cache if set up.
+ */
+static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocation_policy policy)
+{
+ void *alloc;
+ unsigned long timeout, resched_after;
+ const char *policy_name;
+
+ switch (policy) {
+ case ALLOCATE_ANY:
+ policy_name = "any";
+ break;
+ case ALLOCATE_LEFT:
+ policy_name = "left";
+ break;
+ case ALLOCATE_RIGHT:
+ policy_name = "right";
+ break;
+ case ALLOCATE_NONE:
+ policy_name = "none";
+ break;
+ }
+
+ kunit_info(test, "%s: size=%zu, gfp=%x, policy=%s, cache=%i\n", __func__, size, gfp,
+ policy_name, !!test_cache);
+
+ /*
+ * 100x the sample interval should be more than enough to ensure we get
+ * a KFENCE allocation eventually.
+ */
+ timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL);
+ /*
+ * Especially for non-preemption kernels, ensure the allocation-gate
+ * timer can catch up: after @resched_after, every failed allocation
+ * attempt yields, to ensure the allocation-gate timer is scheduled.
+ */
+ resched_after = jiffies + msecs_to_jiffies(CONFIG_KFENCE_SAMPLE_INTERVAL);
+ do {
+ if (test_cache)
+ alloc = kmem_cache_alloc(test_cache, gfp);
+ else
+ alloc = kmalloc(size, gfp);
+
+ if (is_kfence_address(alloc)) {
+ struct page *page = virt_to_head_page(alloc);
+ struct kmem_cache *s = test_cache ?: kmalloc_caches[kmalloc_type(GFP_KERNEL)][kmalloc_index(size)];
+
+ /*
+ * Verify that various helpers return the right values
+ * even for KFENCE objects; these are required so that
+ * memcg accounting works correctly.
+ */
+ KUNIT_EXPECT_EQ(test, obj_to_index(s, page, alloc), 0U);
+ KUNIT_EXPECT_EQ(test, objs_per_slab_page(s, page), 1);
+
+ if (policy == ALLOCATE_ANY)
+ return alloc;
+ if (policy == ALLOCATE_LEFT && IS_ALIGNED((unsigned long)alloc, PAGE_SIZE))
+ return alloc;
+ if (policy == ALLOCATE_RIGHT &&
+ !IS_ALIGNED((unsigned long)alloc, PAGE_SIZE))
+ return alloc;
+ } else if (policy == ALLOCATE_NONE)
+ return alloc;
+
+ test_free(alloc);
+
+ if (time_after(jiffies, resched_after))
+ cond_resched();
+ } while (time_before(jiffies, timeout));
+
+ KUNIT_ASSERT_TRUE_MSG(test, false, "failed to allocate from KFENCE");
+ return NULL; /* Unreachable. */
+}
+
+static void test_out_of_bounds_read(struct kunit *test)
+{
+ size_t size = 32;
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_OOB,
+ .fn = test_out_of_bounds_read,
+ .is_write = false,
+ };
+ char *buf;
+
+ setup_test_cache(test, size, 0, NULL);
+
+ /*
+ * If we don't have our own cache, adjust based on alignment, so that we
+ * actually access guard pages on either side.
+ */
+ if (!test_cache)
+ size = kmalloc_cache_alignment(size);
+
+ /* Test both sides. */
+
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT);
+ expect.addr = buf - 1;
+ READ_ONCE(*expect.addr);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+ test_free(buf);
+
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT);
+ expect.addr = buf + size;
+ READ_ONCE(*expect.addr);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+ test_free(buf);
+}
+
+static void test_out_of_bounds_write(struct kunit *test)
+{
+ size_t size = 32;
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_OOB,
+ .fn = test_out_of_bounds_write,
+ .is_write = true,
+ };
+ char *buf;
+
+ setup_test_cache(test, size, 0, NULL);
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT);
+ expect.addr = buf - 1;
+ WRITE_ONCE(*expect.addr, 42);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+ test_free(buf);
+}
+
+static void test_use_after_free_read(struct kunit *test)
+{
+ const size_t size = 32;
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_UAF,
+ .fn = test_use_after_free_read,
+ .is_write = false,
+ };
+
+ setup_test_cache(test, size, 0, NULL);
+ expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+ test_free(expect.addr);
+ READ_ONCE(*expect.addr);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+static void test_double_free(struct kunit *test)
+{
+ const size_t size = 32;
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_INVALID_FREE,
+ .fn = test_double_free,
+ };
+
+ setup_test_cache(test, size, 0, NULL);
+ expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+ test_free(expect.addr);
+ test_free(expect.addr); /* Double-free. */
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+static void test_invalid_addr_free(struct kunit *test)
+{
+ const size_t size = 32;
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_INVALID_FREE,
+ .fn = test_invalid_addr_free,
+ };
+ char *buf;
+
+ setup_test_cache(test, size, 0, NULL);
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+ expect.addr = buf + 1; /* Free on invalid address. */
+ test_free(expect.addr); /* Invalid address free. */
+ test_free(buf); /* No error. */
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+static void test_corruption(struct kunit *test)
+{
+ size_t size = 32;
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_CORRUPTION,
+ .fn = test_corruption,
+ };
+ char *buf;
+
+ setup_test_cache(test, size, 0, NULL);
+
+ /* Test both sides. */
+
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT);
+ expect.addr = buf + size;
+ WRITE_ONCE(*expect.addr, 42);
+ test_free(buf);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT);
+ expect.addr = buf - 1;
+ WRITE_ONCE(*expect.addr, 42);
+ test_free(buf);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * KFENCE is unable to detect an OOB if the allocation's alignment requirements
+ * leave a gap between the object and the guard page. Specifically, an
+ * allocation of e.g. 73 bytes is aligned on 8 and 128 bytes for SLUB or SLAB
+ * respectively. Therefore it is impossible for the allocated object to
+ * contiguously line up with the right guard page.
+ *
+ * However, we test that an access to memory beyond the gap results in KFENCE
+ * detecting an OOB access.
+ */
+static void test_kmalloc_aligned_oob_read(struct kunit *test)
+{
+ const size_t size = 73;
+ const size_t align = kmalloc_cache_alignment(size);
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_OOB,
+ .fn = test_kmalloc_aligned_oob_read,
+ .is_write = false,
+ };
+ char *buf;
+
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT);
+
+ /*
+ * The object is offset to the right, so there won't be an OOB to the
+ * left of it.
+ */
+ READ_ONCE(*(buf - 1));
+ KUNIT_EXPECT_FALSE(test, report_available());
+
+ /*
+ * @buf must be aligned on @align, therefore buf + size belongs to the
+ * same page -> no OOB.
+ */
+ READ_ONCE(*(buf + size));
+ KUNIT_EXPECT_FALSE(test, report_available());
+
+ /* Overflowing by @align bytes will result in an OOB. */
+ expect.addr = buf + size + align;
+ READ_ONCE(*expect.addr);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+
+ test_free(buf);
+}
+
+static void test_kmalloc_aligned_oob_write(struct kunit *test)
+{
+ const size_t size = 73;
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_CORRUPTION,
+ .fn = test_kmalloc_aligned_oob_write,
+ };
+ char *buf;
+
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT);
+ /*
+ * The object is offset to the right, so we won't get a page
+ * fault immediately after it.
+ */
+ expect.addr = buf + size;
+ WRITE_ONCE(*expect.addr, READ_ONCE(*expect.addr) + 1);
+ KUNIT_EXPECT_FALSE(test, report_available());
+ test_free(buf);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Test cache shrinking and destroying with KFENCE. */
+static void test_shrink_memcache(struct kunit *test)
+{
+ const size_t size = 32;
+ void *buf;
+
+ setup_test_cache(test, size, 0, NULL);
+ KUNIT_EXPECT_TRUE(test, test_cache);
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+ kmem_cache_shrink(test_cache);
+ test_free(buf);
+
+ KUNIT_EXPECT_FALSE(test, report_available());
+}
+
+static void ctor_set_x(void *obj)
+{
+ /* Every object has at least 8 bytes. */
+ memset(obj, 'x', 8);
+}
+
+/* Ensure that SL*B does not modify KFENCE objects on bulk free. */
+static void test_free_bulk(struct kunit *test)
+{
+ int iter;
+
+ for (iter = 0; iter < 5; iter++) {
+ const size_t size = setup_test_cache(test, 8 + prandom_u32_max(300), 0,
+ (iter & 1) ? ctor_set_x : NULL);
+ void *objects[] = {
+ test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT),
+ test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE),
+ test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT),
+ test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE),
+ test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE),
+ };
+
+ kmem_cache_free_bulk(test_cache, ARRAY_SIZE(objects), objects);
+ KUNIT_ASSERT_FALSE(test, report_available());
+ test_cache_destroy();
+ }
+}
+
+/* Test init-on-free works. */
+static void test_init_on_free(struct kunit *test)
+{
+ const size_t size = 32;
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_UAF,
+ .fn = test_init_on_free,
+ .is_write = false,
+ };
+ int i;
+
+ if (!IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON))
+ return;
+ /* Assume it hasn't been disabled on command line. */
+
+ setup_test_cache(test, size, 0, NULL);
+ expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+ for (i = 0; i < size; i++)
+ expect.addr[i] = i + 1;
+ test_free(expect.addr);
+
+ for (i = 0; i < size; i++) {
+ /*
+ * This may fail if the page was recycled by KFENCE and then
+ * written to again -- this however, is near impossible with a
+ * default config.
+ */
+ KUNIT_EXPECT_EQ(test, expect.addr[i], (char)0);
+
+ if (!i) /* Only check first access to not fail test if page is ever re-protected. */
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+ }
+}
+
+/* Ensure that constructors work properly. */
+static void test_memcache_ctor(struct kunit *test)
+{
+ const size_t size = 32;
+ char *buf;
+ int i;
+
+ setup_test_cache(test, size, 0, ctor_set_x);
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+
+ for (i = 0; i < 8; i++)
+ KUNIT_EXPECT_EQ(test, buf[i], (char)'x');
+
+ test_free(buf);
+
+ KUNIT_EXPECT_FALSE(test, report_available());
+}
+
+/* Test that memory is zeroed if requested. */
+static void test_gfpzero(struct kunit *test)
+{
+ const size_t size = PAGE_SIZE; /* PAGE_SIZE so we can use ALLOCATE_ANY. */
+ char *buf1, *buf2;
+ int i;
+
+ if (CONFIG_KFENCE_SAMPLE_INTERVAL > 100) {
+ kunit_warn(test, "skipping ... would take too long\n");
+ return;
+ }
+
+ setup_test_cache(test, size, 0, NULL);
+ buf1 = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+ for (i = 0; i < size; i++)
+ buf1[i] = i + 1;
+ test_free(buf1);
+
+ /* Try to get same address again -- this can take a while. */
+ for (i = 0;; i++) {
+ buf2 = test_alloc(test, size, GFP_KERNEL | __GFP_ZERO, ALLOCATE_ANY);
+ if (buf1 == buf2)
+ break;
+ test_free(buf2);
+
+ if (i == CONFIG_KFENCE_NUM_OBJECTS) {
+ kunit_warn(test, "giving up ... cannot get same object back\n");
+ return;
+ }
+ }
+
+ for (i = 0; i < size; i++)
+ KUNIT_EXPECT_EQ(test, buf2[i], (char)0);
+
+ test_free(buf2);
+
+ KUNIT_EXPECT_FALSE(test, report_available());
+}
+
+static void test_invalid_access(struct kunit *test)
+{
+ const struct expect_report expect = {
+ .type = KFENCE_ERROR_INVALID,
+ .fn = test_invalid_access,
+ .addr = &__kfence_pool[10],
+ .is_write = false,
+ };
+
+ READ_ONCE(__kfence_pool[10]);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Test SLAB_TYPESAFE_BY_RCU works. */
+static void test_memcache_typesafe_by_rcu(struct kunit *test)
+{
+ const size_t size = 32;
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_UAF,
+ .fn = test_memcache_typesafe_by_rcu,
+ .is_write = false,
+ };
+
+ setup_test_cache(test, size, SLAB_TYPESAFE_BY_RCU, NULL);
+ KUNIT_EXPECT_TRUE(test, test_cache); /* Want memcache. */
+
+ expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+ *expect.addr = 42;
+
+ rcu_read_lock();
+ test_free(expect.addr);
+ KUNIT_EXPECT_EQ(test, *expect.addr, (char)42);
+ /*
+ * Up to this point, memory should not have been freed yet, and
+ * therefore there should be no KFENCE report from the above access.
+ */
+ rcu_read_unlock();
+
+ /* Above access to @expect.addr should not have generated a report! */
+ KUNIT_EXPECT_FALSE(test, report_available());
+
+ /* Only after rcu_barrier() is the memory guaranteed to be freed. */
+ rcu_barrier();
+
+ /* Expect use-after-free. */
+ KUNIT_EXPECT_EQ(test, *expect.addr, (char)42);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Test krealloc(). */
+static void test_krealloc(struct kunit *test)
+{
+ const size_t size = 32;
+ const struct expect_report expect = {
+ .type = KFENCE_ERROR_UAF,
+ .fn = test_krealloc,
+ .addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY),
+ .is_write = false,
+ };
+ char *buf = expect.addr;
+ int i;
+
+ KUNIT_EXPECT_FALSE(test, test_cache);
+ KUNIT_EXPECT_EQ(test, ksize(buf), size); /* Precise size match after KFENCE alloc. */
+ for (i = 0; i < size; i++)
+ buf[i] = i + 1;
+
+ /* Check that we successfully change the size. */
+ buf = krealloc(buf, size * 3, GFP_KERNEL); /* Grow. */
+ /* Note: Might no longer be a KFENCE alloc. */
+ KUNIT_EXPECT_GE(test, ksize(buf), size * 3);
+ for (i = 0; i < size; i++)
+ KUNIT_EXPECT_EQ(test, buf[i], (char)(i + 1));
+ for (; i < size * 3; i++) /* Fill to extra bytes. */
+ buf[i] = i + 1;
+
+ buf = krealloc(buf, size * 2, GFP_KERNEL); /* Shrink. */
+ KUNIT_EXPECT_GE(test, ksize(buf), size * 2);
+ for (i = 0; i < size * 2; i++)
+ KUNIT_EXPECT_EQ(test, buf[i], (char)(i + 1));
+
+ buf = krealloc(buf, 0, GFP_KERNEL); /* Free. */
+ KUNIT_EXPECT_EQ(test, (unsigned long)buf, (unsigned long)ZERO_SIZE_PTR);
+ KUNIT_ASSERT_FALSE(test, report_available()); /* No reports yet! */
+
+ READ_ONCE(*expect.addr); /* Ensure krealloc() actually freed earlier KFENCE object. */
+ KUNIT_ASSERT_TRUE(test, report_matches(&expect));
+}
+
+/* Test that some objects from a bulk allocation belong to KFENCE pool. */
+static void test_memcache_alloc_bulk(struct kunit *test)
+{
+ const size_t size = 32;
+ bool pass = false;
+ unsigned long timeout;
+
+ setup_test_cache(test, size, 0, NULL);
+ KUNIT_EXPECT_TRUE(test, test_cache); /* Want memcache. */
+ /*
+ * 100x the sample interval should be more than enough to ensure we get
+ * a KFENCE allocation eventually.
+ */
+ timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL);
+ do {
+ void *objects[100];
+ int i, num = kmem_cache_alloc_bulk(test_cache, GFP_ATOMIC, ARRAY_SIZE(objects),
+ objects);
+ if (!num)
+ continue;
+ for (i = 0; i < ARRAY_SIZE(objects); i++) {
+ if (is_kfence_address(objects[i])) {
+ pass = true;
+ break;
+ }
+ }
+ kmem_cache_free_bulk(test_cache, num, objects);
+ /*
+ * kmem_cache_alloc_bulk() disables interrupts, and calling it
+ * in a tight loop may not give KFENCE a chance to switch the
+ * static branch. Call cond_resched() to let KFENCE chime in.
+ */
+ cond_resched();
+ } while (!pass && time_before(jiffies, timeout));
+
+ KUNIT_EXPECT_TRUE(test, pass);
+ KUNIT_EXPECT_FALSE(test, report_available());
+}
+
+/*
+ * KUnit does not provide a way to provide arguments to tests, and we encode
+ * additional info in the name. Set up 2 tests per test case, one using the
+ * default allocator, and another using a custom memcache (suffix '-memcache').
+ */
+#define KFENCE_KUNIT_CASE(test_name) \
+ { .run_case = test_name, .name = #test_name }, \
+ { .run_case = test_name, .name = #test_name "-memcache" }
+
+static struct kunit_case kfence_test_cases[] = {
+ KFENCE_KUNIT_CASE(test_out_of_bounds_read),
+ KFENCE_KUNIT_CASE(test_out_of_bounds_write),
+ KFENCE_KUNIT_CASE(test_use_after_free_read),
+ KFENCE_KUNIT_CASE(test_double_free),
+ KFENCE_KUNIT_CASE(test_invalid_addr_free),
+ KFENCE_KUNIT_CASE(test_corruption),
+ KFENCE_KUNIT_CASE(test_free_bulk),
+ KFENCE_KUNIT_CASE(test_init_on_free),
+ KUNIT_CASE(test_kmalloc_aligned_oob_read),
+ KUNIT_CASE(test_kmalloc_aligned_oob_write),
+ KUNIT_CASE(test_shrink_memcache),
+ KUNIT_CASE(test_memcache_ctor),
+ KUNIT_CASE(test_invalid_access),
+ KUNIT_CASE(test_gfpzero),
+ KUNIT_CASE(test_memcache_typesafe_by_rcu),
+ KUNIT_CASE(test_krealloc),
+ KUNIT_CASE(test_memcache_alloc_bulk),
+ {},
+};
+
+/* ===== End test cases ===== */
+
+static int test_init(struct kunit *test)
+{
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&observed.lock, flags);
+ for (i = 0; i < ARRAY_SIZE(observed.lines); i++)
+ observed.lines[i][0] = '\0';
+ observed.nlines = 0;
+ spin_unlock_irqrestore(&observed.lock, flags);
+
+ /* Any test with 'memcache' in its name will want a memcache. */
+ if (strstr(test->name, "memcache"))
+ test->priv = TEST_PRIV_WANT_MEMCACHE;
+ else
+ test->priv = NULL;
+
+ return 0;
+}
+
+static void test_exit(struct kunit *test)
+{
+ test_cache_destroy();
+}
+
+static struct kunit_suite kfence_test_suite = {
+ .name = "kfence",
+ .test_cases = kfence_test_cases,
+ .init = test_init,
+ .exit = test_exit,
+};
+static struct kunit_suite *kfence_test_suites[] = { &kfence_test_suite, NULL };
+
+static void register_tracepoints(struct tracepoint *tp, void *ignore)
+{
+ check_trace_callback_type_console(probe_console);
+ if (!strcmp(tp->name, "console"))
+ WARN_ON(tracepoint_probe_register(tp, probe_console, NULL));
+}
+
+static void unregister_tracepoints(struct tracepoint *tp, void *ignore)
+{
+ if (!strcmp(tp->name, "console"))
+ tracepoint_probe_unregister(tp, probe_console, NULL);
+}
+
+/*
+ * We only want to do tracepoints setup and teardown once, therefore we have to
+ * customize the init and exit functions and cannot rely on kunit_test_suite().
+ */
+static int __init kfence_test_init(void)
+{
+ /*
+ * Because we want to be able to build the test as a module, we need to
+ * iterate through all known tracepoints, since the static registration
+ * won't work here.
+ */
+ for_each_kernel_tracepoint(register_tracepoints, NULL);
+ return __kunit_test_suites_init(kfence_test_suites);
+}
+
+static void kfence_test_exit(void)
+{
+ __kunit_test_suites_exit(kfence_test_suites);
+ for_each_kernel_tracepoint(unregister_tracepoints, NULL);
+ tracepoint_synchronize_unregister();
+}
+
+late_initcall(kfence_test_init);
+module_exit(kfence_test_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Alexander Potapenko <glider(a)google.com>, Marco Elver <elver(a)google.com>");
diff --git a/mm/kfence/report.c b/mm/kfence/report.c
index 4dbfa9a382e4..901bd7ee83d8 100644
--- a/mm/kfence/report.c
+++ b/mm/kfence/report.c
@@ -156,7 +156,12 @@ static void print_diff_canary(unsigned long address, size_t bytes_to_show,
pr_cont(" ]");
}
-void kfence_report_error(unsigned long address, struct pt_regs *regs,
+static const char *get_access_type(bool is_write)
+{
+ return is_write ? "write" : "read";
+}
+
+void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs,
const struct kfence_metadata *meta, enum kfence_error_type type)
{
unsigned long stack_entries[KFENCE_STACK_DEPTH] = { 0 };
@@ -194,17 +199,19 @@ void kfence_report_error(unsigned long address, struct pt_regs *regs,
case KFENCE_ERROR_OOB: {
const bool left_of_object = address < meta->addr;
- pr_err("BUG: KFENCE: out-of-bounds in %pS\n\n", (void *)stack_entries[skipnr]);
- pr_err("Out-of-bounds access at 0x" PTR_FMT " (%luB %s of kfence-#%zd):\n",
- (void *)address,
+ pr_err("BUG: KFENCE: out-of-bounds %s in %pS\n\n", get_access_type(is_write),
+ (void *)stack_entries[skipnr]);
+ pr_err("Out-of-bounds %s at 0x" PTR_FMT " (%luB %s of kfence-#%zd):\n",
+ get_access_type(is_write), (void *)address,
left_of_object ? meta->addr - address : address - meta->addr,
left_of_object ? "left" : "right", object_index);
break;
}
case KFENCE_ERROR_UAF:
- pr_err("BUG: KFENCE: use-after-free in %pS\n\n", (void *)stack_entries[skipnr]);
- pr_err("Use-after-free access at 0x" PTR_FMT " (in kfence-#%zd):\n",
- (void *)address, object_index);
+ pr_err("BUG: KFENCE: use-after-free %s in %pS\n\n", get_access_type(is_write),
+ (void *)stack_entries[skipnr]);
+ pr_err("Use-after-free %s at 0x" PTR_FMT " (in kfence-#%zd):\n",
+ get_access_type(is_write), (void *)address, object_index);
break;
case KFENCE_ERROR_CORRUPTION:
pr_err("BUG: KFENCE: memory corruption in %pS\n\n", (void *)stack_entries[skipnr]);
@@ -213,8 +220,10 @@ void kfence_report_error(unsigned long address, struct pt_regs *regs,
pr_cont(" (in kfence-#%zd):\n", object_index);
break;
case KFENCE_ERROR_INVALID:
- pr_err("BUG: KFENCE: invalid access in %pS\n\n", (void *)stack_entries[skipnr]);
- pr_err("Invalid access at 0x" PTR_FMT ":\n", (void *)address);
+ pr_err("BUG: KFENCE: invalid %s in %pS\n\n", get_access_type(is_write),
+ (void *)stack_entries[skipnr]);
+ pr_err("Invalid %s at 0x" PTR_FMT ":\n", get_access_type(is_write),
+ (void *)address);
break;
case KFENCE_ERROR_INVALID_FREE:
pr_err("BUG: KFENCE: invalid free in %pS\n\n", (void *)stack_entries[skipnr]);
--
2.25.1
1
0

[PATCH OLK-5.10 8/9] kfence, Documentation: add KFENCE documentation
by 1415317271@qq.com 23 Aug '21
by 1415317271@qq.com 23 Aug '21
23 Aug '21
From: Marco Elver <elver(a)google.com>
Add KFENCE documentation in dev-tools/kfence.rst, and add to index.
[elver(a)google.com: add missing copyright header to documentation]
Link: https://lkml.kernel.org/r/20210118092159.145934-4-elver@google.com
Link: https://lkml.kernel.org/r/20201103175841.3495947-8-elver@google.com
Signed-off-by: Alexander Potapenko <glider(a)google.com>
Signed-off-by: Marco Elver <elver(a)google.com>
Reviewed-by: Dmitry Vyukov <dvyukov(a)google.com>
Co-developed-by: Alexander Potapenko <glider(a)google.com>
Reviewed-by: Jann Horn <jannh(a)google.com>
Cc: Andrey Konovalov <andreyknvl(a)google.com>
Cc: Andrey Ryabinin <aryabinin(a)virtuozzo.com>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Borislav Petkov <bp(a)alien8.de>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Christopher Lameter <cl(a)linux.com>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: David Rientjes <rientjes(a)google.com>
Cc: Eric Dumazet <edumazet(a)google.com>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: Hillf Danton <hdanton(a)sina.com>
Cc: "H. Peter Anvin" <hpa(a)zytor.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Joern Engel <joern(a)purestorage.com>
Cc: Jonathan Corbet <corbet(a)lwn.net>
Cc: Joonsoo Kim <iamjoonsoo.kim(a)lge.com>
Cc: Kees Cook <keescook(a)chromium.org>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Paul E. McKenney <paulmck(a)kernel.org>
Cc: Pekka Enberg <penberg(a)kernel.org>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: SeongJae Park <sjpark(a)amazon.de>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Will Deacon <will(a)kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Signed-off-by: Yingjie Shang <1415317271(a)qq.com>
Reviewed-by: Bixuan Cui <cuibixuan(a)huawei.com>
---
Documentation/dev-tools/index.rst | 1 +
Documentation/dev-tools/kfence.rst | 298 +++++++++++++++++++++++++++++
lib/Kconfig.kfence | 2 +
3 files changed, 301 insertions(+)
create mode 100644 Documentation/dev-tools/kfence.rst
diff --git a/Documentation/dev-tools/index.rst b/Documentation/dev-tools/index.rst
index f7809c7b1ba9..1b1cf4f5c9d9 100644
--- a/Documentation/dev-tools/index.rst
+++ b/Documentation/dev-tools/index.rst
@@ -22,6 +22,7 @@ whole; patches welcome!
ubsan
kmemleak
kcsan
+ kfence
gdb-kernel-debugging
kgdb
kselftest
diff --git a/Documentation/dev-tools/kfence.rst b/Documentation/dev-tools/kfence.rst
new file mode 100644
index 000000000000..0e2fb6ef3016
--- /dev/null
+++ b/Documentation/dev-tools/kfence.rst
@@ -0,0 +1,298 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. Copyright (C) 2020, Google LLC.
+
+Kernel Electric-Fence (KFENCE)
+==============================
+
+Kernel Electric-Fence (KFENCE) is a low-overhead sampling-based memory safety
+error detector. KFENCE detects heap out-of-bounds access, use-after-free, and
+invalid-free errors.
+
+KFENCE is designed to be enabled in production kernels, and has near zero
+performance overhead. Compared to KASAN, KFENCE trades performance for
+precision. The main motivation behind KFENCE's design, is that with enough
+total uptime KFENCE will detect bugs in code paths not typically exercised by
+non-production test workloads. One way to quickly achieve a large enough total
+uptime is when the tool is deployed across a large fleet of machines.
+
+Usage
+-----
+
+To enable KFENCE, configure the kernel with::
+
+ CONFIG_KFENCE=y
+
+To build a kernel with KFENCE support, but disabled by default (to enable, set
+``kfence.sample_interval`` to non-zero value), configure the kernel with::
+
+ CONFIG_KFENCE=y
+ CONFIG_KFENCE_SAMPLE_INTERVAL=0
+
+KFENCE provides several other configuration options to customize behaviour (see
+the respective help text in ``lib/Kconfig.kfence`` for more info).
+
+Tuning performance
+~~~~~~~~~~~~~~~~~~
+
+The most important parameter is KFENCE's sample interval, which can be set via
+the kernel boot parameter ``kfence.sample_interval`` in milliseconds. The
+sample interval determines the frequency with which heap allocations will be
+guarded by KFENCE. The default is configurable via the Kconfig option
+``CONFIG_KFENCE_SAMPLE_INTERVAL``. Setting ``kfence.sample_interval=0``
+disables KFENCE.
+
+The KFENCE memory pool is of fixed size, and if the pool is exhausted, no
+further KFENCE allocations occur. With ``CONFIG_KFENCE_NUM_OBJECTS`` (default
+255), the number of available guarded objects can be controlled. Each object
+requires 2 pages, one for the object itself and the other one used as a guard
+page; object pages are interleaved with guard pages, and every object page is
+therefore surrounded by two guard pages.
+
+The total memory dedicated to the KFENCE memory pool can be computed as::
+
+ ( #objects + 1 ) * 2 * PAGE_SIZE
+
+Using the default config, and assuming a page size of 4 KiB, results in
+dedicating 2 MiB to the KFENCE memory pool.
+
+Note: On architectures that support huge pages, KFENCE will ensure that the
+pool is using pages of size ``PAGE_SIZE``. This will result in additional page
+tables being allocated.
+
+Error reports
+~~~~~~~~~~~~~
+
+A typical out-of-bounds access looks like this::
+
+ ==================================================================
+ BUG: KFENCE: out-of-bounds in test_out_of_bounds_read+0xa3/0x22b
+
+ Out-of-bounds access at 0xffffffffb672efff (1B left of kfence-#17):
+ test_out_of_bounds_read+0xa3/0x22b
+ kunit_try_run_case+0x51/0x85
+ kunit_generic_run_threadfn_adapter+0x16/0x30
+ kthread+0x137/0x160
+ ret_from_fork+0x22/0x30
+
+ kfence-#17 [0xffffffffb672f000-0xffffffffb672f01f, size=32, cache=kmalloc-32] allocated by task 507:
+ test_alloc+0xf3/0x25b
+ test_out_of_bounds_read+0x98/0x22b
+ kunit_try_run_case+0x51/0x85
+ kunit_generic_run_threadfn_adapter+0x16/0x30
+ kthread+0x137/0x160
+ ret_from_fork+0x22/0x30
+
+ CPU: 4 PID: 107 Comm: kunit_try_catch Not tainted 5.8.0-rc6+ #7
+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014
+ ==================================================================
+
+The header of the report provides a short summary of the function involved in
+the access. It is followed by more detailed information about the access and
+its origin. Note that, real kernel addresses are only shown for
+``CONFIG_DEBUG_KERNEL=y`` builds.
+
+Use-after-free accesses are reported as::
+
+ ==================================================================
+ BUG: KFENCE: use-after-free in test_use_after_free_read+0xb3/0x143
+
+ Use-after-free access at 0xffffffffb673dfe0 (in kfence-#24):
+ test_use_after_free_read+0xb3/0x143
+ kunit_try_run_case+0x51/0x85
+ kunit_generic_run_threadfn_adapter+0x16/0x30
+ kthread+0x137/0x160
+ ret_from_fork+0x22/0x30
+
+ kfence-#24 [0xffffffffb673dfe0-0xffffffffb673dfff, size=32, cache=kmalloc-32] allocated by task 507:
+ test_alloc+0xf3/0x25b
+ test_use_after_free_read+0x76/0x143
+ kunit_try_run_case+0x51/0x85
+ kunit_generic_run_threadfn_adapter+0x16/0x30
+ kthread+0x137/0x160
+ ret_from_fork+0x22/0x30
+
+ freed by task 507:
+ test_use_after_free_read+0xa8/0x143
+ kunit_try_run_case+0x51/0x85
+ kunit_generic_run_threadfn_adapter+0x16/0x30
+ kthread+0x137/0x160
+ ret_from_fork+0x22/0x30
+
+ CPU: 4 PID: 109 Comm: kunit_try_catch Tainted: G W 5.8.0-rc6+ #7
+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014
+ ==================================================================
+
+KFENCE also reports on invalid frees, such as double-frees::
+
+ ==================================================================
+ BUG: KFENCE: invalid free in test_double_free+0xdc/0x171
+
+ Invalid free of 0xffffffffb6741000:
+ test_double_free+0xdc/0x171
+ kunit_try_run_case+0x51/0x85
+ kunit_generic_run_threadfn_adapter+0x16/0x30
+ kthread+0x137/0x160
+ ret_from_fork+0x22/0x30
+
+ kfence-#26 [0xffffffffb6741000-0xffffffffb674101f, size=32, cache=kmalloc-32] allocated by task 507:
+ test_alloc+0xf3/0x25b
+ test_double_free+0x76/0x171
+ kunit_try_run_case+0x51/0x85
+ kunit_generic_run_threadfn_adapter+0x16/0x30
+ kthread+0x137/0x160
+ ret_from_fork+0x22/0x30
+
+ freed by task 507:
+ test_double_free+0xa8/0x171
+ kunit_try_run_case+0x51/0x85
+ kunit_generic_run_threadfn_adapter+0x16/0x30
+ kthread+0x137/0x160
+ ret_from_fork+0x22/0x30
+
+ CPU: 4 PID: 111 Comm: kunit_try_catch Tainted: G W 5.8.0-rc6+ #7
+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014
+ ==================================================================
+
+KFENCE also uses pattern-based redzones on the other side of an object's guard
+page, to detect out-of-bounds writes on the unprotected side of the object.
+These are reported on frees::
+
+ ==================================================================
+ BUG: KFENCE: memory corruption in test_kmalloc_aligned_oob_write+0xef/0x184
+
+ Corrupted memory at 0xffffffffb6797ff9 [ 0xac . . . . . . ] (in kfence-#69):
+ test_kmalloc_aligned_oob_write+0xef/0x184
+ kunit_try_run_case+0x51/0x85
+ kunit_generic_run_threadfn_adapter+0x16/0x30
+ kthread+0x137/0x160
+ ret_from_fork+0x22/0x30
+
+ kfence-#69 [0xffffffffb6797fb0-0xffffffffb6797ff8, size=73, cache=kmalloc-96] allocated by task 507:
+ test_alloc+0xf3/0x25b
+ test_kmalloc_aligned_oob_write+0x57/0x184
+ kunit_try_run_case+0x51/0x85
+ kunit_generic_run_threadfn_adapter+0x16/0x30
+ kthread+0x137/0x160
+ ret_from_fork+0x22/0x30
+
+ CPU: 4 PID: 120 Comm: kunit_try_catch Tainted: G W 5.8.0-rc6+ #7
+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014
+ ==================================================================
+
+For such errors, the address where the corruption occurred as well as the
+invalidly written bytes (offset from the address) are shown; in this
+representation, '.' denote untouched bytes. In the example above ``0xac`` is
+the value written to the invalid address at offset 0, and the remaining '.'
+denote that no following bytes have been touched. Note that, real values are
+only shown for ``CONFIG_DEBUG_KERNEL=y`` builds; to avoid information
+disclosure for non-debug builds, '!' is used instead to denote invalidly
+written bytes.
+
+And finally, KFENCE may also report on invalid accesses to any protected page
+where it was not possible to determine an associated object, e.g. if adjacent
+object pages had not yet been allocated::
+
+ ==================================================================
+ BUG: KFENCE: invalid access in test_invalid_access+0x26/0xe0
+
+ Invalid access at 0xffffffffb670b00a:
+ test_invalid_access+0x26/0xe0
+ kunit_try_run_case+0x51/0x85
+ kunit_generic_run_threadfn_adapter+0x16/0x30
+ kthread+0x137/0x160
+ ret_from_fork+0x22/0x30
+
+ CPU: 4 PID: 124 Comm: kunit_try_catch Tainted: G W 5.8.0-rc6+ #7
+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014
+ ==================================================================
+
+DebugFS interface
+~~~~~~~~~~~~~~~~~
+
+Some debugging information is exposed via debugfs:
+
+* The file ``/sys/kernel/debug/kfence/stats`` provides runtime statistics.
+
+* The file ``/sys/kernel/debug/kfence/objects`` provides a list of objects
+ allocated via KFENCE, including those already freed but protected.
+
+Implementation Details
+----------------------
+
+Guarded allocations are set up based on the sample interval. After expiration
+of the sample interval, the next allocation through the main allocator (SLAB or
+SLUB) returns a guarded allocation from the KFENCE object pool (allocation
+sizes up to PAGE_SIZE are supported). At this point, the timer is reset, and
+the next allocation is set up after the expiration of the interval. To "gate" a
+KFENCE allocation through the main allocator's fast-path without overhead,
+KFENCE relies on static branches via the static keys infrastructure. The static
+branch is toggled to redirect the allocation to KFENCE.
+
+KFENCE objects each reside on a dedicated page, at either the left or right
+page boundaries selected at random. The pages to the left and right of the
+object page are "guard pages", whose attributes are changed to a protected
+state, and cause page faults on any attempted access. Such page faults are then
+intercepted by KFENCE, which handles the fault gracefully by reporting an
+out-of-bounds access, and marking the page as accessible so that the faulting
+code can (wrongly) continue executing (set ``panic_on_warn`` to panic instead).
+
+To detect out-of-bounds writes to memory within the object's page itself,
+KFENCE also uses pattern-based redzones. For each object page, a redzone is set
+up for all non-object memory. For typical alignments, the redzone is only
+required on the unguarded side of an object. Because KFENCE must honor the
+cache's requested alignment, special alignments may result in unprotected gaps
+on either side of an object, all of which are redzoned.
+
+The following figure illustrates the page layout::
+
+ ---+-----------+-----------+-----------+-----------+-----------+---
+ | xxxxxxxxx | O : | xxxxxxxxx | : O | xxxxxxxxx |
+ | xxxxxxxxx | B : | xxxxxxxxx | : B | xxxxxxxxx |
+ | x GUARD x | J : RED- | x GUARD x | RED- : J | x GUARD x |
+ | xxxxxxxxx | E : ZONE | xxxxxxxxx | ZONE : E | xxxxxxxxx |
+ | xxxxxxxxx | C : | xxxxxxxxx | : C | xxxxxxxxx |
+ | xxxxxxxxx | T : | xxxxxxxxx | : T | xxxxxxxxx |
+ ---+-----------+-----------+-----------+-----------+-----------+---
+
+Upon deallocation of a KFENCE object, the object's page is again protected and
+the object is marked as freed. Any further access to the object causes a fault
+and KFENCE reports a use-after-free access. Freed objects are inserted at the
+tail of KFENCE's freelist, so that the least recently freed objects are reused
+first, and the chances of detecting use-after-frees of recently freed objects
+is increased.
+
+Interface
+---------
+
+The following describes the functions which are used by allocators as well as
+page handling code to set up and deal with KFENCE allocations.
+
+.. kernel-doc:: include/linux/kfence.h
+ :functions: is_kfence_address
+ kfence_shutdown_cache
+ kfence_alloc kfence_free __kfence_free
+ kfence_ksize kfence_object_start
+ kfence_handle_page_fault
+
+Related Tools
+-------------
+
+In userspace, a similar approach is taken by `GWP-ASan
+<http://llvm.org/docs/GwpAsan.html>`_. GWP-ASan also relies on guard pages and
+a sampling strategy to detect memory unsafety bugs at scale. KFENCE's design is
+directly influenced by GWP-ASan, and can be seen as its kernel sibling. Another
+similar but non-sampling approach, that also inspired the name "KFENCE", can be
+found in the userspace `Electric Fence Malloc Debugger
+<https://linux.die.net/man/3/efence>`_.
+
+In the kernel, several tools exist to debug memory access errors, and in
+particular KASAN can detect all bug classes that KFENCE can detect. While KASAN
+is more precise, relying on compiler instrumentation, this comes at a
+performance cost.
+
+It is worth highlighting that KASAN and KFENCE are complementary, with
+different target environments. For instance, KASAN is the better debugging-aid,
+where test cases or reproducers exists: due to the lower chance to detect the
+error, it would require more effort using KFENCE to debug. Deployments at scale
+that cannot afford to enable KASAN, however, would benefit from using KFENCE to
+discover bugs due to code paths not exercised by test cases or fuzzers.
diff --git a/lib/Kconfig.kfence b/lib/Kconfig.kfence
index edfecb5d6165..605125ac2ae0 100644
--- a/lib/Kconfig.kfence
+++ b/lib/Kconfig.kfence
@@ -13,6 +13,8 @@ menuconfig KFENCE
to have negligible cost to permit enabling it in production
environments.
+ See <file:Documentation/dev-tools/kfence.rst> for more details.
+
Note that, KFENCE is not a substitute for explicit testing with tools
such as KASAN. KFENCE can detect a subset of bugs that KASAN can
detect, albeit at very different performance profiles. If you can
--
2.25.1
1
0

[PATCH OLK-5.10 7/9] kfence, kasan: make KFENCE compatible with KASAN
by 1415317271@qq.com 23 Aug '21
by 1415317271@qq.com 23 Aug '21
23 Aug '21
From: Alexander Potapenko <glider(a)google.com>
Make KFENCE compatible with KASAN. Currently this helps test KFENCE
itself, where KASAN can catch potential corruptions to KFENCE state, or
other corruptions that may be a result of freepointer corruptions in the
main allocators.
[akpm(a)linux-foundation.org: merge fixup]
[andreyknvl(a)google.com: untag addresses for KFENCE]
Link: https://lkml.kernel.org/r/9dc196006921b191d25d10f6e611316db7da2efc.16119461…
Link: https://lkml.kernel.org/r/20201103175841.3495947-7-elver@google.com
Signed-off-by: Marco Elver <elver(a)google.com>
Signed-off-by: Alexander Potapenko <glider(a)google.com>
Signed-off-by: Andrey Konovalov <andreyknvl(a)google.com>
Reviewed-by: Dmitry Vyukov <dvyukov(a)google.com>
Reviewed-by: Jann Horn <jannh(a)google.com>
Co-developed-by: Marco Elver <elver(a)google.com>
Cc: Andrey Konovalov <andreyknvl(a)google.com>
Cc: Andrey Ryabinin <aryabinin(a)virtuozzo.com>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Borislav Petkov <bp(a)alien8.de>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Christopher Lameter <cl(a)linux.com>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: David Rientjes <rientjes(a)google.com>
Cc: Eric Dumazet <edumazet(a)google.com>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: Hillf Danton <hdanton(a)sina.com>
Cc: "H. Peter Anvin" <hpa(a)zytor.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Joern Engel <joern(a)purestorage.com>
Cc: Jonathan Corbet <corbet(a)lwn.net>
Cc: Joonsoo Kim <iamjoonsoo.kim(a)lge.com>
Cc: Kees Cook <keescook(a)chromium.org>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Paul E. McKenney <paulmck(a)kernel.org>
Cc: Pekka Enberg <penberg(a)kernel.org>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: SeongJae Park <sjpark(a)amazon.de>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Will Deacon <will(a)kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Signed-off-by: Yingjie Shang <1415317271(a)qq.com>
Reviewed-by: Bixuan Cui <cuibixuan(a)huawei.com>
---
lib/Kconfig.kfence | 2 +-
mm/kasan/common.c | 6 ++++++
mm/kasan/generic.c | 3 ++-
3 files changed, 9 insertions(+), 2 deletions(-)
diff --git a/lib/Kconfig.kfence b/lib/Kconfig.kfence
index b88ac9d6b2e6..edfecb5d6165 100644
--- a/lib/Kconfig.kfence
+++ b/lib/Kconfig.kfence
@@ -5,7 +5,7 @@ config HAVE_ARCH_KFENCE
menuconfig KFENCE
bool "KFENCE: low-overhead sampling-based memory safety error detector"
- depends on HAVE_ARCH_KFENCE && !KASAN && (SLAB || SLUB)
+ depends on HAVE_ARCH_KFENCE && (SLAB || SLUB)
select STACKTRACE
help
KFENCE is a low-overhead sampling-based detector of heap out-of-bounds
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 950fd372a07e..d9a25af29354 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -396,6 +396,9 @@ static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
tagged_object = object;
object = reset_tag(object);
+ if (is_kfence_address(object))
+ return false;
+
if (unlikely(nearest_obj(cache, virt_to_head_page(object), object) !=
object)) {
kasan_report_invalid_free(tagged_object, ip);
@@ -444,6 +447,9 @@ static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
if (unlikely(object == NULL))
return NULL;
+ if (is_kfence_address(kasan_reset_tag(object)))
+ return (void *)object;
+
redzone_start = round_up((unsigned long)(object + size),
KASAN_SHADOW_SCALE_SIZE);
redzone_end = round_up((unsigned long)object + cache->object_size,
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 2efc48444e77..c4c56ec8a472 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -21,6 +21,7 @@
#include <linux/init.h>
#include <linux/kasan.h>
#include <linux/kernel.h>
+#include <linux/kfence.h>
#include <linux/kmemleak.h>
#include <linux/linkage.h>
#include <linux/memblock.h>
@@ -332,7 +333,7 @@ void kasan_record_aux_stack(void *addr)
struct kasan_alloc_meta *alloc_info;
void *object;
- if (!(page && PageSlab(page)))
+ if (is_kfence_address(addr) || !(page && PageSlab(page)))
return;
cache = page->slab_cache;
--
2.25.1
1
0

23 Aug '21
From: Alexander Potapenko <glider(a)google.com>
Inserts KFENCE hooks into the SLUB allocator.
To pass the originally requested size to KFENCE, add an argument
'orig_size' to slab_alloc*(). The additional argument is required to
preserve the requested original size for kmalloc() allocations, which
uses size classes (e.g. an allocation of 272 bytes will return an object
of size 512). Therefore, kmem_cache::size does not represent the
kmalloc-caller's requested size, and we must introduce the argument
'orig_size' to propagate the originally requested size to KFENCE.
Without the originally requested size, we would not be able to detect
out-of-bounds accesses for objects placed at the end of a KFENCE object
page if that object is not equal to the kmalloc-size class it was
bucketed into.
When KFENCE is disabled, there is no additional overhead, since
slab_alloc*() functions are __always_inline.
Link: https://lkml.kernel.org/r/20201103175841.3495947-6-elver@google.com
Signed-off-by: Marco Elver <elver(a)google.com>
Signed-off-by: Alexander Potapenko <glider(a)google.com>
Reviewed-by: Dmitry Vyukov <dvyukov(a)google.com>
Reviewed-by: Jann Horn <jannh(a)google.com>
Co-developed-by: Marco Elver <elver(a)google.com>
Cc: Andrey Konovalov <andreyknvl(a)google.com>
Cc: Andrey Ryabinin <aryabinin(a)virtuozzo.com>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Borislav Petkov <bp(a)alien8.de>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Christopher Lameter <cl(a)linux.com>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: David Rientjes <rientjes(a)google.com>
Cc: Eric Dumazet <edumazet(a)google.com>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: Hillf Danton <hdanton(a)sina.com>
Cc: "H. Peter Anvin" <hpa(a)zytor.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Joern Engel <joern(a)purestorage.com>
Cc: Jonathan Corbet <corbet(a)lwn.net>
Cc: Joonsoo Kim <iamjoonsoo.kim(a)lge.com>
Cc: Kees Cook <keescook(a)chromium.org>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Paul E. McKenney <paulmck(a)kernel.org>
Cc: Pekka Enberg <penberg(a)kernel.org>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: SeongJae Park <sjpark(a)amazon.de>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Will Deacon <will(a)kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Signed-off-by: Yingjie Shang <1415317271(a)qq.com>
Reviewed-by: Bixuan Cui <cuibixuan(a)huawei.com>
---
include/linux/slub_def.h | 3 ++
mm/kfence/core.c | 2 ++
mm/slub.c | 61 ++++++++++++++++++++++++++++++----------
3 files changed, 51 insertions(+), 15 deletions(-)
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 1be0ed5befa1..dcde82a4434c 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -7,6 +7,7 @@
*
* (C) 2007 SGI, Christoph Lameter
*/
+#include <linux/kfence.h>
#include <linux/kobject.h>
#include <linux/reciprocal_div.h>
@@ -185,6 +186,8 @@ static inline unsigned int __obj_to_index(const struct kmem_cache *cache,
static inline unsigned int obj_to_index(const struct kmem_cache *cache,
const struct page *page, void *obj)
{
+ if (is_kfence_address(obj))
+ return 0;
return __obj_to_index(cache, page_address(page), obj);
}
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 05c18aa11851..7692af715fdb 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -317,6 +317,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
/* Set required struct page fields. */
page = virt_to_page(meta->addr);
page->slab_cache = cache;
+ if (IS_ENABLED(CONFIG_SLUB))
+ page->objects = 1;
if (IS_ENABLED(CONFIG_SLAB))
page->s_mem = addr;
diff --git a/mm/slub.c b/mm/slub.c
index f06f002bb098..678c29105d7f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -28,6 +28,7 @@
#include <linux/ctype.h>
#include <linux/debugobjects.h>
#include <linux/kallsyms.h>
+#include <linux/kfence.h>
#include <linux/memory.h>
#include <linux/math64.h>
#include <linux/fault-inject.h>
@@ -1560,6 +1561,11 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
void *old_tail = *tail ? *tail : *head;
int rsize;
+ if (is_kfence_address(next)) {
+ slab_free_hook(s, next);
+ return true;
+ }
+
/* Head and tail of the reconstructed freelist */
*head = NULL;
*tail = NULL;
@@ -2801,7 +2807,7 @@ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
* Otherwise we can simply pick the next object from the lockless free list.
*/
static __always_inline void *slab_alloc_node(struct kmem_cache *s,
- gfp_t gfpflags, int node, unsigned long addr)
+ gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
{
void *object;
struct kmem_cache_cpu *c;
@@ -2812,6 +2818,11 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags);
if (!s)
return NULL;
+
+ object = kfence_alloc(s, orig_size, gfpflags);
+ if (unlikely(object))
+ goto out;
+
redo:
/*
* Must read kmem_cache cpu data via this cpu ptr. Preemption is
@@ -2883,21 +2894,21 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
memset(object, 0, s->object_size);
-
+out:
slab_post_alloc_hook(s, objcg, gfpflags, 1, &object);
return object;
}
static __always_inline void *slab_alloc(struct kmem_cache *s,
- gfp_t gfpflags, unsigned long addr)
+ gfp_t gfpflags, unsigned long addr, size_t orig_size)
{
- return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
+ return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr, orig_size);
}
void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
{
- void *ret = slab_alloc(s, gfpflags, _RET_IP_);
+ void *ret = slab_alloc(s, gfpflags, _RET_IP_, s->object_size);
trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
s->size, gfpflags);
@@ -2909,7 +2920,7 @@ EXPORT_SYMBOL(kmem_cache_alloc);
#ifdef CONFIG_TRACING
void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
{
- void *ret = slab_alloc(s, gfpflags, _RET_IP_);
+ void *ret = slab_alloc(s, gfpflags, _RET_IP_, size);
trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
ret = kasan_kmalloc(s, ret, size, gfpflags);
return ret;
@@ -2920,7 +2931,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace);
#ifdef CONFIG_NUMA
void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
{
- void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
+ void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, s->object_size);
trace_kmem_cache_alloc_node(_RET_IP_, ret,
s->object_size, s->size, gfpflags, node);
@@ -2934,7 +2945,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
gfp_t gfpflags,
int node, size_t size)
{
- void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
+ void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, size);
trace_kmalloc_node(_RET_IP_, ret,
size, s->size, gfpflags, node);
@@ -2968,6 +2979,9 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
stat(s, FREE_SLOWPATH);
+ if (kfence_free(head))
+ return;
+
if (kmem_cache_debug(s) &&
!free_debug_processing(s, page, head, tail, cnt, addr))
return;
@@ -3212,6 +3226,13 @@ int build_detached_freelist(struct kmem_cache *s, size_t size,
df->s = cache_from_obj(s, object); /* Support for memcg */
}
+ if (is_kfence_address(object)) {
+ slab_free_hook(df->s, object);
+ __kfence_free(object);
+ p[size] = NULL; /* mark object processed */
+ return size;
+ }
+
/* Start new detached freelist */
df->page = page;
set_freepointer(df->s, object, NULL);
@@ -3287,8 +3308,14 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
c = this_cpu_ptr(s->cpu_slab);
for (i = 0; i < size; i++) {
- void *object = c->freelist;
+ void *object = kfence_alloc(s, s->object_size, flags);
+ if (unlikely(object)) {
+ p[i] = object;
+ continue;
+ }
+
+ object = c->freelist;
if (unlikely(!object)) {
/*
* We may have removed an object from c->freelist using
@@ -3953,7 +3980,7 @@ void *__kmalloc(size_t size, gfp_t flags)
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc(s, flags, _RET_IP_);
+ ret = slab_alloc(s, flags, _RET_IP_, size);
trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
@@ -4001,7 +4028,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc_node(s, flags, node, _RET_IP_);
+ ret = slab_alloc_node(s, flags, node, _RET_IP_, size);
trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
@@ -4027,6 +4054,7 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
struct kmem_cache *s;
unsigned int offset;
size_t object_size;
+ bool is_kfence = is_kfence_address(ptr);
ptr = kasan_reset_tag(ptr);
@@ -4039,10 +4067,13 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
to_user, 0, n);
/* Find offset within object. */
- offset = (ptr - page_address(page)) % s->size;
+ if (is_kfence)
+ offset = ptr - kfence_object_start(ptr);
+ else
+ offset = (ptr - page_address(page)) % s->size;
/* Adjust for redzone and reject if within the redzone. */
- if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
+ if (!is_kfence && kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
if (offset < s->red_left_pad)
usercopy_abort("SLUB object in left red zone",
s->name, to_user, offset, n);
@@ -4459,7 +4490,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc(s, gfpflags, caller);
+ ret = slab_alloc(s, gfpflags, caller, size);
/* Honor the call site pointer we received. */
trace_kmalloc(caller, ret, size, s->size, gfpflags);
@@ -4490,7 +4521,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc_node(s, gfpflags, node, caller);
+ ret = slab_alloc_node(s, gfpflags, node, caller, size);
/* Honor the call site pointer we received. */
trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
--
2.25.1
1
0