From: YueHaibing yuehaibing@huawei.com
mainline inclusion from mainline-v4.20-rc1 commit efbaec89c642cd1d4977fc6df9923697e1598d4e category: other bugzilla: 43460 CVE: NA
--------------------------------------- Remove duplicated include.
Signed-off-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Signed-off-by: liuxin liuxin264@huawei.com Reviewed-by: Cheng Jian cj.chengjian@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/bpf/syscall.c | 1 - 1 file changed, 1 deletion(-)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cb473159a55c..3e9ff9897c7c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -30,7 +30,6 @@ #include <linux/cred.h> #include <linux/timekeeping.h> #include <linux/ctype.h> -#include <linux/btf.h> #include <linux/nospec.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
From: Edward Cree ecree@solarflare.com
mainline inclusion from mainline-v4.20-rc1 commit 679c782de14bd48c19dd74cd1af20a2bc05dd936 category: feature bugzilla: 43460 CVE: NA
--------------------------------------- By giving each register its own liveness chain, we elide the skip_callee() logic. Instead, each register's parent is the state it inherits from; both check_func_call() and prepare_func_exit() automatically connect reg states to the correct chain since when they copy the reg state across (r1-r5 into the callee as args, and r0 out as the return value) they also copy the parent pointer.
Signed-off-by: Edward Cree ecree@solarflare.com Signed-off-by: Alexei Starovoitov ast@kernel.org
Conflicts: kernel/bpf/verifier.c [liuxin:solve the conflicts in verifier.c]
Signed-off-by: liuxin liuxin264@huawei.com Reviewed-by: Cheng Jian cj.chengjian@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/bpf_verifier.h | 8 +- kernel/bpf/verifier.c | 183 +++++++++-------------------------- 2 files changed, 47 insertions(+), 144 deletions(-)
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 91393724e933..42dc8c2146f4 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -41,6 +41,7 @@ enum bpf_reg_liveness { };
struct bpf_reg_state { + /* Ordering of fields matters. See states_equal() */ enum bpf_reg_type type; union { /* valid when type == PTR_TO_PACKET */ @@ -62,7 +63,6 @@ struct bpf_reg_state { * came from, when one is tested for != NULL. */ u32 id; - /* Ordering of fields matters. See states_equal() */ /* For scalar types (SCALAR_VALUE), this represents our knowledge of * the actual value. * For pointer types, this represents the variable part of the offset @@ -79,15 +79,15 @@ struct bpf_reg_state { s64 smax_value; /* maximum possible (s64)value */ u64 umin_value; /* minimum possible (u64)value */ u64 umax_value; /* maximum possible (u64)value */ + /* parentage chain for liveness checking */ + struct bpf_reg_state *parent; /* Inside the callee two registers can be both PTR_TO_STACK like * R1=fp-8 and R2=fp-8, but one of them points to this function stack * while another to the caller's stack. To differentiate them 'frameno' * is used which is an index in bpf_verifier_state->frame[] array * pointing to bpf_func_state. - * This field must be second to last, for states_equal() reasons. */ u32 frameno; - /* This field must be last, for states_equal() reasons. */ enum bpf_reg_liveness live; };
@@ -110,7 +110,6 @@ struct bpf_stack_state { */ struct bpf_func_state { struct bpf_reg_state regs[MAX_BPF_REG]; - struct bpf_verifier_state *parent; /* index of call instruction that called into this func */ int callsite; /* stack frame number of this function state from pov of @@ -132,7 +131,6 @@ struct bpf_func_state { struct bpf_verifier_state { /* call stack tracking */ struct bpf_func_state *frame[MAX_CALL_FRAMES]; - struct bpf_verifier_state *parent; u32 curframe; bool speculative; }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index daf0a9637d73..2fa3e269fe1d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -380,9 +380,9 @@ static int copy_stack_state(struct bpf_func_state *dst, /* do_check() starts with zero-sized stack in struct bpf_verifier_state to * make it consume minimal amount of memory. check_stack_write() access from * the program calls into realloc_func_state() to grow the stack size. - * Note there is a non-zero 'parent' pointer inside bpf_verifier_state - * which this function copies over. It points to previous bpf_verifier_state - * which is never reallocated + * Note there is a non-zero parent pointer inside each reg of bpf_verifier_state + * which this function copies over. It points to corresponding reg in previous + * bpf_verifier_state which is never reallocated */ static int realloc_func_state(struct bpf_func_state *state, int size, bool copy_old) @@ -467,7 +467,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, } dst_state->speculative = src->speculative; dst_state->curframe = src->curframe; - dst_state->parent = src->parent; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -739,6 +738,7 @@ static void init_reg_state(struct bpf_verifier_env *env, for (i = 0; i < MAX_BPF_REG; i++) { mark_reg_not_init(env, regs, i); regs[i].live = REG_LIVE_NONE; + regs[i].parent = NULL; }
/* frame pointer */ @@ -883,74 +883,21 @@ static int check_subprogs(struct bpf_verifier_env *env) return 0; }
-static -struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env, - const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent, - u32 regno) -{ - struct bpf_verifier_state *tmp = NULL; - - /* 'parent' could be a state of caller and - * 'state' could be a state of callee. In such case - * parent->curframe < state->curframe - * and it's ok for r1 - r5 registers - * - * 'parent' could be a callee's state after it bpf_exit-ed. - * In such case parent->curframe > state->curframe - * and it's ok for r0 only - */ - if (parent->curframe == state->curframe || - (parent->curframe < state->curframe && - regno >= BPF_REG_1 && regno <= BPF_REG_5) || - (parent->curframe > state->curframe && - regno == BPF_REG_0)) - return parent; - - if (parent->curframe > state->curframe && - regno >= BPF_REG_6) { - /* for callee saved regs we have to skip the whole chain - * of states that belong to callee and mark as LIVE_READ - * the registers before the call - */ - tmp = parent; - while (tmp && tmp->curframe != state->curframe) { - tmp = tmp->parent; - } - if (!tmp) - goto bug; - parent = tmp; - } else { - goto bug; - } - return parent; -bug: - verbose(env, "verifier bug regno %d tmp %p\n", regno, tmp); - verbose(env, "regno %d parent frame %d current frame %d\n", - regno, parent->curframe, state->curframe); - return NULL; -} - +/* Parentage chain of this register (or stack slot) should take care of all + * issues like callee-saved registers, stack slot allocation time, etc. + */ static int mark_reg_read(struct bpf_verifier_env *env, - const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent, - u32 regno) + const struct bpf_reg_state *state, + struct bpf_reg_state *parent) { bool writes = parent == state->parent; /* Observe write marks */
- if (regno == BPF_REG_FP) - /* We don't need to worry about FP liveness because it's read-only */ - return 0; - while (parent) { /* if read wasn't screened by an earlier write ... */ - if (writes && state->frame[state->curframe]->regs[regno].live & REG_LIVE_WRITTEN) + if (writes && state->live & REG_LIVE_WRITTEN) break; - parent = skip_callee(env, state, parent, regno); - if (!parent) - return -EFAULT; /* ... then we depend on parent's value */ - parent->frame[parent->curframe]->regs[regno].live |= REG_LIVE_READ; + parent->live |= REG_LIVE_READ; state = parent; parent = state->parent; writes = true; @@ -976,7 +923,10 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, verbose(env, "R%d !read_ok\n", regno); return -EACCES; } - return mark_reg_read(env, vstate, vstate->parent, regno); + /* We don't need to worry about FP liveness because it's read-only */ + if (regno != BPF_REG_FP) + return mark_reg_read(env, ®s[regno], + regs[regno].parent); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { @@ -1087,8 +1037,8 @@ static int check_stack_write(struct bpf_verifier_env *env, } else { u8 type = STACK_MISC;
- /* regular write of data into stack */ - state->stack[spi].spilled_ptr = (struct bpf_reg_state) {}; + /* regular write of data into stack destroys any spilled ptr */ + state->stack[spi].spilled_ptr.type = NOT_INIT;
/* only mark the slot as written if all 8 bytes were written * otherwise read propagation may incorrectly stop too soon @@ -1113,61 +1063,6 @@ static int check_stack_write(struct bpf_verifier_env *env, return 0; }
-/* registers of every function are unique and mark_reg_read() propagates - * the liveness in the following cases: - * - from callee into caller for R1 - R5 that were used as arguments - * - from caller into callee for R0 that used as result of the call - * - from caller to the same caller skipping states of the callee for R6 - R9, - * since R6 - R9 are callee saved by implicit function prologue and - * caller's R6 != callee's R6, so when we propagate liveness up to - * parent states we need to skip callee states for R6 - R9. - * - * stack slot marking is different, since stacks of caller and callee are - * accessible in both (since caller can pass a pointer to caller's stack to - * callee which can pass it to another function), hence mark_stack_slot_read() - * has to propagate the stack liveness to all parent states at given frame number. - * Consider code: - * f1() { - * ptr = fp - 8; - * *ptr = ctx; - * call f2 { - * .. = *ptr; - * } - * .. = *ptr; - * } - * First *ptr is reading from f1's stack and mark_stack_slot_read() has - * to mark liveness at the f1's frame and not f2's frame. - * Second *ptr is also reading from f1's stack and mark_stack_slot_read() has - * to propagate liveness to f2 states at f1's frame level and further into - * f1 states at f1's frame level until write into that stack slot - */ -static void mark_stack_slot_read(struct bpf_verifier_env *env, - const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent, - int slot, int frameno) -{ - bool writes = parent == state->parent; /* Observe write marks */ - - while (parent) { - if (parent->frame[frameno]->allocated_stack <= slot * BPF_REG_SIZE) - /* since LIVE_WRITTEN mark is only done for full 8-byte - * write the read marks are conservative and parent - * state may not even have the stack allocated. In such case - * end the propagation, since the loop reached beginning - * of the function - */ - break; - /* if read wasn't screened by an earlier write ... */ - if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) - break; - /* ... then we depend on parent's value */ - parent->frame[frameno]->stack[slot].spilled_ptr.live |= REG_LIVE_READ; - state = parent; - parent = state->parent; - writes = true; - } -} - static int check_stack_read(struct bpf_verifier_env *env, struct bpf_func_state *reg_state /* func where register points to */, int off, int size, int value_regno) @@ -1205,8 +1100,8 @@ static int check_stack_read(struct bpf_verifier_env *env, */ state->regs[value_regno].live |= REG_LIVE_WRITTEN; } - mark_stack_slot_read(env, vstate, vstate->parent, spi, - reg_state->frameno); + mark_reg_read(env, ®_state->stack[spi].spilled_ptr, + reg_state->stack[spi].spilled_ptr.parent); return 0; } else { int zeros = 0; @@ -1222,8 +1117,8 @@ static int check_stack_read(struct bpf_verifier_env *env, off, i, size); return -EACCES; } - mark_stack_slot_read(env, vstate, vstate->parent, spi, - reg_state->frameno); + mark_reg_read(env, ®_state->stack[spi].spilled_ptr, + reg_state->stack[spi].spilled_ptr.parent); if (value_regno >= 0) { if (zeros == size) { /* any size read into register is zero extended, @@ -1927,8 +1822,8 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, /* reading any byte out of 8-byte 'spill_slot' will cause * the whole slot to be marked as 'read' */ - mark_stack_slot_read(env, env->cur_state, env->cur_state->parent, - spi, state->frameno); + mark_reg_read(env, &state->stack[spi].spilled_ptr, + state->stack[spi].spilled_ptr.parent); } return update_stack_depth(env, state, off); } @@ -2384,11 +2279,13 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, state->curframe + 1 /* frameno within this callchain */, subprog /* subprog number within this prog */);
- /* copy r1 - r5 args that callee can access */ + /* copy r1 - r5 args that callee can access. The copy includes parent + * pointers, which connects us up to the liveness chain + */ for (i = BPF_REG_1; i <= BPF_REG_5; i++) callee->regs[i] = caller->regs[i];
- /* after the call regsiters r0 - r5 were scratched */ + /* after the call registers r0 - r5 were scratched */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, caller->regs, caller_saved[i]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); @@ -4681,7 +4578,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, /* explored state didn't use this */ return true;
- equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, frameno)) == 0; + equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0;
if (rold->type == PTR_TO_STACK) /* two stack pointers are equal only if they're pointing to @@ -4920,7 +4817,7 @@ static bool states_equal(struct bpf_verifier_env *env, * equivalent state (jump target or such) we didn't arrive by the straight-line * code, so read marks in the state must propagate to the parent regardless * of the state's write marks. That's what 'parent == state->parent' comparison - * in mark_reg_read() and mark_stack_slot_read() is for. + * in mark_reg_read() is for. */ static int propagate_liveness(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, @@ -4941,7 +4838,8 @@ static int propagate_liveness(struct bpf_verifier_env *env, if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ) continue; if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) { - err = mark_reg_read(env, vstate, vparent, i); + err = mark_reg_read(env, &vstate->frame[vstate->curframe]->regs[i], + &vparent->frame[vstate->curframe]->regs[i]); if (err) return err; } @@ -4956,7 +4854,8 @@ static int propagate_liveness(struct bpf_verifier_env *env, if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) continue; if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) - mark_stack_slot_read(env, vstate, vparent, i, frame); + mark_reg_read(env, &state->stack[i].spilled_ptr, + &parent->stack[i].spilled_ptr); } } return err; @@ -4966,7 +4865,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; - struct bpf_verifier_state *cur = env->cur_state; + struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err, states_cnt = 0;
sl = env->explored_states[insn_idx]; @@ -5012,16 +4911,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return -ENOMEM;
/* add new state to the head of linked list */ - err = copy_verifier_state(&new_sl->state, cur); + new = &new_sl->state; + err = copy_verifier_state(new, cur); if (err) { - free_verifier_state(&new_sl->state, false); + free_verifier_state(new, false); kfree(new_sl); return err; } new_sl->next = env->explored_states[insn_idx]; env->explored_states[insn_idx] = new_sl; /* connect new state to parentage chain */ - cur->parent = &new_sl->state; + for (i = 0; i < BPF_REG_FP; i++) + cur_regs(env)[i].parent = &new->frame[new->curframe]->regs[i]; /* clear write marks in current state: the writes we did are not writes * our child did, so they don't screen off its reads from us. * (There are no read marks in current state, because reads always mark @@ -5034,9 +4935,13 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) /* all stack frames are accessible from callee, clear them all */ for (j = 0; j <= cur->curframe; j++) { struct bpf_func_state *frame = cur->frame[j]; + struct bpf_func_state *newframe = new->frame[j];
- for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) + for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) { frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; + frame->stack[i].spilled_ptr.parent = + &newframe->stack[i].spilled_ptr; + } } return 0; }
From: Edward Cree ecree@solarflare.com
mainline inclusion from mainline-v4.20-rc1 commit 8efea21d333d21e1f9177579ffdc69556314f603 category: feature bugzilla: 43460 CVE: NA
--------------------------------------- If a stack slot does not hold a spilled register (STACK_SPILL), then each of its eight bytes could potentially have a different slot_type. This information can be important for debugging, and previously we either did not print anything for the stack slot, or just printed fp-X=0 in the case where its first byte was STACK_ZERO. Instead, print eight characters with either 0 (STACK_ZERO), m (STACK_MISC) or ? (STACK_INVALID) for any stack slot which is neither STACK_SPILL nor entirely STACK_INVALID.
Signed-off-by: Edward Cree ecree@solarflare.com Signed-off-by: Alexei Starovoitov ast@kernel.org Signed-off-by: liuxin liuxin264@huawei.com Reviewed-by: Cheng Jian cj.chengjian@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/bpf/verifier.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2fa3e269fe1d..f58e7928768a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -263,6 +263,13 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET_END] = "pkt_end", };
+static char slot_type_char[] = { + [STACK_INVALID] = '?', + [STACK_SPILL] = 'r', + [STACK_MISC] = 'm', + [STACK_ZERO] = '0', +}; + static void print_liveness(struct bpf_verifier_env *env, enum bpf_reg_liveness live) { @@ -349,15 +356,26 @@ static void print_verifier_state(struct bpf_verifier_env *env, } } for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] == STACK_SPILL) { - verbose(env, " fp%d", - (-i - 1) * BPF_REG_SIZE); - print_liveness(env, state->stack[i].spilled_ptr.live); + char types_buf[BPF_REG_SIZE + 1]; + bool valid = false; + int j; + + for (j = 0; j < BPF_REG_SIZE; j++) { + if (state->stack[i].slot_type[j] != STACK_INVALID) + valid = true; + types_buf[j] = slot_type_char[ + state->stack[i].slot_type[j]]; + } + types_buf[BPF_REG_SIZE] = 0; + if (!valid) + continue; + verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); + print_liveness(env, state->stack[i].spilled_ptr.live); + if (state->stack[i].slot_type[0] == STACK_SPILL) verbose(env, "=%s", reg_type_str[state->stack[i].spilled_ptr.type]); - } - if (state->stack[i].slot_type[0] == STACK_ZERO) - verbose(env, " fp%d=0", (-i - 1) * BPF_REG_SIZE); + else + verbose(env, "=%s", types_buf); } verbose(env, "\n"); }
From: Yonghong Song yhs@fb.com
mainline inclusion from mainline-v4.20-rc1 commit c7b27c37af3da5a63f32b0bc99569e3069e4d9c1 category: feature bugzilla: 43460 CVE: NA
--------------------------------------- Added bpffs pretty print for percpu arraymap, percpu hashmap and percpu lru hashmap.
For each map <key, value> pair, the format is: <key_value>: { cpu0: <value_on_cpu0> cpu1: <value_on_cpu1> ... cpun: <value_on_cpun> }
For example, on my VM, there are 4 cpus, and for test_btf test in the next patch: cat /sys/fs/bpf/pprint_test_percpu_hash
You may get: ... 43602: { cpu0: {43602,0,-43602,0x3,0xaa52,0x3,{43602|[82,170,0,0,0,0,0,0]},ENUM_TWO} cpu1: {43602,0,-43602,0x3,0xaa52,0x3,{43602|[82,170,0,0,0,0,0,0]},ENUM_TWO} cpu2: {43602,0,-43602,0x3,0xaa52,0x3,{43602|[82,170,0,0,0,0,0,0]},ENUM_TWO} cpu3: {43602,0,-43602,0x3,0xaa52,0x3,{43602|[82,170,0,0,0,0,0,0]},ENUM_TWO} } 72847: { cpu0: {72847,0,-72847,0x3,0x11c8f,0x3,{72847|[143,28,1,0,0,0,0,0]},ENUM_THREE} cpu1: {72847,0,-72847,0x3,0x11c8f,0x3,{72847|[143,28,1,0,0,0,0,0]},ENUM_THREE} cpu2: {72847,0,-72847,0x3,0x11c8f,0x3,{72847|[143,28,1,0,0,0,0,0]},ENUM_THREE} cpu3: {72847,0,-72847,0x3,0x11c8f,0x3,{72847|[143,28,1,0,0,0,0,0]},ENUM_THREE} } ...
Signed-off-by: Yonghong Song yhs@fb.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Signed-off-by: liuxin liuxin264@huawei.com Reviewed-by: Cheng Jian cj.chengjian@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/bpf/arraymap.c | 24 ++++++++++++++++++++++++ kernel/bpf/hashtab.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+)
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 0c17aab3ce5f..f9d24121be99 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -358,6 +358,29 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); }
+static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + void __percpu *pptr; + int cpu; + + rcu_read_lock(); + + seq_printf(m, "%u: {\n", *(u32 *)key); + pptr = array->pptrs[index & array->index_mask]; + for_each_possible_cpu(cpu) { + seq_printf(m, "\tcpu%d: ", cpu); + btf_type_seq_show(map->btf, map->btf_value_type_id, + per_cpu_ptr(pptr, cpu), m); + seq_puts(m, "\n"); + } + seq_puts(m, "}\n"); + + rcu_read_unlock(); +} + static int array_map_check_btf(const struct bpf_map *map, const struct btf_type *key_type, const struct btf_type *value_type) @@ -398,6 +421,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_lookup_elem = percpu_array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, + .map_seq_show_elem = percpu_array_map_seq_show_elem, .map_check_btf = array_map_check_btf, };
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3f3ed33bd2fd..5268e3de1bd7 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1296,6 +1296,35 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, return ret; }
+static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + struct htab_elem *l; + void __percpu *pptr; + int cpu; + + rcu_read_lock(); + + l = __htab_map_lookup_elem(map, key); + if (!l) { + rcu_read_unlock(); + return; + } + + btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); + seq_puts(m, ": {\n"); + pptr = htab_elem_get_ptr(l, map->key_size); + for_each_possible_cpu(cpu) { + seq_printf(m, "\tcpu%d: ", cpu); + btf_type_seq_show(map->btf, map->btf_value_type_id, + per_cpu_ptr(pptr, cpu), m); + seq_puts(m, "\n"); + } + seq_puts(m, "}\n"); + + rcu_read_unlock(); +} + const struct bpf_map_ops htab_percpu_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1304,6 +1333,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_lookup_elem = htab_percpu_map_lookup_elem, .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, + .map_seq_show_elem = htab_percpu_map_seq_show_elem, };
const struct bpf_map_ops htab_lru_percpu_map_ops = { @@ -1314,6 +1344,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_lookup_elem = htab_lru_percpu_map_lookup_elem, .map_update_elem = htab_lru_percpu_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, + .map_seq_show_elem = htab_percpu_map_seq_show_elem, };
static int fd_htab_map_alloc_check(union bpf_attr *attr)
From: Yonghong Song yhs@fb.com
mainline inclusion from mainline-v4.20-rc1 commit a7c19db38d62fc1ce797dba19936e9f81cf2b9fb category: feature bugzilla: 43460 CVE: NA
--------------------------------------- Added bpffs pretty print for program array map. For a particular array index, if the program array points to a valid program, the "<index>: <prog_id>" will be printed out like 0: 6 which means bpf program with id "6" is installed at index "0".
Signed-off-by: Yonghong Song yhs@fb.com Signed-off-by: Alexei Starovoitov ast@kernel.org Signed-off-by: liuxin liuxin264@huawei.com Reviewed-by: Cheng Jian cj.chengjian@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/bpf/arraymap.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-)
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index f9d24121be99..dded84cbe814 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -553,6 +553,29 @@ static void bpf_fd_array_map_clear(struct bpf_map *map) fd_array_map_delete_elem(map, &i); }
+static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + void **elem, *ptr; + u32 prog_id; + + rcu_read_lock(); + + elem = array_map_lookup_elem(map, key); + if (elem) { + ptr = READ_ONCE(*elem); + if (ptr) { + seq_printf(m, "%u: ", *(u32 *)key); + prog_id = prog_fd_array_sys_lookup_elem(ptr); + btf_type_seq_show(map->btf, map->btf_value_type_id, + &prog_id, m); + seq_puts(m, "\n"); + } + } + + rcu_read_unlock(); +} + const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, @@ -564,7 +587,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, .map_release_uref = bpf_fd_array_map_clear, - .map_check_btf = map_check_no_btf, + .map_seq_show_elem = prog_array_map_seq_show_elem, };
static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
From: zhong jiang zhongjiang@huawei.com
mainline inclusion from mainline-v4.20-rc1 commit 788758d1fe874fd20ecb0ab490552d94c024a9de category: other bugzilla: 43460 CVE: NA
--------------------------------------- consume_skb has taken the null pointer into account. hence it is safe to remove the redundant null pointer check before consume_skb.
Signed-off-by: zhong jiang zhongjiang@huawei.com Acked-by: Song Liu songliubraving@fb.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Signed-off-by: liuxin liuxin264@huawei.com Reviewed-by: Cheng Jian cj.chengjian@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/bpf/sockmap.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 0a0f2ec75370..d37a1a0a6e1e 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -612,8 +612,7 @@ static int free_sg(struct sock *sk, int start, if (i == MAX_SKB_FRAGS) i = 0; } - if (md->skb) - consume_skb(md->skb); + consume_skb(md->skb);
return free; } @@ -995,8 +994,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
if (!sg->length && md->sg_start == md->sg_end) { list_del(&md->list); - if (md->skb) - consume_skb(md->skb); + consume_skb(md->skb); kfree(md); } }
From: Yonghong Song yhs@fb.com
mainline inclusion from mainline-v4.20-rc1 commit 5bf7a60b8e70969f65c961d7e2c4eb40eb2c664d category: feature bugzilla: 43460 CVE: NA
--------------------------------------- Currently, helper bpf_get_current_cgroup_id() is not permitted for CGROUP_DEVICE type of programs. If the helper is used in such cases, the verifier will log the following error:
0: (bf) r6 = r1 1: (69) r7 = *(u16 *)(r6 +0) 2: (85) call bpf_get_current_cgroup_id#80 unknown func bpf_get_current_cgroup_id#80
The bpf_get_current_cgroup_id() is useful for CGROUP_DEVICE type of programs in order to customize action based on cgroup id. This patch added such a support.
Cc: Roman Gushchin guro@fb.com Signed-off-by: Yonghong Song yhs@fb.com Acked-by: Alexei Starovoitov ast@kernel.org Acked-by: Roman Gushchin guro@fb.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Signed-off-by: liuxin liuxin264@huawei.com Reviewed-by: Cheng Jian cj.chengjian@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/bpf/cgroup.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 6a7d931bbc55..549f6fbcc461 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -677,6 +677,8 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; case BPF_FUNC_trace_printk: if (capable(CAP_SYS_ADMIN)) return bpf_get_trace_printk_proto();
From: Roman Gushchin guro@fb.com
mainline inclusion from mainline-v4.20-rc1 commit 8bad74f9840f87661f20ced3dc80c84ab4fd55a1 category: feature bugzilla: 43460 CVE: NA
--------------------------------------- In order to introduce per-cpu cgroup storage, let's generalize bpf cgroup core to support multiple cgroup storage types. Potentially, per-node cgroup storage can be added later.
This commit is mostly a formal change that replaces cgroup_storage pointer with a array of cgroup_storage pointers. It doesn't actually introduce a new storage type, it will be done later.
Each bpf program is now able to have one cgroup storage of each type.
Signed-off-by: Roman Gushchin guro@fb.com Acked-by: Song Liu songliubraving@fb.com Cc: Daniel Borkmann daniel@iogearbox.net Cc: Alexei Starovoitov ast@kernel.org Signed-off-by: Daniel Borkmann daniel@iogearbox.net Signed-off-by: liuxin liuxin264@huawei.com Reviewed-by: Cheng Jian cj.chengjian@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/bpf-cgroup.h | 38 ++++++++++++++------ include/linux/bpf.h | 11 ++++-- kernel/bpf/cgroup.c | 74 ++++++++++++++++++++++++++------------ kernel/bpf/helpers.c | 15 ++++---- kernel/bpf/local_storage.c | 18 ++++++---- kernel/bpf/syscall.c | 9 +++-- kernel/bpf/verifier.c | 8 +++-- net/bpf/test_run.c | 20 +++++++---- 8 files changed, 136 insertions(+), 57 deletions(-)
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index ad6b30137ac2..65e19fe60a49 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -2,6 +2,7 @@ #ifndef _BPF_CGROUP_H #define _BPF_CGROUP_H
+#include <linux/bpf.h> #include <linux/errno.h> #include <linux/jump_label.h> #include <linux/percpu.h> @@ -22,7 +23,10 @@ struct bpf_cgroup_storage; extern struct static_key_false cgroup_bpf_enabled_key; #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
-DECLARE_PER_CPU(void*, bpf_cgroup_storage); +DECLARE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); + +#define for_each_cgroup_storage_type(stype) \ + for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++)
struct bpf_cgroup_storage_map;
@@ -43,7 +47,7 @@ struct bpf_cgroup_storage { struct bpf_prog_list { struct list_head node; struct bpf_prog *prog; - struct bpf_cgroup_storage *storage; + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; };
struct bpf_prog_array; @@ -101,18 +105,29 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, short access, enum bpf_attach_type type);
-static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage) +static inline enum bpf_cgroup_storage_type cgroup_storage_type( + struct bpf_map *map) { + return BPF_CGROUP_STORAGE_SHARED; +} + +static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage + *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) +{ + enum bpf_cgroup_storage_type stype; struct bpf_storage_buffer *buf;
- if (!storage) - return; + for_each_cgroup_storage_type(stype) { + if (!storage[stype]) + continue;
- buf = READ_ONCE(storage->buf); - this_cpu_write(bpf_cgroup_storage, &buf->data[0]); + buf = READ_ONCE(storage[stype]->buf); + this_cpu_write(bpf_cgroup_storage[stype], &buf->data[0]); + } }
-struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog); +struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, + enum bpf_cgroup_storage_type stype); void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage); void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, struct cgroup *cgroup, @@ -271,13 +286,14 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, return -EINVAL; }
-static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage) {} +static inline void bpf_cgroup_storage_set( + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) {} static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map) { return 0; } static inline void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map) {} static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( - struct bpf_prog *prog) { return 0; } + struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return 0; } static inline void bpf_cgroup_storage_free( struct bpf_cgroup_storage *storage) {}
@@ -301,6 +317,8 @@ static inline void bpf_cgroup_storage_free( #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
+#define for_each_cgroup_storage_type(stype) for (; false; ) + #endif /* CONFIG_CGROUP_BPF */
#endif /* _BPF_CGROUP_H */ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 16f6beef5cad..fa76b98f49a6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -272,6 +272,13 @@ struct bpf_prog_offload { u32 jited_len; };
+enum bpf_cgroup_storage_type { + BPF_CGROUP_STORAGE_SHARED, + __BPF_CGROUP_STORAGE_MAX +}; + +#define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX + struct bpf_prog_aux { atomic_t refcnt; u32 used_map_cnt; @@ -289,7 +296,7 @@ struct bpf_prog_aux { struct bpf_prog *prog; struct user_struct *user; u64 load_time; /* ns since boottime */ - struct bpf_map *cgroup_storage; + struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; char name[BPF_OBJ_NAME_LEN]; #ifdef CONFIG_SECURITY void *security; @@ -358,7 +365,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, */ struct bpf_prog_array_item { struct bpf_prog *prog; - struct bpf_cgroup_storage *cgroup_storage; + struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; };
struct bpf_prog_array { diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 549f6fbcc461..00f6ed2e4f9a 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -25,6 +25,7 @@ EXPORT_SYMBOL(cgroup_bpf_enabled_key); */ void cgroup_bpf_put(struct cgroup *cgrp) { + enum bpf_cgroup_storage_type stype; unsigned int type;
for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { @@ -34,8 +35,10 @@ void cgroup_bpf_put(struct cgroup *cgrp) list_for_each_entry_safe(pl, tmp, progs, node) { list_del(&pl->node); bpf_prog_put(pl->prog); - bpf_cgroup_storage_unlink(pl->storage); - bpf_cgroup_storage_free(pl->storage); + for_each_cgroup_storage_type(stype) { + bpf_cgroup_storage_unlink(pl->storage[stype]); + bpf_cgroup_storage_free(pl->storage[stype]); + } kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } @@ -97,6 +100,7 @@ static int compute_effective_progs(struct cgroup *cgrp, enum bpf_attach_type type, struct bpf_prog_array __rcu **array) { + enum bpf_cgroup_storage_type stype; struct bpf_prog_array *progs; struct bpf_prog_list *pl; struct cgroup *p = cgrp; @@ -125,7 +129,9 @@ static int compute_effective_progs(struct cgroup *cgrp, continue;
progs->items[cnt].prog = pl->prog; - progs->items[cnt].cgroup_storage = pl->storage; + for_each_cgroup_storage_type(stype) + progs->items[cnt].cgroup_storage[stype] = + pl->storage[stype]; cnt++; } } while ((p = cgroup_parent(p))); @@ -232,7 +238,9 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, { struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; - struct bpf_cgroup_storage *storage, *old_storage = NULL; + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], + *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL}; + enum bpf_cgroup_storage_type stype; struct bpf_prog_list *pl; bool pl_was_allocated; int err; @@ -254,34 +262,44 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) return -E2BIG;
- storage = bpf_cgroup_storage_alloc(prog); - if (IS_ERR(storage)) - return -ENOMEM; + for_each_cgroup_storage_type(stype) { + storage[stype] = bpf_cgroup_storage_alloc(prog, stype); + if (IS_ERR(storage[stype])) { + storage[stype] = NULL; + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_free(storage[stype]); + return -ENOMEM; + } + }
if (flags & BPF_F_ALLOW_MULTI) { list_for_each_entry(pl, progs, node) { if (pl->prog == prog) { /* disallow attaching the same prog twice */ - bpf_cgroup_storage_free(storage); + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_free(storage[stype]); return -EINVAL; } }
pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { - bpf_cgroup_storage_free(storage); + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_free(storage[stype]); return -ENOMEM; }
pl_was_allocated = true; pl->prog = prog; - pl->storage = storage; + for_each_cgroup_storage_type(stype) + pl->storage[stype] = storage[stype]; list_add_tail(&pl->node, progs); } else { if (list_empty(progs)) { pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { - bpf_cgroup_storage_free(storage); + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_free(storage[stype]); return -ENOMEM; } pl_was_allocated = true; @@ -289,12 +307,15 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, } else { pl = list_first_entry(progs, typeof(*pl), node); old_prog = pl->prog; - old_storage = pl->storage; - bpf_cgroup_storage_unlink(old_storage); + for_each_cgroup_storage_type(stype) { + old_storage[stype] = pl->storage[stype]; + bpf_cgroup_storage_unlink(old_storage[stype]); + } pl_was_allocated = false; } pl->prog = prog; - pl->storage = storage; + for_each_cgroup_storage_type(stype) + pl->storage[stype] = storage[stype]; }
cgrp->bpf.flags[type] = flags; @@ -304,21 +325,27 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, goto cleanup;
static_branch_inc(&cgroup_bpf_enabled_key); - if (old_storage) - bpf_cgroup_storage_free(old_storage); + for_each_cgroup_storage_type(stype) { + if (!old_storage[stype]) + continue; + bpf_cgroup_storage_free(old_storage[stype]); + } if (old_prog) { bpf_prog_put(old_prog); static_branch_dec(&cgroup_bpf_enabled_key); } - bpf_cgroup_storage_link(storage, cgrp, type); + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_link(storage[stype], cgrp, type); return 0;
cleanup: /* and cleanup the prog list */ pl->prog = old_prog; - bpf_cgroup_storage_free(pl->storage); - pl->storage = old_storage; - bpf_cgroup_storage_link(old_storage, cgrp, type); + for_each_cgroup_storage_type(stype) { + bpf_cgroup_storage_free(pl->storage[stype]); + pl->storage[stype] = old_storage[stype]; + bpf_cgroup_storage_link(old_storage[stype], cgrp, type); + } if (pl_was_allocated) { list_del(&pl->node); kfree(pl); @@ -339,6 +366,7 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 unused_flags) { struct list_head *progs = &cgrp->bpf.progs[type]; + enum bpf_cgroup_storage_type stype; u32 flags = cgrp->bpf.flags[type]; struct bpf_prog *old_prog = NULL; struct bpf_prog_list *pl; @@ -385,8 +413,10 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
/* now can actually delete it from this cgroup list */ list_del(&pl->node); - bpf_cgroup_storage_unlink(pl->storage); - bpf_cgroup_storage_free(pl->storage); + for_each_cgroup_storage_type(stype) { + bpf_cgroup_storage_unlink(pl->storage[stype]); + bpf_cgroup_storage_free(pl->storage[stype]); + } kfree(pl); if (list_empty(progs)) /* last program was detached, reset flags to zero */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1991466b8327..9070b2ace6aa 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -194,16 +194,18 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { .ret_type = RET_INTEGER, };
-DECLARE_PER_CPU(void*, bpf_cgroup_storage); +#ifdef CONFIG_CGROUP_BPF +DECLARE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) { - /* map and flags arguments are not used now, - * but provide an ability to extend the API - * for other types of local storages. - * verifier checks that their values are correct. + /* flags argument is not used now, + * but provides an ability to extend the API. + * verifier checks that its value is correct. */ - return (unsigned long) this_cpu_read(bpf_cgroup_storage); + enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); + + return (unsigned long) this_cpu_read(bpf_cgroup_storage[stype]); }
const struct bpf_func_proto bpf_get_local_storage_proto = { @@ -214,3 +216,4 @@ const struct bpf_func_proto bpf_get_local_storage_proto = { .arg2_type = ARG_ANYTHING, }; #endif +#endif diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index fc1605aee5ea..fc9835e69b83 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -7,7 +7,7 @@ #include <linux/rbtree.h> #include <linux/slab.h>
-DEFINE_PER_CPU(void*, bpf_cgroup_storage); +DEFINE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
#ifdef CONFIG_CGROUP_BPF
@@ -255,6 +255,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = {
int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) { + enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); struct bpf_cgroup_storage_map *map = map_to_storage(_map); int ret = -EBUSY;
@@ -262,11 +263,12 @@ int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map)
if (map->prog && map->prog != prog) goto unlock; - if (prog->aux->cgroup_storage && prog->aux->cgroup_storage != _map) + if (prog->aux->cgroup_storage[stype] && + prog->aux->cgroup_storage[stype] != _map) goto unlock;
map->prog = prog; - prog->aux->cgroup_storage = _map; + prog->aux->cgroup_storage[stype] = _map; ret = 0; unlock: spin_unlock_bh(&map->lock); @@ -276,24 +278,26 @@ int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map)
void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map) { + enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); struct bpf_cgroup_storage_map *map = map_to_storage(_map);
spin_lock_bh(&map->lock); if (map->prog == prog) { - WARN_ON(prog->aux->cgroup_storage != _map); + WARN_ON(prog->aux->cgroup_storage[stype] != _map); map->prog = NULL; - prog->aux->cgroup_storage = NULL; + prog->aux->cgroup_storage[stype] = NULL; } spin_unlock_bh(&map->lock); }
-struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog) +struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, + enum bpf_cgroup_storage_type stype) { struct bpf_cgroup_storage *storage; struct bpf_map *map; u32 pages;
- map = prog->aux->cgroup_storage; + map = prog->aux->cgroup_storage[stype]; if (!map) return NULL;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3e9ff9897c7c..2dcd9bd50175 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1017,10 +1017,15 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) /* drop refcnt on maps used by eBPF program and free auxilary data */ static void free_used_maps(struct bpf_prog_aux *aux) { + enum bpf_cgroup_storage_type stype; int i;
- if (aux->cgroup_storage) - bpf_cgroup_storage_release(aux->prog, aux->cgroup_storage); + for_each_cgroup_storage_type(stype) { + if (!aux->cgroup_storage[stype]) + continue; + bpf_cgroup_storage_release(aux->prog, + aux->cgroup_storage[stype]); + }
for (i = 0; i < aux->used_map_cnt; i++) bpf_map_put(aux->used_maps[i]); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f58e7928768a..62a016831f64 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5469,11 +5469,15 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) /* drop refcnt of maps used by the rejected program */ static void release_maps(struct bpf_verifier_env *env) { + enum bpf_cgroup_storage_type stype; int i;
- if (env->prog->aux->cgroup_storage) + for_each_cgroup_storage_type(stype) { + if (!env->prog->aux->cgroup_storage[stype]) + continue; bpf_cgroup_storage_release(env->prog, - env->prog->aux->cgroup_storage); + env->prog->aux->cgroup_storage[stype]); + }
for (i = 0; i < env->used_map_cnt; i++) bpf_map_put(env->used_maps[i]); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index f4078830ea50..0c423b8cd75c 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -12,7 +12,7 @@ #include <linux/sched/signal.h>
static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx, - struct bpf_cgroup_storage *storage) + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) { u32 ret;
@@ -28,13 +28,20 @@ static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx,
static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) { - struct bpf_cgroup_storage *storage = NULL; + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { 0 }; + enum bpf_cgroup_storage_type stype; u64 time_start, time_spent = 0; u32 ret = 0, i;
- storage = bpf_cgroup_storage_alloc(prog); - if (IS_ERR(storage)) - return PTR_ERR(storage); + for_each_cgroup_storage_type(stype) { + storage[stype] = bpf_cgroup_storage_alloc(prog, stype); + if (IS_ERR(storage[stype])) { + storage[stype] = NULL; + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_free(storage[stype]); + return -ENOMEM; + } + }
if (!repeat) repeat = 1; @@ -53,7 +60,8 @@ static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) do_div(time_spent, repeat); *time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent;
- bpf_cgroup_storage_free(storage); + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_free(storage[stype]);
return ret; }
From: Bo YU tsu.yubo@gmail.com
mainline inclusion from mainline-v5.1-rc1 commit 71b91a506bb05f9aef3acd57af2e835d85721942 category: bugfix bugzilla: 43460 CVE: NA
--------------------------------------- Sparse warning below:
sudo make C=2 CF=-D__CHECK_ENDIAN__ M=net/bpf/ CHECK net/bpf//test_run.c net/bpf//test_run.c:19:77: warning: Using plain integer as NULL pointer ./include/linux/bpf-cgroup.h:295:77: warning: Using plain integer as NULL pointer
Fixes: 8bad74f9840f ("bpf: extend cgroup bpf core to allow multiple cgroup storage types") Acked-by: Yonghong Song yhs@fb.com Signed-off-by: Bo YU tsu.yubo@gmail.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Signed-off-by: liuxin liuxin264@huawei.com Reviewed-by: Cheng Jian cj.chengjian@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/bpf-cgroup.h | 2 +- net/bpf/test_run.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 65e19fe60a49..0970af2cb70f 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -293,7 +293,7 @@ static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog, static inline void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map) {} static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( - struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return 0; } + struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return NULL; } static inline void bpf_cgroup_storage_free( struct bpf_cgroup_storage *storage) {}
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 0c423b8cd75c..cb161146d52b 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -28,7 +28,7 @@ static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx,
static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) { - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { 0 }; + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { NULL }; enum bpf_cgroup_storage_type stype; u64 time_start, time_spent = 0; u32 ret = 0, i;
From: Andrii Nakryiko andriin@fb.com
mainline inclusion from mainline-v5.6 commit 62039c30c19dcab96621e074aeeb90da7100def7 category: bugfix bugzilla: 43460 CVE: NA
--------------------------------------- Local storage array isn't initialized, so if cgroup storage allocation fails for BPF_CGROUP_STORAGE_SHARED, error handling code will attempt to free uninitialized pointer for BPF_CGROUP_STORAGE_PERCPU storage type. Avoid this by always initializing storage pointers to NULLs.
Fixes: 8bad74f9840f ("bpf: extend cgroup bpf core to allow multiple cgroup storage types") Signed-off-by: Andrii Nakryiko andriin@fb.com Signed-off-by: Alexei Starovoitov ast@kernel.org Link: https://lore.kernel.org/bpf/20200309222756.1018737-1-andriin@fb.com
Conflicts: kernel/bpf/cgroup.c [liuxin:solve the conflict in cgroup.c]
Signed-off-by: liuxin liuxin264@huawei.com Reviewed-by: Cheng Jian cj.chengjian@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/bpf/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 00f6ed2e4f9a..f574fe68868f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -238,10 +238,10 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, { struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], - *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL}; - enum bpf_cgroup_storage_type stype; + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; + struct bpf_cgroup_storage *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; struct bpf_prog_list *pl; + enum bpf_cgroup_storage_type stype; bool pl_was_allocated; int err;
From: Roman Gushchin guro@fb.com
mainline inclusion from mainline-v4.20-rc1 commit f294b37ec7b24a574884cd157497a3748081c0f0 category: other bugzilla: 43460 CVE: NA
--------------------------------------- To simplify the following introduction of per-cpu cgroup storage, let's rework a bit a mechanism of passing a pointer to a cgroup storage into the bpf_get_local_storage(). Let's save a pointer to the corresponding bpf_cgroup_storage structure, instead of a pointer to the actual buffer.
It will help us to handle per-cpu storage later, which has a different way of accessing to the actual data.
Signed-off-by: Roman Gushchin guro@fb.com Acked-by: Song Liu songliubraving@fb.com Cc: Daniel Borkmann daniel@iogearbox.net Cc: Alexei Starovoitov ast@kernel.org Signed-off-by: Daniel Borkmann daniel@iogearbox.net Signed-off-by: liuxin liuxin264@huawei.com Reviewed-by: Cheng Jian cj.chengjian@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/bpf-cgroup.h | 13 ++++--------- kernel/bpf/helpers.c | 8 ++++++-- kernel/bpf/local_storage.c | 3 ++- 3 files changed, 12 insertions(+), 12 deletions(-)
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 0970af2cb70f..f4119e9de6ed 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -23,7 +23,8 @@ struct bpf_cgroup_storage; extern struct static_key_false cgroup_bpf_enabled_key; #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
-DECLARE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); +DECLARE_PER_CPU(struct bpf_cgroup_storage*, + bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
#define for_each_cgroup_storage_type(stype) \ for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++) @@ -115,15 +116,9 @@ static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) { enum bpf_cgroup_storage_type stype; - struct bpf_storage_buffer *buf; - - for_each_cgroup_storage_type(stype) { - if (!storage[stype]) - continue;
- buf = READ_ONCE(storage[stype]->buf); - this_cpu_write(bpf_cgroup_storage[stype], &buf->data[0]); - } + for_each_cgroup_storage_type(stype) + this_cpu_write(bpf_cgroup_storage[stype], storage[stype]); }
struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9070b2ace6aa..e42f8789b7ea 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -195,7 +195,8 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { };
#ifdef CONFIG_CGROUP_BPF -DECLARE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); +DECLARE_PER_CPU(struct bpf_cgroup_storage*, + bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) { @@ -204,8 +205,11 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) * verifier checks that its value is correct. */ enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); + struct bpf_cgroup_storage *storage;
- return (unsigned long) this_cpu_read(bpf_cgroup_storage[stype]); + storage = this_cpu_read(bpf_cgroup_storage[stype]); + + return (unsigned long)&READ_ONCE(storage->buf)->data[0]; }
const struct bpf_func_proto bpf_get_local_storage_proto = { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index fc9835e69b83..e0b1d9ccde39 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -7,7 +7,8 @@ #include <linux/rbtree.h> #include <linux/slab.h>
-DEFINE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); +DEFINE_PER_CPU(struct bpf_cgroup_storage*, + bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
#ifdef CONFIG_CGROUP_BPF
From: Roman Gushchin guro@fb.com
mainline inclusion from mainline-v4.20-rc1 commit b741f1630346defcbc8cc60f1a2bdae8b3b0036f category: feature bugzilla: 43460 CVE: NA
--------------------------------------- This commit introduced per-cpu cgroup local storage.
Per-cpu cgroup local storage is very similar to simple cgroup storage (let's call it shared), except all the data is per-cpu.
The main goal of per-cpu variant is to implement super fast counters (e.g. packet counters), which don't require neither lookups, neither atomic operations.
From userspace's point of view, accessing a per-cpu cgroup storage
is similar to other per-cpu map types (e.g. per-cpu hashmaps and arrays).
Writing to a per-cpu cgroup storage is not atomic, but is performed by copying longs, so some minimal atomicity is here, exactly as with other per-cpu maps.
Signed-off-by: Roman Gushchin guro@fb.com Cc: Daniel Borkmann daniel@iogearbox.net Cc: Alexei Starovoitov ast@kernel.org Acked-by: Song Liu songliubraving@fb.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Signed-off-by: liuxin liuxin264@huawei.com Reviewed-by: Cheng Jian cj.chengjian@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/bpf-cgroup.h | 20 ++++- include/linux/bpf.h | 1 + include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/helpers.c | 8 +- kernel/bpf/local_storage.c | 150 ++++++++++++++++++++++++++++++++----- kernel/bpf/syscall.c | 11 ++- kernel/bpf/verifier.c | 15 +++- 8 files changed, 179 insertions(+), 28 deletions(-)
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index f4119e9de6ed..30c9d0247e7f 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -37,7 +37,10 @@ struct bpf_storage_buffer { };
struct bpf_cgroup_storage { - struct bpf_storage_buffer *buf; + union { + struct bpf_storage_buffer *buf; + void __percpu *percpu_buf; + }; struct bpf_cgroup_storage_map *map; struct bpf_cgroup_storage_key key; struct list_head list; @@ -109,6 +112,9 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, static inline enum bpf_cgroup_storage_type cgroup_storage_type( struct bpf_map *map) { + if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) + return BPF_CGROUP_STORAGE_PERCPU; + return BPF_CGROUP_STORAGE_SHARED; }
@@ -131,6 +137,10 @@ void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage); int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map); void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map);
+int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value); +int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, + void *value, u64 flags); + /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ @@ -291,6 +301,14 @@ static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return NULL; } static inline void bpf_cgroup_storage_free( struct bpf_cgroup_storage *storage) {} +static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, + void *value) { + return 0; +} +static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, + void *key, void *value, u64 flags) { + return 0; +}
#define cgroup_bpf_enabled (0) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fa76b98f49a6..6bdc7157c232 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -274,6 +274,7 @@ struct bpf_prog_offload {
enum bpf_cgroup_storage_type { BPF_CGROUP_STORAGE_SHARED, + BPF_CGROUP_STORAGE_PERCPU, __BPF_CGROUP_STORAGE_MAX };
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index cd26c090e7c0..b3f9156a519a 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -42,6 +42,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops) #endif #ifdef CONFIG_CGROUP_BPF BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, cgroup_storage_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 71ca8c4dc290..a2b606060588 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -127,6 +127,7 @@ enum bpf_map_type { BPF_MAP_TYPE_SOCKHASH, BPF_MAP_TYPE_CGROUP_STORAGE, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, + BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, };
enum bpf_prog_type { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index e42f8789b7ea..6502115e8f55 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -206,10 +206,16 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) */ enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); struct bpf_cgroup_storage *storage; + void *ptr;
storage = this_cpu_read(bpf_cgroup_storage[stype]);
- return (unsigned long)&READ_ONCE(storage->buf)->data[0]; + if (stype == BPF_CGROUP_STORAGE_SHARED) + ptr = &READ_ONCE(storage->buf)->data[0]; + else + ptr = this_cpu_ptr(storage->percpu_buf); + + return (unsigned long)ptr; }
const struct bpf_func_proto bpf_get_local_storage_proto = { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index e0b1d9ccde39..bed9d48a7ae9 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -153,6 +153,71 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, return 0; }
+int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key, + void *value) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage *storage; + int cpu, off = 0; + u32 size; + + rcu_read_lock(); + storage = cgroup_storage_lookup(map, key, false); + if (!storage) { + rcu_read_unlock(); + return -ENOENT; + } + + /* per_cpu areas are zero-filled and bpf programs can only + * access 'value_size' of them, so copying rounded areas + * will not leak any kernel data + */ + size = round_up(_map->value_size, 8); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(value + off, + per_cpu_ptr(storage->percpu_buf, cpu), size); + off += size; + } + rcu_read_unlock(); + return 0; +} + +int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key, + void *value, u64 map_flags) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage *storage; + int cpu, off = 0; + u32 size; + + if (map_flags != BPF_ANY && map_flags != BPF_EXIST) + return -EINVAL; + + rcu_read_lock(); + storage = cgroup_storage_lookup(map, key, false); + if (!storage) { + rcu_read_unlock(); + return -ENOENT; + } + + /* the user space will provide round_up(value_size, 8) bytes that + * will be copied into per-cpu area. bpf programs can only access + * value_size of it. During lookup the same extra bytes will be + * returned or zeros which were zero-filled by percpu_alloc, + * so no kernel data leaks possible + */ + size = round_up(_map->value_size, 8); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu), + value + off, size); + off += size; + } + rcu_read_unlock(); + return 0; +} + static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key, void *_next_key) { @@ -291,60 +356,105 @@ void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map) spin_unlock_bh(&map->lock); }
+static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages) +{ + size_t size; + + if (cgroup_storage_type(map) == BPF_CGROUP_STORAGE_SHARED) { + size = sizeof(struct bpf_storage_buffer) + map->value_size; + *pages = round_up(sizeof(struct bpf_cgroup_storage) + size, + PAGE_SIZE) >> PAGE_SHIFT; + } else { + size = map->value_size; + *pages = round_up(round_up(size, 8) * num_possible_cpus(), + PAGE_SIZE) >> PAGE_SHIFT; + } + + return size; +} + struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { struct bpf_cgroup_storage *storage; struct bpf_map *map; + gfp_t flags; + size_t size; u32 pages;
map = prog->aux->cgroup_storage[stype]; if (!map) return NULL;
- pages = round_up(sizeof(struct bpf_cgroup_storage) + - sizeof(struct bpf_storage_buffer) + - map->value_size, PAGE_SIZE) >> PAGE_SHIFT; + size = bpf_cgroup_storage_calculate_size(map, &pages); + if (bpf_map_charge_memlock(map, pages)) return ERR_PTR(-EPERM);
storage = kmalloc_node(sizeof(struct bpf_cgroup_storage), __GFP_ZERO | GFP_USER, map->numa_node); - if (!storage) { - bpf_map_uncharge_memlock(map, pages); - return ERR_PTR(-ENOMEM); - } + if (!storage) + goto enomem;
- storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) + - map->value_size, __GFP_ZERO | GFP_USER, - map->numa_node); - if (!storage->buf) { - bpf_map_uncharge_memlock(map, pages); - kfree(storage); - return ERR_PTR(-ENOMEM); + flags = __GFP_ZERO | GFP_USER; + + if (stype == BPF_CGROUP_STORAGE_SHARED) { + storage->buf = kmalloc_node(size, flags, map->numa_node); + if (!storage->buf) + goto enomem; + } else { + storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags); + if (!storage->percpu_buf) + goto enomem; }
storage->map = (struct bpf_cgroup_storage_map *)map;
return storage; + +enomem: + bpf_map_uncharge_memlock(map, pages); + kfree(storage); + return ERR_PTR(-ENOMEM); +} + +static void free_shared_cgroup_storage_rcu(struct rcu_head *rcu) +{ + struct bpf_cgroup_storage *storage = + container_of(rcu, struct bpf_cgroup_storage, rcu); + + kfree(storage->buf); + kfree(storage); +} + +static void free_percpu_cgroup_storage_rcu(struct rcu_head *rcu) +{ + struct bpf_cgroup_storage *storage = + container_of(rcu, struct bpf_cgroup_storage, rcu); + + free_percpu(storage->percpu_buf); + kfree(storage); }
void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage) { - u32 pages; + enum bpf_cgroup_storage_type stype; struct bpf_map *map; + u32 pages;
if (!storage) return;
map = &storage->map->map; - pages = round_up(sizeof(struct bpf_cgroup_storage) + - sizeof(struct bpf_storage_buffer) + - map->value_size, PAGE_SIZE) >> PAGE_SHIFT; + + bpf_cgroup_storage_calculate_size(map, &pages); bpf_map_uncharge_memlock(map, pages);
- kfree_rcu(storage->buf, rcu); - kfree_rcu(storage, rcu); + stype = cgroup_storage_type(map); + if (stype == BPF_CGROUP_STORAGE_SHARED) + call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu); + else + call_rcu(&storage->rcu, free_percpu_cgroup_storage_rcu); }
void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2dcd9bd50175..9bdbe025d874 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -686,7 +686,8 @@ static int map_lookup_elem(union bpf_attr *attr)
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) value_size = round_up(map->value_size, 8) * num_possible_cpus(); else if (IS_FD_MAP(map)) value_size = sizeof(u32); @@ -710,6 +711,8 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { + err = bpf_percpu_cgroup_storage_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_copy(map, key, value); } else if (IS_FD_ARRAY(map)) { @@ -801,7 +804,8 @@ static int map_update_elem(union bpf_attr *attr)
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) value_size = round_up(map->value_size, 8) * num_possible_cpus(); else value_size = map->value_size; @@ -836,6 +840,9 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_percpu_hash_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_update(map, key, value, attr->flags); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { + err = bpf_percpu_cgroup_storage_update(map, key, value, + attr->flags); } else if (IS_FD_ARRAY(map)) { rcu_read_lock(); err = bpf_fd_array_map_update_elem(map, f.file, key, value, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 62a016831f64..240bdbde8a15 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2060,6 +2060,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_MAP_TYPE_CGROUP_STORAGE: + case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: if (func_id != BPF_FUNC_get_local_storage) goto error; break; @@ -2150,7 +2151,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_FUNC_get_local_storage: - if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) + if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && + map->map_type != BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) goto error; break; case BPF_FUNC_sk_select_reuseport: @@ -5347,6 +5349,12 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, return 0; }
+static bool bpf_map_is_cgroup_storage(struct bpf_map *map) +{ + return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); +} + /* look for pseudo eBPF instructions that access map FDs and * replace them with actual map pointers */ @@ -5437,10 +5445,9 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) } env->used_maps[env->used_map_cnt++] = map;
- if (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE && + if (bpf_map_is_cgroup_storage(map) && bpf_cgroup_storage_assign(env->prog, map)) { - verbose(env, - "only one cgroup storage is allowed\n"); + verbose(env, "only one cgroup storage of each type is allowed\n"); fdput(f); return -EBUSY; }
From: Roman Gushchin guro@fb.com
mainline inclusion from mainline-v4.20-rc1 commit c6fdcd6e0cc4dc316e3eb261025fb0abd69540b9 category: feature bugzilla: 43460 CVE: NA
--------------------------------------- Explicitly forbid creating map of per-cpu cgroup local storages. This behavior matches the behavior of shared cgroup storages.
Signed-off-by: Roman Gushchin guro@fb.com Acked-by: Song Liu songliubraving@fb.com Cc: Daniel Borkmann daniel@iogearbox.net Cc: Alexei Starovoitov ast@kernel.org Signed-off-by: Daniel Borkmann daniel@iogearbox.net Signed-off-by: liuxin liuxin264@huawei.com Reviewed-by: Cheng Jian cj.chengjian@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/bpf/map_in_map.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 9670ee5ee74e..52378d3e34b3 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -25,7 +25,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) * in the verifier is not enough. */ if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY || - inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE) { + inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || + inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { fdput(f); return ERR_PTR(-ENOTSUPP); }