From: Jan Kara jack@suse.cz
mainline inclusion from mainline-5.12-rc1 commit 5ac83c644f5fb924f0b2c09102ab82fc788f8411 category: perf bugzilla: https://gitee.com/openeuler/kernel/issues/I4SW26 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
This reverts commit b445547ec1bbd3e7bf4b1c142550942f70527d95.
Since both mq-deadline and BFQ completely ignore hctx they are passed to their dispatch function and dispatch whatever request they deem fit checking whether any request for a particular hctx is queued is just pointless since we'll very likely get a request from a different hctx anyway. In the following commit we'll deal with lock contention in these IO schedulers in presence of multiple HW queues in a different way.
Signed-off-by: Jan Kara jack@suse.cz Reviewed-by: Ming Lei ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk
Conflicts: block/bfq-iosched.c
Signed-off-by: Baokun Li libaokun1@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/bfq-iosched.c | 5 ----- block/blk-mq.c | 1 - block/mq-deadline.c | 6 ------ include/linux/blk-mq.h | 4 ---- 4 files changed, 16 deletions(-)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 70f2aeadd21c..2247db842985 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4651,9 +4651,6 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) { struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
- if (!atomic_read(&hctx->elevator_queued)) - return false; - /* * Avoiding lock: a race on bfqd->busy_queues should cause at * most a call to dispatch for nothing @@ -5570,7 +5567,6 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); bfq_insert_request(hctx, rq, at_head); - atomic_inc(&hctx->elevator_queued); } }
@@ -5935,7 +5931,6 @@ static void bfq_finish_requeue_request(struct request *rq) bfq_update_inject_limit(bfqd, bfqq);
bfq_completed_request(bfqq, bfqd); - atomic_dec(&rq->mq_hctx->elevator_queued); } bfq_finish_requeue_request_body(bfqq); spin_unlock_irqrestore(&bfqd->lock, flags); diff --git a/block/blk-mq.c b/block/blk-mq.c index b8cf684030dc..8428624ac42f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2748,7 +2748,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, goto free_hctx;
atomic_set(&hctx->nr_active, 0); - atomic_set(&hctx->elevator_queued, 0); if (node == NUMA_NO_NODE) node = set->numa_node; hctx->numa_node = node; diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 43994cce1eb2..42b6e9dbe7c7 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -386,8 +386,6 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) spin_lock(&dd->lock); rq = __dd_dispatch_request(dd); spin_unlock(&dd->lock); - if (rq) - atomic_dec(&rq->mq_hctx->elevator_queued);
return rq; } @@ -539,7 +537,6 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); dd_insert_request(hctx, rq, at_head); - atomic_inc(&hctx->elevator_queued); } spin_unlock(&dd->lock); } @@ -586,9 +583,6 @@ static bool dd_has_work(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data;
- if (!atomic_read(&hctx->elevator_queued)) - return false; - return !list_empty_careful(&dd->dispatch) || !list_empty_careful(&dd->fifo_list[0]) || !list_empty_careful(&dd->fifo_list[1]); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 3134aaf9032a..adcbef9705ca 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -141,10 +141,6 @@ struct blk_mq_hw_ctx { * shared across request queues. */ atomic_t nr_active; - /** - * @elevator_queued: Number of queued requests on hctx. - */ - atomic_t elevator_queued;
/** @cpuhp_online: List to store request if CPU is going to die */ struct hlist_node cpuhp_online;
From: Jan Kara jack@suse.cz
mainline inclusion from mainline-5.12-rc1 commit b6e68ee82585f2ee890b0a897a6aacbf49a467bb category: perf bugzilla: https://gitee.com/openeuler/kernel/issues/I4SW26 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Currently when non-mq aware IO scheduler (BFQ, mq-deadline) is used for a queue with multiple HW queues, the performance it rather bad. The problem is that these IO schedulers use queue-wide locking and their dispatch function does not respect the hctx it is passed in and returns any request it finds appropriate. Thus locality of request access is broken and dispatch from multiple CPUs just contends on IO scheduler locks. For these IO schedulers there's little point in dispatching from multiple CPUs. Instead dispatch always only from a single CPU to limit contention.
Below is a comparison of dbench runs on XFS filesystem where the storage is a raid card with 64 HW queues and to it attached a single rotating disk. BFQ is used as IO scheduler:
clients MQ SQ MQ-Patched Amean 1 39.12 (0.00%) 43.29 * -10.67%* 36.09 * 7.74%* Amean 2 128.58 (0.00%) 101.30 * 21.22%* 96.14 * 25.23%* Amean 4 577.42 (0.00%) 494.47 * 14.37%* 508.49 * 11.94%* Amean 8 610.95 (0.00%) 363.86 * 40.44%* 362.12 * 40.73%* Amean 16 391.78 (0.00%) 261.49 * 33.25%* 282.94 * 27.78%* Amean 32 324.64 (0.00%) 267.71 * 17.54%* 233.00 * 28.23%* Amean 64 295.04 (0.00%) 253.02 * 14.24%* 242.37 * 17.85%* Amean 512 10281.61 (0.00%) 10211.16 * 0.69%* 10447.53 * -1.61%*
Numbers are times so lower is better. MQ is stock 5.10-rc6 kernel. SQ is the same kernel with megaraid_sas.host_tagset_enable=0 so that the card advertises just a single HW queue. MQ-Patched is a kernel with this patch applied.
You can see multiple hardware queues heavily hurt performance in combination with BFQ. The patch restores the performance.
Signed-off-by: Jan Kara jack@suse.cz Reviewed-by: Ming Lei ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Baokun Li libaokun1@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-mq.c | 66 ++++++++++++++++++++++++++++++++++++---- block/kyber-iosched.c | 1 + include/linux/elevator.h | 2 ++ 3 files changed, 63 insertions(+), 6 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c index 8428624ac42f..e0b833120498 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1663,6 +1663,42 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) } EXPORT_SYMBOL(blk_mq_run_hw_queue);
+/* + * Is the request queue handled by an IO scheduler that does not respect + * hardware queues when dispatching? + */ +static bool blk_mq_has_sqsched(struct request_queue *q) +{ + struct elevator_queue *e = q->elevator; + + if (e && e->type->ops.dispatch_request && + !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE)) + return true; + return false; +} + +/* + * Return prefered queue to dispatch from (if any) for non-mq aware IO + * scheduler. + */ +static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + + /* + * If the IO scheduler does not respect hardware queues when + * dispatching, we just don't bother with multiple HW queues and + * dispatch from hctx for the current CPU since running multiple queues + * just causes lock contention inside the scheduler and pointless cache + * bouncing. + */ + hctx = blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT, + raw_smp_processor_id()); + if (!blk_mq_hctx_stopped(hctx)) + return hctx; + return NULL; +} + /** * blk_mq_run_hw_queues - Run all hardware queues in a request queue. * @q: Pointer to the request queue to run. @@ -1670,14 +1706,23 @@ EXPORT_SYMBOL(blk_mq_run_hw_queue); */ void blk_mq_run_hw_queues(struct request_queue *q, bool async) { - struct blk_mq_hw_ctx *hctx; + struct blk_mq_hw_ctx *hctx, *sq_hctx; int i;
+ sq_hctx = NULL; + if (blk_mq_has_sqsched(q)) + sq_hctx = blk_mq_get_sq_hctx(q); queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; - - blk_mq_run_hw_queue(hctx, async); + /* + * Dispatch from this hctx either if there's no hctx preferred + * by IO scheduler or if it has requests that bypass the + * scheduler. + */ + if (!sq_hctx || sq_hctx == hctx || + !list_empty_careful(&hctx->dispatch)) + blk_mq_run_hw_queue(hctx, async); } } EXPORT_SYMBOL(blk_mq_run_hw_queues); @@ -1689,14 +1734,23 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues); */ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) { - struct blk_mq_hw_ctx *hctx; + struct blk_mq_hw_ctx *hctx, *sq_hctx; int i;
+ sq_hctx = NULL; + if (blk_mq_has_sqsched(q)) + sq_hctx = blk_mq_get_sq_hctx(q); queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; - - blk_mq_delay_run_hw_queue(hctx, msecs); + /* + * Dispatch from this hctx either if there's no hctx preferred + * by IO scheduler or if it has requests that bypass the + * scheduler. + */ + if (!sq_hctx || sq_hctx == hctx || + !list_empty_careful(&hctx->dispatch)) + blk_mq_delay_run_hw_queue(hctx, msecs); } } EXPORT_SYMBOL(blk_mq_delay_run_hw_queues); diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index d2648356e430..448ae410f510 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -1027,6 +1027,7 @@ static struct elevator_type kyber_sched = { #endif .elevator_attrs = kyber_sched_attrs, .elevator_name = "kyber", + .elevator_features = ELEVATOR_F_MQ_AWARE, .elevator_owner = THIS_MODULE, };
diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 820563e85c41..1363b5858486 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -188,6 +188,8 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
/* Supports zoned block devices sequential write constraint */ #define ELEVATOR_F_ZBD_SEQ_WRITE (1U << 0) +/* Supports scheduling on multiple hardware queues */ +#define ELEVATOR_F_MQ_AWARE (1U << 1)
#endif /* CONFIG_BLOCK */ #endif
From: Jiazi Li jqqlijiazi@gmail.com
mainline inclusion from mainline-v5.15-rc6 commit d208b89401e0 category: panic bugzilla: 185514 https://gitee.com/openeuler/kernel/issues/I4V6FT?from=project-issue CVE: NA
-------------------------------------------------
dm_io_dec_pending() calls end_io_acct() first and will then dec md in-flight pending count. But if a task is swapping DM table at same time this can result in a crash due to mempool->elements being NULL:
task1 task2 do_resume ->do_suspend ->dm_wait_for_completion bio_endio ->clone_endio ->dm_io_dec_pending ->end_io_acct ->wakeup task1 ->dm_swap_table ->__bind ->__bind_mempools ->bioset_exit ->mempool_exit ->free_io
[ 67.330330] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 ...... [ 67.330494] pstate: 80400085 (Nzcv daIf +PAN -UAO) [ 67.330510] pc : mempool_free+0x70/0xa0 [ 67.330515] lr : mempool_free+0x4c/0xa0 [ 67.330520] sp : ffffff8008013b20 [ 67.330524] x29: ffffff8008013b20 x28: 0000000000000004 [ 67.330530] x27: ffffffa8c2ff40a0 x26: 00000000ffff1cc8 [ 67.330535] x25: 0000000000000000 x24: ffffffdada34c800 [ 67.330541] x23: 0000000000000000 x22: ffffffdada34c800 [ 67.330547] x21: 00000000ffff1cc8 x20: ffffffd9a1304d80 [ 67.330552] x19: ffffffdada34c970 x18: 000000b312625d9c [ 67.330558] x17: 00000000002dcfbf x16: 00000000000006dd [ 67.330563] x15: 000000000093b41e x14: 0000000000000010 [ 67.330569] x13: 0000000000007f7a x12: 0000000034155555 [ 67.330574] x11: 0000000000000001 x10: 0000000000000001 [ 67.330579] x9 : 0000000000000000 x8 : 0000000000000000 [ 67.330585] x7 : 0000000000000000 x6 : ffffff80148b5c1a [ 67.330590] x5 : ffffff8008013ae0 x4 : 0000000000000001 [ 67.330596] x3 : ffffff80080139c8 x2 : ffffff801083bab8 [ 67.330601] x1 : 0000000000000000 x0 : ffffffdada34c970 [ 67.330609] Call trace: [ 67.330616] mempool_free+0x70/0xa0 [ 67.330627] bio_put+0xf8/0x110 [ 67.330638] dec_pending+0x13c/0x230 [ 67.330644] clone_endio+0x90/0x180 [ 67.330649] bio_endio+0x198/0x1b8 [ 67.330655] dec_pending+0x190/0x230 [ 67.330660] clone_endio+0x90/0x180 [ 67.330665] bio_endio+0x198/0x1b8 [ 67.330673] blk_update_request+0x214/0x428 [ 67.330683] scsi_end_request+0x2c/0x300 [ 67.330688] scsi_io_completion+0xa0/0x710 [ 67.330695] scsi_finish_command+0xd8/0x110 [ 67.330700] scsi_softirq_done+0x114/0x148 [ 67.330708] blk_done_softirq+0x74/0xd0 [ 67.330716] __do_softirq+0x18c/0x374 [ 67.330724] irq_exit+0xb4/0xb8 [ 67.330732] __handle_domain_irq+0x84/0xc0 [ 67.330737] gic_handle_irq+0x148/0x1b0 [ 67.330744] el1_irq+0xe8/0x190 [ 67.330753] lpm_cpuidle_enter+0x4f8/0x538 [ 67.330759] cpuidle_enter_state+0x1fc/0x398 [ 67.330764] cpuidle_enter+0x18/0x20 [ 67.330772] do_idle+0x1b4/0x290 [ 67.330778] cpu_startup_entry+0x20/0x28 [ 67.330786] secondary_start_kernel+0x160/0x170
Fix this by: 1) Establishing pointers to 'struct dm_io' members in dm_io_dec_pending() so that they may be passed into end_io_acct() _after_ free_io() is called. 2) Moving end_io_acct() after free_io().
Cc: stable@vger.kernel.org Signed-off-by: Jiazi Li lijiazi@xiaomi.com Signed-off-by: Mike Snitzer snitzer@redhat.com Signed-off-by: Luo Meng luomeng12@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/md/dm.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 3403722b1688..2cc3419b4889 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -607,18 +607,17 @@ static void start_io_acct(struct dm_io *io) false, 0, &io->stats_aux); }
-static void end_io_acct(struct dm_io *io) +static void end_io_acct(struct mapped_device *md, struct bio *bio, + unsigned long start_time, struct dm_stats_aux *stats_aux) { - struct mapped_device *md = io->md; - struct bio *bio = io->orig_bio; - unsigned long duration = jiffies - io->start_time; + unsigned long duration = jiffies - start_time;
- bio_end_io_acct(bio, io->start_time); + bio_end_io_acct(bio, start_time);
if (unlikely(dm_stats_used(&md->stats))) dm_stats_account_io(&md->stats, bio_data_dir(bio), bio->bi_iter.bi_sector, bio_sectors(bio), - true, duration, &io->stats_aux); + true, duration, stats_aux);
/* nudge anyone waiting on suspend queue */ if (unlikely(wq_has_sleeper(&md->wait))) @@ -903,6 +902,8 @@ static void dec_pending(struct dm_io *io, blk_status_t error) blk_status_t io_error; struct bio *bio; struct mapped_device *md = io->md; + unsigned long start_time = 0; + struct dm_stats_aux stats_aux;
/* Push-back supersedes any I/O errors */ if (unlikely(error)) { @@ -929,8 +930,10 @@ static void dec_pending(struct dm_io *io, blk_status_t error)
io_error = io->status; bio = io->orig_bio; - end_io_acct(io); + start_time = io->start_time; + stats_aux = io->stats_aux; free_io(md, io); + end_io_acct(md, bio, start_time, &stats_aux);
if (io_error == BLK_STS_DM_REQUEUE) return;
From: Frederic Weisbecker frederic@kernel.org
mainline inclusion from mainline-v5.13-rc1 commit b2fcf2102049f6e56981e0ab3d9b633b8e2741da category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4V8KN CVE: NA
-------------------------------------------------------------------------
This sequence of events can lead to a failure to requeue a CPU's ->nocb_timer:
1. There are no callbacks queued for any CPU covered by CPU 0-2's ->nocb_gp_kthread. Note that ->nocb_gp_kthread is associated with CPU 0.
2. CPU 1 enqueues its first callback with interrupts disabled, and thus must defer awakening its ->nocb_gp_kthread. It therefore queues its rcu_data structure's ->nocb_timer. At this point, CPU 1's rdp->nocb_defer_wakeup is RCU_NOCB_WAKE.
3. CPU 2, which shares the same ->nocb_gp_kthread, also enqueues a callback, but with interrupts enabled, allowing it to directly awaken the ->nocb_gp_kthread.
4. The newly awakened ->nocb_gp_kthread associates both CPU 1's and CPU 2's callbacks with a future grace period and arranges for that grace period to be started.
5. This ->nocb_gp_kthread goes to sleep waiting for the end of this future grace period.
6. This grace period elapses before the CPU 1's timer fires. This is normally improbably given that the timer is set for only one jiffy, but timers can be delayed. Besides, it is possible that kernel was built with CONFIG_RCU_STRICT_GRACE_PERIOD=y.
7. The grace period ends, so rcu_gp_kthread awakens the ->nocb_gp_kthread, which in turn awakens both CPU 1's and CPU 2's ->nocb_cb_kthread. Then ->nocb_gb_kthread sleeps waiting for more newly queued callbacks.
8. CPU 1's ->nocb_cb_kthread invokes its callback, then sleeps waiting for more invocable callbacks.
9. Note that neither kthread updated any ->nocb_timer state, so CPU 1's ->nocb_defer_wakeup is still set to RCU_NOCB_WAKE.
10. CPU 1 enqueues its second callback, this time with interrupts enabled so it can wake directly ->nocb_gp_kthread. It does so with calling wake_nocb_gp() which also cancels the pending timer that got queued in step 2. But that doesn't reset CPU 1's ->nocb_defer_wakeup which is still set to RCU_NOCB_WAKE. So CPU 1's ->nocb_defer_wakeup and its ->nocb_timer are now desynchronized.
11. ->nocb_gp_kthread associates the callback queued in 10 with a new grace period, arranges for that grace period to start and sleeps waiting for it to complete.
12. The grace period ends, rcu_gp_kthread awakens ->nocb_gp_kthread, which in turn wakes up CPU 1's ->nocb_cb_kthread which then invokes the callback queued in 10.
13. CPU 1 enqueues its third callback, this time with interrupts disabled so it must queue a timer for a deferred wakeup. However the value of its ->nocb_defer_wakeup is RCU_NOCB_WAKE which incorrectly indicates that a timer is already queued. Instead, CPU 1's ->nocb_timer was cancelled in 10. CPU 1 therefore fails to queue the ->nocb_timer.
14. CPU 1 has its pending callback and it may go unnoticed until some other CPU ever wakes up ->nocb_gp_kthread or CPU 1 ever calls an explicit deferred wakeup, for example, during idle entry.
This commit fixes this bug by resetting rdp->nocb_defer_wakeup everytime we delete the ->nocb_timer.
It is quite possible that there is a similar scenario involving ->nocb_bypass_timer and ->nocb_defer_wakeup. However, despite some effort from several people, a failure scenario has not yet been located. However, that by no means guarantees that no such scenario exists. Finding a failure scenario is left as an exercise for the reader, and the "Fixes:" tag below relates to ->nocb_bypass_timer instead of ->nocb_timer.
Fixes: d1b222c6be1f (rcu/nocb: Add bypass callback queueing) Cc: stable@vger.kernel.org Cc: Josh Triplett josh@joshtriplett.org Cc: Lai Jiangshan jiangshanlai@gmail.com Cc: Joel Fernandes joel@joelfernandes.org Cc: Boqun Feng boqun.feng@gmail.com Reviewed-by: Neeraj Upadhyay neeraju@codeaurora.org Signed-off-by: Frederic Weisbecker frederic@kernel.org Signed-off-by: Paul E. McKenney paulmck@kernel.org Signed-off-by: Zhen Lei thunder.leizhen@huawei.com Reviewed-by: Cheng Jian cj.chengjian@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/rcu/tree_plugin.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index a222faffbafa..029fb69b549d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1646,7 +1646,11 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force, rcu_nocb_unlock_irqrestore(rdp, flags); return false; } - del_timer(&rdp->nocb_timer); + + if (READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT) { + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + del_timer(&rdp->nocb_timer); + } rcu_nocb_unlock_irqrestore(rdp, flags); raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) { @@ -2167,7 +2171,6 @@ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp) return false; } ndw = READ_ONCE(rdp->nocb_defer_wakeup); - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); ret = wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
From: Ming Lei ming.lei@redhat.com
mainline inclusion from mainline-v5.15-rc6 commit b4459b11e840 category: panic bugzilla: 185513 https://gitee.com/openeuler/kernel/issues/I4V82O?from=project-issue CVE: NA
-------------------------------------------------
DM uses blk-mq's quiesce/unquiesce to stop/start device mapper queue.
But blk-mq's unquiesce may come from outside events, such as elevator switch, updating nr_requests or others, and request may come during suspend, so simply ask for blk-mq to requeue it.
Fixes one kernel panic issue when running updating nr_requests and dm-mpath suspend/resume stress test.
Cc: stable@vger.kernel.org Signed-off-by: Ming Lei ming.lei@redhat.com Signed-off-by: Mike Snitzer snitzer@redhat.com Signed-off-by: Luo Meng luomeng12@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/md/dm-core.h | 13 +++++++++++++ drivers/md/dm-rq.c | 8 ++++++++ drivers/md/dm.c | 13 ------------- 3 files changed, 21 insertions(+), 13 deletions(-)
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 3db92d9a030b..595e87e03d43 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -122,6 +122,19 @@ struct mapped_device { struct srcu_struct io_barrier; };
+/* + * Bits for the flags field of struct mapped_device. + */ +#define DMF_BLOCK_IO_FOR_SUSPEND 0 +#define DMF_SUSPENDED 1 +#define DMF_FROZEN 2 +#define DMF_FREEING 3 +#define DMF_DELETING 4 +#define DMF_NOFLUSH_SUSPENDING 5 +#define DMF_DEFERRED_REMOVE 6 +#define DMF_SUSPENDED_INTERNALLY 7 +#define DMF_POST_SUSPENDING 8 + void disable_discard(struct mapped_device *md); void disable_write_same(struct mapped_device *md); void disable_write_zeroes(struct mapped_device *md); diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index be708b7c66a1..a75929b222a4 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -490,6 +490,14 @@ static blk_status_t dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, struct mapped_device *md = tio->md; struct dm_target *ti = md->immutable_target;
+ /* + * blk-mq's unquiesce may come from outside events, such as + * elevator switch, updating nr_requests or others, and request may + * come during suspend, so simply ask for blk-mq to requeue it. + */ + if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) + return BLK_STS_RESOURCE; + if (unlikely(!ti)) { int srcu_idx; struct dm_table *map; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 2cc3419b4889..0f8e669f0de3 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -132,19 +132,6 @@ EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
#define MINOR_ALLOCED ((void *)-1)
-/* - * Bits for the md->flags field. - */ -#define DMF_BLOCK_IO_FOR_SUSPEND 0 -#define DMF_SUSPENDED 1 -#define DMF_FROZEN 2 -#define DMF_FREEING 3 -#define DMF_DELETING 4 -#define DMF_NOFLUSH_SUSPENDING 5 -#define DMF_DEFERRED_REMOVE 6 -#define DMF_SUSPENDED_INTERNALLY 7 -#define DMF_POST_SUSPENDING 8 - #define DM_NUMA_NODE NUMA_NO_NODE static int dm_numa_node = DM_NUMA_NODE;
On 3/8/22 10:05 PM, Zheng Zengkai wrote:
From: Ming Lei ming.lei@redhat.com
mainline inclusion from mainline-v5.15-rc6 commit b4459b11e840
Unless I missed something, the above mainline commit only has below change.
commit b4459b11e84092658fa195a2587aff3b9637f0e7 Author: Ming Lei ming.lei@redhat.com Date:Â Â Thu Sep 23 17:11:31 2021 +0800
dm rq: don't queue request to blk-mq during DM suspend
DM uses blk-mq's quiesce/unquiesce to stop/start device mapper queue.
But blk-mq's unquiesce may come from outside events, such as elevator    switch, updating nr_requests or others, and request may come during    suspend, so simply ask for blk-mq to requeue it.
Fixes one kernel panic issue when running updating nr_requests and    dm-mpath suspend/resume stress test.
Cc: stable@vger.kernel.org    Signed-off-by: Ming Lei ming.lei@redhat.com    Signed-off-by: Mike Snitzer snitzer@redhat.com
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 5b95eea517d1..a896dea9750e 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -490,6 +490,14 @@ static blk_status_t dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, Â Â Â Â Â Â Â struct mapped_device *md = tio->md; Â Â Â Â Â Â Â struct dm_target *ti = md->immutable_target;
+Â Â Â Â Â Â /* +Â Â Â Â Â Â Â * blk-mq's unquiesce may come from outside events, such as +Â Â Â Â Â Â Â * elevator switch, updating nr_requests or others, and request may +Â Â Â Â Â Â Â * come during suspend, so simply ask for blk-mq to requeue it. +Â Â Â Â Â Â Â */ +Â Â Â Â Â Â if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) +Â Â Â Â Â Â Â Â Â Â Â Â Â Â return BLK_STS_RESOURCE; + Â Â Â Â Â Â Â if (unlikely(!ti)) { Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â int srcu_idx; Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â struct dm_table *map = dm_get_live_table(md, &srcu_idx);
category: panic bugzilla: 185513 https://gitee.com/openeuler/kernel/issues/I4V82O?from=project-issue CVE: NA
DM uses blk-mq's quiesce/unquiesce to stop/start device mapper queue.
But blk-mq's unquiesce may come from outside events, such as elevator switch, updating nr_requests or others, and request may come during suspend, so simply ask for blk-mq to requeue it.
Fixes one kernel panic issue when running updating nr_requests and dm-mpath suspend/resume stress test.
Cc: stable@vger.kernel.org Signed-off-by: Ming Lei ming.lei@redhat.com Signed-off-by: Mike Snitzer snitzer@redhat.com Signed-off-by: Luo Meng luomeng12@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com
drivers/md/dm-core.h | 13 +++++++++++++ drivers/md/dm-rq.c | 8 ++++++++ drivers/md/dm.c | 13 ------------- 3 files changed, 21 insertions(+), 13 deletions(-)
But this patch changed two more files besides dm-rq.c, which is more likely cherry-picked from e2118b3c3d94289852417f70ec128c25f4833aad, if so, please add another patch for the two files.
Thanks, Guoqing
From: Chao Liu liuchao173@huawei.com
euler inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4VPIB CVE: NA
-------------------------------------------------
If this config is enabled, block mapping is not used. The linear address page table is mapped to 4 KB. As a result, the TLB miss rate is high, affecting performance.
For examples, tested by libMicro benchmark: enable disable Improve memsetP2_10m 3540.37760 2129.715200 66.2% memset_4k 0.38400 0.204800 87.5% mprot_twz8k 7.16800 3.072000 133.3% unmap_ra8k 7.93600 4.096000 93.8% unmap_wa128k 68.86400 33.024000 108.5%
This additional enhancement can be turned on with rodata=full if this option is set to 'n'.
Signed-off-by: Chao Liu liuchao173@huawei.com Reviewed-by: Kai Liu kai.liu@suse.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/configs/openeuler_defconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 8d150178b5b8..18836a160cf4 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -434,7 +434,7 @@ CONFIG_ARM64_CPU_PARK=y # CONFIG_XEN is not set CONFIG_FORCE_MAX_ZONEORDER=11 CONFIG_UNMAP_KERNEL_AT_EL0=y -CONFIG_RODATA_FULL_DEFAULT_ENABLED=y +# CONFIG_RODATA_FULL_DEFAULT_ENABLED is not set CONFIG_ARM64_PMEM_RESERVE=y CONFIG_ARM64_PMEM_LEGACY=m # CONFIG_ARM64_SW_TTBR0_PAN is not set
From: Eric Dumazet edumazet@google.com
net-next inclusion from net-next-v5.17-rc5 commit e5f80fcf869a18cc750d6b350bbfac82df292e0b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4VZN0?from=project-issue
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?...
--------------------------------
IPv6 addrconf notifiers wants the loopback device to be the last device being dismantled at netns deletion.
This caused many limitations and work arounds.
Back in linux-5.3, Mahesh added a per host blackhole_netdev that can be used whenever we need to make sure objects no longer refer to a disappearing device.
If we attach to blackhole_netdev an ip6_ptr (allocate an idev), then we can use this special device (which is never freed) in place of the loopback_dev (which can be freed).
This will permit improvements in netdev_run_todo() and other parts of the stack where had steps to make sure loopback_dev was the last device to disappear.
Signed-off-by: Eric Dumazet edumazet@google.com Cc: Mahesh Bandewar maheshb@google.com Signed-off-by: David S. Miller davem@davemloft.net Conflicts: net/ipv6/addrconf.c Signed-off-by: Ziyang Xuan william.xuanziyang@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/ipv6/addrconf.c | 78 +++++++++++++++++++-------------------------- net/ipv6/route.c | 21 +++++------- 2 files changed, 40 insertions(+), 59 deletions(-)
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 29526937077b..71ee4aec4af8 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -368,7 +368,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
ASSERT_RTNL();
- if (dev->mtu < IPV6_MIN_MTU) + if (dev->mtu < IPV6_MIN_MTU && dev != blackhole_netdev) return ERR_PTR(-EINVAL);
ndev = kzalloc(sizeof(struct inet6_dev), GFP_KERNEL); @@ -395,21 +395,22 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) /* We refer to the device */ dev_hold(dev);
- if (snmp6_alloc_dev(ndev) < 0) { - netdev_dbg(dev, "%s: cannot allocate memory for statistics\n", - __func__); - neigh_parms_release(&nd_tbl, ndev->nd_parms); - dev_put(dev); - kfree(ndev); - return ERR_PTR(err); - } + if (dev != blackhole_netdev) { + if (snmp6_alloc_dev(ndev) < 0) { + netdev_dbg(dev, "%s: cannot allocate memory for statistics\n", + __func__); + neigh_parms_release(&nd_tbl, ndev->nd_parms); + dev_put(dev); + kfree(ndev); + return ERR_PTR(err); + }
- if (snmp6_register_dev(ndev) < 0) { - netdev_dbg(dev, "%s: cannot create /proc/net/dev_snmp6/%s\n", - __func__, dev->name); - goto err_release; + if (snmp6_register_dev(ndev) < 0) { + netdev_dbg(dev, "%s: cannot create /proc/net/dev_snmp6/%s\n", + __func__, dev->name); + goto err_release; + } } - /* One reference from device. */ refcount_set(&ndev->refcnt, 1);
@@ -440,25 +441,28 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
ipv6_mc_init_dev(ndev); ndev->tstamp = jiffies; - err = addrconf_sysctl_register(ndev); - if (err) { - ipv6_mc_destroy_dev(ndev); - snmp6_unregister_dev(ndev); - goto err_release; + if (dev != blackhole_netdev) { + err = addrconf_sysctl_register(ndev); + if (err) { + ipv6_mc_destroy_dev(ndev); + snmp6_unregister_dev(ndev); + goto err_release; + } } /* protected by rtnl_lock */ rcu_assign_pointer(dev->ip6_ptr, ndev);
- /* Join interface-local all-node multicast group */ - ipv6_dev_mc_inc(dev, &in6addr_interfacelocal_allnodes); - - /* Join all-node multicast group */ - ipv6_dev_mc_inc(dev, &in6addr_linklocal_allnodes); + if (dev != blackhole_netdev) { + /* Join interface-local all-node multicast group */ + ipv6_dev_mc_inc(dev, &in6addr_interfacelocal_allnodes);
- /* Join all-router multicast group if forwarding is set */ - if (ndev->cnf.forwarding && (dev->flags & IFF_MULTICAST)) - ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters); + /* Join all-node multicast group */ + ipv6_dev_mc_inc(dev, &in6addr_linklocal_allnodes);
+ /* Join all-router multicast group if forwarding is set */ + if (ndev->cnf.forwarding && (dev->flags & IFF_MULTICAST)) + ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters); + } return ndev;
err_release: @@ -7115,26 +7119,8 @@ int __init addrconf_init(void) goto out_nowq; }
- /* The addrconf netdev notifier requires that loopback_dev - * has it's ipv6 private information allocated and setup - * before it can bring up and give link-local addresses - * to other devices which are up. - * - * Unfortunately, loopback_dev is not necessarily the first - * entry in the global dev_base list of net devices. In fact, - * it is likely to be the very last entry on that list. - * So this causes the notifier registry below to try and - * give link-local addresses to all devices besides loopback_dev - * first, then loopback_dev, which cases all the non-loopback_dev - * devices to fail to get a link-local address. - * - * So, as a temporary fix, allocate the ipv6 structure for - * loopback_dev first by hand. - * Longer term, all of the dependencies ipv6 has upon the loopback - * device and it being up should be removed. - */ rtnl_lock(); - idev = ipv6_add_dev(init_net.loopback_dev); + idev = ipv6_add_dev(blackhole_netdev); rtnl_unlock(); if (IS_ERR(idev)) { err = PTR_ERR(idev); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 654bf4ca6126..668b36b42699 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -156,14 +156,10 @@ void rt6_uncached_list_del(struct rt6_info *rt) } }
-static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) +static void rt6_uncached_list_flush_dev(struct net_device *dev) { - struct net_device *loopback_dev = net->loopback_dev; int cpu;
- if (dev == loopback_dev) - return; - for_each_possible_cpu(cpu) { struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); struct rt6_info *rt; @@ -174,7 +170,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) struct net_device *rt_dev = rt->dst.dev;
if (rt_idev->dev == dev) { - rt->rt6i_idev = in6_dev_get(loopback_dev); + rt->rt6i_idev = in6_dev_get(blackhole_netdev); in6_dev_put(rt_idev); }
@@ -372,13 +368,12 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, { struct rt6_info *rt = (struct rt6_info *)dst; struct inet6_dev *idev = rt->rt6i_idev; - struct net_device *loopback_dev = - dev_net(dev)->loopback_dev;
- if (idev && idev->dev != loopback_dev) { - struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); - if (loopback_idev) { - rt->rt6i_idev = loopback_idev; + if (idev && idev->dev != blackhole_netdev) { + struct inet6_dev *blackhole_idev = in6_dev_get(blackhole_netdev); + + if (blackhole_idev) { + rt->rt6i_idev = blackhole_idev; in6_dev_put(idev); } } @@ -4799,7 +4794,7 @@ void rt6_sync_down_dev(struct net_device *dev, unsigned long event) void rt6_disable_ip(struct net_device *dev, unsigned long event) { rt6_sync_down_dev(dev, event); - rt6_uncached_list_flush_dev(dev_net(dev), dev); + rt6_uncached_list_flush_dev(dev); neigh_ifdown(&nd_tbl, dev); }
From: Jakub Kicinski kuba@kernel.org
net-next inclusion from net-next-v5.17-rc5 commit ae68db14b6164ce46beffaf35eb7c9bb2f92fee3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4VZN0?from=project-issue
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?...
--------------------------------
In prep for unregistering netdevs out of order move the netdev state validation and change outside of the loop.
While at it modernize this code and use WARN() instead of pr_err() + dump_stack().
Reviewed-by: Eric Dumazet edumazet@google.com Reviewed-by: Xin Long lucien.xin@gmail.com Link: https://lore.kernel.org/r/20220215225310.3679266-1-kuba@kernel.org Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Ziyang Xuan william.xuanziyang@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/core/dev.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/net/core/dev.c b/net/core/dev.c index 738632e04653..2c2fd376306c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10336,6 +10336,7 @@ static void netdev_wait_allrefs(struct net_device *dev) */ void netdev_run_todo(void) { + struct net_device *dev, *tmp; struct list_head list; #ifdef CONFIG_LOCKDEP struct list_head unlink_list; @@ -10356,24 +10357,23 @@ void netdev_run_todo(void)
__rtnl_unlock();
- /* Wait for rcu callbacks to finish before next phase */ if (!list_empty(&list)) rcu_barrier();
- while (!list_empty(&list)) { - struct net_device *dev - = list_first_entry(&list, struct net_device, todo_list); - list_del(&dev->todo_list); - + list_for_each_entry_safe(dev, tmp, &list, todo_list) { if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { - pr_err("network todo '%s' but state %d\n", - dev->name, dev->reg_state); - dump_stack(); + netdev_WARN(dev, "run_todo but not unregistering\n"); + list_del(&dev->todo_list); continue; }
dev->reg_state = NETREG_UNREGISTERED; + } + + while (!list_empty(&list)) { + dev = list_first_entry(&list, struct net_device, todo_list); + list_del(&dev->todo_list);
netdev_wait_allrefs(dev);
From: Jakub Kicinski kuba@kernel.org
net-next inclusion from net-next-v5.17-rc5 commit faab39f63c1fc4bcdf135690f03bd596b578c67e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4VZN0?from=project-issue
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?...
--------------------------------
Sprinkle for each loops to allow netdevices to be unregistered out of order, as their refs are released.
This prevents problems caused by dependencies between netdevs which want to release references in their ->priv_destructor. See commit d6ff94afd90b ("vlan: move dev_put into vlan_dev_uninit") for example.
Eric has removed the only known ordering requirement in commit c002496babfd ("Merge branch 'ipv6-loopback'") so let's try this and see if anything explodes...
Reviewed-by: Eric Dumazet edumazet@google.com Reviewed-by: Xin Long lucien.xin@gmail.com Link: https://lore.kernel.org/r/20220215225310.3679266-2-kuba@kernel.org Signed-off-by: Jakub Kicinski kuba@kernel.org Conflicts: net/core/dev.c Signed-off-by: Ziyang Xuan william.xuanziyang@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/core/dev.c | 59 +++++++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 25 deletions(-)
diff --git a/net/core/dev.c b/net/core/dev.c index 2c2fd376306c..d035234ce00f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10244,8 +10244,8 @@ EXPORT_SYMBOL(netdev_refcnt_read); #define WAIT_REFS_MIN_MSECS 1 #define WAIT_REFS_MAX_MSECS 250 /** - * netdev_wait_allrefs - wait until all references are gone. - * @dev: target net_device + * netdev_wait_allrefs_any - wait until all references are gone. + * @list: list of net_devices to wait on * * This is called when unregistering network devices. * @@ -10255,37 +10255,45 @@ EXPORT_SYMBOL(netdev_refcnt_read); * We can get stuck here if buggy protocols don't correctly * call dev_put. */ -static void netdev_wait_allrefs(struct net_device *dev) +static struct net_device *netdev_wait_allrefs_any(struct list_head *list) { unsigned long rebroadcast_time, warning_time; - int wait = 0, refcnt; + struct net_device *dev; + int wait = 0;
- linkwatch_forget_dev(dev); + list_for_each_entry(dev, list, todo_list) + linkwatch_forget_dev(dev);
rebroadcast_time = warning_time = jiffies; - refcnt = netdev_refcnt_read(dev);
- while (refcnt != 0) { + list_for_each_entry(dev, list, todo_list) + if (netdev_refcnt_read(dev) == 0) + return dev; + + while (true) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { rtnl_lock();
/* Rebroadcast unregister notification */ - call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + list_for_each_entry(dev, list, todo_list) + call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
__rtnl_unlock(); rcu_barrier(); rtnl_lock();
- if (test_bit(__LINK_STATE_LINKWATCH_PENDING, - &dev->state)) { - /* We must not have linkwatch events - * pending on unregister. If this - * happens, we simply run the queue - * unscheduled, resulting in a noop - * for this device. - */ - linkwatch_run_queue(); - } + list_for_each_entry(dev, list, todo_list) + if (test_bit(__LINK_STATE_LINKWATCH_PENDING, + &dev->state)) { + /* We must not have linkwatch events + * pending on unregister. If this + * happens, we simply run the queue + * unscheduled, resulting in a noop + * for this device. + */ + linkwatch_run_queue(); + break; + }
__rtnl_unlock();
@@ -10300,11 +10308,14 @@ static void netdev_wait_allrefs(struct net_device *dev) wait = min(wait << 1, WAIT_REFS_MAX_MSECS); }
- refcnt = netdev_refcnt_read(dev); + list_for_each_entry(dev, list, todo_list) + if (netdev_refcnt_read(dev) == 0) + return dev;
- if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) { - pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", - dev->name, refcnt); + if (time_after(jiffies, warning_time + 10 * HZ)) { + list_for_each_entry(dev, list, todo_list) + pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", + dev->name, netdev_refcnt_read(dev)); warning_time = jiffies; } } @@ -10372,11 +10383,9 @@ void netdev_run_todo(void) }
while (!list_empty(&list)) { - dev = list_first_entry(&list, struct net_device, todo_list); + dev = netdev_wait_allrefs_any(&list); list_del(&dev->todo_list);
- netdev_wait_allrefs(dev); - /* paranoia */ BUG_ON(netdev_refcnt_read(dev)); BUG_ON(!list_empty(&dev->ptype_all));
From: Eric Dumazet edumazet@google.com
net-next inclusion from net-next-v5.17-rc5 commit 86213f80da1b1d007721cc22e04b5f5d0da33127 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4VZN0?from=project-issue
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?...
--------------------------------
If the list of devices has N elements, netdev_wait_allrefs_any() is called N times, and linkwatch_forget_dev() is called N*(N-1)/2 times.
Fix this by calling linkwatch_forget_dev() only once per device.
Fixes: faab39f63c1f ("net: allow out-of-order netdev unregistration") Signed-off-by: Eric Dumazet edumazet@google.com Link: https://lore.kernel.org/r/20220218065430.2613262-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Ziyang Xuan william.xuanziyang@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/core/dev.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/net/core/dev.c b/net/core/dev.c index d035234ce00f..b54a4158e1db 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10261,9 +10261,6 @@ static struct net_device *netdev_wait_allrefs_any(struct list_head *list) struct net_device *dev; int wait = 0;
- list_for_each_entry(dev, list, todo_list) - linkwatch_forget_dev(dev); - rebroadcast_time = warning_time = jiffies;
list_for_each_entry(dev, list, todo_list) @@ -10380,6 +10377,7 @@ void netdev_run_todo(void) }
dev->reg_state = NETREG_UNREGISTERED; + linkwatch_forget_dev(dev); }
while (!list_empty(&list)) {
From: Ido Schimmel idosch@nvidia.com
net-next inclusion from net-next-v5.17-rc5 commit dd263a8cb1941d2d34a55633bd5366d9bebf4be8 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4VZN0?from=project-issue
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?...
--------------------------------
Whenever rt6_uncached_list_flush_dev() swaps rt->rt6_idev to the blackhole device, parts of IPv6 stack might still need to increment one SNMP counter.
Root cause, patch from Ido, changelog from Eric :)
This bug suggests that we need to audit rt->rt6_idev usages and make sure they are properly using RCU protection.
Fixes: e5f80fcf869a ("ipv6: give an IPv6 dev to blackhole_netdev") Signed-off-by: Ido Schimmel idosch@nvidia.com Signed-off-by: Eric Dumazet edumazet@google.com Reported-by: syzbot syzkaller@googlegroups.com Signed-off-by: David S. Miller davem@davemloft.net Conflicts: net/ipv6/addrconf.c Signed-off-by: Ziyang Xuan william.xuanziyang@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/ipv6/addrconf.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 71ee4aec4af8..f88023a34d0a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -395,16 +395,16 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) /* We refer to the device */ dev_hold(dev);
- if (dev != blackhole_netdev) { - if (snmp6_alloc_dev(ndev) < 0) { - netdev_dbg(dev, "%s: cannot allocate memory for statistics\n", - __func__); - neigh_parms_release(&nd_tbl, ndev->nd_parms); - dev_put(dev); - kfree(ndev); - return ERR_PTR(err); - } + if (snmp6_alloc_dev(ndev) < 0) { + netdev_dbg(dev, "%s: cannot allocate memory for statistics\n", + __func__); + neigh_parms_release(&nd_tbl, ndev->nd_parms); + dev_put(dev); + kfree(ndev); + return ERR_PTR(err); + }
+ if (dev != blackhole_netdev) { if (snmp6_register_dev(ndev) < 0) { netdev_dbg(dev, "%s: cannot create /proc/net/dev_snmp6/%s\n", __func__, dev->name);
From: Jialin Zhang zhangjialin11@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4JZ0H CVE: NA
-------------------------------
Use the kABI reference checking tool as follows:
./scripts/check-kabi -k Module.symvers.baseline -s Module.symvers
A python error occurred, and the following traceback is printed:
Traceback (most recent call last): File "./scripts/check-kabi", line 144, in <module> load_symvers(symvers,symvers_file) File "./scripts/check-kabi", line 45, in load_symvers checksum,symbol,directory,type = string.split(in_line) ValueError: too many values to unpack
It is because the Module.symvers file change its line format in the following commits, and the namespace field may be empty: cb9b55d21fe0 ("modpost: add support for symbol namespaces") 5190044c2965 ("modpost: move the namespace field in Module.symvers last")
In order to solve this problem, use '\t' to split each line and add a variable to save namespace.
Fixes: 9fc7fbafe3d3 ("kabi: add kABI reference checking tool") Signed-off-by: Jialin Zhang zhangjialin11@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- scripts/check-kabi | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/scripts/check-kabi b/scripts/check-kabi index e3ec97cebffa..b1b55d512e61 100755 --- a/scripts/check-kabi +++ b/scripts/check-kabi @@ -42,7 +42,7 @@ def load_symvers(symvers,filename): break if in_line == "\n": continue - checksum,symbol,directory,type = string.split(in_line) + checksum,symbol,directory,type,namespace = string.split(in_line, sep='\t')
symvers[symbol] = in_line[0:-1]
@@ -57,7 +57,7 @@ def load_kabi(kabi,filename): break if in_line == "\n": continue - checksum,symbol,directory,type = string.split(in_line) + checksum,symbol,directory,type,namespace = string.split(in_line, sep='\t')
kabi[symbol] = in_line[0:-1]
@@ -70,9 +70,9 @@ def check_kabi(symvers,kabi): moved_symbols=[]
for symbol in kabi: - abi_hash,abi_sym,abi_dir,abi_type = string.split(kabi[symbol]) + abi_hash,abi_sym,abi_dir,abi_type,namespace = string.split(kabi[symbol], sep='\t') if symvers.has_key(symbol): - sym_hash,sym_sym,sym_dir,sym_type = string.split(symvers[symbol]) + sym_hash,sym_sym,sym_dir,sym_type,namespace = string.split(symvers[symbol], sep='\t') if abi_hash != sym_hash: fail=1 changed_symbols.append(symbol)
From: Wang ShaoBo bobo.shaobowang@huawei.com
mainline inclusion from mainline-v5.16-rc1 commit 4cc4cc28ec4154c4f1395648ab67ac9fd3e71fdc category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4WA1K CVE: NA
--------------------------------
When testing cpu online and offline, warning happened like this:
[ 146.746743] WARNING: CPU: 92 PID: 974 at kernel/sched/topology.c:2215 build_sched_domains+0x81c/0x11b0 [ 146.749988] CPU: 92 PID: 974 Comm: kworker/92:2 Not tainted 5.15.0 #9 [ 146.750402] Hardware name: Huawei TaiShan 2280 V2/BC82AMDDA, BIOS 1.79 08/21/2021 [ 146.751213] Workqueue: events cpuset_hotplug_workfn [ 146.751629] pstate: 00400009 (nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 146.752048] pc : build_sched_domains+0x81c/0x11b0 [ 146.752461] lr : build_sched_domains+0x414/0x11b0 [ 146.752860] sp : ffff800040a83a80 [ 146.753247] x29: ffff800040a83a80 x28: ffff20801f13a980 x27: ffff20800448ae00 [ 146.753644] x26: ffff800012a858e8 x25: ffff800012ea48c0 x24: 0000000000000000 [ 146.754039] x23: ffff800010ab7d60 x22: ffff800012f03758 x21: 000000000000005f [ 146.754427] x20: 000000000000005c x19: ffff004080012840 x18: ffffffffffffffff [ 146.754814] x17: 3661613030303230 x16: 30303078303a3239 x15: ffff800011f92b48 [ 146.755197] x14: ffff20be3f95cef6 x13: 2e6e69616d6f642d x12: 6465686373204c4c [ 146.755578] x11: ffff20bf7fc83a00 x10: 0000000000000040 x9 : 0000000000000000 [ 146.755957] x8 : 0000000000000002 x7 : ffffffffe0000000 x6 : 0000000000000002 [ 146.756334] x5 : 0000000090000000 x4 : 00000000f0000000 x3 : 0000000000000001 [ 146.756705] x2 : 0000000000000080 x1 : ffff800012f03860 x0 : 0000000000000001 [ 146.757070] Call trace: [ 146.757421] build_sched_domains+0x81c/0x11b0 [ 146.757771] partition_sched_domains_locked+0x57c/0x978 [ 146.758118] rebuild_sched_domains_locked+0x44c/0x7f0 [ 146.758460] rebuild_sched_domains+0x2c/0x48 [ 146.758791] cpuset_hotplug_workfn+0x3fc/0x888 [ 146.759114] process_one_work+0x1f4/0x480 [ 146.759429] worker_thread+0x48/0x460 [ 146.759734] kthread+0x158/0x168 [ 146.760030] ret_from_fork+0x10/0x20 [ 146.760318] ---[ end trace 82c44aad6900e81a ]---
For some architectures like risc-v and arm64 which use common code clear_cpu_topology() in shutting down CPUx, When CONFIG_SCHED_CLUSTER is set, cluster_sibling in cpu_topology of each sibling adjacent to CPUx is missed clearing, this causes checking failed in topology_span_sane() and rebuilding topology failure at end when CPU online.
Different sibling's cluster_sibling in cpu_topology[] when CPU92 offline (CPU 92, 93, 94, 95 are in one cluster):
Before revision: CPU [92] [93] [94] [95] cluster_sibling [92] [92-95] [92-95] [92-95]
After revision: CPU [92] [93] [94] [95] cluster_sibling [92] [93-95] [93-95] [93-95]
Fixes: c5e22feffdd7 ("topology: Represent clusters of CPUs within a die") Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Reviewed-by: Dietmar Eggemann dietmar.eggemann@arm.com Acked-by: Barry Song song.bao.hua@hisilicon.com Tested-by: Dietmar Eggemann dietmar.eggemann@arm.com Link: https://lore.kernel.org/r/20211110095856.469360-1-bobo.shaobowang@huawei.com Reviewed-by: Cheng Jian cj.chengjian@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/base/arch_topology.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 21e63b6fab83..0d19fe5c99d4 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -586,6 +586,8 @@ void remove_cpu_topology(unsigned int cpu) cpumask_clear_cpu(cpu, topology_core_cpumask(sibling)); for_each_cpu(sibling, topology_sibling_cpumask(cpu)) cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling)); + for_each_cpu(sibling, topology_cluster_cpumask(cpu)) + cpumask_clear_cpu(cpu, topology_cluster_cpumask(sibling)); for_each_cpu(sibling, topology_llc_cpumask(cpu)) cpumask_clear_cpu(cpu, topology_llc_cpumask(sibling));
From: Zhen Lei thunder.leizhen@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4V6F8 CVE: NA
-------------------------------------------------------------------------
access_ok() is used to preliminarily check 'uaddr' to avoid unnecessary page fault caused by invalid input. The page fault will do the accurate address verification based on task's mm. It's also used to do a check on the get_fs(), see the comments of __access_ok().
But now the support for get_fs() on arm64 has been deleted by commit edf84200127a ("arm64: uaccess: remove set_fs()"). So access_ok() does not need to perform such strict checks for compat tasks.
Remove the is_compact_task() check can improve the performance of syscalls. For example, all test items of libMicro can be improved by 4.89% on average.
The next patch will avoid calling is_ilp32_compat_task() by default by close its build option, because ILP32 has specific requirements.
Signed-off-by: Zhen Lei thunder.leizhen@huawei.com Reviewed-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: liuchao (CR) liuchao173@huawei.com Reviewed-by: Liu Chao (CR) < liuchao173@huawei.com > Reviewed-by: Xiongfeng Wang wangxiongfeng2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/include/asm/processor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 8e6f1af816c9..617cdd40c61b 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -49,7 +49,7 @@ #define TASK_SIZE_64 (UL(1) << vabits_actual)
#ifdef CONFIG_COMPAT -#define TASK_SIZE_MAX (is_compat_task() ? \ +#define TASK_SIZE_MAX (is_ilp32_compat_task() ? \ UL(0x100000000) : (UL(1) << VA_BITS)) #if defined(CONFIG_ARM64_64K_PAGES) && defined(CONFIG_KUSER_HELPERS) /*
From: Zhen Lei thunder.leizhen@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4V6F8 CVE: NA
-------------------------------------------------------------------------
No one currently uses IPL32, close it to simplify the code logic of TASK_SIZE_MAX, thereby improving benchmark performance. For example, all test items of libMicro can be improved by 4.89% on average.
Signed-off-by: Zhen Lei thunder.leizhen@huawei.com Reviewed-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: liuchao (CR) liuchao173@huawei.com Reviewed-by: Liu Chao (CR) < liuchao173@huawei.com > Reviewed-by: Xiongfeng Wang wangxiongfeng2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 - 1 file changed, 1 deletion(-)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 18836a160cf4..d8b8ffbd3f78 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -439,7 +439,6 @@ CONFIG_ARM64_PMEM_RESERVE=y CONFIG_ARM64_PMEM_LEGACY=m # CONFIG_ARM64_SW_TTBR0_PAN is not set CONFIG_ARM64_TAGGED_ADDR_ABI=y -CONFIG_ARM64_ILP32=y CONFIG_AARCH32_EL0=y # CONFIG_KUSER_HELPERS is not set CONFIG_ARMV8_DEPRECATED=y
From: Wang ShaoBo bobo.shaobowang@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I47W8L CVE: NA
---------------------------
We detect a performance deterioration when using Unixbench, we use the dichotomy to locate the patch 7e66740ad725 ("MPAM / ACPI: Refactoring MPAM init process and set MPAM ACPI as entrance"), In comparing two commit df5defd901ff ("KVM: X86: MMU: Use the correct inherited permissions to get shadow page") and ac4dbb7554ef ("ACPI 6.x: Add definitions for MPAM table") we get following testing result:
CMD: ./Run -c xx context1 RESULT: +-------------UnixBench context1-----------+ +---------+--------------+-----------------+ + + ac4dbb7554ef + df5defd901ff + +---------+--------------+---------+-------+ + Cores + Score + Score + +---------+--------------+-----------------+ + 1 + 522.8 + 535.7 + +---------+--------------+-----------------+ + 24 + 11231.5 + 12111.2 + +---------+--------------+-----------------+ + 48 + 8535.1 + 8745.1 + +---------+--------------+-----------------+ + 72 + 10821.9 + 10343.8 + +---------+--------------+-----------------+ + 96 + 15238.5 + 42947.8 + +---------+--------------+-----------------+
We found a irrefutable difference in latency sampling when using the perf tool:
HEAD:ac4dbb7554ef HEAD:df5defd901ff
45.18% [kernel] [k] ktime_get_coarse_real_ts64 -> 1.78% [kernel] [k] ktime_get_coarse_real_ts64 ... 65.87 │ dmb ishld //smp_rmb()
Through ftrace we get the calltrace and and detected the number of visits of ktime_get_coarse_real_ts64, which frequently visits tk_core->seq and tk_core->timekeeper->tkr_mono:
- 48.86% [kernel] [k] ktime_get_coarse_real_ts64 - 5.76% ktime_get_coarse_real_ts64 #about 111437657 times per 10 seconds - 14.70% __audit_syscall_entry syscall_trace_enter el0_svc_common el0_svc_handler + el0_svc - 2.85% current_time
So this may be performance degradation caused by interference when happened different fields access, We compare .bss and .data section of this two version:
HEAD:ac4dbb7554ef `-> ffff00000962e680 l O .bss 0000000000000110 tk_core ffff000009355680 l O .data 0000000000000078 tk_fast_mono ffff0000093557a0 l O .data 0000000000000090 dummy_clock ffff000009355700 l O .data 0000000000000078 tk_fast_raw ffff000009355778 l O .data 0000000000000028 timekeeping_syscore_ops ffff00000962e640 l O .bss 0000000000000008 cycles_at_suspend
HEAD:df5defd901ff `-> ffff00000957dbc0 l O .bss 0000000000000110 tk_core ffff0000092b4e80 l O .data 0000000000000078 tk_fast_mono ffff0000092b4fa0 l O .data 0000000000000090 dummy_clock ffff0000092b4f00 l O .data 0000000000000078 tk_fast_raw ffff0000092b4f78 l O .data 0000000000000028 timekeeping_syscore_ops ffff00000957db80 l O .bss 0000000000000008 cycles_at_suspend
By comparing this two version tk_core's address: ffff00000962e680 is 128Byte aligned but latter df5defd901ff is 64Byte aligned, the memory storage layout of tk_core has undergone subtle changes:
HEAD:ac4dbb7554ef `-> |<--------formmer 64Bytes---------->|<------------latter 64Byte------------->| 0xffff00000957dbc0_>|<-seq 8Bytes->|<-tkr_mono 56Bytes->|<-thr_raw 56Bytes->|<-xtime_sec 8Bytes->| 0xffff00000957dc00_>...
HEAD:df5defd901ff `-> |<------formmer 64Bytes---->|<------------latter 64Byte-------->| 0xffff00000962e680_>|<-Other variables 64Bytes->|<-seq 8Bytes->|<-tkr_mono 56Bytes->| 0xffff00000962e6c0_>..
We testified thr_raw,xtime_sec fields interfere strongly with seq,tkr_mono field because of frequent load/store operation, this will cause as known false sharing.
We add a 64Bytes padding field in tk_core for reservation of any after usefull usage and keep tk_core 128Byte aligned, this can avoid changes in the way tk_core's layout is stored, In this solution, layout of tk_core always like this:
crash> struct -o tk_core_t struct tk_core_t { [0] u64 padding[8]; [64] seqcount_t seq; [72] struct timekeeper timekeeper; } SIZE: 336 crash> struct -o timekeeper struct timekeeper { [0] struct tk_read_base tkr_mono; [56] struct tk_read_base tkr_raw; [112] u64 xtime_sec; [120] unsigned long ktime_sec; ... } SIZE: 264
After appling our own solution:
+---------+--------------+ + + Our solution + +---------+--------------+ + Cores + Score + +---------+--------------+ + 1 + 548.9 + +---------+--------------+ + 24 + 11018.3 + +---------+--------------+ + 48 + 8938.2 + +---------+--------------+ + 72 + 14610.7 + +---------+--------------+ + 96 + 40811.7 + +---------+--------------+
Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/Kconfig | 9 +++++++++ arch/arm64/include/asm/cache.h | 6 ++++++ kernel/time/timekeeping.c | 7 +++++++ 3 files changed, 22 insertions(+)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c0f6a275f798..9c6ad627ba4f 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1155,6 +1155,15 @@ config ARCH_WANT_HUGE_PMD_SHARE config ARCH_HAS_CACHE_LINE_SIZE def_bool y
+config ARCH_LLC_128_LINE_SIZE + bool "Force 128 bytes alignment for fitting LLC cacheline" + depends on ARM64 + default y + help + As specific machine's LLC cacheline size may be up to + 128 bytes, gaining performance improvement from fitting + 128 Bytes LLC cache aligned. + config ARCH_HAS_FILTER_PGPROT def_bool y
diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h index cf8e78585865..f7e1d1bb8172 100644 --- a/arch/arm64/include/asm/cache.h +++ b/arch/arm64/include/asm/cache.h @@ -31,6 +31,12 @@ #define L1_CACHE_SHIFT (6) #define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)
+#ifdef CONFIG_ARCH_LLC_128_LINE_SIZE +#ifndef ____cacheline_aligned_128 +#define ____cacheline_aligned_128 __attribute__((__aligned__(128))) +#endif +#endif +
#define CLIDR_LOUU_SHIFT 27 #define CLIDR_LOC_SHIFT 24 diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index cc4dc2857a87..72cad9bf19d7 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -46,9 +46,16 @@ DEFINE_RAW_SPINLOCK(timekeeper_lock); * cache line. */ static struct { +#ifdef CONFIG_ARCH_LLC_128_LINE_SIZE + u64 padding[8]; +#endif seqcount_raw_spinlock_t seq; struct timekeeper timekeeper; +#ifdef CONFIG_ARCH_LLC_128_LINE_SIZE +} tk_core ____cacheline_aligned_128 = { +#else } tk_core ____cacheline_aligned = { +#endif .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_core.seq, &timekeeper_lock), };
From: Keqian Zhu zhukeqian1@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4WK5B CVE: NA
------------------------------
The previous patch extended dirty log feature for SMMUv3 nested mode, but forget to remove the S1 mapping restriction.
Fixes: 97e11307edcc (iommu/arm-smmu-v3: Using HTTU with SMMU STE and stage 2 TTD) Signed-off-by: Keqian Zhu zhukeqian1@huawei.com Tested-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index f6868511ad01..e463fd31d268 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3533,7 +3533,7 @@ static int arm_smmu_switch_dirty_log(struct iommu_domain *domain, bool enable,
if (!(smmu->features & ARM_SMMU_FEAT_HD)) return -ENODEV; - if (smmu_domain->stage != ARM_SMMU_DOMAIN_S1) + if (smmu_domain->stage == ARM_SMMU_DOMAIN_BYPASS) return -EINVAL;
if (enable) { @@ -3574,7 +3574,7 @@ static int arm_smmu_sync_dirty_log(struct iommu_domain *domain,
if (!(smmu->features & ARM_SMMU_FEAT_HD)) return -ENODEV; - if (smmu_domain->stage != ARM_SMMU_DOMAIN_S1) + if (smmu_domain->stage == ARM_SMMU_DOMAIN_BYPASS) return -EINVAL;
if (!ops || !ops->sync_dirty_log) { @@ -3603,7 +3603,7 @@ static int arm_smmu_clear_dirty_log(struct iommu_domain *domain,
if (!(smmu->features & ARM_SMMU_FEAT_HD)) return -ENODEV; - if (smmu_domain->stage != ARM_SMMU_DOMAIN_S1) + if (smmu_domain->stage == ARM_SMMU_DOMAIN_BYPASS) return -EINVAL;
if (!ops || !ops->clear_dirty_log) {
From: Keqian Zhu zhukeqian1@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4WK5B CVE: NA
------------------------------
The iommu_domain may contain more than one DMA range which can be dirty log tracked separately, so it's hard to track the dirty log status of iommu_domain. The upper layer (e.g. vfio) should make sure it's doing right thing.
Fixes: bbf3b39e5576 (iommu: Introduce dirty log tracking framework) Signed-off-by: Keqian Zhu zhukeqian1@huawei.com Tested-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/iommu/iommu.c | 25 ++++--------------------- include/linux/iommu.h | 1 - 2 files changed, 4 insertions(+), 22 deletions(-)
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 25b3b8386ca9..9e7de0e5b9e8 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3017,13 +3017,6 @@ int iommu_switch_dirty_log(struct iommu_domain *domain, bool enable, }
mutex_lock(&domain->switch_log_lock); - if (enable && domain->dirty_log_tracking) { - ret = -EBUSY; - goto out; - } else if (!enable && !domain->dirty_log_tracking) { - ret = -EINVAL; - goto out; - }
pr_debug("switch_dirty_log %s for: iova 0x%lx size 0x%zx\n", enable ? "enable" : "disable", iova, size); @@ -3046,11 +3039,9 @@ int iommu_switch_dirty_log(struct iommu_domain *domain, bool enable, if (flush) iommu_flush_iotlb_all(domain);
- if (!ret) { - domain->dirty_log_tracking = enable; + if (!ret) trace_switch_dirty_log(orig_iova, orig_size, enable); - } -out: + mutex_unlock(&domain->switch_log_lock); return ret; } @@ -3077,10 +3068,6 @@ int iommu_sync_dirty_log(struct iommu_domain *domain, unsigned long iova, }
mutex_lock(&domain->switch_log_lock); - if (!domain->dirty_log_tracking) { - ret = -EINVAL; - goto out; - }
pr_debug("sync_dirty_log for: iova 0x%lx size 0x%zx\n", iova, size);
@@ -3101,7 +3088,7 @@ int iommu_sync_dirty_log(struct iommu_domain *domain, unsigned long iova,
if (!ret) trace_sync_dirty_log(orig_iova, orig_size); -out: + mutex_unlock(&domain->switch_log_lock); return ret; } @@ -3163,10 +3150,6 @@ int iommu_clear_dirty_log(struct iommu_domain *domain, }
mutex_lock(&domain->switch_log_lock); - if (!domain->dirty_log_tracking) { - ret = -EINVAL; - goto out; - }
start = (iova - base_iova) >> bitmap_pgshift; end = start + (size >> bitmap_pgshift); @@ -3182,7 +3165,7 @@ int iommu_clear_dirty_log(struct iommu_domain *domain,
if (flush) iommu_flush_iotlb_all(domain); -out: + mutex_unlock(&domain->switch_log_lock); return ret; } diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 591a6c5d2ddf..8baf5ed66a84 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -87,7 +87,6 @@ struct iommu_domain { void *handler_token; struct iommu_domain_geometry geometry; void *iova_cookie; - bool dirty_log_tracking; struct mutex switch_log_lock; KABI_RESERVE(1) KABI_RESERVE(2)
From: Keqian Zhu zhukeqian1@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4WK5B CVE: NA
------------------------------
When a newly created iommu_domain is not HWDBM capable, the vfio_iommu's num_non_hwdbm_domains field should increase but failed to increase. This fixes this bug.
Fixes: d2373c56459f (vfio/iommu_type1: Add HWDBM status maintenance) Signed-off-by: Keqian Zhu zhukeqian1@huawei.com Tested-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Kunkun Jiang jiangkunkun@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/vfio/vfio_iommu_type1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 5daceec48811..f556b572c86d 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -2420,7 +2420,7 @@ static void vfio_iommu_update_hwdbm(struct vfio_iommu *iommu, bool num_non_hwdbm_zeroed = false; bool log_enabled, should_enable;
- if (old_hwdbm && !new_hwdbm && attach) { + if ((old_hwdbm || singular) && !new_hwdbm && attach) { iommu->num_non_hwdbm_domains++; } else if (!old_hwdbm && new_hwdbm && !attach) { iommu->num_non_hwdbm_domains--;
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4WKMY CVE: NA
--------------------------------
cfs_bandwidth_usage_inc() need hold jump_label_mutex and might sleep, so we can not call it in atomic context. Fix this by moving cfs_bandwidth_usage_{inc,dec}() out of rcu read critical section.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/core.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 46d219b87109..a8358ddba7ef 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8507,13 +8507,10 @@ static int tg_change_scheduler(struct task_group *tg, void *data) struct cgroup_subsys_state *css = &tg->css;
tg->qos_level = qos_level; - if (qos_level == -1) { + if (qos_level == -1) policy = SCHED_IDLE; - cfs_bandwidth_usage_inc(); - } else { + else policy = SCHED_NORMAL; - cfs_bandwidth_usage_dec(); - }
param.sched_priority = 0; css_task_iter_start(css, 0, &it); @@ -8541,6 +8538,13 @@ static int cpu_qos_write(struct cgroup_subsys_state *css, if (tg->qos_level == -1 && qos_level == 0) return -EINVAL;
+ cpus_read_lock(); + if (qos_level == -1) + cfs_bandwidth_usage_inc(); + else + cfs_bandwidth_usage_dec(); + cpus_read_unlock(); + rcu_read_lock(); walk_tg_tree_from(tg, tg_change_scheduler, tg_nop, (void *)(&qos_level)); rcu_read_unlock();
From: Ma Wupeng mawupeng1@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA
----------------------------------------------
Commit cc3d801fa10f ("efi: Make efi_find_mirror() public") add efi_find_mirror() defination into linux/efi.h, but forget to drop this in arch/x86/include/asm/efi.h, kill it.
Fixes: cc3d801fa10f ("efi: Make efi_find_mirror() public") Signed-off-by: Ma Wupeng mawupeng1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/include/asm/efi.h | 4 ---- 1 file changed, 4 deletions(-)
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 3be8754408d5..3f58bc3fb550 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -353,7 +353,6 @@ static inline bool efi_is_64bit(void) extern bool efi_reboot_required(void); extern bool efi_is_table_address(unsigned long phys_addr);
-extern void efi_find_mirror(void); extern void efi_reserve_boot_services(void); #else static inline void parse_efi_setup(u64 phys_addr, u32 data_len) {} @@ -365,9 +364,6 @@ static inline bool efi_is_table_address(unsigned long phys_addr) { return false; } -static inline void efi_find_mirror(void) -{ -} static inline void efi_reserve_boot_services(void) { }
From: Tong Tiangen tongtiangen@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PM10 CVE: NA
--------------------------------
During kernel copy_from_user processing, the kernel triggers a RAS exception when reading pages. In this solution, we identify this scenario in the kernel do_sea processing process, send SIGBUS signals to the process that triggers copy_from_user and isolate memory pages, preventing kernel panic.
At the same time, we use cmdline(uce_kernel_recovery) or proc (/proc/sys/kernel/uce_kernel_recovery) to control this feature on/off.
Usage: 1. Each bit controls whether this feature is turned on in a scene, 1 means turned on and 0 means turned off. 2. Bit2 represents copy_from_user scene, other bits are currently reserved.
eg: make copy_from_user scene open this feature: 1. echo 4 > /proc/sys/kernel/uce_kernel_recovery. or 2. uce_kernel_recovery=4 add to cmdline.
Signed-off-by: Tong Tiangen tongtiangen@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/admin-guide/sysctl/kernel.rst | 17 ++ arch/arm64/Kconfig | 9 + arch/arm64/include/asm/exception.h | 13 ++ arch/arm64/lib/copy_from_user.S | 11 ++ arch/arm64/mm/Makefile | 2 + arch/arm64/mm/fault.c | 4 + arch/arm64/mm/uce_kernel_recovery.c | 198 ++++++++++++++++++++ 7 files changed, 254 insertions(+) create mode 100644 arch/arm64/mm/uce_kernel_recovery.c
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 7d5e8a67c775..56bb3afe3794 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -1526,3 +1526,20 @@ is 10 seconds.
The softlockup threshold is (``2 * watchdog_thresh``). Setting this tunable to zero will disable lockup detection altogether. + +uce_kernel_recovery(ARM64 only) +=============================== + +This value can be used to control whether panic the kernel when UCE RAS +errors occur in a specific scenario. Each bit controls a scene, 1 means +avoid kernel panic when encountering UCE RAS error in this scenario, and +0 means kernel panic. + +Current usage of each bit: + +============ ============== +bit0 reserved +bit1 reserved +bit2 copy_from_user +bit3 ~ bit31 reserved +============ ============== diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 9c6ad627ba4f..259e4a18377c 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1634,6 +1634,15 @@ config ARM64_CNP at runtime, and does not affect PEs that do not implement this feature.
+config ARM64_UCE_KERNEL_RECOVERY + bool "arm64 uce kernel recovery for special scenario" + depends on ACPI_APEI_SEA + help + With ARM v8.2 RAS Extension, SEA are usually triggered when memory + error are consumed. In some cases, if the error address is in a + user page there is a chance to recover. we can isolate this page + and killing process instead of die. + endmenu
menu "ARMv8.3 architectural features" diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index 0756191f44f6..731cf01d9296 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -19,6 +19,19 @@ #define __exception_irq_entry __kprobes #endif
+#ifdef CONFIG_ARM64_UCE_KERNEL_RECOVERY +bool arm64_process_kernel_sea(unsigned long addr, unsigned int esr, + struct pt_regs *regs, int sig, + int code, void __user *siaddr); +#else +static inline bool arm64_process_kernel_sea(unsigned long addr, unsigned int esr, + struct pt_regs *regs, int sig, + int code, void __user *siaddr) +{ + return false; +} +#endif + static inline u32 disr_to_esr(u64 disr) { unsigned int esr = ESR_ELx_EC_SERROR << ESR_ELx_EC_SHIFT; diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 2cf999e41d30..100de4e2d9ee 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -60,6 +60,17 @@ SYM_FUNC_START(__arch_copy_from_user) #include "copy_template.S" mov x0, #0 // Nothing to copy ret + +/* + * In feature CONFIG_ARM64_UCE_KERNEL_RECOVERY, if RAS error is triggered + * in copy_from_user(), RAS error is processed in do_sea() and + * copy_from_user_sea_fallback will be assigned to regs->pc, finally return + * here to continue processing. + */ + .global copy_from_user_sea_fallback +copy_from_user_sea_fallback: + sub x0, end, dst // bytes not copied + ret SYM_FUNC_END(__arch_copy_from_user) EXPORT_SYMBOL(__arch_copy_from_user)
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index 42e107d6da4f..3634ad81bdf1 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -11,6 +11,8 @@ obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o obj-$(CONFIG_ARM64_MTE) += mteswap.o KASAN_SANITIZE_physaddr.o += n
+obj-$(CONFIG_ARM64_UCE_KERNEL_RECOVERY) += uce_kernel_recovery.o + obj-$(CONFIG_KASAN) += kasan_init.o KASAN_SANITIZE_kasan_init.o := n
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 3fc5aceb72eb..7da2f8118b35 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -653,6 +653,10 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) siaddr = NULL; else siaddr = (void __user *)addr; + + if (arm64_process_kernel_sea(addr, esr, regs, inf->sig, inf->code, siaddr)) + return 0; + arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr);
return 0; diff --git a/arch/arm64/mm/uce_kernel_recovery.c b/arch/arm64/mm/uce_kernel_recovery.c new file mode 100644 index 000000000000..c654dc6c4dfd --- /dev/null +++ b/arch/arm64/mm/uce_kernel_recovery.c @@ -0,0 +1,198 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "ARM64 UCE: " fmt + +#include <linux/acpi.h> +#include <linux/kallsyms.h> +#include <linux/mm.h> +#include <linux/kernel.h> +#include <linux/sysctl.h> + +#include <asm/acpi.h> +#include <asm/exception.h> +#include <asm/system_misc.h> +#include <asm/traps.h> +#include <asm/esr.h> + +struct uce_kernel_recovery_info { + int (*fn)(void); + const char *name; + unsigned long addr; + unsigned long size; +}; + +int copy_from_user_sea_fallback(void); + +static int kernel_access_sea_recovery; +static int kernel_uce_recovery_sysctl_max = 7; + +#define UCE_KER_REC_NUM ARRAY_SIZE(reco_info) +static struct uce_kernel_recovery_info reco_info[] = { + {NULL, NULL, 0, 0}, /* reserved */ + {NULL, NULL, 0, 0}, /* reserved */ + {copy_from_user_sea_fallback, "__arch_copy_from_user", (unsigned long)__arch_copy_from_user, 0}, +}; + +static struct ctl_table uce_kernel_recovery_ctl_table[] = { + { + .procname = "uce_kernel_recovery", + .data = &kernel_access_sea_recovery, + .maxlen = sizeof(kernel_access_sea_recovery), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &kernel_uce_recovery_sysctl_max, + }, + { } +}; + +static int __init kernel_access_sea_recovery_init(void) +{ + unsigned long addr, size, offset; + unsigned int i; + + for (i = 0; i < UCE_KER_REC_NUM; i++) { + addr = reco_info[i].addr; + + if (!addr) + continue; + + if (!kallsyms_lookup_size_offset(addr, &size, &offset)) { + pr_info("symbol %s lookup addr fail.\n", + reco_info[i].name); + size = 0; + } + + reco_info[i].size = size; + } + + if (!register_sysctl("kernel", uce_kernel_recovery_ctl_table)) + pr_err("register sysctl table fail.\n"); + + return 1; +} +fs_initcall(kernel_access_sea_recovery_init); + +static int __init enable_kernel_access_sea_recovery(char *str) +{ + int max = (1 << UCE_KER_REC_NUM) - 1; + int val; + + if (kstrtoint(str, 0, &val)) + return -EINVAL; + + if (val < 0 || val > max) { + pr_info("invalid uce_kernel_recovery value %d", val); + return -EINVAL; + } + + kernel_access_sea_recovery = val; + + return 1; +} +__setup("uce_kernel_recovery=", enable_kernel_access_sea_recovery); + +/* + * what is kernel recovery? + * If the process's private data is accessed in the kernel mode to trigger + * special sea fault, it can controlled by killing the process and isolating + * the failure pages instead of die. + */ +static int is_in_kernel_recovery(unsigned int esr, struct pt_regs *regs) +{ + /* + * target insn: ldp-pre, ldp-post, ldp-offset, + * ldr-64bit-pre/pose, ldr-32bit-pre/post, ldrb-pre/post, ldrh-pre/post + */ + u32 target_insn[] = {0xa8c, 0xa9c, 0xa94, 0xf84, 0x784, 0x384, 0xb84}; + void *pc = (void *)instruction_pointer(regs); + struct uce_kernel_recovery_info *info; + bool insn_match = false; + u32 insn; + int i; + + pr_emerg("%s-%d, kernel recovery: 0x%x, esr: 0x%08x -- %s, %pS\n", + current->comm, current->pid, kernel_access_sea_recovery, esr, + esr_get_class_string(esr), pc); + + if (aarch64_insn_read((void *)pc, &insn)) { + pr_emerg("insn read fail.\n"); + return -EFAULT; + } + + /* + * We process special ESR: + * EC : 0b100101 Data Abort taken without a change in Exception level. + * DFSC : 0b010000 Synchronous External abort, not on translation table + * walk or hardware update of translation table. + * eg: 0x96000610 + */ + if (ESR_ELx_EC(esr) != ESR_ELx_EC_DABT_CUR || + (esr & ESR_ELx_FSC) != ESR_ELx_FSC_EXTABT) { + pr_emerg("esr not match.\n"); + return -EINVAL; + } + + insn = (insn >> 20) & 0xffc; + for (i = 0; i < ARRAY_SIZE(target_insn); i++) { + if (insn == target_insn[i]) { + insn_match = true; + break; + } + } + + if (!insn_match) { + pr_emerg("insn 0x%x is not match.\n", insn); + return -EINVAL; + } + + for (i = 0; i < UCE_KER_REC_NUM; i++) { + if (!((kernel_access_sea_recovery >> i) & 0x1)) + continue; + + info = &reco_info[i]; + if (info->fn && regs->pc >= info->addr && + regs->pc < (info->addr + info->size)) { + pr_emerg("total match %s success.\n", info->name); + return i; + } + } + + pr_emerg("scene is not match, kernel recovery %d.\n", + kernel_access_sea_recovery); + return -EINVAL; +} + +bool arm64_process_kernel_sea(unsigned long addr, unsigned int esr, + struct pt_regs *regs, int sig, + int code, void __user *siaddr) +{ + int idx; + + if (user_mode(regs) || apei_claim_sea(regs) < 0) + return false; + + if (!current->mm || !kernel_access_sea_recovery) { + pr_emerg("kernel recovery %d, %s-%d is %s-thread.\n", + kernel_access_sea_recovery, + current->comm, current->pid, + (current->mm) ? "user" : "kernel"); + + return false; + } + + idx = is_in_kernel_recovery(esr, regs); + if (idx < 0 || idx >= UCE_KER_REC_NUM) { + pr_emerg("Uncorrected hardware memory error (sence not match or sence switch is off) in kernel-access\n"); + return false; + } + + current->thread.fault_address = 0; + current->thread.fault_code = esr; + regs->pc = (unsigned long)reco_info[idx].fn; + + arm64_force_sig_fault(sig, code, siaddr, + "Uncorrected hardware memory use with kernel recovery in kernel-access\n"); + + return true; +}
From: Tong Tiangen tongtiangen@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PM10 CVE: NA
--------------------------------
Signed-off-by: Tong Tiangen tongtiangen@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + 1 file changed, 1 insertion(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index d8b8ffbd3f78..401ab0f99631 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -464,6 +464,7 @@ CONFIG_ARM64_UAO=y CONFIG_ARM64_PMEM=y CONFIG_ARM64_RAS_EXTN=y CONFIG_ARM64_CNP=y +CONFIG_ARM64_UCE_KERNEL_RECOVERY=y # end of ARMv8.2 architectural features
#
From: Zheng Yejian zhengyejian1@huawei.com
Offering: HULK hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4WBFN
--------------------------------
Refer to following procedure: klp_init_object klp_init_object_loaded klp_find_object_symbol <-- 1. oops happened when old_name is NULL!!! klp_init_func <-- 2. currently old_name is first time check here
This problem was introduced in commit 7834e94cd8b7 ("livepatch/arm64: Fix func size less than limit") which exchange order of 'klp_init_func' and 'klp_init_object_loaded' then cause old_name being used before check.
We move these checks before 'klp_init_object_loaded' and add several logs to tell why check failed.
Fixes: 7834e94cd8b7 ("livepatch/arm64: Fix func size less than limit") Signed-off-by: Zheng Yejian zhengyejian1@huawei.com Reviewed-by: Cheng Jian cj.chengjian@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/livepatch/core.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-)
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 2da8b922278a..d34b68614f2c 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -953,19 +953,6 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) int ret; #endif
- if (!func->old_name) - return -EINVAL; - - /* - * NOPs get the address later. The patched module must be loaded, - * see klp_init_object_loaded(). - */ - if (!func->new_func && !func->nop) - return -EINVAL; - - if (strlen(func->old_name) >= KSYM_NAME_LEN) - return -EINVAL; - INIT_LIST_HEAD(&func->stack_node); func->patched = false;
@@ -1082,6 +1069,24 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
if (klp_is_module(obj) && strlen(obj->name) >= MODULE_NAME_LEN) return -EINVAL; + klp_for_each_func(obj, func) { + if (!func->old_name) { + pr_err("old name is invalid\n"); + return -EINVAL; + } + /* + * NOPs get the address later. The patched module must be loaded, + * see klp_init_object_loaded(). + */ + if (!func->new_func && !func->nop) { + pr_err("new_func is invalid\n"); + return -EINVAL; + } + if (strlen(func->old_name) >= KSYM_NAME_LEN) { + pr_err("function old name is too long\n"); + return -EINVAL; + } + }
obj->patched = false; obj->mod = NULL;
From: Zhang Wensheng zhangwensheng5@huawei.com
hulk inclusion category: bugfix bugzilla: 185755, https://gitee.com/openeuler/kernel/issues/I4WTCI CVE:NA
Reference: https://lore.kernel.org/lkml/8b6032c7-c971-d79b-4a41-271d3cc0efdd@huawei.com...
--------------------------------
KASAN reports a use-after-free report when doing normal scsi-mq test
[69832.239032] ================================================================== [69832.241810] BUG: KASAN: use-after-free in bfq_dispatch_request+0x1045/0x44b0 [69832.243267] Read of size 8 at addr ffff88802622ba88 by task kworker/3:1H/155 [69832.244656] [69832.245007] CPU: 3 PID: 155 Comm: kworker/3:1H Not tainted 5.10.0-10295-g576c6382529e #8 [69832.246626] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [69832.249069] Workqueue: kblockd blk_mq_run_work_fn [69832.250022] Call Trace: [69832.250541] dump_stack+0x9b/0xce [69832.251232] ? bfq_dispatch_request+0x1045/0x44b0 [69832.252243] print_address_description.constprop.6+0x3e/0x60 [69832.253381] ? __cpuidle_text_end+0x5/0x5 [69832.254211] ? vprintk_func+0x6b/0x120 [69832.254994] ? bfq_dispatch_request+0x1045/0x44b0 [69832.255952] ? bfq_dispatch_request+0x1045/0x44b0 [69832.256914] kasan_report.cold.9+0x22/0x3a [69832.257753] ? bfq_dispatch_request+0x1045/0x44b0 [69832.258755] check_memory_region+0x1c1/0x1e0 [69832.260248] bfq_dispatch_request+0x1045/0x44b0 [69832.261181] ? bfq_bfqq_expire+0x2440/0x2440 [69832.262032] ? blk_mq_delay_run_hw_queues+0xf9/0x170 [69832.263022] __blk_mq_do_dispatch_sched+0x52f/0x830 [69832.264011] ? blk_mq_sched_request_inserted+0x100/0x100 [69832.265101] __blk_mq_sched_dispatch_requests+0x398/0x4f0 [69832.266206] ? blk_mq_do_dispatch_ctx+0x570/0x570 [69832.267147] ? __switch_to+0x5f4/0xee0 [69832.267898] blk_mq_sched_dispatch_requests+0xdf/0x140 [69832.268946] __blk_mq_run_hw_queue+0xc0/0x270 [69832.269840] blk_mq_run_work_fn+0x51/0x60 [69832.278170] process_one_work+0x6d4/0xfe0 [69832.278984] worker_thread+0x91/0xc80 [69832.279726] ? __kthread_parkme+0xb0/0x110 [69832.280554] ? process_one_work+0xfe0/0xfe0 [69832.281414] kthread+0x32d/0x3f0 [69832.282082] ? kthread_park+0x170/0x170 [69832.282849] ret_from_fork+0x1f/0x30 [69832.283573] [69832.283886] Allocated by task 7725: [69832.284599] kasan_save_stack+0x19/0x40 [69832.285385] __kasan_kmalloc.constprop.2+0xc1/0xd0 [69832.286350] kmem_cache_alloc_node+0x13f/0x460 [69832.287237] bfq_get_queue+0x3d4/0x1140 [69832.287993] bfq_get_bfqq_handle_split+0x103/0x510 [69832.289015] bfq_init_rq+0x337/0x2d50 [69832.289749] bfq_insert_requests+0x304/0x4e10 [69832.290634] blk_mq_sched_insert_requests+0x13e/0x390 [69832.291629] blk_mq_flush_plug_list+0x4b4/0x760 [69832.292538] blk_flush_plug_list+0x2c5/0x480 [69832.293392] io_schedule_prepare+0xb2/0xd0 [69832.294209] io_schedule_timeout+0x13/0x80 [69832.295014] wait_for_common_io.constprop.1+0x13c/0x270 [69832.296137] submit_bio_wait+0x103/0x1a0 [69832.296932] blkdev_issue_discard+0xe6/0x160 [69832.297794] blk_ioctl_discard+0x219/0x290 [69832.298614] blkdev_common_ioctl+0x50a/0x1750 [69832.304715] blkdev_ioctl+0x470/0x600 [69832.305474] block_ioctl+0xde/0x120 [69832.306232] vfs_ioctl+0x6c/0xc0 [69832.306877] __se_sys_ioctl+0x90/0xa0 [69832.307629] do_syscall_64+0x2d/0x40 [69832.308362] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [69832.309382] [69832.309701] Freed by task 155: [69832.310328] kasan_save_stack+0x19/0x40 [69832.311121] kasan_set_track+0x1c/0x30 [69832.311868] kasan_set_free_info+0x1b/0x30 [69832.312699] __kasan_slab_free+0x111/0x160 [69832.313524] kmem_cache_free+0x94/0x460 [69832.314367] bfq_put_queue+0x582/0x940 [69832.315112] __bfq_bfqd_reset_in_service+0x166/0x1d0 [69832.317275] bfq_bfqq_expire+0xb27/0x2440 [69832.318084] bfq_dispatch_request+0x697/0x44b0 [69832.318991] __blk_mq_do_dispatch_sched+0x52f/0x830 [69832.319984] __blk_mq_sched_dispatch_requests+0x398/0x4f0 [69832.321087] blk_mq_sched_dispatch_requests+0xdf/0x140 [69832.322225] __blk_mq_run_hw_queue+0xc0/0x270 [69832.323114] blk_mq_run_work_fn+0x51/0x60 [69832.323942] process_one_work+0x6d4/0xfe0 [69832.324772] worker_thread+0x91/0xc80 [69832.325518] kthread+0x32d/0x3f0 [69832.326205] ret_from_fork+0x1f/0x30 [69832.326932] [69832.338297] The buggy address belongs to the object at ffff88802622b968 [69832.338297] which belongs to the cache bfq_queue of size 512 [69832.340766] The buggy address is located 288 bytes inside of [69832.340766] 512-byte region [ffff88802622b968, ffff88802622bb68) [69832.343091] The buggy address belongs to the page: [69832.344097] page:ffffea0000988a00 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff88802622a528 pfn:0x26228 [69832.346214] head:ffffea0000988a00 order:2 compound_mapcount:0 compound_pincount:0 [69832.347719] flags: 0x1fffff80010200(slab|head) [69832.348625] raw: 001fffff80010200 ffffea0000dbac08 ffff888017a57650 ffff8880179fe840 [69832.354972] raw: ffff88802622a528 0000000000120008 00000001ffffffff 0000000000000000 [69832.356547] page dumped because: kasan: bad access detected [69832.357652] [69832.357970] Memory state around the buggy address: [69832.358926] ffff88802622b980: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [69832.360358] ffff88802622ba00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [69832.361810] >ffff88802622ba80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [69832.363273] ^ [69832.363975] ffff88802622bb00: fb fb fb fb fb fb fb fb fb fb fb fb fb fc fc fc [69832.375960] ffff88802622bb80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [69832.377405] ==================================================================
In bfq_dispatch_requestfunction, it may have function call:
bfq_dispatch_request __bfq_dispatch_request bfq_select_queue bfq_bfqq_expire __bfq_bfqd_reset_in_service bfq_put_queue kmem_cache_free In this function call, in_serv_queue has beed expired and meet the conditions to free. In the function bfq_dispatch_request, the address of in_serv_queue pointing to has been released. For getting the value of idle_timer_disabled, it will get flags value from the address which in_serv_queue pointing to, then the problem of use-after-free happens;
Fix the problem by check in_serv_queue == bfqd->in_service_queue, to get the value of idle_timer_disabled if in_serve_queue is equel to bfqd->in_service_queue. If the space of in_serv_queue pointing has been released, this judge will aviod use-after-free problem. And if in_serv_queue may be expired or finished, the idle_timer_disabled will be false which would not give effects to bfq_update_dispatch_stats.
Reported-by: Hulk Robot hulkci@huawei.com Signed-off-by: Zhang Wensheng zhangwensheng5@huawei.com Reviewed-by: Laibin Qiu qiulaibin@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/bfq-iosched.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 2247db842985..abba0b7c8d71 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4804,7 +4804,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; struct request *rq; struct bfq_queue *in_serv_queue; - bool waiting_rq, idle_timer_disabled; + bool waiting_rq, idle_timer_disabled = false;
spin_lock_irq(&bfqd->lock);
@@ -4812,14 +4812,15 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
rq = __bfq_dispatch_request(hctx); - - idle_timer_disabled = - waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); + if (in_serv_queue == bfqd->in_service_queue) { + idle_timer_disabled = + waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); + }
spin_unlock_irq(&bfqd->lock); - - bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue, - idle_timer_disabled); + bfq_update_dispatch_stats(hctx->queue, rq, + idle_timer_disabled ? in_serv_queue : NULL, + idle_timer_disabled);
return rq; }
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I4S8DW
---------------------------
When sharing a tag set, if most disks are issuing small amount of IO, and only a few is issuing a large amount of IO. Current approach is to limit the max amount of tags a disk can get equally to the average of total tags. Thus the few heavy load disk can't get enough tags while many tags are still free in the tag set.
We add 'pending_queues' in blk_mq_tag_set to count how many queues can't get driver tag. Thus if this value is zero, there is no need to limit the max number of available tags.
On the other hand, if a queue doesn't issue IO, the 'active_queues' will not be decreased in a period of time(request timeout), thus a lot of tags will not be available because max number of available tags is set to max(total tags / active_queues, 4). Thus we decreased it when 'nr_active' is 0.
This functionality is enabled by default, to disable it, add "blk_mq.unfair_dtag=0" to boot cmd.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-mq-debugfs.c | 2 ++ block/blk-mq-tag.c | 41 ++++++++++++++++++++++++++++++++++++++++- block/blk-mq-tag.h | 29 +++++++++++++++++++++++++++-- block/blk-mq.c | 24 ++++++++++++++++++++---- block/blk-mq.h | 5 +++++ include/linux/blk-mq.h | 4 ++++ include/linux/blkdev.h | 2 ++ 7 files changed, 100 insertions(+), 7 deletions(-)
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index b5f26082b959..91a3314dde8b 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -453,6 +453,8 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m, seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags); seq_printf(m, "active_queues=%d\n", atomic_read(&tags->active_queues)); + seq_printf(m, "pending_queues=%d\n", + atomic_read(&tags->pending_queues));
seq_puts(m, "\nbitmap_tags:\n"); sbitmap_queue_show(tags->bitmap_tags, m); diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 16ad9e656610..69f5c170d1f4 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -73,6 +73,40 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) blk_mq_tag_wakeup_all(tags, false); }
+void __blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx) +{ + if (blk_mq_is_sbitmap_shared(hctx->flags)) { + struct request_queue *q = hctx->queue; + struct blk_mq_tag_set *set = q->tag_set; + + if (!test_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags) && + !test_and_set_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags)) + atomic_inc(&set->pending_queues_shared_sbitmap); + } else { + if (!test_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state) && + !test_and_set_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state)) + atomic_inc(&hctx->tags->pending_queues); + } +} + +void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_tags *tags = hctx->tags; + struct request_queue *q = hctx->queue; + struct blk_mq_tag_set *set = q->tag_set; + + if (blk_mq_is_sbitmap_shared(hctx->flags)) { + if (!test_and_clear_bit(QUEUE_FLAG_HCTX_WAIT, + &q->queue_flags)) + return; + atomic_dec(&set->pending_queues_shared_sbitmap); + } else { + if (!test_and_clear_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state)) + return; + atomic_dec(&tags->pending_queues); + } +} + static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt) { @@ -111,8 +145,11 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) if (tag != BLK_MQ_NO_TAG) goto found_tag;
- if (data->flags & BLK_MQ_REQ_NOWAIT) + if (data->flags & BLK_MQ_REQ_NOWAIT) { + if (!data->q->elevator) + blk_mq_dtag_busy(data->hctx); return BLK_MQ_NO_TAG; + }
ws = bt_wait_ptr(bt, data->hctx); do { @@ -139,6 +176,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) if (tag != BLK_MQ_NO_TAG) break;
+ if (!data->q->elevator) + blk_mq_dtag_busy(data->hctx); bt_prev = bt; io_schedule();
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 674b0d80f4d2..33579a1e967e 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -4,6 +4,7 @@
#include <linux/kabi.h>
+extern bool mq_unfair_dtag; /* * Tag address space map. */ @@ -12,6 +13,11 @@ struct blk_mq_tags { unsigned int nr_reserved_tags;
atomic_t active_queues; + /* + * If multiple queues share a tag set, pending_queues record the + * number of queues that can't get driver tag. + */ + atomic_t pending_queues;
struct sbitmap_queue *bitmap_tags; struct sbitmap_queue *breserved_tags; @@ -73,8 +79,11 @@ enum { BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1, };
-extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *); -extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); +extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx); +extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx); +extern void __blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx); +extern void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx); +
static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { @@ -92,6 +101,22 @@ static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) __blk_mq_tag_idle(hctx); }
+static inline void blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx) +{ + if (!(mq_unfair_dtag && (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))) + return; + + __blk_mq_dtag_busy(hctx); +} + +static inline void blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx) +{ + if (!(mq_unfair_dtag && (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))) + return; + + __blk_mq_dtag_idle(hctx); +} + static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, unsigned int tag) { diff --git a/block/blk-mq.c b/block/blk-mq.c index e0b833120498..3220c68f4503 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -41,6 +41,9 @@ #include "blk-mq-sched.h" #include "blk-rq-qos.h"
+bool mq_unfair_dtag = true; +module_param_named(unfair_dtag, mq_unfair_dtag, bool, 0444); + static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
static void blk_mq_poll_stats_start(struct request_queue *q); @@ -535,8 +538,13 @@ void blk_mq_free_request(struct request *rq) }
ctx->rq_completed[rq_is_sync(rq)]++; - if (rq->rq_flags & RQF_MQ_INFLIGHT) + if (rq->rq_flags & RQF_MQ_INFLIGHT) { __blk_mq_dec_active_requests(hctx); + if (mq_unfair_dtag && !__blk_mq_active_requests(hctx)) { + blk_mq_tag_idle(hctx); + blk_mq_dtag_idle(hctx); + } + }
if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) laptop_io_completion(q->backing_dev_info); @@ -1003,8 +1011,10 @@ static void blk_mq_timeout_work(struct work_struct *work) */ queue_for_each_hw_ctx(q, hctx, i) { /* the hctx may be unmapped, so check it here */ - if (blk_mq_hw_queue_mapped(hctx)) + if (blk_mq_hw_queue_mapped(hctx)) { blk_mq_tag_idle(hctx); + blk_mq_dtag_idle(hctx); + } } } blk_queue_exit(q); @@ -1109,8 +1119,10 @@ static bool __blk_mq_get_driver_tag(struct request *rq) }
tag = __sbitmap_queue_get(bt); - if (tag == BLK_MQ_NO_TAG) + if (tag == BLK_MQ_NO_TAG) { + blk_mq_dtag_busy(rq->mq_hctx); return false; + }
rq->tag = tag + tag_offset; return true; @@ -2711,8 +2723,10 @@ static void blk_mq_exit_hctx(struct request_queue *q, { struct request *flush_rq = hctx->fq->flush_rq;
- if (blk_mq_hw_queue_mapped(hctx)) + if (blk_mq_hw_queue_mapped(hctx)) { blk_mq_tag_idle(hctx); + blk_mq_dtag_idle(hctx); + }
blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx], set->queue_depth, flush_rq); @@ -3033,6 +3047,7 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared) hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; } else { blk_mq_tag_idle(hctx); + blk_mq_dtag_idle(hctx); hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; } } @@ -3589,6 +3604,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
if (blk_mq_is_sbitmap_shared(set->flags)) { atomic_set(&set->active_queues_shared_sbitmap, 0); + atomic_set(&set->pending_queues_shared_sbitmap, 0);
if (blk_mq_init_shared_sbitmap(set, set->flags)) { ret = -ENOMEM; diff --git a/block/blk-mq.h b/block/blk-mq.h index 7f3194657dff..bb58c68b8274 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -316,10 +316,15 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, struct request_queue *q = hctx->queue; struct blk_mq_tag_set *set = q->tag_set;
+ if (mq_unfair_dtag && + !atomic_read(&set->pending_queues_shared_sbitmap)) + return true; if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) return true; users = atomic_read(&set->active_queues_shared_sbitmap); } else { + if (mq_unfair_dtag && !atomic_read(&hctx->tags->pending_queues)) + return true; if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return true; users = atomic_read(&hctx->tags->active_queues); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index adcbef9705ca..f0b5519a3f5d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -266,6 +266,7 @@ struct blk_mq_tag_set { unsigned int flags; void *driver_data; atomic_t active_queues_shared_sbitmap; + atomic_t pending_queues_shared_sbitmap;
struct sbitmap_queue __bitmap_tags; struct sbitmap_queue __breserved_tags; @@ -442,6 +443,9 @@ enum { /* hw queue is inactive after all its CPUs become offline */ BLK_MQ_S_INACTIVE = 3,
+ /* hw queue is waiting for driver tag */ + BLK_MQ_S_DTAG_WAIT = 4, + BLK_MQ_MAX_DEPTH = 10240,
BLK_MQ_CPU_WORK_BATCH = 8, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index eae4a046037e..f1513c3308fb 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -640,6 +640,8 @@ struct request_queue { #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */ +/*at least one blk-mq hctx can't get driver tag */ +#define QUEUE_FLAG_HCTX_WAIT 30
#define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_SAME_COMP) | \
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I4S8DW
---------------------------
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-mq-debugfs.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+)
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 91a3314dde8b..f7ababd06f77 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -204,6 +204,26 @@ static ssize_t queue_write_hint_store(void *data, const char __user *buf, return count; }
+static int queue_tag_set_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + struct blk_mq_tag_set *set = q->tag_set; + + seq_printf(m, "nr_hw_queues=%u\n", set->nr_hw_queues); + seq_printf(m, "queue_depth=%u\n", set->queue_depth); + seq_printf(m, "reserved_tags=%u\n", set->reserved_tags); + seq_printf(m, "cmd_size=%u\n", set->cmd_size); + seq_printf(m, "numa_node=%d\n", set->numa_node); + seq_printf(m, "timeout=%u\n", set->timeout); + seq_printf(m, "flags=%u\n", set->flags); + seq_printf(m, "active_queues_shared_sbitmap=%d\n", + atomic_read(&set->active_queues_shared_sbitmap)); + seq_printf(m, "pending_queues_shared_sbitmap=%d\n", + atomic_read(&set->pending_queues_shared_sbitmap)); + + return 0; +} + static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { { "poll_stat", 0400, queue_poll_stat_show }, { "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops }, @@ -211,6 +231,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { { "state", 0600, queue_state_show, queue_state_write }, { "write_hints", 0600, queue_write_hint_show, queue_write_hint_store }, { "zone_wlock", 0400, queue_zone_wlock_show, NULL }, + { "tag_set", 0400, queue_tag_set_show, NULL }, { }, };
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I4S8DW
---------------------------
If pending_queues is increased once, it will only be decreased when nr_active is zero, and that will lead to the under-utilization of host tags because pending_queues is non-zero and the available tags for the queue will be max(host tags / active_queues, 4) instead of the needed tags of the queue.
Fix it by adding an expiration time for the increasement of pending_queues, and decrease it when it expires, so pending_queues will be decreased to zero if there is no tag allocation failure, and the available tags for the queue will be the whole host tags.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-mq-debugfs.c | 29 +++++++++++++++++++++++++++++ block/blk-mq-tag.c | 34 +++++++++++++++++++++++++++++++--- block/blk-mq-tag.h | 6 +++--- block/blk-mq.c | 11 +++++++---- include/linux/blk-mq.h | 6 ++++++ include/linux/blkdev.h | 1 + 6 files changed, 77 insertions(+), 10 deletions(-)
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index f7ababd06f77..f3a263a1bb43 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -224,6 +224,19 @@ static int queue_tag_set_show(void *data, struct seq_file *m) return 0; }
+static int queue_dtag_wait_time_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + unsigned int time = 0; + + if (test_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags)) + time = jiffies_to_msecs(jiffies - READ_ONCE(q->dtag_wait_time)); + + seq_printf(m, "%u\n", time); + + return 0; +} + static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { { "poll_stat", 0400, queue_poll_stat_show }, { "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops }, @@ -232,6 +245,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { { "write_hints", 0600, queue_write_hint_show, queue_write_hint_store }, { "zone_wlock", 0400, queue_zone_wlock_show, NULL }, { "tag_set", 0400, queue_tag_set_show, NULL }, + { "dtag_wait_time_ms", 0400, queue_dtag_wait_time_show, NULL }, { }, };
@@ -651,6 +665,20 @@ static int hctx_dispatch_busy_show(void *data, struct seq_file *m) return 0; }
+static int hctx_dtag_wait_time_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + unsigned int time = 0; + + if (test_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state)) + time = jiffies_to_msecs(jiffies - + READ_ONCE(hctx->dtag_wait_time)); + + seq_printf(m, "%u\n", time); + + return 0; +} + #define CTX_RQ_SEQ_OPS(name, type) \ static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t *pos) \ __acquires(&ctx->lock) \ @@ -821,6 +849,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"active", 0400, hctx_active_show}, {"dispatch_busy", 0400, hctx_dispatch_busy_show}, {"type", 0400, hctx_type_show}, + {"dtag_wait_time_ms", 0400, hctx_dtag_wait_time_show}, {}, };
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 69f5c170d1f4..64c9633d5d5a 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -15,6 +15,8 @@ #include "blk-mq.h" #include "blk-mq-tag.h"
+#define BLK_MQ_DTAG_WAIT_EXPIRE (5 * HZ) + /* * If a previously inactive queue goes active, bump the active user count. * We need to do this before try to allocate driver tag, then even if fail @@ -80,29 +82,53 @@ void __blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx) struct blk_mq_tag_set *set = q->tag_set;
if (!test_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags) && - !test_and_set_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags)) + !test_and_set_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags)) { + WRITE_ONCE(q->dtag_wait_time, jiffies); atomic_inc(&set->pending_queues_shared_sbitmap); + } } else { if (!test_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state) && - !test_and_set_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state)) + !test_and_set_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state)) { + WRITE_ONCE(hctx->dtag_wait_time, jiffies); atomic_inc(&hctx->tags->pending_queues); + } } }
-void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx) +void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx, bool force) { struct blk_mq_tags *tags = hctx->tags; struct request_queue *q = hctx->queue; struct blk_mq_tag_set *set = q->tag_set;
if (blk_mq_is_sbitmap_shared(hctx->flags)) { + if (!test_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags)) + return; + + if (!force && time_before(jiffies, + READ_ONCE(q->dtag_wait_time) + + BLK_MQ_DTAG_WAIT_EXPIRE)) + return; + if (!test_and_clear_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags)) return; + + WRITE_ONCE(q->dtag_wait_time, jiffies); atomic_dec(&set->pending_queues_shared_sbitmap); } else { + if (!test_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state)) + return; + + if (!force && time_before(jiffies, + READ_ONCE(hctx->dtag_wait_time) + + BLK_MQ_DTAG_WAIT_EXPIRE)) + return; + if (!test_and_clear_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state)) return; + + WRITE_ONCE(hctx->dtag_wait_time, jiffies); atomic_dec(&tags->pending_queues); } } @@ -206,6 +232,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) sbitmap_finish_wait(bt, ws, &wait);
found_tag: + if (!data->q->elevator) + blk_mq_dtag_idle(data->hctx, false); /* * Give up this allocation if the hctx is inactive. The caller will * retry on an active hctx. diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 33579a1e967e..25f30fa99857 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -82,7 +82,7 @@ enum { extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx); extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx); extern void __blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx); -extern void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx); +extern void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx, bool force);
static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) @@ -109,12 +109,12 @@ static inline void blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx) __blk_mq_dtag_busy(hctx); }
-static inline void blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx) +static inline void blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx, bool force) { if (!(mq_unfair_dtag && (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))) return;
- __blk_mq_dtag_idle(hctx); + __blk_mq_dtag_idle(hctx, force); }
static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, diff --git a/block/blk-mq.c b/block/blk-mq.c index 3220c68f4503..c3beaca1f4fb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -542,7 +542,7 @@ void blk_mq_free_request(struct request *rq) __blk_mq_dec_active_requests(hctx); if (mq_unfair_dtag && !__blk_mq_active_requests(hctx)) { blk_mq_tag_idle(hctx); - blk_mq_dtag_idle(hctx); + blk_mq_dtag_idle(hctx, true); } }
@@ -1013,7 +1013,7 @@ static void blk_mq_timeout_work(struct work_struct *work) /* the hctx may be unmapped, so check it here */ if (blk_mq_hw_queue_mapped(hctx)) { blk_mq_tag_idle(hctx); - blk_mq_dtag_idle(hctx); + blk_mq_dtag_idle(hctx, true); } } } @@ -1124,6 +1124,7 @@ static bool __blk_mq_get_driver_tag(struct request *rq) return false; }
+ blk_mq_dtag_idle(rq->mq_hctx, false); rq->tag = tag + tag_offset; return true; } @@ -2725,7 +2726,7 @@ static void blk_mq_exit_hctx(struct request_queue *q,
if (blk_mq_hw_queue_mapped(hctx)) { blk_mq_tag_idle(hctx); - blk_mq_dtag_idle(hctx); + blk_mq_dtag_idle(hctx, true); }
blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx], @@ -2825,6 +2826,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, INIT_LIST_HEAD(&hctx->dispatch); hctx->queue = q; hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED; + hctx->dtag_wait_time = jiffies;
INIT_LIST_HEAD(&hctx->hctx_list);
@@ -3047,7 +3049,7 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared) hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; } else { blk_mq_tag_idle(hctx); - blk_mq_dtag_idle(hctx); + blk_mq_dtag_idle(hctx, true); hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; } } @@ -3375,6 +3377,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, spin_lock_init(&q->requeue_lock);
q->nr_requests = set->queue_depth; + q->dtag_wait_time = jiffies;
/* * Default to classic polling diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index f0b5519a3f5d..b2db9a5c10e8 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -172,6 +172,12 @@ struct blk_mq_hw_ctx { */ struct list_head hctx_list;
+ /** + * @dtag_wait_time: record when hardware queue is pending, specifically + * when BLK_MQ_S_DTAG_WAIT is set in state. + */ + unsigned long dtag_wait_time; + KABI_RESERVE(1) KABI_RESERVE(2) KABI_RESERVE(3) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f1513c3308fb..433485f8b1cc 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -602,6 +602,7 @@ struct request_queue { #define BLK_MAX_WRITE_HINTS 5 u64 write_hints[BLK_MAX_WRITE_HINTS];
+ unsigned long dtag_wait_time; KABI_RESERVE(1) KABI_RESERVE(2) KABI_RESERVE(3)
From: Daniel Borkmann daniel@iogearbox.net
stable inclusion from stable-v5.10.92 commit 35ab8c9085b0af847df7fac9571ccd26d9f0f513 bugzilla: https://gitee.com/openeuler/kernel/issues/I4WRPV
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
[ no upstream commit given implicitly fixed through the larger refactoring in c25b2ae136039ffa820c26138ed4a5e5f3ab3841 ]
While auditing some other code, I noticed missing checks inside the pointer arithmetic simulation, more specifically, adjust_ptr_min_max_vals(). Several *_OR_NULL types are not rejected whereas they are _required_ to be rejected given the expectation is that they get promoted into a 'real' pointer type for the success case, that is, after an explicit != NULL check.
One case which stands out and is accessible from unprivileged (iff enabled given disabled by default) is BPF ring buffer. From crafting a PoC, the NULL check can be bypassed through an offset, and its id marking will then lead to promotion of mem_or_null to a mem type.
bpf_ringbuf_reserve() helper can trigger this case through passing of reserved flags, for example.
func#0 @0 0: R1=ctx(id=0,off=0,imm=0) R10=fp0 0: (7a) *(u64 *)(r10 -8) = 0 1: R1=ctx(id=0,off=0,imm=0) R10=fp0 fp-8_w=mmmmmmmm 1: (18) r1 = 0x0 3: R1_w=map_ptr(id=0,off=0,ks=0,vs=0,imm=0) R10=fp0 fp-8_w=mmmmmmmm 3: (b7) r2 = 8 4: R1_w=map_ptr(id=0,off=0,ks=0,vs=0,imm=0) R2_w=invP8 R10=fp0 fp-8_w=mmmmmmmm 4: (b7) r3 = 0 5: R1_w=map_ptr(id=0,off=0,ks=0,vs=0,imm=0) R2_w=invP8 R3_w=invP0 R10=fp0 fp-8_w=mmmmmmmm 5: (85) call bpf_ringbuf_reserve#131 6: R0_w=mem_or_null(id=2,ref_obj_id=2,off=0,imm=0) R10=fp0 fp-8_w=mmmmmmmm refs=2 6: (bf) r6 = r0 7: R0_w=mem_or_null(id=2,ref_obj_id=2,off=0,imm=0) R6_w=mem_or_null(id=2,ref_obj_id=2,off=0,imm=0) R10=fp0 fp-8_w=mmmmmmmm refs=2 7: (07) r0 += 1 8: R0_w=mem_or_null(id=2,ref_obj_id=2,off=1,imm=0) R6_w=mem_or_null(id=2,ref_obj_id=2,off=0,imm=0) R10=fp0 fp-8_w=mmmmmmmm refs=2 8: (15) if r0 == 0x0 goto pc+4 R0_w=mem(id=0,ref_obj_id=0,off=0,imm=0) R6_w=mem(id=0,ref_obj_id=2,off=0,imm=0) R10=fp0 fp-8_w=mmmmmmmm refs=2 9: R0_w=mem(id=0,ref_obj_id=0,off=0,imm=0) R6_w=mem(id=0,ref_obj_id=2,off=0,imm=0) R10=fp0 fp-8_w=mmmmmmmm refs=2 9: (62) *(u32 *)(r6 +0) = 0 R0_w=mem(id=0,ref_obj_id=0,off=0,imm=0) R6_w=mem(id=0,ref_obj_id=2,off=0,imm=0) R10=fp0 fp-8_w=mmmmmmmm refs=2 10: R0_w=mem(id=0,ref_obj_id=0,off=0,imm=0) R6_w=mem(id=0,ref_obj_id=2,off=0,imm=0) R10=fp0 fp-8_w=mmmmmmmm refs=2 10: (bf) r1 = r6 11: R0_w=mem(id=0,ref_obj_id=0,off=0,imm=0) R1_w=mem(id=0,ref_obj_id=2,off=0,imm=0) R6_w=mem(id=0,ref_obj_id=2,off=0,imm=0) R10=fp0 fp-8_w=mmmmmmmm refs=2 11: (b7) r2 = 0 12: R0_w=mem(id=0,ref_obj_id=0,off=0,imm=0) R1_w=mem(id=0,ref_obj_id=2,off=0,imm=0) R2_w=invP0 R6_w=mem(id=0,ref_obj_id=2,off=0,imm=0) R10=fp0 fp-8_w=mmmmmmmm refs=2 12: (85) call bpf_ringbuf_submit#132 13: R6=invP(id=0) R10=fp0 fp-8=mmmmmmmm 13: (b7) r0 = 0 14: R0_w=invP0 R6=invP(id=0) R10=fp0 fp-8=mmmmmmmm 14: (95) exit
from 8 to 13: safe processed 15 insns (limit 1000000) max_states_per_insn 0 total_states 1 peak_states 1 mark_read 0 OK
All three commits, that is b121b341e598 ("bpf: Add PTR_TO_BTF_ID_OR_NULL support"), 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it"), and the afbf21dce668 ("bpf: Support readonly/readwrite buffers in verifier") suffer the same cause and their *_OR_NULL type pendants must be rejected in adjust_ptr_min_max_vals().
Make the test more robust by reusing reg_type_may_be_null() helper such that we catch all *_OR_NULL types we have today and in future.
Note that pointer arithmetic on PTR_TO_BTF_ID, PTR_TO_RDONLY_BUF, and PTR_TO_RDWR_BUF is generally allowed.
Fixes: b121b341e598 ("bpf: Add PTR_TO_BTF_ID_OR_NULL support") Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Fixes: afbf21dce668 ("bpf: Support readonly/readwrite buffers in verifier") Signed-off-by: Daniel Borkmann daniel@iogearbox.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Pu Lehui pulehui@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/bpf/verifier.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4e28961cfa53..b43c9de34a2c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6037,16 +6037,16 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, fallthrough; case PTR_TO_PACKET_END: case PTR_TO_SOCKET: - case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: - case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: - case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: +reject: verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; default: + if (reg_type_may_be_null(ptr_reg->type)) + goto reject; break; }
From: Hao Luo haoluo@google.com
mainline inclusion from mainline-v5.17-rc1 commit d639b9d13a39cf15639cbe6e8b2c43eb60148a73 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4WRPV CVE: CVE-2022-0500
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
There are some common properties shared between bpf reg, ret and arg values. For instance, a value may be a NULL pointer, or a pointer to a read-only memory. Previously, to express these properties, enumeration was used. For example, in order to test whether a reg value can be NULL, reg_type_may_be_null() simply enumerates all types that are possibly NULL. The problem of this approach is that it's not scalable and causes a lot of duplication. These properties can be combined, for example, a type could be either MAYBE_NULL or RDONLY, or both.
This patch series rewrites the layout of reg_type, arg_type and ret_type, so that common properties can be extracted and represented as composable flag. For example, one can write
ARG_PTR_TO_MEM | PTR_MAYBE_NULL
which is equivalent to the previous
ARG_PTR_TO_MEM_OR_NULL
The type ARG_PTR_TO_MEM are called "base type" in this patch. Base types can be extended with flags. A flag occupies the higher bits while base types sits in the lower bits.
This patch in particular sets up a set of macro for this purpose. The following patches will rewrite arg_types, ret_types and reg_types respectively.
Signed-off-by: Hao Luo haoluo@google.com Signed-off-by: Alexei Starovoitov ast@kernel.org Link: https://lore.kernel.org/bpf/20211217003152.48334-2-haoluo@google.com Conflicts: include/linux/bpf.h include/linux/bpf_verifier.h Signed-off-by: Pu Lehui pulehui@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/bpf.h | 43 ++++++++++++++++++++++++++++++++++++ include/linux/bpf_verifier.h | 13 +++++++++++ 2 files changed, 56 insertions(+)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d526073fe354..f5efbbb1d5ee 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -254,6 +254,29 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
extern const struct bpf_map_ops bpf_map_offload_ops;
+/* bpf_type_flag contains a set of flags that are applicable to the values of + * arg_type, ret_type and reg_type. For example, a pointer value may be null, + * or a memory is read-only. We classify types into two categories: base types + * and extended types. Extended types are base types combined with a type flag. + * + * Currently there are no more than 32 base types in arg_type, ret_type and + * reg_types. + */ +#define BPF_BASE_TYPE_BITS 8 + +enum bpf_type_flag { + /* PTR may be NULL. */ + PTR_MAYBE_NULL = BIT(0 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_LAST_FLAG = PTR_MAYBE_NULL, +}; + +/* Max number of base types. */ +#define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) + +/* Max number of all types. */ +#define BPF_TYPE_LIMIT (__BPF_TYPE_LAST_FLAG | (__BPF_TYPE_LAST_FLAG - 1)) + /* function argument constraints */ enum bpf_arg_type { ARG_DONTCARE = 0, /* unused argument in helper function */ @@ -296,7 +319,13 @@ enum bpf_arg_type { ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ __BPF_ARG_TYPE_MAX, + + /* This must be the last entry. Its purpose is to ensure the enum is + * wide enough to hold the higher bits reserved for bpf_type_flag. + */ + __BPF_ARG_TYPE_LIMIT = BPF_TYPE_LIMIT, }; +static_assert(__BPF_ARG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);
/* type of values returned from helper functions */ enum bpf_return_type { @@ -311,7 +340,14 @@ enum bpf_return_type { RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */ RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */ RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */ + __BPF_RET_TYPE_MAX, + + /* This must be the last entry. Its purpose is to ensure the enum is + * wide enough to hold the higher bits reserved for bpf_type_flag. + */ + __BPF_RET_TYPE_LIMIT = BPF_TYPE_LIMIT, }; +static_assert(__BPF_RET_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);
/* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL @@ -410,7 +446,14 @@ enum bpf_reg_type { PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */ PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ + __BPF_REG_TYPE_MAX, + + /* This must be the last entry. Its purpose is to ensure the enum is + * wide enough to hold the higher bits reserved for bpf_type_flag. + */ + __BPF_REG_TYPE_LIMIT = BPF_TYPE_LIMIT, }; +static_assert(__BPF_REG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);
/* The information passed from prog-specific *_is_valid_access * back to the verifier. diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 6e330ff2f28d..b04dba78100a 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -480,5 +480,18 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, const struct bpf_prog *tgt_prog, u32 btf_id, struct bpf_attach_target_info *tgt_info); +#define BPF_BASE_TYPE_MASK GENMASK(BPF_BASE_TYPE_BITS - 1, 0) + +/* extract base type from bpf_{arg, return, reg}_type. */ +static inline u32 base_type(u32 type) +{ + return type & BPF_BASE_TYPE_MASK; +} + +/* extract flags from an extended type. See bpf_type_flag in bpf.h. */ +static inline u32 type_flag(u32 type) +{ + return type & ~BPF_BASE_TYPE_MASK; +}
#endif /* _LINUX_BPF_VERIFIER_H */
From: Hao Luo haoluo@google.com
mainline inclusion from mainline-v5.17-rc1 commit 48946bd6a5d695c50b34546864b79c1f910a33c1 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4WRPV CVE: CVE-2022-0500
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
We have introduced a new type to make bpf_arg composable, by reserving high bits of bpf_arg to represent flags of a type.
One of the flags is PTR_MAYBE_NULL which indicates a pointer may be NULL. When applying this flag to an arg_type, it means the arg can take NULL pointer. This patch switches the qualified arg_types to use this flag. The arg_types changed in this patch include:
1. ARG_PTR_TO_MAP_VALUE_OR_NULL 2. ARG_PTR_TO_MEM_OR_NULL 3. ARG_PTR_TO_CTX_OR_NULL 4. ARG_PTR_TO_SOCKET_OR_NULL 5. ARG_PTR_TO_ALLOC_MEM_OR_NULL 6. ARG_PTR_TO_STACK_OR_NULL
This patch does not eliminate the use of these arg_types, instead it makes them an alias to the 'ARG_XXX | PTR_MAYBE_NULL'.
Signed-off-by: Hao Luo haoluo@google.com Signed-off-by: Alexei Starovoitov ast@kernel.org Link: https://lore.kernel.org/bpf/20211217003152.48334-3-haoluo@google.com Conflicts: include/linux/bpf.h kernel/bpf/verifier.c Signed-off-by: Pu Lehui pulehui@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/bpf.h | 12 +++++++----- kernel/bpf/verifier.c | 36 +++++++++++++----------------------- 2 files changed, 20 insertions(+), 28 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f5efbbb1d5ee..e28616839b7c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -288,13 +288,11 @@ enum bpf_arg_type { ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */ ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */ ARG_PTR_TO_UNINIT_MAP_VALUE, /* pointer to valid memory used to store a map value */ - ARG_PTR_TO_MAP_VALUE_OR_NULL, /* pointer to stack used as map value or NULL */
/* the following constraints used to prototype bpf_memcmp() and other * functions that access data on eBPF program stack */ ARG_PTR_TO_MEM, /* pointer to valid memory (stack, packet, map value) */ - ARG_PTR_TO_MEM_OR_NULL, /* pointer to valid memory or NULL */ ARG_PTR_TO_UNINIT_MEM, /* pointer to memory does not need to be initialized, * helper function must fill all bytes or clear * them in error case. @@ -304,22 +302,26 @@ enum bpf_arg_type { ARG_CONST_SIZE_OR_ZERO, /* number of bytes accessed from memory or 0 */
ARG_PTR_TO_CTX, /* pointer to context */ - ARG_PTR_TO_CTX_OR_NULL, /* pointer to context or NULL */ ARG_ANYTHING, /* any (initialized) argument is ok */ ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */ ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ ARG_PTR_TO_INT, /* pointer to int */ ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ - ARG_PTR_TO_SOCKET_OR_NULL, /* pointer to bpf_sock (fullsock) or NULL */ ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */ - ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ __BPF_ARG_TYPE_MAX,
+ /* Extended arg_types. */ + ARG_PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MAP_VALUE, + ARG_PTR_TO_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MEM, + ARG_PTR_TO_CTX_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_CTX, + ARG_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET, + ARG_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_ALLOC_MEM, + /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b43c9de34a2c..a99b2b586dc3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -437,13 +437,9 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type) return type == ARG_PTR_TO_SOCK_COMMON; }
-static bool arg_type_may_be_null(enum bpf_arg_type type) +static bool type_may_be_null(u32 type) { - return type == ARG_PTR_TO_MAP_VALUE_OR_NULL || - type == ARG_PTR_TO_MEM_OR_NULL || - type == ARG_PTR_TO_CTX_OR_NULL || - type == ARG_PTR_TO_SOCKET_OR_NULL || - type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; + return type & PTR_MAYBE_NULL; }
/* Determine whether the function releases some resources allocated by another @@ -4250,9 +4246,8 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
static bool arg_type_is_mem_ptr(enum bpf_arg_type type) { - return type == ARG_PTR_TO_MEM || - type == ARG_PTR_TO_MEM_OR_NULL || - type == ARG_PTR_TO_UNINIT_MEM; + return base_type(type) == ARG_PTR_TO_MEM || + base_type(type) == ARG_PTR_TO_UNINIT_MEM; }
static bool arg_type_is_mem_size(enum bpf_arg_type type) @@ -4379,26 +4374,21 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types, [ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types, - [ARG_PTR_TO_MAP_VALUE_OR_NULL] = &map_key_value_types, [ARG_CONST_SIZE] = &scalar_types, [ARG_CONST_SIZE_OR_ZERO] = &scalar_types, [ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types, [ARG_CONST_MAP_PTR] = &const_map_ptr_types, [ARG_PTR_TO_CTX] = &context_types, - [ARG_PTR_TO_CTX_OR_NULL] = &context_types, [ARG_PTR_TO_SOCK_COMMON] = &sock_types, #ifdef CONFIG_NET [ARG_PTR_TO_BTF_ID_SOCK_COMMON] = &btf_id_sock_common_types, #endif [ARG_PTR_TO_SOCKET] = &fullsock_types, - [ARG_PTR_TO_SOCKET_OR_NULL] = &fullsock_types, [ARG_PTR_TO_BTF_ID] = &btf_ptr_types, [ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types, [ARG_PTR_TO_MEM] = &mem_types, - [ARG_PTR_TO_MEM_OR_NULL] = &mem_types, [ARG_PTR_TO_UNINIT_MEM] = &mem_types, [ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types, - [ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types, [ARG_PTR_TO_INT] = &int_ptr_types, [ARG_PTR_TO_LONG] = &int_ptr_types, [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, @@ -4413,7 +4403,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, const struct bpf_reg_types *compatible; int i, j;
- compatible = compatible_reg_types[arg_type]; + compatible = compatible_reg_types[base_type(arg_type)]; if (!compatible) { verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type); return -EFAULT; @@ -4494,15 +4484,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return -EACCES; }
- if (arg_type == ARG_PTR_TO_MAP_VALUE || - arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE || - arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) { + if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE || + base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) { err = resolve_map_arg_type(env, meta, &arg_type); if (err) return err; }
- if (register_is_null(reg) && arg_type_may_be_null(arg_type)) + if (register_is_null(reg) && type_may_be_null(arg_type)) /* A NULL register has a SCALAR_VALUE type, so skip * type checking. */ @@ -4549,10 +4538,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, err = check_helper_mem_access(env, regno, meta->map_ptr->key_size, false, NULL); - } else if (arg_type == ARG_PTR_TO_MAP_VALUE || - (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL && - !register_is_null(reg)) || - arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { + } else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE || + base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) { + if (type_may_be_null(arg_type) && register_is_null(reg)) + return 0; + /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity */
From: Hao Luo haoluo@google.com
mainline inclusion from mainline-v5.17-rc1 commit 3c4807322660d4290ac9062c034aed6b87243861 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4WRPV CVE: CVE-2022-0500
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
We have introduced a new type to make bpf_ret composable, by reserving high bits to represent flags.
One of the flag is PTR_MAYBE_NULL, which indicates a pointer may be NULL. When applying this flag to ret_types, it means the returned value could be a NULL pointer. This patch switches the qualified arg_types to use this flag. The ret_types changed in this patch include:
1. RET_PTR_TO_MAP_VALUE_OR_NULL 2. RET_PTR_TO_SOCKET_OR_NULL 3. RET_PTR_TO_TCP_SOCK_OR_NULL 4. RET_PTR_TO_SOCK_COMMON_OR_NULL 5. RET_PTR_TO_ALLOC_MEM_OR_NULL 6. RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL 7. RET_PTR_TO_BTF_ID_OR_NULL
This patch doesn't eliminate the use of these names, instead it makes them aliases to 'RET_PTR_TO_XXX | PTR_MAYBE_NULL'.
Signed-off-by: Hao Luo haoluo@google.com Signed-off-by: Alexei Starovoitov ast@kernel.org Link: https://lore.kernel.org/bpf/20211217003152.48334-4-haoluo@google.com Conflicts: kernel/bpf/verifier.c Signed-off-by: Pu Lehui pulehui@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/bpf.h | 20 ++++++++++++------- kernel/bpf/helpers.c | 2 +- kernel/bpf/verifier.c | 45 ++++++++++++++++++++++--------------------- 3 files changed, 37 insertions(+), 30 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e28616839b7c..760d6eb52049 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -334,16 +334,22 @@ enum bpf_return_type { RET_INTEGER, /* function returns integer */ RET_VOID, /* function doesn't return anything */ RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */ - RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */ - RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */ - RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */ - RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ - RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */ - RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */ - RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */ + RET_PTR_TO_SOCKET, /* returns a pointer to a socket */ + RET_PTR_TO_TCP_SOCK, /* returns a pointer to a tcp_sock */ + RET_PTR_TO_SOCK_COMMON, /* returns a pointer to a sock_common */ + RET_PTR_TO_ALLOC_MEM, /* returns a pointer to dynamically allocated memory */ RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */ + RET_PTR_TO_BTF_ID, /* returns a pointer to a btf_id */ __BPF_RET_TYPE_MAX,
+ /* Extended ret_types. */ + RET_PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_MAP_VALUE, + RET_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCKET, + RET_PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK, + RET_PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON, + RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_ALLOC_MEM, + RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID, + /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bb4350de9f11..2a04e6eafca5 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -636,7 +636,7 @@ BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) const struct bpf_func_proto bpf_per_cpu_ptr_proto = { .func = bpf_per_cpu_ptr, .gpl_only = false, - .ret_type = RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL, .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, .arg2_type = ARG_ANYTHING, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a99b2b586dc3..ab65184e64ad 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5322,6 +5322,7 @@ static int check_reference_leak(struct bpf_verifier_env *env) static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) { const struct bpf_func_proto *fn = NULL; + enum bpf_return_type ret_type; struct bpf_reg_state *regs; struct bpf_call_arg_meta meta; bool changes_data; @@ -5433,13 +5434,13 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
/* update return register (already marked as written above) */ - if (fn->ret_type == RET_INTEGER) { + ret_type = fn->ret_type; + if (ret_type == RET_INTEGER) { /* sets type to SCALAR_VALUE */ mark_reg_unknown(env, regs, BPF_REG_0); - } else if (fn->ret_type == RET_VOID) { + } else if (ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; - } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL || - fn->ret_type == RET_PTR_TO_MAP_VALUE) { + } else if (base_type(ret_type) == RET_PTR_TO_MAP_VALUE) { /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); /* remember map_ptr, so that check_map_access() @@ -5452,28 +5453,27 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } regs[BPF_REG_0].map_ptr = meta.map_ptr; - if (fn->ret_type == RET_PTR_TO_MAP_VALUE) { + if (type_may_be_null(ret_type)) { + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; + } else { regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; if (map_value_has_spin_lock(meta.map_ptr)) regs[BPF_REG_0].id = ++env->id_gen; - } else { - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; } - } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { + } else if (base_type(ret_type) == RET_PTR_TO_SOCKET) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; - } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) { + } else if (base_type(ret_type) == RET_PTR_TO_SOCK_COMMON) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL; - } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { + } else if (base_type(ret_type) == RET_PTR_TO_TCP_SOCK) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; - } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) { + } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; regs[BPF_REG_0].mem_size = meta.mem_size; - } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL || - fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) { + } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) { const struct btf_type *t;
mark_reg_known_zero(env, regs, BPF_REG_0); @@ -5492,30 +5492,31 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } regs[BPF_REG_0].type = - fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? - PTR_TO_MEM : PTR_TO_MEM_OR_NULL; + (ret_type & PTR_MAYBE_NULL) ? + PTR_TO_MEM_OR_NULL : PTR_TO_MEM; regs[BPF_REG_0].mem_size = tsize; } else { regs[BPF_REG_0].type = - fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? - PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL; + (ret_type & PTR_MAYBE_NULL) ? + PTR_TO_BTF_ID_OR_NULL : PTR_TO_BTF_ID; regs[BPF_REG_0].btf_id = meta.ret_btf_id; } - } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) { + } else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) { int ret_btf_id;
mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL; ret_btf_id = *fn->ret_btf_id; if (ret_btf_id == 0) { - verbose(env, "invalid return type %d of func %s#%d\n", - fn->ret_type, func_id_name(func_id), func_id); + verbose(env, "invalid return type %u of func %s#%d\n", + base_type(ret_type), func_id_name(func_id), + func_id); return -EINVAL; } regs[BPF_REG_0].btf_id = ret_btf_id; } else { - verbose(env, "unknown return type %d of func %s#%d\n", - fn->ret_type, func_id_name(func_id), func_id); + verbose(env, "unknown return type %u of func %s#%d\n", + base_type(ret_type), func_id_name(func_id), func_id); return -EINVAL; }
From: Hao Luo haoluo@google.com
mainline inclusion from mainline-v5.17-rc1 commit c25b2ae136039ffa820c26138ed4a5e5f3ab3841 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4WRPV CVE: CVE-2022-0500
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
We have introduced a new type to make bpf_reg composable, by allocating bits in the type to represent flags.
One of the flags is PTR_MAYBE_NULL which indicates a pointer may be NULL. This patch switches the qualified reg_types to use this flag. The reg_types changed in this patch include:
1. PTR_TO_MAP_VALUE_OR_NULL 2. PTR_TO_SOCKET_OR_NULL 3. PTR_TO_SOCK_COMMON_OR_NULL 4. PTR_TO_TCP_SOCK_OR_NULL 5. PTR_TO_BTF_ID_OR_NULL 6. PTR_TO_MEM_OR_NULL 7. PTR_TO_RDONLY_BUF_OR_NULL 8. PTR_TO_RDWR_BUF_OR_NULL
Signed-off-by: Hao Luo haoluo@google.com Signed-off-by: Alexei Starovoitov ast@kernel.org Link: https://lore.kernel.org/r/20211217003152.48334-5-haoluo@google.com Conflicts: include/linux/bpf.h include/linux/bpf_verifier.h kernel/bpf/verifier.c Signed-off-by: Pu Lehui pulehui@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/netronome/nfp/bpf/fw.h | 4 +- include/linux/bpf.h | 16 +- include/linux/bpf_verifier.h | 4 + kernel/bpf/btf.c | 7 +- kernel/bpf/map_iter.c | 4 +- kernel/bpf/verifier.c | 273 +++++++++----------- net/core/bpf_sk_storage.c | 2 +- net/core/sock_map.c | 2 +- 8 files changed, 144 insertions(+), 168 deletions(-)
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/fw.h b/drivers/net/ethernet/netronome/nfp/bpf/fw.h index 4268a7e0f344..33f9058ed32e 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/fw.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/fw.h @@ -13,8 +13,8 @@ */ #define NFP_BPF_SCALAR_VALUE 1 #define NFP_BPF_MAP_VALUE 4 -#define NFP_BPF_STACK 6 -#define NFP_BPF_PACKET_DATA 8 +#define NFP_BPF_STACK 5 +#define NFP_BPF_PACKET_DATA 7
enum bpf_cap_tlv_type { NFP_BPF_CAP_TYPE_FUNC = 1, diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 760d6eb52049..221661ea6035 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -417,18 +417,14 @@ enum bpf_reg_type { PTR_TO_CTX, /* reg points to bpf_context */ CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ PTR_TO_MAP_VALUE, /* reg points to map element value */ - PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ PTR_TO_STACK, /* reg == frame_pointer + offset */ PTR_TO_PACKET_META, /* skb->data - meta_len */ PTR_TO_PACKET, /* reg points to skb->data */ PTR_TO_PACKET_END, /* skb->data + headlen */ PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */ PTR_TO_SOCKET, /* reg points to struct bpf_sock */ - PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */ PTR_TO_SOCK_COMMON, /* reg points to sock_common */ - PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */ PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ - PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ /* PTR_TO_BTF_ID points to a kernel struct that does not need @@ -446,16 +442,20 @@ enum bpf_reg_type { * been checked for null. Used primarily to inform the verifier * an explicit null check is required for this struct. */ - PTR_TO_BTF_ID_OR_NULL, PTR_TO_MEM, /* reg points to valid memory region */ - PTR_TO_MEM_OR_NULL, /* reg points to valid memory region or NULL */ PTR_TO_RDONLY_BUF, /* reg points to a readonly buffer */ - PTR_TO_RDONLY_BUF_OR_NULL, /* reg points to a readonly buffer or NULL */ PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ - PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */ PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ __BPF_REG_TYPE_MAX,
+ /* Extended reg_types. */ + PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | PTR_TO_MAP_VALUE, + PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCKET, + PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON, + PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | PTR_TO_TCP_SOCK, + PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | PTR_TO_BTF_ID, + PTR_TO_MEM_OR_NULL = PTR_MAYBE_NULL | PTR_TO_MEM, + /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b04dba78100a..efc26c4b5f7b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -17,6 +17,8 @@ * that converting umax_value to int cannot overflow. */ #define BPF_MAX_VAR_SIZ (1 << 29) +/* size of type_str_buf in bpf_verifier. */ +#define TYPE_STR_BUF_LEN 64
/* Liveness marks, used for registers and spilled-regs (in stack slots). * Read marks propagate upwards until they find a write mark; they record that @@ -434,6 +436,8 @@ struct bpf_verifier_env { u32 peak_states; /* longest register parentage chain walked for liveness marking */ u32 longest_mark_read_walk; + /* buffer used in reg_type_str() to generate reg_type string */ + char type_str_buf[TYPE_STR_BUF_LEN]; };
__printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index aaf2fbaa0cc7..538f7adc61c4 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4525,10 +4525,13 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */ for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; + u32 type, flag;
+ type = base_type(ctx_arg_info->reg_type); + flag = type_flag(ctx_arg_info->reg_type); if (ctx_arg_info->offset == off && - (ctx_arg_info->reg_type == PTR_TO_RDONLY_BUF_OR_NULL || - ctx_arg_info->reg_type == PTR_TO_RDWR_BUF_OR_NULL)) { + (type == PTR_TO_RDWR_BUF || type == PTR_TO_RDONLY_BUF) && + (flag & PTR_MAYBE_NULL)) { info->reg_type = ctx_arg_info->reg_type; return true; } diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 6a9542af4212..631f0e44b7a9 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -174,9 +174,9 @@ static const struct bpf_iter_reg bpf_map_elem_reg_info = { .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_map_elem, key), - PTR_TO_RDONLY_BUF_OR_NULL }, + PTR_TO_RDONLY_BUF | PTR_MAYBE_NULL }, { offsetof(struct bpf_iter__bpf_map_elem, value), - PTR_TO_RDWR_BUF_OR_NULL }, + PTR_TO_RDWR_BUF | PTR_MAYBE_NULL }, }, };
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ab65184e64ad..0498f77ba666 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -404,18 +404,6 @@ static bool reg_type_not_null(enum bpf_reg_type type) type == PTR_TO_SOCK_COMMON; }
-static bool reg_type_may_be_null(enum bpf_reg_type type) -{ - return type == PTR_TO_MAP_VALUE_OR_NULL || - type == PTR_TO_SOCKET_OR_NULL || - type == PTR_TO_SOCK_COMMON_OR_NULL || - type == PTR_TO_TCP_SOCK_OR_NULL || - type == PTR_TO_BTF_ID_OR_NULL || - type == PTR_TO_MEM_OR_NULL || - type == PTR_TO_RDONLY_BUF_OR_NULL || - type == PTR_TO_RDWR_BUF_OR_NULL; -} - static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) { return reg->type == PTR_TO_MAP_VALUE && @@ -424,12 +412,9 @@ static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) { - return type == PTR_TO_SOCKET || - type == PTR_TO_SOCKET_OR_NULL || - type == PTR_TO_TCP_SOCK || - type == PTR_TO_TCP_SOCK_OR_NULL || - type == PTR_TO_MEM || - type == PTR_TO_MEM_OR_NULL; + return base_type(type) == PTR_TO_SOCKET || + base_type(type) == PTR_TO_TCP_SOCK || + base_type(type) == PTR_TO_MEM; }
static bool arg_type_may_be_refcounted(enum bpf_arg_type type) @@ -492,37 +477,50 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id) func_id == BPF_FUNC_skc_to_tcp_request_sock; }
-/* string representation of 'enum bpf_reg_type' */ -static const char * const reg_type_str[] = { - [NOT_INIT] = "?", - [SCALAR_VALUE] = "inv", - [PTR_TO_CTX] = "ctx", - [CONST_PTR_TO_MAP] = "map_ptr", - [PTR_TO_MAP_VALUE] = "map_value", - [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", - [PTR_TO_STACK] = "fp", - [PTR_TO_PACKET] = "pkt", - [PTR_TO_PACKET_META] = "pkt_meta", - [PTR_TO_PACKET_END] = "pkt_end", - [PTR_TO_FLOW_KEYS] = "flow_keys", - [PTR_TO_SOCKET] = "sock", - [PTR_TO_SOCKET_OR_NULL] = "sock_or_null", - [PTR_TO_SOCK_COMMON] = "sock_common", - [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", - [PTR_TO_TCP_SOCK] = "tcp_sock", - [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", - [PTR_TO_TP_BUFFER] = "tp_buffer", - [PTR_TO_XDP_SOCK] = "xdp_sock", - [PTR_TO_BTF_ID] = "ptr_", - [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", - [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", - [PTR_TO_MEM] = "mem", - [PTR_TO_MEM_OR_NULL] = "mem_or_null", - [PTR_TO_RDONLY_BUF] = "rdonly_buf", - [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null", - [PTR_TO_RDWR_BUF] = "rdwr_buf", - [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null", -}; +/* string representation of 'enum bpf_reg_type' + * + * Note that reg_type_str() can not appear more than once in a single verbose() + * statement. + */ +static const char *reg_type_str(struct bpf_verifier_env *env, + enum bpf_reg_type type) +{ + char postfix[16] = {0}; + static const char * const str[] = { + [NOT_INIT] = "?", + [SCALAR_VALUE] = "inv", + [PTR_TO_CTX] = "ctx", + [CONST_PTR_TO_MAP] = "map_ptr", + [PTR_TO_MAP_VALUE] = "map_value", + [PTR_TO_STACK] = "fp", + [PTR_TO_PACKET] = "pkt", + [PTR_TO_PACKET_META] = "pkt_meta", + [PTR_TO_PACKET_END] = "pkt_end", + [PTR_TO_FLOW_KEYS] = "flow_keys", + [PTR_TO_SOCKET] = "sock", + [PTR_TO_SOCK_COMMON] = "sock_common", + [PTR_TO_TCP_SOCK] = "tcp_sock", + [PTR_TO_TP_BUFFER] = "tp_buffer", + [PTR_TO_XDP_SOCK] = "xdp_sock", + [PTR_TO_BTF_ID] = "ptr_", + [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", + [PTR_TO_MEM] = "mem", + [PTR_TO_RDONLY_BUF] = "rdonly_buf", + [PTR_TO_RDWR_BUF] = "rdwr_buf", + }; + + if (type & PTR_MAYBE_NULL) { + if (base_type(type) == PTR_TO_BTF_ID || + base_type(type) == PTR_TO_PERCPU_BTF_ID) + strncpy(postfix, "or_null_", 16); + else + strncpy(postfix, "_or_null", 16); + } + + snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s", + str[base_type(type)], postfix); + return env->type_str_buf; +}
static char slot_type_char[] = { [STACK_INVALID] = '?', @@ -574,7 +572,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, continue; verbose(env, " R%d", i); print_liveness(env, reg->live); - verbose(env, "=%s", reg_type_str[t]); + verbose(env, "=%s", reg_type_str(env, t)); if (t == SCALAR_VALUE && reg->precise) verbose(env, "P"); if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && @@ -582,9 +580,8 @@ static void print_verifier_state(struct bpf_verifier_env *env, /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); } else { - if (t == PTR_TO_BTF_ID || - t == PTR_TO_BTF_ID_OR_NULL || - t == PTR_TO_PERCPU_BTF_ID) + if (base_type(t) == PTR_TO_BTF_ID || + base_type(t) == PTR_TO_PERCPU_BTF_ID) verbose(env, "%s", kernel_type_name(reg->btf_id)); verbose(env, "(id=%d", reg->id); if (reg_type_may_be_refcounted_or_null(t)) @@ -593,9 +590,8 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, ",off=%d", reg->off); if (type_is_pkt_pointer(t)) verbose(env, ",r=%d", reg->range); - else if (t == CONST_PTR_TO_MAP || - t == PTR_TO_MAP_VALUE || - t == PTR_TO_MAP_VALUE_OR_NULL) + else if (base_type(t) == CONST_PTR_TO_MAP || + base_type(t) == PTR_TO_MAP_VALUE) verbose(env, ",ks=%d,vs=%d", reg->map_ptr->key_size, reg->map_ptr->value_size); @@ -665,7 +661,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (state->stack[i].slot_type[0] == STACK_SPILL) { reg = &state->stack[i].spilled_ptr; t = reg->type; - verbose(env, "=%s", reg_type_str[t]); + verbose(env, "=%s", reg_type_str(env, t)); if (t == SCALAR_VALUE && reg->precise) verbose(env, "P"); if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) @@ -1564,7 +1560,7 @@ static int mark_reg_read(struct bpf_verifier_env *env, break; if (parent->live & REG_LIVE_DONE) { verbose(env, "verifier BUG type %s var_off %lld off %d\n", - reg_type_str[parent->type], + reg_type_str(env, parent->type), parent->var_off.value, parent->off); return -EFAULT; } @@ -2190,9 +2186,8 @@ static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi)
static bool is_spillable_regtype(enum bpf_reg_type type) { - switch (type) { + switch (base_type(type)) { case PTR_TO_MAP_VALUE: - case PTR_TO_MAP_VALUE_OR_NULL: case PTR_TO_STACK: case PTR_TO_CTX: case PTR_TO_PACKET: @@ -2201,21 +2196,14 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_FLOW_KEYS: case CONST_PTR_TO_MAP: case PTR_TO_SOCKET: - case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: - case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: - case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: - case PTR_TO_BTF_ID_OR_NULL: case PTR_TO_RDONLY_BUF: - case PTR_TO_RDONLY_BUF_OR_NULL: case PTR_TO_RDWR_BUF: - case PTR_TO_RDWR_BUF_OR_NULL: case PTR_TO_PERCPU_BTF_ID: case PTR_TO_MEM: - case PTR_TO_MEM_OR_NULL: return true; default: return false; @@ -3016,7 +3004,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, */ *reg_type = info.reg_type;
- if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) + if (base_type(*reg_type) == PTR_TO_BTF_ID) *btf_id = info.btf_id; else env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; @@ -3082,7 +3070,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, }
verbose(env, "R%d invalid %s access off=%d size=%d\n", - regno, reg_type_str[reg->type], off, size); + regno, reg_type_str(env, reg->type), off, size);
return -EACCES; } @@ -3822,7 +3810,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else { mark_reg_known_zero(env, regs, value_regno); - if (reg_type_may_be_null(reg_type)) + if (type_may_be_null(reg_type)) regs[value_regno].id = ++env->id_gen; /* A load of ctx field could have different * actual load size with the one encoded in the @@ -3830,8 +3818,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * a sub-register. */ regs[value_regno].subreg_def = DEF_NOT_SUBREG; - if (reg_type == PTR_TO_BTF_ID || - reg_type == PTR_TO_BTF_ID_OR_NULL) + if (base_type(reg_type) == PTR_TO_BTF_ID) regs[value_regno].btf_id = btf_id; } regs[value_regno].type = reg_type; @@ -3882,7 +3869,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (type_is_sk_pointer(reg->type)) { if (t == BPF_WRITE) { verbose(env, "R%d cannot write into %s\n", - regno, reg_type_str[reg->type]); + regno, reg_type_str(env, reg->type)); return -EACCES; } err = check_sock_access(env, insn_idx, regno, off, size, t); @@ -3901,7 +3888,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (reg->type == PTR_TO_RDONLY_BUF) { if (t == BPF_WRITE) { verbose(env, "R%d cannot write into %s\n", - regno, reg_type_str[reg->type]); + regno, reg_type_str(env, reg->type)); return -EACCES; } err = check_buffer_access(env, reg, regno, off, size, false, @@ -3917,7 +3904,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, - reg_type_str[reg->type]); + reg_type_str(env, reg->type)); return -EACCES; }
@@ -3960,7 +3947,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins is_sk_reg(env, insn->dst_reg)) { verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", insn->dst_reg, - reg_type_str[reg_state(env, insn->dst_reg)->type]); + reg_type_str(env, reg_state(env, insn->dst_reg)->type)); return -EACCES; }
@@ -4156,9 +4143,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, register_is_null(reg)) return 0;
- verbose(env, "R%d type=%s expected=%s\n", regno, - reg_type_str[reg->type], - reg_type_str[PTR_TO_STACK]); + verbose(env, "R%d type=%s ", regno, + reg_type_str(env, reg->type)); + verbose(env, "expected=%s\n", reg_type_str(env, PTR_TO_STACK)); return -EACCES; } } @@ -4418,10 +4405,10 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, goto found; }
- verbose(env, "R%d type=%s expected=", regno, reg_type_str[type]); + verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, type)); for (j = 0; j + 1 < i; j++) - verbose(env, "%s, ", reg_type_str[compatible->types[j]]); - verbose(env, "%s\n", reg_type_str[compatible->types[j]]); + verbose(env, "%s, ", reg_type_str(env, compatible->types[j])); + verbose(env, "%s\n", reg_type_str(env, compatible->types[j])); return -EACCES;
found: @@ -5323,6 +5310,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn { const struct bpf_func_proto *fn = NULL; enum bpf_return_type ret_type; + enum bpf_type_flag ret_flag; struct bpf_reg_state *regs; struct bpf_call_arg_meta meta; bool changes_data; @@ -5435,6 +5423,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
/* update return register (already marked as written above) */ ret_type = fn->ret_type; + ret_flag = type_flag(fn->ret_type); if (ret_type == RET_INTEGER) { /* sets type to SCALAR_VALUE */ mark_reg_unknown(env, regs, BPF_REG_0); @@ -5453,25 +5442,23 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } regs[BPF_REG_0].map_ptr = meta.map_ptr; - if (type_may_be_null(ret_type)) { - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; - } else { - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; - if (map_value_has_spin_lock(meta.map_ptr)) - regs[BPF_REG_0].id = ++env->id_gen; + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag; + if (!type_may_be_null(ret_type) && + map_value_has_spin_lock(meta.map_ptr)) { + regs[BPF_REG_0].id = ++env->id_gen; } } else if (base_type(ret_type) == RET_PTR_TO_SOCKET) { mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; + regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag; } else if (base_type(ret_type) == RET_PTR_TO_SOCK_COMMON) { mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL; + regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag; } else if (base_type(ret_type) == RET_PTR_TO_TCP_SOCK) { mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; + regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag; } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) { mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; + regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag; regs[BPF_REG_0].mem_size = meta.mem_size; } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) { const struct btf_type *t; @@ -5491,21 +5478,17 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn tname, PTR_ERR(ret)); return -EINVAL; } - regs[BPF_REG_0].type = - (ret_type & PTR_MAYBE_NULL) ? - PTR_TO_MEM_OR_NULL : PTR_TO_MEM; + regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag; regs[BPF_REG_0].mem_size = tsize; } else { - regs[BPF_REG_0].type = - (ret_type & PTR_MAYBE_NULL) ? - PTR_TO_BTF_ID_OR_NULL : PTR_TO_BTF_ID; + regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; regs[BPF_REG_0].btf_id = meta.ret_btf_id; } } else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) { int ret_btf_id;
mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL; + regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; ret_btf_id = *fn->ret_btf_id; if (ret_btf_id == 0) { verbose(env, "invalid return type %u of func %s#%d\n", @@ -5520,7 +5503,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; }
- if (reg_type_may_be_null(regs[BPF_REG_0].type)) + if (type_may_be_null(regs[BPF_REG_0].type)) regs[BPF_REG_0].id = ++env->id_gen;
if (is_ptr_cast_function(func_id)) { @@ -5621,25 +5604,25 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env,
if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { verbose(env, "math between %s pointer and %lld is not allowed\n", - reg_type_str[type], val); + reg_type_str(env, type), val); return false; }
if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) { verbose(env, "%s pointer offset %d is not allowed\n", - reg_type_str[type], reg->off); + reg_type_str(env, type), reg->off); return false; }
if (smin == S64_MIN) { verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n", - reg_type_str[type]); + reg_type_str(env, type)); return false; }
if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) { verbose(env, "value %lld makes %s pointer be out of bounds\n", - smin, reg_type_str[type]); + smin, reg_type_str(env, type)); return false; }
@@ -6016,11 +5999,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; }
- switch (ptr_reg->type) { - case PTR_TO_MAP_VALUE_OR_NULL: + if (ptr_reg->type & PTR_MAYBE_NULL) { verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n", - dst, reg_type_str[ptr_reg->type]); + dst, reg_type_str(env, ptr_reg->type)); return -EACCES; + } + + switch (base_type(ptr_reg->type)) { case CONST_PTR_TO_MAP: /* smin_val represents the known value */ if (known && smin_val == 0 && opcode == BPF_ADD) @@ -6033,10 +6018,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case PTR_TO_XDP_SOCK: reject: verbose(env, "R%d pointer arithmetic on %s prohibited\n", - dst, reg_type_str[ptr_reg->type]); + dst, reg_type_str(env, ptr_reg->type)); return -EACCES; default: - if (reg_type_may_be_null(ptr_reg->type)) + if (type_may_be_null(ptr_reg->type)) goto reject; break; } @@ -7714,7 +7699,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, struct bpf_reg_state *reg, u32 id, bool is_null) { - if (reg_type_may_be_null(reg->type) && reg->id == id && + if (type_may_be_null(reg->type) && reg->id == id && !WARN_ON_ONCE(!reg->id)) { /* Old offset (both fixed and variable parts) should * have been known-zero, because we don't allow pointer @@ -7728,7 +7713,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, } if (is_null) { reg->type = SCALAR_VALUE; - } else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) { + } else if (base_type(reg->type) == PTR_TO_MAP_VALUE) { const struct bpf_map *map = reg->map_ptr;
if (map->inner_map_meta) { @@ -7742,21 +7727,10 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, } else { reg->type = PTR_TO_MAP_VALUE; } - } else if (reg->type == PTR_TO_SOCKET_OR_NULL) { - reg->type = PTR_TO_SOCKET; - } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) { - reg->type = PTR_TO_SOCK_COMMON; - } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { - reg->type = PTR_TO_TCP_SOCK; - } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) { - reg->type = PTR_TO_BTF_ID; - } else if (reg->type == PTR_TO_MEM_OR_NULL) { - reg->type = PTR_TO_MEM; - } else if (reg->type == PTR_TO_RDONLY_BUF_OR_NULL) { - reg->type = PTR_TO_RDONLY_BUF; - } else if (reg->type == PTR_TO_RDWR_BUF_OR_NULL) { - reg->type = PTR_TO_RDWR_BUF; + } else { + reg->type &= ~PTR_MAYBE_NULL; } + if (is_null) { /* We don't need id and ref_obj_id from this point * onwards anymore, thus we should better reset it, @@ -8103,7 +8077,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, */ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K && insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && - reg_type_may_be_null(dst_reg->type)) { + type_may_be_null(dst_reg->type)) { /* Mark all identical registers in each branch as either * safe or unknown depending R == 0 or R != 0 conditional. */ @@ -8332,7 +8306,7 @@ static int check_return_code(struct bpf_verifier_env *env) if (is_subprog) { if (reg->type != SCALAR_VALUE) { verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n", - reg_type_str[reg->type]); + reg_type_str(env, reg->type)); return -EINVAL; } return 0; @@ -8393,7 +8367,7 @@ static int check_return_code(struct bpf_verifier_env *env)
if (reg->type != SCALAR_VALUE) { verbose(env, "At program exit the register R0 is not a known value (%s)\n", - reg_type_str[reg->type]); + reg_type_str(env, reg->type)); return -EINVAL; }
@@ -9141,7 +9115,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return true; if (rcur->type == NOT_INIT) return false; - switch (rold->type) { + switch (base_type(rold->type)) { case SCALAR_VALUE: if (env->explore_alu_limits) return false; @@ -9162,6 +9136,22 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return false; } case PTR_TO_MAP_VALUE: + /* a PTR_TO_MAP_VALUE could be safe to use as a + * PTR_TO_MAP_VALUE_OR_NULL into the same map. + * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL- + * checked, doing so could have affected others with the same + * id, and we can't check for that because we lost the id when + * we converted to a PTR_TO_MAP_VALUE. + */ + if (type_may_be_null(rold->type)) { + if (!type_may_be_null(rcur->type)) + return false; + if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id))) + return false; + /* Check our ids match any regs they're supposed to */ + return check_ids(rold->id, rcur->id, idmap); + } + /* If the new min/max/var_off satisfy the old ones and * everything else matches, we are OK. * 'id' is not compared, since it's only used for maps with @@ -9173,20 +9163,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); - case PTR_TO_MAP_VALUE_OR_NULL: - /* a PTR_TO_MAP_VALUE could be safe to use as a - * PTR_TO_MAP_VALUE_OR_NULL into the same map. - * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL- - * checked, doing so could have affected others with the same - * id, and we can't check for that because we lost the id when - * we converted to a PTR_TO_MAP_VALUE. - */ - if (rcur->type != PTR_TO_MAP_VALUE_OR_NULL) - return false; - if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id))) - return false; - /* Check our ids match any regs they're supposed to */ - return check_ids(rold->id, rcur->id, idmap); case PTR_TO_PACKET_META: case PTR_TO_PACKET: if (rcur->type != rold->type) @@ -9215,11 +9191,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, case PTR_TO_PACKET_END: case PTR_TO_FLOW_KEYS: case PTR_TO_SOCKET: - case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: - case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: - case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: /* Only valid matches are exact, which memcmp() above * would have accepted @@ -9729,17 +9702,13 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) /* Return true if it's OK to have the same insn return a different type. */ static bool reg_type_mismatch_ok(enum bpf_reg_type type) { - switch (type) { + switch (base_type(type)) { case PTR_TO_CTX: case PTR_TO_SOCKET: - case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: - case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: - case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: - case PTR_TO_BTF_ID_OR_NULL: return false; default: return true; @@ -9957,7 +9926,7 @@ static int do_check(struct bpf_verifier_env *env) if (is_ctx_reg(env, insn->dst_reg)) { verbose(env, "BPF_ST stores into R%d %s is not allowed\n", insn->dst_reg, - reg_type_str[reg_state(env, insn->dst_reg)->type]); + reg_type_str(env, reg_state(env, insn->dst_reg)->type)); return -EACCES; }
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index c907f0dc7f87..f01693d8d73e 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -858,7 +858,7 @@ static struct bpf_iter_reg bpf_sk_storage_map_reg_info = { { offsetof(struct bpf_iter__bpf_sk_storage_map, sk), PTR_TO_BTF_ID_OR_NULL }, { offsetof(struct bpf_iter__bpf_sk_storage_map, value), - PTR_TO_RDWR_BUF_OR_NULL }, + PTR_TO_RDWR_BUF | PTR_MAYBE_NULL }, }, .seq_info = &iter_seq_info, }; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 4ea5bc65848f..af34a8ee3dec 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1608,7 +1608,7 @@ static struct bpf_iter_reg sock_map_iter_reg = { .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__sockmap, key), - PTR_TO_RDONLY_BUF_OR_NULL }, + PTR_TO_RDONLY_BUF | PTR_MAYBE_NULL }, { offsetof(struct bpf_iter__sockmap, sk), PTR_TO_BTF_ID_OR_NULL }, },
From: Hao Luo haoluo@google.com
mainline inclusion from mainline-v5.17-rc1 commit 20b2aff4bc15bda809f994761d5719827d66c0b4 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4WRPV CVE: CVE-2022-0500
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
This patch introduce a flag MEM_RDONLY to tag a reg value pointing to read-only memory. It makes the following changes:
1. PTR_TO_RDWR_BUF -> PTR_TO_BUF 2. PTR_TO_RDONLY_BUF -> PTR_TO_BUF | MEM_RDONLY
Signed-off-by: Hao Luo haoluo@google.com Signed-off-by: Alexei Starovoitov ast@kernel.org Link: https://lore.kernel.org/bpf/20211217003152.48334-6-haoluo@google.com Conflicts: kernel/bpf/verifier.c Signed-off-by: Pu Lehui pulehui@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/bpf.h | 8 ++-- kernel/bpf/btf.c | 3 +- kernel/bpf/map_iter.c | 4 +- kernel/bpf/verifier.c | 83 +++++++++++++++++++++++---------------- net/core/bpf_sk_storage.c | 2 +- net/core/sock_map.c | 2 +- 6 files changed, 59 insertions(+), 43 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 221661ea6035..c7cc8be46b7d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -268,7 +268,10 @@ enum bpf_type_flag { /* PTR may be NULL. */ PTR_MAYBE_NULL = BIT(0 + BPF_BASE_TYPE_BITS),
- __BPF_TYPE_LAST_FLAG = PTR_MAYBE_NULL, + /* MEM is read-only. */ + MEM_RDONLY = BIT(1 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_LAST_FLAG = MEM_RDONLY, };
/* Max number of base types. */ @@ -443,8 +446,7 @@ enum bpf_reg_type { * an explicit null check is required for this struct. */ PTR_TO_MEM, /* reg points to valid memory region */ - PTR_TO_RDONLY_BUF, /* reg points to a readonly buffer */ - PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ + PTR_TO_BUF, /* reg points to a read/write buffer */ PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ __BPF_REG_TYPE_MAX,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 538f7adc61c4..9d6c55138571 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4529,8 +4529,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
type = base_type(ctx_arg_info->reg_type); flag = type_flag(ctx_arg_info->reg_type); - if (ctx_arg_info->offset == off && - (type == PTR_TO_RDWR_BUF || type == PTR_TO_RDONLY_BUF) && + if (ctx_arg_info->offset == off && type == PTR_TO_BUF && (flag & PTR_MAYBE_NULL)) { info->reg_type = ctx_arg_info->reg_type; return true; diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 631f0e44b7a9..b0fa190b0979 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -174,9 +174,9 @@ static const struct bpf_iter_reg bpf_map_elem_reg_info = { .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_map_elem, key), - PTR_TO_RDONLY_BUF | PTR_MAYBE_NULL }, + PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY }, { offsetof(struct bpf_iter__bpf_map_elem, value), - PTR_TO_RDWR_BUF | PTR_MAYBE_NULL }, + PTR_TO_BUF | PTR_MAYBE_NULL }, }, };
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0498f77ba666..161673bee204 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -417,6 +417,11 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) base_type(type) == PTR_TO_MEM; }
+static bool type_is_rdonly_mem(u32 type) +{ + return type & MEM_RDONLY; +} + static bool arg_type_may_be_refcounted(enum bpf_arg_type type) { return type == ARG_PTR_TO_SOCK_COMMON; @@ -485,7 +490,7 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id) static const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type) { - char postfix[16] = {0}; + char postfix[16] = {0}, prefix[16] = {0}; static const char * const str[] = { [NOT_INIT] = "?", [SCALAR_VALUE] = "inv", @@ -505,8 +510,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env, [PTR_TO_BTF_ID] = "ptr_", [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", [PTR_TO_MEM] = "mem", - [PTR_TO_RDONLY_BUF] = "rdonly_buf", - [PTR_TO_RDWR_BUF] = "rdwr_buf", + [PTR_TO_BUF] = "buf", };
if (type & PTR_MAYBE_NULL) { @@ -517,8 +521,11 @@ static const char *reg_type_str(struct bpf_verifier_env *env, strncpy(postfix, "_or_null", 16); }
- snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s", - str[base_type(type)], postfix); + if (type & MEM_RDONLY) + strncpy(prefix, "rdonly_", 16); + + snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s", + prefix, str[base_type(type)], postfix); return env->type_str_buf; }
@@ -2200,8 +2207,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_TCP_SOCK: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: - case PTR_TO_RDONLY_BUF: - case PTR_TO_RDWR_BUF: + case PTR_TO_BUF: case PTR_TO_PERCPU_BTF_ID: case PTR_TO_MEM: return true; @@ -3885,22 +3891,27 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (reg->type == CONST_PTR_TO_MAP) { err = check_ptr_to_map_access(env, regs, regno, off, size, t, value_regno); - } else if (reg->type == PTR_TO_RDONLY_BUF) { - if (t == BPF_WRITE) { - verbose(env, "R%d cannot write into %s\n", - regno, reg_type_str(env, reg->type)); - return -EACCES; + } else if (base_type(reg->type) == PTR_TO_BUF) { + bool rdonly_mem = type_is_rdonly_mem(reg->type); + const char *buf_info; + u32 *max_access; + + if (rdonly_mem) { + if (t == BPF_WRITE) { + verbose(env, "R%d cannot write into %s\n", + regno, reg_type_str(env, reg->type)); + return -EACCES; + } + buf_info = "rdonly"; + max_access = &env->prog->aux->max_rdonly_access; + } else { + buf_info = "rdwr"; + max_access = &env->prog->aux->max_rdwr_access; } + err = check_buffer_access(env, reg, regno, off, size, false, - "rdonly", - &env->prog->aux->max_rdonly_access); - if (!err && value_regno >= 0) - mark_reg_unknown(env, regs, value_regno); - } else if (reg->type == PTR_TO_RDWR_BUF) { - err = check_buffer_access(env, reg, regno, off, size, false, - "rdwr", - &env->prog->aux->max_rdwr_access); - if (!err && t == BPF_READ && value_regno >= 0) + buf_info, max_access); + if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ)) mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, @@ -4103,8 +4114,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + const char *buf_info; + u32 *max_access;
- switch (reg->type) { + switch (base_type(reg->type)) { case PTR_TO_PACKET: case PTR_TO_PACKET_META: return check_packet_access(env, regno, reg->off, access_size, @@ -4120,18 +4133,20 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_mem_region_access(env, regno, reg->off, access_size, reg->mem_size, zero_size_allowed); - case PTR_TO_RDONLY_BUF: - if (meta && meta->raw_mode) - return -EACCES; - return check_buffer_access(env, reg, regno, reg->off, - access_size, zero_size_allowed, - "rdonly", - &env->prog->aux->max_rdonly_access); - case PTR_TO_RDWR_BUF: + case PTR_TO_BUF: + if (type_is_rdonly_mem(reg->type)) { + if (meta && meta->raw_mode) + return -EACCES; + + buf_info = "rdonly"; + max_access = &env->prog->aux->max_rdonly_access; + } else { + buf_info = "rdwr"; + max_access = &env->prog->aux->max_rdwr_access; + } return check_buffer_access(env, reg, regno, reg->off, access_size, zero_size_allowed, - "rdwr", - &env->prog->aux->max_rdwr_access); + buf_info, max_access); case PTR_TO_STACK: return check_stack_range_initialized( env, @@ -4334,8 +4349,8 @@ static const struct bpf_reg_types mem_types = { PTR_TO_PACKET_META, PTR_TO_MAP_VALUE, PTR_TO_MEM, - PTR_TO_RDONLY_BUF, - PTR_TO_RDWR_BUF, + PTR_TO_BUF, + PTR_TO_BUF | MEM_RDONLY, }, };
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index f01693d8d73e..5b61e99b8d63 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -858,7 +858,7 @@ static struct bpf_iter_reg bpf_sk_storage_map_reg_info = { { offsetof(struct bpf_iter__bpf_sk_storage_map, sk), PTR_TO_BTF_ID_OR_NULL }, { offsetof(struct bpf_iter__bpf_sk_storage_map, value), - PTR_TO_RDWR_BUF | PTR_MAYBE_NULL }, + PTR_TO_BUF | PTR_MAYBE_NULL }, }, .seq_info = &iter_seq_info, }; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index af34a8ee3dec..4534cd6be0c8 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1608,7 +1608,7 @@ static struct bpf_iter_reg sock_map_iter_reg = { .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__sockmap, key), - PTR_TO_RDONLY_BUF | PTR_MAYBE_NULL }, + PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY }, { offsetof(struct bpf_iter__sockmap, sk), PTR_TO_BTF_ID_OR_NULL }, },
From: Hao Luo haoluo@google.com
mainline inclusion from mainline-v5.17-rc1 commit cf9f2f8d62eca810afbd1ee6cc0800202b000e57 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4WRPV CVE: CVE-2022-0500
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Remove PTR_TO_MEM_OR_NULL and replace it with PTR_TO_MEM combined with flag PTR_MAYBE_NULL.
Signed-off-by: Hao Luo haoluo@google.com Signed-off-by: Alexei Starovoitov ast@kernel.org Link: https://lore.kernel.org/bpf/20211217003152.48334-7-haoluo@google.com Conflicts: kernel/bpf/btf.c kernel/bpf/verifier.c Signed-off-by: Pu Lehui pulehui@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/bpf.h | 1 - 1 file changed, 1 deletion(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c7cc8be46b7d..3eb4dcb2655a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -456,7 +456,6 @@ enum bpf_reg_type { PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON, PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | PTR_TO_TCP_SOCK, PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | PTR_TO_BTF_ID, - PTR_TO_MEM_OR_NULL = PTR_MAYBE_NULL | PTR_TO_MEM,
/* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag.
From: Hao Luo haoluo@google.com
mainline inclusion from mainline-v5.17-rc1 commit 34d3a78c681e8e7844b43d1a2f4671a04249c821 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4WRPV CVE: CVE-2022-0500
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Tag the return type of {per, this}_cpu_ptr with RDONLY_MEM. The returned value of this pair of helpers is kernel object, which can not be updated by bpf programs. Previously these two helpers return PTR_OT_MEM for kernel objects of scalar type, which allows one to directly modify the memory. Now with RDONLY_MEM tagging, the verifier will reject programs that write into RDONLY_MEM.
Fixes: 63d9b80dcf2c ("bpf: Introducte bpf_this_cpu_ptr()") Fixes: eaa6bcb71ef6 ("bpf: Introduce bpf_per_cpu_ptr()") Fixes: 4976b718c355 ("bpf: Introduce pseudo_btf_id") Signed-off-by: Hao Luo haoluo@google.com Signed-off-by: Alexei Starovoitov ast@kernel.org Link: https://lore.kernel.org/bpf/20211217003152.48334-8-haoluo@google.com Signed-off-by: Pu Lehui pulehui@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/bpf/helpers.c | 4 ++-- kernel/bpf/verifier.c | 30 ++++++++++++++++++++++++++---- 2 files changed, 28 insertions(+), 6 deletions(-)
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 2a04e6eafca5..45e72c262caf 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -636,7 +636,7 @@ BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) const struct bpf_func_proto bpf_per_cpu_ptr_proto = { .func = bpf_per_cpu_ptr, .gpl_only = false, - .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL, + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY, .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, .arg2_type = ARG_ANYTHING, }; @@ -649,7 +649,7 @@ BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr) const struct bpf_func_proto bpf_this_cpu_ptr_proto = { .func = bpf_this_cpu_ptr, .gpl_only = false, - .ret_type = RET_PTR_TO_MEM_OR_BTF_ID, + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY, .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, };
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 161673bee204..c8c71b9250f2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3779,15 +3779,30 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn mark_reg_unknown(env, regs, value_regno); } } - } else if (reg->type == PTR_TO_MEM) { + } else if (base_type(reg->type) == PTR_TO_MEM) { + bool rdonly_mem = type_is_rdonly_mem(reg->type); + + if (type_may_be_null(reg->type)) { + verbose(env, "R%d invalid mem access '%s'\n", regno, + reg_type_str(env, reg->type)); + return -EACCES; + } + + if (t == BPF_WRITE && rdonly_mem) { + verbose(env, "R%d cannot write into %s\n", + regno, reg_type_str(env, reg->type)); + return -EACCES; + } + if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose(env, "R%d leaks addr into mem\n", value_regno); return -EACCES; } + err = check_mem_region_access(env, regno, off, size, reg->mem_size, false); - if (!err && t == BPF_READ && value_regno >= 0) + if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem)) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; @@ -5496,6 +5511,13 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag; regs[BPF_REG_0].mem_size = tsize; } else { + /* MEM_RDONLY may be carried from ret_flag, but it + * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise + * it will confuse the check of PTR_TO_BTF_ID in + * check_mem_access(). + */ + ret_flag &= ~MEM_RDONLY; + regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; regs[BPF_REG_0].btf_id = meta.ret_btf_id; } @@ -8147,7 +8169,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) mark_reg_known_zero(env, regs, insn->dst_reg);
dst_reg->type = aux->btf_var.reg_type; - switch (dst_reg->type) { + switch (base_type(dst_reg->type)) { case PTR_TO_MEM: dst_reg->mem_size = aux->btf_var.mem_size; break; @@ -10149,7 +10171,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, tname, PTR_ERR(ret)); return -EINVAL; } - aux->btf_var.reg_type = PTR_TO_MEM; + aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY; aux->btf_var.mem_size = tsize; } else { aux->btf_var.reg_type = PTR_TO_BTF_ID;
From: Hao Luo haoluo@google.com
mainline inclusion from mainline-v5.17-rc1 commit 216e3cd2f28dbbf1fe86848e0e29e6693b9f0a20 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4WRPV CVE: CVE-2022-0500
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Some helper functions may modify its arguments, for example, bpf_d_path, bpf_get_stack etc. Previously, their argument types were marked as ARG_PTR_TO_MEM, which is compatible with read-only mem types, such as PTR_TO_RDONLY_BUF. Therefore it's legitimate, but technically incorrect, to modify a read-only memory by passing it into one of such helper functions.
This patch tags the bpf_args compatible with immutable memory with MEM_RDONLY flag. The arguments that don't have this flag will be only compatible with mutable memory types, preventing the helper from modifying a read-only memory. The bpf_args that have MEM_RDONLY are compatible with both mutable memory and immutable memory.
Signed-off-by: Hao Luo haoluo@google.com Signed-off-by: Alexei Starovoitov ast@kernel.org Link: https://lore.kernel.org/bpf/20211217003152.48334-9-haoluo@google.com Conflicts: kernel/bpf/btf.c kernel/bpf/helpers.c kernel/bpf/syscall.c kernel/trace/bpf_trace.c net/core/filter.c Signed-off-by: Pu Lehui pulehui@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/bpf.h | 4 ++- kernel/bpf/cgroup.c | 2 +- kernel/bpf/helpers.c | 6 ++-- kernel/bpf/ringbuf.c | 2 +- kernel/bpf/verifier.c | 20 +++++++++++-- kernel/trace/bpf_trace.c | 22 +++++++------- net/core/filter.c | 62 ++++++++++++++++++++-------------------- 7 files changed, 67 insertions(+), 51 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3eb4dcb2655a..ea822e084793 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -268,7 +268,9 @@ enum bpf_type_flag { /* PTR may be NULL. */ PTR_MAYBE_NULL = BIT(0 + BPF_BASE_TYPE_BITS),
- /* MEM is read-only. */ + /* MEM is read-only. When applied on bpf_arg, it indicates the arg is + * compatible with both mutable and immutable memory. + */ MEM_RDONLY = BIT(1 + BPF_BASE_TYPE_BITS),
__BPF_TYPE_LAST_FLAG = MEM_RDONLY, diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 8f53be0558c1..7e1e5051468f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1703,7 +1703,7 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, };
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 45e72c262caf..4bb5921a7d21 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -499,7 +499,7 @@ const struct bpf_func_proto bpf_strtol_proto = { .func = bpf_strtol, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_LONG, @@ -527,7 +527,7 @@ const struct bpf_func_proto bpf_strtoul_proto = { .func = bpf_strtoul, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_LONG, @@ -599,7 +599,7 @@ const struct bpf_func_proto bpf_event_output_data_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, };
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index f9913bc65ef8..e69d067f7e7f 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -463,7 +463,7 @@ const struct bpf_func_proto bpf_ringbuf_output_proto = { .func = bpf_ringbuf_output, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c8c71b9250f2..bd2349fb83fc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4365,7 +4365,6 @@ static const struct bpf_reg_types mem_types = { PTR_TO_MAP_VALUE, PTR_TO_MEM, PTR_TO_BUF, - PTR_TO_BUF | MEM_RDONLY, }, };
@@ -4426,6 +4425,21 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, return -EFAULT; }
+ /* ARG_PTR_TO_MEM + RDONLY is compatible with PTR_TO_MEM and PTR_TO_MEM + RDONLY, + * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM and NOT with PTR_TO_MEM + RDONLY + * + * Same for MAYBE_NULL: + * + * ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL, + * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL + * + * Therefore we fold these flags depending on the arg_type before comparison. + */ + if (arg_type & MEM_RDONLY) + type &= ~MEM_RDONLY; + if (arg_type & PTR_MAYBE_NULL) + type &= ~PTR_MAYBE_NULL; + for (i = 0; i < ARRAY_SIZE(compatible->types); i++) { expected = compatible->types[i]; if (expected == NOT_INIT) @@ -4435,14 +4449,14 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, goto found; }
- verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, type)); + verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, reg->type)); for (j = 0; j + 1 < i; j++) verbose(env, "%s, ", reg_type_str(env, compatible->types[j])); verbose(env, "%s\n", reg_type_str(env, compatible->types[j])); return -EACCES;
found: - if (type == PTR_TO_BTF_ID) { + if (reg->type == PTR_TO_BTF_ID) { if (!arg_btf_id) { if (!compatible->btf_id) { verbose(env, "verifier internal error: missing arg compatible BTF ID\n"); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1228e56853e0..b838148d73fc 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -342,7 +342,7 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, };
@@ -545,7 +545,7 @@ static const struct bpf_func_proto bpf_trace_printk_proto = { .func = bpf_trace_printk, .gpl_only = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, };
@@ -754,9 +754,9 @@ static const struct bpf_func_proto bpf_seq_printf_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_seq_file_ids[0], - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, - .arg4_type = ARG_PTR_TO_MEM_OR_NULL, + .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, };
@@ -771,7 +771,7 @@ static const struct bpf_func_proto bpf_seq_write_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_seq_file_ids[0], - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, };
@@ -795,7 +795,7 @@ static const struct bpf_func_proto bpf_seq_printf_btf_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_seq_file_ids[0], - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; @@ -956,7 +956,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, };
@@ -1229,7 +1229,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM, .arg2_type = ARG_CONST_SIZE, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; @@ -1404,7 +1404,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, };
@@ -1626,7 +1626,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, };
@@ -1680,7 +1680,7 @@ static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; diff --git a/net/core/filter.c b/net/core/filter.c index 42d15942ce90..51514e41025e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1712,7 +1712,7 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; @@ -2020,9 +2020,9 @@ static const struct bpf_func_proto bpf_csum_diff_proto = { .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM_OR_NULL, + .arg1_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE_OR_ZERO, - .arg3_type = ARG_PTR_TO_MEM_OR_NULL, + .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; @@ -2560,7 +2560,7 @@ static const struct bpf_func_proto bpf_redirect_neigh_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, - .arg2_type = ARG_PTR_TO_MEM_OR_NULL, + .arg2_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; @@ -4177,7 +4177,7 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, };
@@ -4191,7 +4191,7 @@ const struct bpf_func_proto bpf_skb_output_proto = { .arg1_btf_id = &bpf_skb_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, };
@@ -4374,7 +4374,7 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -4400,7 +4400,7 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, };
@@ -4570,7 +4570,7 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, };
@@ -4584,7 +4584,7 @@ const struct bpf_func_proto bpf_xdp_output_proto = { .arg1_btf_id = &bpf_xdp_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, };
@@ -5000,7 +5000,7 @@ static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, };
@@ -5034,7 +5034,7 @@ static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, };
@@ -5209,7 +5209,7 @@ static const struct bpf_func_proto bpf_bind_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, };
@@ -5671,7 +5671,7 @@ static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE };
@@ -5681,7 +5681,7 @@ static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE };
@@ -5724,7 +5724,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE };
@@ -5812,7 +5812,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE };
@@ -6037,7 +6037,7 @@ static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6056,7 +6056,7 @@ static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6075,7 +6075,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6112,7 +6112,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6135,7 +6135,7 @@ static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6158,7 +6158,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6177,7 +6177,7 @@ static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6196,7 +6196,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6215,7 +6215,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6528,9 +6528,9 @@ static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = { .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, };
@@ -6597,9 +6597,9 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = { .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, };
@@ -6828,7 +6828,7 @@ static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, };
From: Hao Luo haoluo@google.com
mainline inclusion from mainline-v5.17-rc1 commit 9497c458c10b049438ef6e6ddda898edbc3ec6a8 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4WRPV CVE: CVE-2022-0500
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
This test verifies that a ksym of non-struct can not be directly updated.
Signed-off-by: Hao Luo haoluo@google.com Signed-off-by: Alexei Starovoitov ast@kernel.org Acked-by: Andrii Nakryiko andrii@kernel.org Link: https://lore.kernel.org/bpf/20211217003152.48334-10-haoluo@google.com Conflicts: tools/testing/selftests/bpf/prog_tests/ksyms_btf.c Signed-off-by: Pu Lehui pulehui@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../selftests/bpf/prog_tests/ksyms_btf.c | 14 +++++++++ .../bpf/progs/test_ksyms_btf_write_check.c | 29 +++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c
diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c index b58b775d19f3..97f38d4f6a26 100644 --- a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c +++ b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c @@ -6,6 +6,7 @@ #include <bpf/btf.h> #include "test_ksyms_btf.skel.h" #include "test_ksyms_btf_null_check.skel.h" +#include "test_ksyms_btf_write_check.skel.h"
static int duration;
@@ -81,6 +82,16 @@ static void test_null_check(void) test_ksyms_btf_null_check__destroy(skel); }
+static void test_write_check(void) +{ + struct test_ksyms_btf_write_check *skel; + + skel = test_ksyms_btf_write_check__open_and_load(); + CHECK(skel, "skel_open", "unexpected load of a prog writing to ksym memory\n"); + + test_ksyms_btf_write_check__destroy(skel); +} + void test_ksyms_btf(void) { int percpu_datasec; @@ -106,4 +117,7 @@ void test_ksyms_btf(void)
if (test__start_subtest("null_check")) test_null_check(); + + if (test__start_subtest("write_check")) + test_write_check(); } diff --git a/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c b/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c new file mode 100644 index 000000000000..2180c41cd890 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Google */ + +#include "vmlinux.h" + +#include <bpf/bpf_helpers.h> + +extern const int bpf_prog_active __ksym; /* int type global var. */ + +SEC("raw_tp/sys_enter") +int handler(const void *ctx) +{ + int *active; + __u32 cpu; + + cpu = bpf_get_smp_processor_id(); + active = (int *)bpf_per_cpu_ptr(&bpf_prog_active, cpu); + if (active) { + /* Kernel memory obtained from bpf_{per,this}_cpu_ptr + * is read-only, should _not_ pass verification. + */ + /* WRITE_ONCE */ + *(volatile int *)active = -1; + } + + return 0; +} + +char _license[] SEC("license") = "GPL";
From: Daniel Borkmann daniel@iogearbox.net
mainline inclusion from mainline-v5.17-rc1 commit be80a1d3f9dbe5aee79a325964f7037fe2d92f30 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4WT90 CVE: CVE-2021-4204
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Generalize the check_ctx_reg() helper function into a more generic named one so that it can be reused for other register types as well to check whether their offset is non-zero. No functional change.
Signed-off-by: Daniel Borkmann daniel@iogearbox.net Acked-by: John Fastabend john.fastabend@gmail.com Acked-by: Alexei Starovoitov ast@kernel.org Conflicts: include/linux/bpf_verifier.h kernel/bpf/btf.c Signed-off-by: Pu Lehui pulehui@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/bpf_verifier.h | 4 ++-- kernel/bpf/btf.c | 2 +- kernel/bpf/verifier.c | 21 +++++++++++---------- 3 files changed, 14 insertions(+), 13 deletions(-)
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index efc26c4b5f7b..2f6a60c76fbd 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -469,8 +469,8 @@ bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off, void bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt);
-int check_ctx_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno); +int check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno);
/* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 9d6c55138571..ce69bb25ede8 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5207,7 +5207,7 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, i, btf_kind_str[BTF_INFO_KIND(t->info)]); goto out; } - if (check_ctx_reg(env, ®[i + 1], i + 1)) + if (check_ptr_off_reg(env, ®[i + 1], i + 1)) goto out; continue; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bd2349fb83fc..f6691c77df03 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3359,16 +3359,16 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, } #endif
-int check_ctx_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno) +int check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno) { - /* Access to ctx or passing it to a helper is only allowed in - * its original, unmodified form. + /* Access to this pointer-typed register or passing it to a helper + * is only allowed in its original, unmodified form. */
if (reg->off) { - verbose(env, "dereference of modified ctx ptr R%d off=%d disallowed\n", - regno, reg->off); + verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n", + reg_type_str(env, reg->type), regno, reg->off); return -EACCES; }
@@ -3376,7 +3376,8 @@ int check_ctx_reg(struct bpf_verifier_env *env, char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable ctx access var_off=%s disallowed\n", tn_buf); + verbose(env, "variable %s access var_off=%s disallowed\n", + reg_type_str(env, reg->type), tn_buf); return -EACCES; }
@@ -3814,7 +3815,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; }
- err = check_ctx_reg(env, reg, regno); + err = check_ptr_off_reg(env, reg, regno); if (err < 0) return err;
@@ -4533,7 +4534,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return err;
if (type == PTR_TO_CTX) { - err = check_ctx_reg(env, reg, regno); + err = check_ptr_off_reg(env, reg, regno); if (err < 0) return err; } @@ -8301,7 +8302,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; }
- err = check_ctx_reg(env, ®s[ctx_reg], ctx_reg); + err = check_ptr_off_reg(env, ®s[ctx_reg], ctx_reg); if (err < 0) return err;
From: Daniel Borkmann daniel@iogearbox.net
mainline inclusion from mainline-v5.17-rc1 commit d400a6cf1c8a57cdf10f35220ead3284320d85ff category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4WT90 CVE: CVE-2021-4204
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Similar as with other pointer types where we use ldimm64, clear the register content to zero first, and then populate the PTR_TO_FUNC type and subprogno number. Currently this is not done, and leads to reuse of stale register tracking data.
Given for special ldimm64 cases we always clear the register offset, make it common for all cases, so it won't be forgotten in future.
Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper") Signed-off-by: Daniel Borkmann daniel@iogearbox.net Acked-by: John Fastabend john.fastabend@gmail.com Acked-by: Alexei Starovoitov ast@kernel.org Signed-off-by: Pu Lehui pulehui@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/bpf/verifier.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f6691c77df03..c9804aee021d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8180,9 +8180,13 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; }
- if (insn->src_reg == BPF_PSEUDO_BTF_ID) { - mark_reg_known_zero(env, regs, insn->dst_reg); + /* All special src_reg cases are listed below. From this point onwards + * we either succeed and assign a corresponding dst_reg->type after + * zeroing the offset, or fail and reject the program. + */ + mark_reg_known_zero(env, regs, insn->dst_reg);
+ if (insn->src_reg == BPF_PSEUDO_BTF_ID) { dst_reg->type = aux->btf_var.reg_type; switch (base_type(dst_reg->type)) { case PTR_TO_MEM: @@ -8200,7 +8204,6 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) }
map = env->used_maps[aux->map_index]; - mark_reg_known_zero(env, regs, insn->dst_reg); dst_reg->map_ptr = map;
if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) {
From: Daniel Borkmann daniel@iogearbox.net
mainline inclusion from mainline-v5.17-rc1 commit 6788ab23508bddb0a9d88e104284922cb2c22b77 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4WT90 CVE: CVE-2021-4204
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Right now the assertion on check_ptr_off_reg() is only enforced for register types PTR_TO_CTX (and open coded also for PTR_TO_BTF_ID), however, this is insufficient since many other PTR_TO_* register types such as PTR_TO_FUNC do not handle/expect register offsets when passed to helper functions.
Given this can slip-through easily when adding new types, make this an explicit allow-list and reject all other current and future types by default if this is encountered.
Also, extend check_ptr_off_reg() to handle PTR_TO_BTF_ID as well instead of duplicating it. For PTR_TO_BTF_ID, reg->off is used for BTF to match expected BTF ids if struct offset is used. This part still needs to be allowed, but the dynamic off from the tnum must be rejected.
Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper") Fixes: eaa6bcb71ef6 ("bpf: Introduce bpf_per_cpu_ptr()") Signed-off-by: Daniel Borkmann daniel@iogearbox.net Acked-by: John Fastabend john.fastabend@gmail.com Acked-by: Alexei Starovoitov ast@kernel.org Signed-off-by: Pu Lehui pulehui@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/bpf/verifier.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c9804aee021d..16a9461b95fb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3359,14 +3359,15 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, } #endif
-int check_ptr_off_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno) +static int __check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno, + bool fixed_off_ok) { /* Access to this pointer-typed register or passing it to a helper * is only allowed in its original, unmodified form. */
- if (reg->off) { + if (!fixed_off_ok && reg->off) { verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n", reg_type_str(env, reg->type), regno, reg->off); return -EACCES; @@ -3384,6 +3385,12 @@ int check_ptr_off_reg(struct bpf_verifier_env *env, return 0; }
+int check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno) +{ + return __check_ptr_off_reg(env, reg, regno, false); +} + static int __check_buffer_access(struct bpf_verifier_env *env, const char *buf_info, const struct bpf_reg_state *reg, @@ -4473,12 +4480,6 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, kernel_type_name(*arg_btf_id)); return -EACCES; } - - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { - verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n", - regno); - return -EACCES; - } }
return 0; @@ -4533,10 +4534,25 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, if (err) return err;
- if (type == PTR_TO_CTX) { - err = check_ptr_off_reg(env, reg, regno); + switch ((u32)type) { + case SCALAR_VALUE: + /* Pointer types where reg offset is explicitly allowed: */ + case PTR_TO_PACKET: + case PTR_TO_PACKET_META: + case PTR_TO_MAP_VALUE: + case PTR_TO_MEM: + case PTR_TO_MEM | MEM_RDONLY: + case PTR_TO_BUF: + case PTR_TO_BUF | MEM_RDONLY: + case PTR_TO_STACK: + break; + /* All the rest must be rejected: */ + default: + err = __check_ptr_off_reg(env, reg, regno, + type == PTR_TO_BTF_ID); if (err < 0) return err; + break; }
skip_type_check:
From: Daniel Borkmann daniel@iogearbox.net
mainline inclusion from mainline-v5.17-rc1 commit 64620e0a1e712a778095bd35cbb277dc2259281f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4WT90 CVE: CVE-2021-4204
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Both bpf_ringbuf_submit() and bpf_ringbuf_discard() have ARG_PTR_TO_ALLOC_MEM in their bpf_func_proto definition as their first argument. They both expect the result from a prior bpf_ringbuf_reserve() call which has a return type of RET_PTR_TO_ALLOC_MEM_OR_NULL.
Meaning, after a NULL check in the code, the verifier will promote the register type in the non-NULL branch to a PTR_TO_MEM and in the NULL branch to a known zero scalar. Generally, pointer arithmetic on PTR_TO_MEM is allowed, so the latter could have an offset.
The ARG_PTR_TO_ALLOC_MEM expects a PTR_TO_MEM register type. However, the non- zero result from bpf_ringbuf_reserve() must be fed into either bpf_ringbuf_submit() or bpf_ringbuf_discard() but with the original offset given it will then read out the struct bpf_ringbuf_hdr mapping.
The verifier missed to enforce a zero offset, so that out of bounds access can be triggered which could be used to escalate privileges if unprivileged BPF was enabled (disabled by default in kernel).
Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Reported-by: tr3e.wang@gmail.com (SecCoder Security Lab) Signed-off-by: Daniel Borkmann daniel@iogearbox.net Acked-by: John Fastabend john.fastabend@gmail.com Acked-by: Alexei Starovoitov ast@kernel.org Signed-off-by: Pu Lehui pulehui@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/bpf/verifier.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 16a9461b95fb..affd3854aef3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4545,9 +4545,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, case PTR_TO_BUF: case PTR_TO_BUF | MEM_RDONLY: case PTR_TO_STACK: + /* Some of the argument types nevertheless require a + * zero register offset. + */ + if (arg_type == ARG_PTR_TO_ALLOC_MEM) + goto force_off_check; break; /* All the rest must be rejected: */ default: +force_off_check: err = __check_ptr_off_reg(env, reg, regno, type == PTR_TO_BTF_ID); if (err < 0)
From: Daniel Borkmann daniel@iogearbox.net
mainline inclusion from mainline-v5.17-rc1 commit a672b2e36a648afb04ad3bda93b6bda947a479a5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4WT90 CVE: CVE-2021-4204
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The bpf_ringbuf_submit() and bpf_ringbuf_discard() have ARG_PTR_TO_ALLOC_MEM in their bpf_func_proto definition as their first argument, and thus both expect the result from a prior bpf_ringbuf_reserve() call which has a return type of RET_PTR_TO_ALLOC_MEM_OR_NULL.
While the non-NULL memory from bpf_ringbuf_reserve() can be passed to other helpers, the two sinks (bpf_ringbuf_submit(), bpf_ringbuf_discard()) right now only enforce a register type of PTR_TO_MEM.
This can lead to potential type confusion since it would allow other PTR_TO_MEM memory to be passed into the two sinks which did not come from bpf_ringbuf_reserve().
Add a new MEM_ALLOC composable type attribute for PTR_TO_MEM, and enforce that:
- bpf_ringbuf_reserve() returns NULL or PTR_TO_MEM | MEM_ALLOC - bpf_ringbuf_submit() and bpf_ringbuf_discard() only take PTR_TO_MEM | MEM_ALLOC but not plain PTR_TO_MEM arguments via ARG_PTR_TO_ALLOC_MEM - however, other helpers might treat PTR_TO_MEM | MEM_ALLOC as plain PTR_TO_MEM to populate the memory area when they use ARG_PTR_TO_{UNINIT_,}MEM in their func proto description
Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Reported-by: Alexei Starovoitov ast@kernel.org Signed-off-by: Daniel Borkmann daniel@iogearbox.net Acked-by: John Fastabend john.fastabend@gmail.com Acked-by: Alexei Starovoitov ast@kernel.org Signed-off-by: Pu Lehui pulehui@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/bpf.h | 9 +++++++-- kernel/bpf/verifier.c | 6 +++++- 2 files changed, 12 insertions(+), 3 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ea822e084793..5443178fffbb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -273,7 +273,12 @@ enum bpf_type_flag { */ MEM_RDONLY = BIT(1 + BPF_BASE_TYPE_BITS),
- __BPF_TYPE_LAST_FLAG = MEM_RDONLY, + /* MEM was "allocated" from a different helper, and cannot be mixed + * with regular non-MEM_ALLOC'ed MEM types. + */ + MEM_ALLOC = BIT(2 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_LAST_FLAG = MEM_ALLOC, };
/* Max number of base types. */ @@ -352,7 +357,7 @@ enum bpf_return_type { RET_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCKET, RET_PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK, RET_PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON, - RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_ALLOC_MEM, + RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | MEM_ALLOC | RET_PTR_TO_ALLOC_MEM, RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID,
/* This must be the last entry. Its purpose is to ensure the enum is diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index affd3854aef3..ac38a3dac298 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -523,6 +523,8 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
if (type & MEM_RDONLY) strncpy(prefix, "rdonly_", 16); + if (type & MEM_ALLOC) + strncpy(prefix, "alloc_", 16);
snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s", prefix, str[base_type(type)], postfix); @@ -4372,6 +4374,7 @@ static const struct bpf_reg_types mem_types = { PTR_TO_PACKET_META, PTR_TO_MAP_VALUE, PTR_TO_MEM, + PTR_TO_MEM | MEM_ALLOC, PTR_TO_BUF, }, }; @@ -4388,7 +4391,7 @@ static const struct bpf_reg_types int_ptr_types = { static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } }; static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } }; static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } }; -static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM } }; +static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM | MEM_ALLOC } }; static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } }; static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; @@ -4542,6 +4545,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, case PTR_TO_MAP_VALUE: case PTR_TO_MEM: case PTR_TO_MEM | MEM_RDONLY: + case PTR_TO_MEM | MEM_ALLOC: case PTR_TO_BUF: case PTR_TO_BUF | MEM_RDONLY: case PTR_TO_STACK:
From: Gilad Reti gilad.reti@gmail.com
mainline inclusion from mainline-v5.11-rc5 commit 4237e9f4a96228ccc8a7abe5e4b30834323cd353 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4WT90 CVE: CVE-2021-4204
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Add a test to check that the verifier is able to recognize spilling of PTR_TO_MEM registers, by reserving a ringbuf buffer, forcing the spill of a pointer holding the buffer address to the stack, filling it back in from the stack and writing to the memory area pointed by it.
The patch was partially contributed by CyberArk Software, Inc.
Signed-off-by: Gilad Reti gilad.reti@gmail.com Signed-off-by: Alexei Starovoitov ast@kernel.org Acked-by: Yonghong Song yhs@fb.com Acked-by: KP Singh kpsingh@kernel.org Link: https://lore.kernel.org/bpf/20210113053810.13518-2-gilad.reti@gmail.com Signed-off-by: Pu Lehui pulehui@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- tools/testing/selftests/bpf/test_verifier.c | 12 +++++++- .../selftests/bpf/verifier/spill_fill.c | 30 +++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index a4c55fcb0e7b..0fc813235575 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -50,7 +50,7 @@ #define MAX_INSNS BPF_MAXINSNS #define MAX_TEST_INSNS 1000000 #define MAX_FIXUPS 8 -#define MAX_NR_MAPS 20 +#define MAX_NR_MAPS 21 #define MAX_TEST_RUNS 8 #define POINTER_VALUE 0xcafe4all #define TEST_DATA_LEN 64 @@ -87,6 +87,7 @@ struct bpf_test { int fixup_sk_storage_map[MAX_FIXUPS]; int fixup_map_event_output[MAX_FIXUPS]; int fixup_map_reuseport_array[MAX_FIXUPS]; + int fixup_map_ringbuf[MAX_FIXUPS]; const char *errstr; const char *errstr_unpriv; uint32_t insn_processed; @@ -640,6 +641,7 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type, int *fixup_sk_storage_map = test->fixup_sk_storage_map; int *fixup_map_event_output = test->fixup_map_event_output; int *fixup_map_reuseport_array = test->fixup_map_reuseport_array; + int *fixup_map_ringbuf = test->fixup_map_ringbuf;
if (test->fill_helper) { test->fill_insns = calloc(MAX_TEST_INSNS, sizeof(struct bpf_insn)); @@ -817,6 +819,14 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type, fixup_map_reuseport_array++; } while (*fixup_map_reuseport_array); } + if (*fixup_map_ringbuf) { + map_fds[20] = create_map(BPF_MAP_TYPE_RINGBUF, 0, + 0, 4096); + do { + prog[*fixup_map_ringbuf].imm = map_fds[20]; + fixup_map_ringbuf++; + } while (*fixup_map_ringbuf); + } }
struct libcap { diff --git a/tools/testing/selftests/bpf/verifier/spill_fill.c b/tools/testing/selftests/bpf/verifier/spill_fill.c index 45d43bf82f26..0b943897aaf6 100644 --- a/tools/testing/selftests/bpf/verifier/spill_fill.c +++ b/tools/testing/selftests/bpf/verifier/spill_fill.c @@ -28,6 +28,36 @@ .result = ACCEPT, .result_unpriv = ACCEPT, }, +{ + "check valid spill/fill, ptr to mem", + .insns = { + /* reserve 8 byte ringbuf memory */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_2, 8), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve), + /* store a pointer to the reserved memory in R6 */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + /* check whether the reservation was successful */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), + /* spill R6(mem) into the stack */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -8), + /* fill it back in R7 */ + BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, -8), + /* should be able to access *(R7) = 0 */ + BPF_ST_MEM(BPF_DW, BPF_REG_7, 0, 0), + /* submit the reserved ringbuf memory */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_submit), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_ringbuf = { 1 }, + .result = ACCEPT, + .result_unpriv = ACCEPT, +}, { "check corrupted spill/fill", .insns = {
From: Daniel Borkmann daniel@iogearbox.net
mainline inclusion from mainline-v5.17-rc1 commit 722e4db3ae0d52b2e3801280afbe19cf2d188e91 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4WT90 CVE: CVE-2021-4204
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Assert that the verifier is rejecting invalid offsets on the ringbuf entries:
# ./test_verifier | grep ring #947/u ringbuf: invalid reservation offset 1 OK #947/p ringbuf: invalid reservation offset 1 OK #948/u ringbuf: invalid reservation offset 2 OK #948/p ringbuf: invalid reservation offset 2 OK
Signed-off-by: Daniel Borkmann daniel@iogearbox.net Acked-by: John Fastabend john.fastabend@gmail.com Acked-by: Alexei Starovoitov ast@kernel.org Signed-off-by: Pu Lehui pulehui@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../testing/selftests/bpf/verifier/ringbuf.c | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 tools/testing/selftests/bpf/verifier/ringbuf.c
diff --git a/tools/testing/selftests/bpf/verifier/ringbuf.c b/tools/testing/selftests/bpf/verifier/ringbuf.c new file mode 100644 index 000000000000..68cae6947cc4 --- /dev/null +++ b/tools/testing/selftests/bpf/verifier/ringbuf.c @@ -0,0 +1,64 @@ +{ + "ringbuf: invalid reservation offset 1", + .insns = { + /* reserve 8 byte ringbuf memory */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_2, 8), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve), + /* store a pointer to the reserved memory in R6 */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + /* check whether the reservation was successful */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), + /* spill R6(mem) into the stack */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -8), + /* fill it back in R7 */ + BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, -8), + /* should be able to access *(R7) = 0 */ + BPF_ST_MEM(BPF_DW, BPF_REG_7, 0, 0), + /* submit the reserved ringbuf memory */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), + /* add invalid offset to reserved ringbuf memory */ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0xcafe), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_submit), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_ringbuf = { 1 }, + .result = REJECT, + .errstr = "dereference of modified mem ptr R1", +}, +{ + "ringbuf: invalid reservation offset 2", + .insns = { + /* reserve 8 byte ringbuf memory */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_2, 8), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve), + /* store a pointer to the reserved memory in R6 */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + /* check whether the reservation was successful */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), + /* spill R6(mem) into the stack */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -8), + /* fill it back in R7 */ + BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, -8), + /* add invalid offset to reserved ringbuf memory */ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 0xcafe), + /* should be able to access *(R7) = 0 */ + BPF_ST_MEM(BPF_DW, BPF_REG_7, 0, 0), + /* submit the reserved ringbuf memory */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_submit), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_ringbuf = { 1 }, + .result = REJECT, + .errstr = "R7 min value is outside of the allowed memory range", +},
From: Hao Luo haoluo@google.com
mainline inclusion from mainline-v5.17-rc1 commit 44bab87d8ca6f0544a9f8fc97bdf33aa5b3c899e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4WT90 CVE: CVE-2021-4204
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The second parameter of bpf_d_path() can only accept writable memories. Rdonly_mem obtained from bpf_per_cpu_ptr() can not be passed into bpf_d_path for modification. This patch adds a selftest to verify this behavior.
Signed-off-by: Hao Luo haoluo@google.com Signed-off-by: Andrii Nakryiko andrii@kernel.org Acked-by: Yonghong Song yhs@fb.com Link: https://lore.kernel.org/bpf/20220106205525.2116218-1-haoluo@google.com Signed-off-by: Pu Lehui pulehui@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../testing/selftests/bpf/prog_tests/d_path.c | 22 ++++++++++++++- .../bpf/progs/test_d_path_check_rdonly_mem.c | 28 +++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/progs/test_d_path_check_rdonly_mem.c
diff --git a/tools/testing/selftests/bpf/prog_tests/d_path.c b/tools/testing/selftests/bpf/prog_tests/d_path.c index 0a577a248d34..8c8bc495e7f4 100644 --- a/tools/testing/selftests/bpf/prog_tests/d_path.c +++ b/tools/testing/selftests/bpf/prog_tests/d_path.c @@ -9,6 +9,7 @@ #define MAX_FILES 7
#include "test_d_path.skel.h" +#include "test_d_path_check_rdonly_mem.skel.h"
static int duration;
@@ -99,7 +100,7 @@ static int trigger_fstat_events(pid_t pid) return ret; }
-void test_d_path(void) +static void test_d_path_basic(void) { struct test_d_path__bss *bss; struct test_d_path *skel; @@ -155,3 +156,22 @@ void test_d_path(void) cleanup: test_d_path__destroy(skel); } + +static void test_d_path_check_rdonly_mem(void) +{ + struct test_d_path_check_rdonly_mem *skel; + + skel = test_d_path_check_rdonly_mem__open_and_load(); + CHECK(skel, "skel_open", "unexpected_load_overwriting_rdonly_mem"); + + test_d_path_check_rdonly_mem__destroy(skel); +} + +void test_d_path(void) +{ + if (test__start_subtest("basic")) + test_d_path_basic(); + + if (test__start_subtest("check_rdonly_mem")) + test_d_path_check_rdonly_mem(); +} diff --git a/tools/testing/selftests/bpf/progs/test_d_path_check_rdonly_mem.c b/tools/testing/selftests/bpf/progs/test_d_path_check_rdonly_mem.c new file mode 100644 index 000000000000..27c27cff6a3a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_d_path_check_rdonly_mem.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Google */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +extern const int bpf_prog_active __ksym; + +SEC("fentry/security_inode_getattr") +int BPF_PROG(d_path_check_rdonly_mem, struct path *path, struct kstat *stat, + __u32 request_mask, unsigned int query_flags) +{ + void *active; + __u32 cpu; + + cpu = bpf_get_smp_processor_id(); + active = (void *)bpf_per_cpu_ptr(&bpf_prog_active, cpu); + if (active) { + /* FAIL here! 'active' points to readonly memory. bpf helpers + * that update its arguments can not write into it. + */ + bpf_d_path(path, active, sizeof(int)); + } + return 0; +} + +char _license[] SEC("license") = "GPL";
From: Daniel Borkmann daniel@iogearbox.net
mainline inclusion from mainline-v5.17-rc1 commit 37c8d4807d1b8b521b30310dce97f6695dc2c2c6 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4WT90 CVE: CVE-2021-4204
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Add two tests, one which asserts that ring buffer memory can be passed to other helpers for populating its entry area, and another one where verifier rejects different type of memory passed to bpf_ringbuf_submit().
Signed-off-by: Daniel Borkmann daniel@iogearbox.net Acked-by: John Fastabend john.fastabend@gmail.com Acked-by: Alexei Starovoitov ast@kernel.org Signed-off-by: Pu Lehui pulehui@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../testing/selftests/bpf/prog_tests/d_path.c | 14 ++++++++ .../bpf/progs/test_d_path_check_types.c | 32 ++++++++++++++++++ .../testing/selftests/bpf/verifier/ringbuf.c | 33 ++++++++++++++++++- 3 files changed, 78 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/progs/test_d_path_check_types.c
diff --git a/tools/testing/selftests/bpf/prog_tests/d_path.c b/tools/testing/selftests/bpf/prog_tests/d_path.c index 8c8bc495e7f4..85f1386a97e6 100644 --- a/tools/testing/selftests/bpf/prog_tests/d_path.c +++ b/tools/testing/selftests/bpf/prog_tests/d_path.c @@ -10,6 +10,7 @@
#include "test_d_path.skel.h" #include "test_d_path_check_rdonly_mem.skel.h" +#include "test_d_path_check_types.skel.h"
static int duration;
@@ -167,6 +168,16 @@ static void test_d_path_check_rdonly_mem(void) test_d_path_check_rdonly_mem__destroy(skel); }
+static void test_d_path_check_types(void) +{ + struct test_d_path_check_types *skel; + + skel = test_d_path_check_types__open_and_load(); + CHECK(skel, "skel_open", "unexpected_load_passing_wrong_type"); + + test_d_path_check_types__destroy(skel); +} + void test_d_path(void) { if (test__start_subtest("basic")) @@ -174,4 +185,7 @@ void test_d_path(void)
if (test__start_subtest("check_rdonly_mem")) test_d_path_check_rdonly_mem(); + + if (test__start_subtest("check_alloc_mem")) + test_d_path_check_types(); } diff --git a/tools/testing/selftests/bpf/progs/test_d_path_check_types.c b/tools/testing/selftests/bpf/progs/test_d_path_check_types.c new file mode 100644 index 000000000000..7e02b7361307 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_d_path_check_types.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +extern const int bpf_prog_active __ksym; + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 1 << 12); +} ringbuf SEC(".maps"); + +SEC("fentry/security_inode_getattr") +int BPF_PROG(d_path_check_rdonly_mem, struct path *path, struct kstat *stat, + __u32 request_mask, unsigned int query_flags) +{ + void *active; + u32 cpu; + + cpu = bpf_get_smp_processor_id(); + active = (void *)bpf_per_cpu_ptr(&bpf_prog_active, cpu); + if (active) { + /* FAIL here! 'active' points to 'regular' memory. It + * cannot be submitted to ring buffer. + */ + bpf_ringbuf_submit(active, 0); + } + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/verifier/ringbuf.c b/tools/testing/selftests/bpf/verifier/ringbuf.c index 68cae6947cc4..b64d33e4833c 100644 --- a/tools/testing/selftests/bpf/verifier/ringbuf.c +++ b/tools/testing/selftests/bpf/verifier/ringbuf.c @@ -28,7 +28,7 @@ }, .fixup_map_ringbuf = { 1 }, .result = REJECT, - .errstr = "dereference of modified mem ptr R1", + .errstr = "dereference of modified alloc_mem ptr R1", }, { "ringbuf: invalid reservation offset 2", @@ -62,3 +62,34 @@ .result = REJECT, .errstr = "R7 min value is outside of the allowed memory range", }, +{ + "ringbuf: check passing rb mem to helpers", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + /* reserve 8 byte ringbuf memory */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_2, 8), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), + /* check whether the reservation was successful */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + /* pass allocated ring buffer memory to fib lookup */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_3, 8), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_fib_lookup), + /* submit the ringbuf memory */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_submit), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_ringbuf = { 2 }, + .prog_type = BPF_PROG_TYPE_XDP, + .result = ACCEPT, +},
From: Laibin Qiu qiulaibin@huawei.com
hulk inclusion category: bugfix bugzilla: 185779, https://gitee.com/openeuler/kernel/issues/I4WFIY CVE: NA
-------------------------------------------------
1.In current process, all bio will set the BIO_THROTTLED flag after __blk_throtl_bio().
2.If bio needs to be throttled, it will start the timer and stop submit bio directly. Bio will submit in blk_throtl_dispatch_work_fn() when the timer expires.But in the current process, if bio is throttled. The BIO_THROTTLED will be set to bio after timer start. If the bio has been completed, it may cause use-after-free blow.
BUG: KASAN: use-after-free in blk_throtl_bio+0x12f0/0x2c70 Read of size 2 at addr ffff88801b8902d4 by task fio/26380
dump_stack+0x9b/0xce print_address_description.constprop.6+0x3e/0x60 kasan_report.cold.9+0x22/0x3a blk_throtl_bio+0x12f0/0x2c70 submit_bio_checks+0x701/0x1550 submit_bio_noacct+0x83/0xc80 submit_bio+0xa7/0x330 mpage_readahead+0x380/0x500 read_pages+0x1c1/0xbf0 page_cache_ra_unbounded+0x471/0x6f0 do_page_cache_ra+0xda/0x110 ondemand_readahead+0x442/0xae0 page_cache_async_ra+0x210/0x300 generic_file_buffered_read+0x4d9/0x2130 generic_file_read_iter+0x315/0x490 blkdev_read_iter+0x113/0x1b0 aio_read+0x2ad/0x450 io_submit_one+0xc8e/0x1d60 __se_sys_io_submit+0x125/0x350 do_syscall_64+0x2d/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9
Allocated by task 26380: kasan_save_stack+0x19/0x40 __kasan_kmalloc.constprop.2+0xc1/0xd0 kmem_cache_alloc+0x146/0x440 mempool_alloc+0x125/0x2f0 bio_alloc_bioset+0x353/0x590 mpage_alloc+0x3b/0x240 do_mpage_readpage+0xddf/0x1ef0 mpage_readahead+0x264/0x500 read_pages+0x1c1/0xbf0 page_cache_ra_unbounded+0x471/0x6f0 do_page_cache_ra+0xda/0x110 ondemand_readahead+0x442/0xae0 page_cache_async_ra+0x210/0x300 generic_file_buffered_read+0x4d9/0x2130 generic_file_read_iter+0x315/0x490 blkdev_read_iter+0x113/0x1b0 aio_read+0x2ad/0x450 io_submit_one+0xc8e/0x1d60 __se_sys_io_submit+0x125/0x350 do_syscall_64+0x2d/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9
Freed by task 0: kasan_save_stack+0x19/0x40 kasan_set_track+0x1c/0x30 kasan_set_free_info+0x1b/0x30 __kasan_slab_free+0x111/0x160 kmem_cache_free+0x94/0x460 mempool_free+0xd6/0x320 bio_free+0xe0/0x130 bio_put+0xab/0xe0 bio_endio+0x3a6/0x5d0 blk_update_request+0x590/0x1370 scsi_end_request+0x7d/0x400 scsi_io_completion+0x1aa/0xe50 scsi_softirq_done+0x11b/0x240 blk_mq_complete_request+0xd4/0x120 scsi_mq_done+0xf0/0x200 virtscsi_vq_done+0xbc/0x150 vring_interrupt+0x179/0x390 __handle_irq_event_percpu+0xf7/0x490 handle_irq_event_percpu+0x7b/0x160 handle_irq_event+0xcc/0x170 handle_edge_irq+0x215/0xb20 common_interrupt+0x60/0x120 asm_common_interrupt+0x1e/0x40
Fix this by move BIO_THROTTLED set into the queue_lock.
Signed-off-by: Laibin Qiu qiulaibin@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-throttle.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 942776a10599..95a13da1f343 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2266,13 +2266,16 @@ bool blk_throtl_bio(struct bio *bio) struct throtl_service_queue *sq; bool rw = bio_data_dir(bio); bool throttled = false; + bool locked = true; struct throtl_data *td = tg->td;
rcu_read_lock();
/* see throtl_charge_bio() */ - if (bio_flagged(bio, BIO_THROTTLED)) + if (bio_flagged(bio, BIO_THROTTLED)) { + locked = false; goto out; + }
if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf, @@ -2280,8 +2283,10 @@ bool blk_throtl_bio(struct bio *bio) blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1); }
- if (!tg->has_rules[rw]) + if (!tg->has_rules[rw]) { + locked = false; goto out; + }
spin_lock_irq(&q->queue_lock);
@@ -2336,7 +2341,7 @@ bool blk_throtl_bio(struct bio *bio) sq = sq->parent_sq; tg = sq_to_tg(sq); if (!tg) - goto out_unlock; + goto out; }
/* out-of-limit, queue to @tg */ @@ -2364,8 +2369,6 @@ bool blk_throtl_bio(struct bio *bio) throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true); }
-out_unlock: - spin_unlock_irq(&q->queue_lock); out: bio_set_flag(bio, BIO_THROTTLED);
@@ -2373,6 +2376,9 @@ bool blk_throtl_bio(struct bio *bio) if (throttled || !td->track_bio_latency) bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY; #endif + if (locked) + spin_unlock_irq(&q->queue_lock); + rcu_read_unlock(); return throttled; }