From: Xiaoyao Li xiaoyao.li@intel.com
stable inclusion from linux-4.19.207 commit f109e8a1678ce920b7f0df7865f2a31754ec5d1c
--------------------------------
[ Upstream commit c53c6b7409f4cd9e542991b53d597fbe2751d7db ]
Per SDM, bit 2:0 of CPUID(0x14,1).EAX[2:0] reports the number of configurable address ranges for filtering, not bit 1:0.
Signed-off-by: Xiaoyao Li xiaoyao.li@intel.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Acked-by: Alexander Shishkin alexander.shishkin@linux.intel.com Link: https://lkml.kernel.org/r/20210824040622.4081502-1-xiaoyao.li@intel.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/x86/events/intel/pt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index db969f3f175cb..774fb0f0bf6df 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -69,7 +69,7 @@ static struct pt_cap_desc { PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)), PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)), PT_CAP(payloads_lip, 0, CPUID_ECX, BIT(31)), - PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x3), + PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x7), PT_CAP(mtc_periods, 1, CPUID_EAX, 0xffff0000), PT_CAP(cycle_thresholds, 1, CPUID_EBX, 0xffff), PT_CAP(psb_periods, 1, CPUID_EBX, 0xffff0000),
From: Mathieu Desnoyers mathieu.desnoyers@efficios.com
stable inclusion from linux-4.19.207 commit 69178f2d652e64991dc812be24d0776a731f447d
--------------------------------
commit e1e84eb58eb494b77c8389fc6308b5042dcce791 upstream.
As per RFC792, ICMP errors should be sent to the source host.
However, in configurations with Virtual Routing and Forwarding tables, looking up which routing table to use is currently done by using the destination net_device.
commit 9d1a6c4ea43e ("net: icmp_route_lookup should use rt dev to determine L3 domain") changes the interface passed to l3mdev_master_ifindex() and inet_addr_type_dev_table() from skb_in->dev to skb_dst(skb_in)->dev. This effectively uses the destination device rather than the source device for choosing which routing table should be used to lookup where to send the ICMP error.
Therefore, if the source and destination interfaces are within separate VRFs, or one in the global routing table and the other in a VRF, looking up the source host in the destination interface's routing table will fail if the destination interface's routing table contains no route to the source host.
One observable effect of this issue is that traceroute does not work in the following cases:
- Route leaking between global routing table and VRF - Route leaking between VRFs
Preferably use the source device routing table when sending ICMP error messages. If no source device is set, fall-back on the destination device routing table. Else, use the main routing table (index 0).
[ It has been pointed out that a similar issue may exist with ICMP errors triggered when forwarding between network namespaces. It would be worthwhile to investigate, but is outside of the scope of this investigation. ]
[ It has also been pointed out that a similar issue exists with unreachable / fragmentation needed messages, which can be triggered by changing the MTU of eth1 in r1 to 1400 and running:
ip netns exec h1 ping -s 1450 -Mdo -c1 172.16.2.2
Some investigation points to raw_icmp_error() and raw_err() as being involved in this last scenario. The focus of this patch is TTL expired ICMP messages, which go through icmp_route_lookup. Investigation of failure modes related to raw_icmp_error() is beyond this investigation's scope. ]
Fixes: 9d1a6c4ea43e ("net: icmp_route_lookup should use rt dev to determine L3 domain") Link: https://tools.ietf.org/html/rfc792 Signed-off-by: Mathieu Desnoyers mathieu.desnoyers@efficios.com Reviewed-by: David Ahern dsahern@gmail.com Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv4/icmp.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-)
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 88d676beda399..8d68298b066dc 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -467,6 +467,23 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) local_bh_enable(); }
+/* + * The device used for looking up which routing table to use for sending an ICMP + * error is preferably the source whenever it is set, which should ensure the + * icmp error can be sent to the source host, else lookup using the routing + * table of the destination device, else use the main routing table (index 0). + */ +static struct net_device *icmp_get_route_lookup_dev(struct sk_buff *skb) +{ + struct net_device *route_lookup_dev = NULL; + + if (skb->dev) + route_lookup_dev = skb->dev; + else if (skb_dst(skb)) + route_lookup_dev = skb_dst(skb)->dev; + return route_lookup_dev; +} + static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, struct sk_buff *skb_in, @@ -475,6 +492,7 @@ static struct rtable *icmp_route_lookup(struct net *net, int type, int code, struct icmp_bxm *param) { + struct net_device *route_lookup_dev; struct rtable *rt, *rt2; struct flowi4 fl4_dec; int err; @@ -489,7 +507,8 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; - fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev); + route_lookup_dev = icmp_get_route_lookup_dev(skb_in); + fl4->flowi4_oif = l3mdev_master_ifindex(route_lookup_dev);
security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); rt = ip_route_output_key_hash(net, fl4, skb_in); @@ -513,7 +532,7 @@ static struct rtable *icmp_route_lookup(struct net *net, if (err) goto relookup_failed;
- if (inet_addr_type_dev_table(net, skb_dst(skb_in)->dev, + if (inet_addr_type_dev_table(net, route_lookup_dev, fl4_dec.saddr) == RTN_LOCAL) { rt2 = __ip_route_output_key(net, &fl4_dec); if (IS_ERR(rt2))
From: Peter Zijlstra peterz@infradead.org
stable inclusion from linux-4.19.207 commit b7a041073f4e72083dcd688a003a59bf9d9c9124
--------------------------------
[ Upstream commit 048661a1f963e9517630f080687d48af79ed784c ]
Yanfei reported that setting HANDOFF should not depend on recomputing @first, only on @first state. Which would then give:
if (ww_ctx || !first) first = __mutex_waiter_is_first(lock, &waiter); if (first) __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
But because 'ww_ctx || !first' is basically 'always' and the test for first is relatively cheap, omit that first branch entirely.
Reported-by: Yanfei Xu yanfei.xu@windriver.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Reviewed-by: Waiman Long longman@redhat.com Reviewed-by: Yanfei Xu yanfei.xu@windriver.com Link: https://lore.kernel.org/r/20210630154114.896786297@infradead.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/locking/mutex.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-)
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 354151fef06ae..fbc62d360419d 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -911,7 +911,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) { struct mutex_waiter waiter; - bool first = false; struct ww_mutex *ww; int ret;
@@ -986,6 +985,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
set_current_state(state); for (;;) { + bool first; + /* * Once we hold wait_lock, we're serialized against * mutex_unlock() handing the lock off to us, do a trylock @@ -1014,15 +1015,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, spin_unlock(&lock->wait_lock); schedule_preempt_disabled();
- /* - * ww_mutex needs to always recheck its position since its waiter - * list is not FIFO ordered. - */ - if (ww_ctx || !first) { - first = __mutex_waiter_is_first(lock, &waiter); - if (first) - __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF); - } + first = __mutex_waiter_is_first(lock, &waiter); + if (first) + __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
set_current_state(state); /*
From: Quentin Perret qperret@google.com
stable inclusion from linux-4.19.207 commit bc0d543f53e0d73059cf10733be0109984dadd44
--------------------------------
[ Upstream commit f95091536f78971b269ec321b057b8d630b0ad8a ]
It is possible for sched_getattr() to incorrectly report the state of the reset_on_fork flag when called on a deadline task.
Indeed, if the flag was set on a deadline task using sched_setattr() with flags (SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_KEEP_PARAMS), then p->sched_reset_on_fork will be set, but __setscheduler() will bail out early, which means that the dl_se->flags will not get updated by __setscheduler_params()->__setparam_dl(). Consequently, if sched_getattr() is then called on the task, __getparam_dl() will override kattr.sched_flags with the now out-of-date copy in dl_se->flags and report the stale value to userspace.
To fix this, make sure to only copy the flags that are relevant to sched_deadline to and from the dl_se->flags field.
Signed-off-by: Quentin Perret qperret@google.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Link: https://lore.kernel.org/r/20210727101103.2729607-2-qperret@google.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/sched/deadline.c | 7 ++++--- kernel/sched/sched.h | 2 ++ 2 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 50a35d7b1eda5..6b1ecc86362b0 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2620,7 +2620,7 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_runtime = attr->sched_runtime; dl_se->dl_deadline = attr->sched_deadline; dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; - dl_se->flags = attr->sched_flags; + dl_se->flags = attr->sched_flags & SCHED_DL_FLAGS; dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); } @@ -2633,7 +2633,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) attr->sched_runtime = dl_se->dl_runtime; attr->sched_deadline = dl_se->dl_deadline; attr->sched_period = dl_se->dl_period; - attr->sched_flags = dl_se->flags; + attr->sched_flags &= ~SCHED_DL_FLAGS; + attr->sched_flags |= dl_se->flags; }
/* @@ -2708,7 +2709,7 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) if (dl_se->dl_runtime != attr->sched_runtime || dl_se->dl_deadline != attr->sched_deadline || dl_se->dl_period != attr->sched_period || - dl_se->flags != attr->sched_flags) + dl_se->flags != (attr->sched_flags & SCHED_DL_FLAGS)) return true;
return false; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ca2fd2d6171f1..e6238db9dc996 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -218,6 +218,8 @@ static inline int task_has_dl_policy(struct task_struct *p) */ #define SCHED_FLAG_SUGOV 0x10000000
+#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV) + static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) { #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
From: Dietmar Eggemann dietmar.eggemann@arm.com
stable inclusion from linux-4.19.207 commit 293fe77dbfe68c5794be4e76c9212003e9304d24
--------------------------------
[ Upstream commit b4da13aa28d4fd0071247b7b41c579ee8a86c81a ]
A missing clock update is causing the following warning:
rq->clock_update_flags < RQCF_ACT_SKIP WARNING: CPU: 112 PID: 2041 at kernel/sched/sched.h:1453 sub_running_bw.isra.0+0x190/0x1a0 ... CPU: 112 PID: 2041 Comm: sugov:112 Tainted: G W 5.14.0-rc1 #1 Hardware name: WIWYNN Mt.Jade Server System B81.030Z1.0007/Mt.Jade Motherboard, BIOS 1.6.20210526 (SCP: 1.06.20210526) 2021/05/26 ... Call trace: sub_running_bw.isra.0+0x190/0x1a0 migrate_task_rq_dl+0xf8/0x1e0 set_task_cpu+0xa8/0x1f0 try_to_wake_up+0x150/0x3d4 wake_up_q+0x64/0xc0 __up_write+0xd0/0x1c0 up_write+0x4c/0x2b0 cppc_set_perf+0x120/0x2d0 cppc_cpufreq_set_target+0xe0/0x1a4 [cppc_cpufreq] __cpufreq_driver_target+0x74/0x140 sugov_work+0x64/0x80 kthread_worker_fn+0xe0/0x230 kthread+0x138/0x140 ret_from_fork+0x10/0x18
The task causing this is the `cppc_fie` DL task introduced by commit 1eb5dde674f5 ("cpufreq: CPPC: Add support for frequency invariance").
With CONFIG_ACPI_CPPC_CPUFREQ_FIE=y and schedutil cpufreq governor on slow-switching system (like on this Ampere Altra WIWYNN Mt. Jade Arm Server):
DL task `curr=sugov:112` lets `p=cppc_fie` migrate and since the latter is in `non_contending` state, migrate_task_rq_dl() calls
sub_running_bw()->__sub_running_bw()->cpufreq_update_util()-> rq_clock()->assert_clock_updated()
on p.
Fix this by updating the clock for a non_contending task in migrate_task_rq_dl() before calling sub_running_bw().
Reported-by: Bruno Goncalves bgoncalv@redhat.com Signed-off-by: Dietmar Eggemann dietmar.eggemann@arm.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Reviewed-by: Daniel Bristot de Oliveira bristot@kernel.org Acked-by: Juri Lelli juri.lelli@redhat.com Link: https://lore.kernel.org/r/20210804135925.3734605-1-dietmar.eggemann@arm.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/sched/deadline.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6b1ecc86362b0..6c4f93af15dbf 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1654,6 +1654,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused */ raw_spin_lock(&rq->lock); if (p->dl.dl_non_contending) { + update_rq_clock(rq); sub_running_bw(&p->dl, &rq->dl); p->dl.dl_non_contending = 0; /*
From: Thomas Gleixner tglx@linutronix.de
stable inclusion from linux-4.19.207 commit e6c3fefc6bb11bef1bd8adfe37e0f317303ab751
--------------------------------
[ Upstream commit 627ef5ae2df8eeccb20d5af0e4cfa4df9e61ed28 ]
If __hrtimer_start_range_ns() is invoked with an already armed hrtimer then the timer has to be canceled first and then added back. If the timer is the first expiring timer then on removal the clockevent device is reprogrammed to the next expiring timer to avoid that the pending expiry fires needlessly.
If the new expiry time ends up to be the first expiry again then the clock event device has to reprogrammed again.
Avoid this by checking whether the timer is the first to expire and in that case, keep the timer on the current CPU and delay the reprogramming up to the point where the timer has been enqueued again.
Reported-by: Lorenzo Colitti lorenzo@google.com Signed-off-by: Thomas Gleixner tglx@linutronix.de Link: https://lore.kernel.org/r/20210713135157.873137732@linutronix.de Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/time/hrtimer.c | 60 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 7 deletions(-)
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 0e04b24cec818..32ee24f5142ab 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1020,12 +1020,13 @@ static void __remove_hrtimer(struct hrtimer *timer, * remove hrtimer, called with base lock held */ static inline int -remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) +remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, + bool restart, bool keep_local) { u8 state = timer->state;
if (state & HRTIMER_STATE_ENQUEUED) { - int reprogram; + bool reprogram;
/* * Remove the timer and force reprogramming when high @@ -1038,8 +1039,16 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest debug_deactivate(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
+ /* + * If the timer is not restarted then reprogramming is + * required if the timer is local. If it is local and about + * to be restarted, avoid programming it twice (on removal + * and a moment later when it's requeued). + */ if (!restart) state = HRTIMER_STATE_INACTIVE; + else + reprogram &= !keep_local;
__remove_hrtimer(timer, base, state, reprogram); return 1; @@ -1093,9 +1102,31 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, struct hrtimer_clock_base *base) { struct hrtimer_clock_base *new_base; + bool force_local, first;
- /* Remove an active timer from the queue: */ - remove_hrtimer(timer, base, true); + /* + * If the timer is on the local cpu base and is the first expiring + * timer then this might end up reprogramming the hardware twice + * (on removal and on enqueue). To avoid that by prevent the + * reprogram on removal, keep the timer local to the current CPU + * and enforce reprogramming after it is queued no matter whether + * it is the new first expiring timer again or not. + */ + force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + force_local &= base->cpu_base->next_timer == timer; + + /* + * Remove an active timer from the queue. In case it is not queued + * on the current CPU, make sure that remove_hrtimer() updates the + * remote data correctly. + * + * If it's on the current CPU and the first expiring timer, then + * skip reprogramming, keep the timer local and enforce + * reprogramming later if it was the first expiring timer. This + * avoids programming the underlying clock event twice (once at + * removal and once after enqueue). + */ + remove_hrtimer(timer, base, true, force_local);
if (mode & HRTIMER_MODE_REL) tim = ktime_add_safe(tim, base->get_time()); @@ -1105,9 +1136,24 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, hrtimer_set_expires_range_ns(timer, tim, delta_ns);
/* Switch the timer base, if necessary: */ - new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); + if (!force_local) { + new_base = switch_hrtimer_base(timer, base, + mode & HRTIMER_MODE_PINNED); + } else { + new_base = base; + } + + first = enqueue_hrtimer(timer, new_base, mode); + if (!force_local) + return first;
- return enqueue_hrtimer(timer, new_base, mode); + /* + * Timer was forced to stay on the current CPU to avoid + * reprogramming on removal and enqueue. Force reprogram the + * hardware by evaluating the new first expiring timer. + */ + hrtimer_force_reprogram(new_base->cpu_base, 1); + return 0; }
/** @@ -1168,7 +1214,7 @@ int hrtimer_try_to_cancel(struct hrtimer *timer) base = lock_hrtimer_base(timer, &flags);
if (!hrtimer_callback_running(timer)) - ret = remove_hrtimer(timer, base, false); + ret = remove_hrtimer(timer, base, false, false);
unlock_hrtimer_base(timer, &flags);
From: Desmond Cheong Zhi Xi desmondcheongzx@gmail.com
stable inclusion from linux-4.19.207 commit 64dd1fbb0bb743ccd2fb420c441f4ca9732f598f
--------------------------------
[ Upstream commit 2f488f698fda820f8e6fa0407630154eceb145d6 ]
There is an existing lock hierarchy of &dev->event_lock --> &fasync_struct.fa_lock --> &f->f_owner.lock from the following call chain:
input_inject_event(): spin_lock_irqsave(&dev->event_lock,...); input_handle_event(): input_pass_values(): input_to_handler(): evdev_events(): evdev_pass_values(): spin_lock(&client->buffer_lock); __pass_event(): kill_fasync(): kill_fasync_rcu(): read_lock(&fa->fa_lock); send_sigio(): read_lock_irqsave(&fown->lock,...);
&dev->event_lock is HARDIRQ-safe, so interrupts have to be disabled while grabbing &fasync_struct.fa_lock, otherwise we invert the lock hierarchy. However, since kill_fasync which calls kill_fasync_rcu is an exported symbol, it may not necessarily be called with interrupts disabled.
As kill_fasync_rcu may be called with interrupts disabled (for example, in the call chain above), we replace calls to read_lock/read_unlock on &fasync_struct.fa_lock in kill_fasync_rcu with read_lock_irqsave/read_unlock_irqrestore.
Signed-off-by: Desmond Cheong Zhi Xi desmondcheongzx@gmail.com Signed-off-by: Jeff Layton jlayton@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/fcntl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/fs/fcntl.c b/fs/fcntl.c index 9ed564c1bcffd..bd9fe9b85a855 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -1013,13 +1013,14 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) { while (fa) { struct fown_struct *fown; + unsigned long flags;
if (fa->magic != FASYNC_MAGIC) { printk(KERN_ERR "kill_fasync: bad magic number in " "fasync_struct!\n"); return; } - read_lock(&fa->fa_lock); + read_lock_irqsave(&fa->fa_lock, flags); if (fa->fa_file) { fown = &fa->fa_file->f_owner; /* Don't send SIGURG to processes which have not set a @@ -1028,7 +1029,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) if (!(sig == SIGURG && fown->signum == 0)) send_sigio(fown, fa->fa_fd, band); } - read_unlock(&fa->fa_lock); + read_unlock_irqrestore(&fa->fa_lock, flags); fa = rcu_dereference(fa->fa_next); } }
From: Martin KaFai Lau kafai@fb.com
stable inclusion from linux-4.19.207 commit 15bf24c5fb7948564397b450ead0441143d42af2
--------------------------------
[ Upstream commit 525e2f9fd0229eb10cb460a9e6d978257f24804e ]
st->bucket stores the current bucket number. st->offset stores the offset within this bucket that is the sk to be seq_show(). Thus, st->offset only makes sense within the same st->bucket.
These two variables are an optimization for the common no-lseek case. When resuming the seq_file iteration (i.e. seq_start()), tcp_seek_last_pos() tries to continue from the st->offset at bucket st->bucket.
However, it is possible that the bucket pointed by st->bucket has changed and st->offset may end up skipping the whole st->bucket without finding a sk. In this case, tcp_seek_last_pos() currently continues to satisfy the offset condition in the next (and incorrect) bucket. Instead, regardless of the offset value, the first sk of the next bucket should be returned. Thus, "bucket == st->bucket" check is added to tcp_seek_last_pos().
The chance of hitting this is small and the issue is a decade old, so targeting for the next tree.
Fixes: a8b690f98baf ("tcp: Fix slowness in read /proc/net/tcp") Signed-off-by: Martin KaFai Lau kafai@fb.com Signed-off-by: Andrii Nakryiko andrii@kernel.org Reviewed-by: Eric Dumazet edumazet@google.com Acked-by: Kuniyuki Iwashima kuniyu@amazon.co.jp Acked-by: Yonghong Song yhs@fb.com Link: https://lore.kernel.org/bpf/20210701200541.1033917-1-kafai@fb.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv4/tcp_ipv4.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index eb7f770a6443d..f1ad62ad5c742 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2270,6 +2270,7 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos) static void *tcp_seek_last_pos(struct seq_file *seq) { struct tcp_iter_state *st = seq->private; + int bucket = st->bucket; int offset = st->offset; int orig_num = st->num; void *rc = NULL; @@ -2280,7 +2281,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq) break; st->state = TCP_SEQ_STATE_LISTENING; rc = listening_get_next(seq, NULL); - while (offset-- && rc) + while (offset-- && rc && bucket == st->bucket) rc = listening_get_next(seq, rc); if (rc) break; @@ -2291,7 +2292,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq) if (st->bucket > tcp_hashinfo.ehash_mask) break; rc = established_get_first(seq); - while (offset-- && rc) + while (offset-- && rc && bucket == st->bucket) rc = established_get_next(seq, rc); }
From: "Rafael J. Wysocki" rafael.j.wysocki@intel.com
stable inclusion from linux-4.19.207 commit c5c62f4c936407fb734cae64700f20b68273b059
--------------------------------
[ Upstream commit da9f2150684ea684a7ddd6d7f0e38b2bdf43dcd8 ]
It is inconsistent to return PCI_D0 from pci_target_state() instead of the original target state if 'wakeup' is true and the device cannot signal PME from D0.
This only happens when the device cannot signal PME from the original target state and any shallower power states (including D0) and that case is effectively equivalent to the one in which PME singaling is not supported at all. Since the original target state is returned in the latter case, make the function do that in the former one too.
Link: https://lore.kernel.org/linux-pm/3149540.aeNJFYEL58@kreacher/ Fixes: 666ff6f83e1d ("PCI/PM: Avoid using device_may_wakeup() for runtime PM") Reported-by: Mika Westerberg mika.westerberg@linux.intel.com Reported-by: Utkarsh H Patel utkarsh.h.patel@intel.com Reported-by: Koba Ko koba.ko@canonical.com Signed-off-by: Rafael J. Wysocki rafael.j.wysocki@intel.com Reviewed-by: Mika Westerberg mika.westerberg@linux.intel.com Tested-by: Mika Westerberg mika.westerberg@linux.intel.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/pci/pci.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 671b67e5b864f..92b6c96b63a49 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -2346,16 +2346,20 @@ static pci_power_t pci_target_state(struct pci_dev *dev, bool wakeup) if (dev->current_state == PCI_D3cold) target_state = PCI_D3cold;
- if (wakeup) { + if (wakeup && dev->pme_support) { + pci_power_t state = target_state; + /* * Find the deepest state from which the device can generate * PME#. */ - if (dev->pme_support) { - while (target_state - && !(dev->pme_support & (1 << target_state))) - target_state--; - } + while (state && !(dev->pme_support & (1 << state))) + state--; + + if (state) + return state; + else if (dev->pme_support & 1) + return PCI_D0; }
return target_state;
From: "Rafael J. Wysocki" rafael.j.wysocki@intel.com
stable inclusion from linux-4.19.207 commit d4bcd29b50d1a33111d3bacd7974f0d0bbec484f
--------------------------------
[ Upstream commit 0e00392a895c95c6d12d42158236c8862a2f43f2 ]
PME signaling is only enabled by __pci_enable_wake() if the target device can signal PME from the given target power state (to avoid pointless reconfiguration of the device), but if the hierarchy above the device goes into D3cold, the device itself will end up in D3cold too, so if it can signal PME from D3cold, it should be enabled to do so in __pci_enable_wake().
[Note that if the device does not end up in D3cold and it cannot signal PME from the original target power state, it will not signal PME, so in that case the behavior does not change.]
Link: https://lore.kernel.org/linux-pm/3149540.aeNJFYEL58@kreacher/ Fixes: 5bcc2fb4e815 ("PCI PM: Simplify PCI wake-up code") Reported-by: Mika Westerberg mika.westerberg@linux.intel.com Reported-by: Utkarsh H Patel utkarsh.h.patel@intel.com Reported-by: Koba Ko koba.ko@canonical.com Signed-off-by: Rafael J. Wysocki rafael.j.wysocki@intel.com Reviewed-by: Mika Westerberg mika.westerberg@linux.intel.com Tested-by: Mika Westerberg mika.westerberg@linux.intel.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/pci/pci.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 92b6c96b63a49..7d73ebc920098 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -2242,7 +2242,14 @@ static int __pci_enable_wake(struct pci_dev *dev, pci_power_t state, bool enable if (enable) { int error;
- if (pci_pme_capable(dev, state)) + /* + * Enable PME signaling if the device can signal PME from + * D3cold regardless of whether or not it can signal PME from + * the current target state, because that will allow it to + * signal PME when the hierarchy above it goes into D3cold and + * the device itself ends up in D3cold as a result of that. + */ + if (pci_pme_capable(dev, state) || pci_pme_capable(dev, PCI_D3cold)) pci_pme_active(dev, true); else ret = 1;
From: Len Baker len.baker@gmx.com
stable inclusion from linux-4.19.207 commit bea655491daf39f1934a71bf576bf3499092d3a4
--------------------------------
[ Upstream commit f980d055a0f858d73d9467bb0b570721bbfcdfb8 ]
strlcpy() reads the entire source buffer first. This read may exceed the destination size limit. This is both inefficient and can lead to linear read overflows if a source string is not NUL-terminated.
Also, the strnlen() call does not avoid the read overflow in the strlcpy function when a not NUL-terminated string is passed.
So, replace this block by a call to kstrndup() that avoids this type of overflow and does the same.
Fixes: 066ce6899484d ("cifs: rename cifs_strlcpy_to_host and make it use new functions") Signed-off-by: Len Baker len.baker@gmx.com Reviewed-by: Paulo Alcantara (SUSE) pc@cjr.nz Reviewed-by: Jeff Layton jlayton@kernel.org Signed-off-by: Steve French stfrench@microsoft.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/cifs/cifs_unicode.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-)
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 9986817532b10..7932e20555d2b 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -371,14 +371,9 @@ cifs_strndup_from_utf16(const char *src, const int maxlen, if (!dst) return NULL; cifs_from_utf16(dst, (__le16 *) src, len, maxlen, codepage, - NO_MAP_UNI_RSVD); + NO_MAP_UNI_RSVD); } else { - len = strnlen(src, maxlen); - len++; - dst = kmalloc(len, GFP_KERNEL); - if (!dst) - return NULL; - strlcpy(dst, src, len); + dst = kstrndup(src, maxlen, GFP_KERNEL); }
return dst;
From: Andy Duan fugang.duan@nxp.com
stable inclusion from linux-4.19.207 commit 600624ecbe35e0359806b107f247811b1f79ad25
--------------------------------
[ Upstream commit d5c38948448abc2bb6b36dbf85a554bf4748885e ]
Register offset needs to be applied on mapbase also. dma_tx/rx_request use the physical address of UARTDATA. Register offset is currently only applied to membase (the corresponding virtual addr) but not on mapbase.
Fixes: 24b1e5f0e83c ("tty: serial: lpuart: add imx7ulp support") Reviewed-by: Leonard Crestez leonard.crestez@nxp.com Signed-off-by: Adriana Reus adriana.reus@nxp.com Signed-off-by: Sherry Sun sherry.sun@nxp.com Signed-off-by: Andy Duan fugang.duan@nxp.com Link: https://lore.kernel.org/r/20210819021033.32606-1-sherry.sun@nxp.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/tty/serial/fsl_lpuart.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c index ee8a5cb61a5f1..c0ebcd34cadb7 100644 --- a/drivers/tty/serial/fsl_lpuart.c +++ b/drivers/tty/serial/fsl_lpuart.c @@ -2161,7 +2161,7 @@ static int lpuart_probe(struct platform_device *pdev) return PTR_ERR(sport->port.membase);
sport->port.membase += sdata->reg_off; - sport->port.mapbase = res->start; + sport->port.mapbase = res->start + sdata->reg_off; sport->port.dev = &pdev->dev; sport->port.type = PORT_LPUART; ret = platform_get_irq(pdev, 0);
From: Xiyu Yang xiyuyang19@fudan.edu.cn
stable inclusion from linux-4.19.207 commit 5129766a9797ea212087314fafc68268f1bf8515
--------------------------------
[ Upstream commit c66070125837900163b81a03063ddd657a7e9bfb ]
The reference counting issue happens in one exception handling path of cbq_change_class(). When failing to get tcf_block, the function forgets to decrease the refcount of "rtab" increased by qdisc_put_rtab(), causing a refcount leak.
Fix this issue by jumping to "failure" label when get tcf_block failed.
Fixes: 6529eaba33f0 ("net: sched: introduce tcf block infractructure") Signed-off-by: Xiyu Yang xiyuyang19@fudan.edu.cn Reviewed-by: Cong Wang cong.wang@bytedance.com Link: https://lore.kernel.org/r/1630252681-71588-1-git-send-email-xiyuyang19@fudan... Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/sched/sch_cbq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index ebc3c8c7e6661..bc62e1b246539 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1616,7 +1616,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); if (err) { kfree(cl); - return err; + goto failure; }
if (tca[TCA_RATE]) {
From: Nguyen Dinh Phi phind.uet@gmail.com
stable inclusion from linux-4.19.207 commit f7bffefa322a3d5a292c0b7a9b93302b392928f6
--------------------------------
commit bb2853a6a421a052268eee00fd5d3f6b3504b2b1 upstream.
The ops->receive_buf() may be accessed concurrently from these two functions. If the driver flushes data to the line discipline receive_buf() method while tiocsti() is waiting for the ops->receive_buf() to finish its work, the data race will happen.
For example: tty_ioctl |tty_ldisc_receive_buf ->tioctsi | ->tty_port_default_receive_buf | ->tty_ldisc_receive_buf ->hci_uart_tty_receive | ->hci_uart_tty_receive ->h4_recv | ->h4_recv
In this case, the h4 receive buffer will be overwritten by the latecomer, and we will lost the data.
Hence, change tioctsi() function to use the exclusive lock interface from tty_buffer to avoid the data race.
Reported-by: syzbot+97388eb9d31b997fe1d0@syzkaller.appspotmail.com Reviewed-by: Jiri Slaby jirislaby@kernel.org Signed-off-by: Nguyen Dinh Phi phind.uet@gmail.com Link: https://lore.kernel.org/r/20210823000641.2082292-1-phind.uet@gmail.com Cc: stable stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/tty/tty_io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index dd9017f98a31e..3025c39ba6b17 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -2175,8 +2175,6 @@ static int tty_fasync(int fd, struct file *filp, int on) * Locking: * Called functions take tty_ldiscs_lock * current->signal->tty check is safe without locks - * - * FIXME: may race normal receive processing */
static int tiocsti(struct tty_struct *tty, char __user *p) @@ -2192,8 +2190,10 @@ static int tiocsti(struct tty_struct *tty, char __user *p) ld = tty_ldisc_ref_wait(tty); if (!ld) return -EIO; + tty_buffer_lock_exclusive(tty->port); if (ld->ops->receive_buf) ld->ops->receive_buf(tty, &ch, &mbz, 1); + tty_buffer_unlock_exclusive(tty->port); tty_ldisc_deref(ld); return 0; }
From: Marek Marczykowski-Górecki marmarek@invisiblethingslab.com
stable inclusion from linux-4.19.207 commit 34b23fc32c05f14fd21caa32544f3720e1527a8d
--------------------------------
commit 1a519dc7a73c977547d8b5108d98c6e769c89f4b upstream.
When running as Xen PV guest, masking MSI-X is a responsibility of the hypervisor. The guest has no write access to the relevant BAR at all - when it tries to, it results in a crash like this:
BUG: unable to handle page fault for address: ffffc9004069100c #PF: supervisor write access in kernel mode #PF: error_code(0x0003) - permissions violation RIP: e030:__pci_enable_msix_range.part.0+0x26b/0x5f0 e1000e_set_interrupt_capability+0xbf/0xd0 [e1000e] e1000_probe+0x41f/0xdb0 [e1000e] local_pci_probe+0x42/0x80 (...)
The recently introduced function msix_mask_all() does not check the global variable pci_msi_ignore_mask which is set by XEN PV to bypass the masking of MSI[-X] interrupts.
Add the check to make this function XEN PV compatible.
Fixes: 7d5ec3d36123 ("PCI/MSI: Mask all unused MSI-X entries") Signed-off-by: Marek Marczykowski-Górecki marmarek@invisiblethingslab.com Signed-off-by: Thomas Gleixner tglx@linutronix.de Acked-by: Bjorn Helgaas bhelgaas@google.com Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210826170342.135172-1-marmarek@invisiblethingsla... Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/pci/msi.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 60f52a84fbaf2..219bad4c12641 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -765,6 +765,9 @@ static void msix_mask_all(void __iomem *base, int tsize) u32 ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT; int i;
+ if (pci_msi_ignore_mask) + return; + for (i = 0; i < tsize; i++, base += PCI_MSIX_ENTRY_SIZE) writel(ctrl, base + PCI_MSIX_ENTRY_VECTOR_CTRL); }
From: Jiong Wang jiong.wang@netronome.com
stable inclusion from linux-4.19.207 commit 79aba0ac3df1a604e843780b17c37646e175b4f8
--------------------------------
commit 0bae2d4d62d523f06ff1a8e88ce38b45400acd28 upstream.
Verifier is supposed to support sharing stack slot allocated to ptr with SCALAR_VALUE for privileged program. However this doesn't happen for some cases.
The reason is verifier is not clearing slot_type STACK_SPILL for all bytes, it only clears part of them, while verifier is using:
slot_type[0] == STACK_SPILL
as a convention to check one slot is ptr type.
So, the consequence of partial clearing slot_type is verifier could treat a partially overridden ptr slot, which should now be a SCALAR_VALUE slot, still as ptr slot, and rejects some valid programs.
Before this patch, test_xdp_noinline.o under bpf selftests, bpf_lxc.o and bpf_netdev.o under Cilium bpf repo, when built with -mattr=+alu32 are rejected due to this issue. After this patch, they all accepted.
There is no processed insn number change before and after this patch on Cilium bpf programs.
Reviewed-by: Jakub Kicinski jakub.kicinski@netronome.com Signed-off-by: Jiong Wang jiong.wang@netronome.com Reviewed-by: Daniel Borkmann daniel@iogearbox.net Signed-off-by: Alexei Starovoitov ast@kernel.org [OP: adjusted context for 4.19] Signed-off-by: Ovidiu Panait ovidiu.panait@windriver.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/bpf/verifier.c | 5 +++ tools/testing/selftests/bpf/test_verifier.c | 34 +++++++++++++++++++-- 2 files changed, 37 insertions(+), 2 deletions(-)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bbfa4cf2f3209..87ca7d21038f9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1043,6 +1043,10 @@ static int check_stack_write(struct bpf_verifier_env *env,
/* regular write of data into stack destroys any spilled ptr */ state->stack[spi].spilled_ptr.type = NOT_INIT; + /* Mark slots as STACK_MISC if they belonged to spilled ptr. */ + if (state->stack[spi].slot_type[0] == STACK_SPILL) + for (i = 0; i < BPF_REG_SIZE; i++) + state->stack[spi].slot_type[i] = STACK_MISC;
/* only mark the slot as written if all 8 bytes were written * otherwise read propagation may incorrectly stop too soon @@ -1059,6 +1063,7 @@ static int check_stack_write(struct bpf_verifier_env *env, if (reg && register_is_null(reg)) type = STACK_ZERO;
+ /* Mark slots affected by this stack write. */ for (i = 0; i < size; i++) state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = type; diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index e1e4b6ab83f74..aaee2b4e2fcd2 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -956,14 +956,44 @@ static struct bpf_test tests[] = { BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), /* mess up with R1 pointer on stack */ BPF_ST_MEM(BPF_B, BPF_REG_10, -7, 0x23), - /* fill back into R0 should fail */ + /* fill back into R0 is fine for priv. + * R0 now becomes SCALAR_VALUE. + */ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), + /* Load from R0 should fail. */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 8), BPF_EXIT_INSN(), }, .errstr_unpriv = "attempt to corrupt spilled", - .errstr = "corrupted spill", + .errstr = "R0 invalid mem access 'inv", .result = REJECT, }, + { + "check corrupted spill/fill, LSB", + .insns = { + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + BPF_ST_MEM(BPF_H, BPF_REG_10, -8, 0xcafe), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), + BPF_EXIT_INSN(), + }, + .errstr_unpriv = "attempt to corrupt spilled", + .result_unpriv = REJECT, + .result = ACCEPT, + .retval = POINTER_VALUE, + }, + { + "check corrupted spill/fill, MSB", + .insns = { + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0x12345678), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), + BPF_EXIT_INSN(), + }, + .errstr_unpriv = "attempt to corrupt spilled", + .result_unpriv = REJECT, + .result = ACCEPT, + .retval = POINTER_VALUE, + }, { "invalid src register in STX", .insns = {
From: Andrey Ignatov rdna@fb.com
stable inclusion from linux-4.19.207 commit caee4103f09e3dcce2ec0d0faf6ff245a6cffbad
--------------------------------
commit 2011fccfb61bbd1d7c8864b2b3ed7012342e9ba3 upstream.
Currently there is a difference in how verifier checks memory access for helper arguments for PTR_TO_MAP_VALUE and PTR_TO_STACK with regard to variable part of offset.
check_map_access, that is used for PTR_TO_MAP_VALUE, can handle variable offsets just fine, so that BPF program can call a helper like this:
some_helper(map_value_ptr + off, size);
, where offset is unknown at load time, but is checked by program to be in a safe rage (off >= 0 && off + size < map_value_size).
But it's not the case for check_stack_boundary, that is used for PTR_TO_STACK, and same code with pointer to stack is rejected by verifier:
some_helper(stack_value_ptr + off, size);
For example: 0: (7a) *(u64 *)(r10 -16) = 0 1: (7a) *(u64 *)(r10 -8) = 0 2: (61) r2 = *(u32 *)(r1 +0) 3: (57) r2 &= 4 4: (17) r2 -= 16 5: (0f) r2 += r10 6: (18) r1 = 0xffff888111343a80 8: (85) call bpf_map_lookup_elem#1 invalid variable stack read R2 var_off=(0xfffffffffffffff0; 0x4)
Add support for variable offset access to check_stack_boundary so that if offset is checked by program to be in a safe range it's accepted by verifier.
Signed-off-by: Andrey Ignatov rdna@fb.com Signed-off-by: Alexei Starovoitov ast@kernel.org [OP: replace reg_state(env, regno) helper with "cur_regs(env) + regno"] Signed-off-by: Ovidiu Panait ovidiu.panait@windriver.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Conflicts: kernel/bpf/verifier.c [yyl: adjust context] Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/bpf/verifier.c | 77 ++++++++++++++++++++++++++++++------------- 1 file changed, 54 insertions(+), 23 deletions(-)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 87ca7d21038f9..9cac7cc5fb638 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1795,6 +1795,29 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins BPF_SIZE(insn->code), BPF_WRITE, -1, true); }
+static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno, + int off, int access_size, + bool zero_size_allowed) +{ + struct bpf_reg_state *reg = cur_regs(env) + regno; + + if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || + access_size < 0 || (access_size == 0 && !zero_size_allowed)) { + if (tnum_is_const(reg->var_off)) { + verbose(env, "invalid stack type R%d off=%d access_size=%d\n", + regno, off, access_size); + } else { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "invalid stack type R%d var_off=%s access_size=%d\n", + regno, tn_buf, access_size); + } + return -EACCES; + } + return 0; +} + /* when register 'regno' is passed into function that will read 'access_size' * bytes from that pointer, make sure that it's within stack boundary * and all elements of stack are initialized. @@ -1807,7 +1830,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, { struct bpf_reg_state *reg = cur_regs(env) + regno; struct bpf_func_state *state = func(env, reg); - int off, i, j, slot, spi; + int err, min_off, max_off, i, j, slot, spi;
if (reg->type != PTR_TO_STACK) { /* Allow zero-byte read from NULL, regardless of pointer type */ @@ -1821,21 +1844,23 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return -EACCES; }
- /* Only allow fixed-offset stack reads */ - if (!tnum_is_const(reg->var_off)) { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid variable stack read R%d var_off=%s\n", - regno, tn_buf); - return -EACCES; - } - off = reg->off + reg->var_off.value; - if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || - access_size < 0 || (access_size == 0 && !zero_size_allowed)) { - verbose(env, "invalid stack type R%d off=%d access_size=%d\n", - regno, off, access_size); - return -EACCES; + if (tnum_is_const(reg->var_off)) { + min_off = max_off = reg->var_off.value + reg->off; + err = __check_stack_boundary(env, regno, min_off, access_size, + zero_size_allowed); + if (err) + return err; + } else { + min_off = reg->smin_value + reg->off; + max_off = reg->umax_value + reg->off; + err = __check_stack_boundary(env, regno, min_off, access_size, + zero_size_allowed); + if (err) + return err; + err = __check_stack_boundary(env, regno, max_off, access_size, + zero_size_allowed); + if (err) + return err; }
if (meta && meta->raw_mode) { @@ -1844,10 +1869,10 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return 0; }
- for (i = 0; i < access_size; i++) { + for (i = min_off; i < max_off + access_size; i++) { u8 *stype;
- slot = -(off + i) - 1; + slot = -i - 1; spi = slot / BPF_REG_SIZE; if (state->allocated_stack <= slot) goto err; @@ -1867,11 +1892,17 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, state->stack[spi].slot_type[j] = STACK_MISC; goto mark; } - - err: - verbose(env, "invalid indirect read from stack off %d+%d size %d\n", - off, i, access_size); + if (tnum_is_const(reg->var_off)) { + verbose(env, "invalid indirect read from stack off %d+%d size %d\n", + min_off, i - min_off, access_size); + } else { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "invalid indirect read from stack var_off %s+%d size %d\n", + tn_buf, i - min_off, access_size); + } return -EACCES; mark: /* reading any byte out of 8-byte 'spill_slot' will cause @@ -1880,7 +1911,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, mark_reg_read(env, &state->stack[spi].spilled_ptr, state->stack[spi].spilled_ptr.parent); } - return update_stack_depth(env, state, off); + return update_stack_depth(env, state, min_off); }
static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
From: Andrey Ignatov rdna@fb.com
stable inclusion from linux-4.19.207 commit 148339cb18ed38a4b1de2269d1a4227fd51376ef
--------------------------------
commit f2bcd05ec7b839ff826d2008506ad2d2dff46a59 upstream.
It's hard to guarantee that whole memory is marked as initialized on helper return if uninitialized stack is accessed with variable offset since specific bounds are unknown to verifier. This may cause uninitialized stack leaking.
Reject such an access in check_stack_boundary to prevent possible leaking.
There are no known use-cases for indirect uninitialized stack access with variable offset so it shouldn't break anything.
Fixes: 2011fccfb61b ("bpf: Support variable offset stack access from helpers") Reported-by: Daniel Borkmann daniel@iogearbox.net Signed-off-by: Andrey Ignatov rdna@fb.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Signed-off-by: Ovidiu Panait ovidiu.panait@windriver.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/bpf/verifier.c | 9 +++++++++ 1 file changed, 9 insertions(+)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9cac7cc5fb638..3e43fa744b470 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1851,6 +1851,15 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (err) return err; } else { + /* Only initialized buffer on stack is allowed to be accessed + * with variable offset. With uninitialized buffer it's hard to + * guarantee that whole memory is marked as initialized on + * helper return since specific bounds are unknown what may + * cause uninitialized stack leaking. + */ + if (meta && meta->raw_mode) + meta = NULL; + min_off = reg->smin_value + reg->off; max_off = reg->umax_value + reg->off; err = __check_stack_boundary(env, regno, min_off, access_size,
From: Andrey Ignatov rdna@fb.com
stable inclusion from linux-4.19.207 commit 14cf676ba6a0fb5e495baf843d750070f627d7e8
--------------------------------
commit 088ec26d9c2da9d879ab73e3f4117f9df6c566ee upstream.
Proper support of indirect stack access with variable offset in unprivileged mode (!root) requires corresponding support in Spectre masking for stack ALU in retrieve_ptr_limit().
There are no use-case for variable offset in unprivileged mode though so make verifier reject such accesses for simplicity.
Pointer arithmetics is one (and only?) way to cause variable offset and it's already rejected in unpriv mode so that verifier won't even get to helper function whose argument contains variable offset, e.g.:
0: (7a) *(u64 *)(r10 -16) = 0 1: (7a) *(u64 *)(r10 -8) = 0 2: (61) r2 = *(u32 *)(r1 +0) 3: (57) r2 &= 4 4: (17) r2 -= 16 5: (0f) r2 += r10 variable stack access var_off=(0xfffffffffffffff0; 0x4) off=-16 size=1R2 stack pointer arithmetic goes out of range, prohibited for !root
Still it looks like a good idea to reject variable offset indirect stack access for unprivileged mode in check_stack_boundary() explicitly.
Fixes: 2011fccfb61b ("bpf: Support variable offset stack access from helpers") Reported-by: Daniel Borkmann daniel@iogearbox.net Signed-off-by: Andrey Ignatov rdna@fb.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net [OP: drop comment in retrieve_ptr_limit()] Signed-off-by: Ovidiu Panait ovidiu.panait@windriver.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/bpf/verifier.c | 13 +++++++++++++ 1 file changed, 13 insertions(+)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3e43fa744b470..ce65c69007ab9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1851,6 +1851,19 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (err) return err; } else { + /* Variable offset is prohibited for unprivileged mode for + * simplicity since it requires corresponding support in + * Spectre masking for stack ALU. + * See also retrieve_ptr_limit(). + */ + if (!env->allow_ptr_leaks) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "R%d indirect variable offset stack access prohibited for !root, var_off=%s\n", + regno, tn_buf); + return -EACCES; + } /* Only initialized buffer on stack is allowed to be accessed * with variable offset. With uninitialized buffer it's hard to * guarantee that whole memory is marked as initialized on
From: Andrey Ignatov rdna@fb.com
stable inclusion from linux-4.19.207 commit 7667818ef188832f69e2cf9cfed56e03a8f7436b
--------------------------------
stable inclusion from linux-4.19.207 commit 7667818ef188832f69e2cf9cfed56e03a8f7436b
--------------------------------
commit 107c26a70ca81bfc33657366ad69d02fdc9efc9d upstream.
As discussed in [1] max value of variable offset has to be checked for overflow on stack access otherwise verifier would accept code like this:
0: (b7) r2 = 6 1: (b7) r3 = 28 2: (7a) *(u64 *)(r10 -16) = 0 3: (7a) *(u64 *)(r10 -8) = 0 4: (79) r4 = *(u64 *)(r1 +168) 5: (c5) if r4 s< 0x0 goto pc+4 R1=ctx(id=0,off=0,imm=0) R2=inv6 R3=inv28 R4=inv(id=0,umax_value=9223372036854775807,var_off=(0x0; 0x7fffffffffffffff)) R10=fp0,call_-1 fp-8=mmmmmmmm fp-16=mmmmmmmm 6: (17) r4 -= 16 7: (0f) r4 += r10 8: (b7) r5 = 8 9: (85) call bpf_getsockopt#57 10: (b7) r0 = 0 11: (95) exit
, where R4 obviosly has unbounded max value.
Fix it by checking that reg->smax_value is inside (-BPF_MAX_VAR_OFF; BPF_MAX_VAR_OFF) range.
reg->smax_value is used instead of reg->umax_value because stack pointers are calculated using negative offset from fp. This is opposite to e.g. map access where offset must be non-negative and where umax_value is used.
Also dedicated verbose logs are added for both min and max bound check failures to have diagnostics consistent with variable offset handling in check_map_access().
[1] https://marc.info/?l=linux-netdev&m=155433357510597&w=2
Fixes: 2011fccfb61b ("bpf: Support variable offset stack access from helpers") Reported-by: Daniel Borkmann daniel@iogearbox.net Signed-off-by: Andrey Ignatov rdna@fb.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Signed-off-by: Ovidiu Panait ovidiu.panait@windriver.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/bpf/verifier.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ce65c69007ab9..eb601aefc5ce1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1873,16 +1873,28 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (meta && meta->raw_mode) meta = NULL;
+ if (reg->smax_value >= BPF_MAX_VAR_OFF || + reg->smax_value <= -BPF_MAX_VAR_OFF) { + verbose(env, "R%d unbounded indirect variable offset stack access\n", + regno); + return -EACCES; + } min_off = reg->smin_value + reg->off; - max_off = reg->umax_value + reg->off; + max_off = reg->smax_value + reg->off; err = __check_stack_boundary(env, regno, min_off, access_size, zero_size_allowed); - if (err) + if (err) { + verbose(env, "R%d min value is outside of stack bound\n", + regno); return err; + } err = __check_stack_boundary(env, regno, max_off, access_size, zero_size_allowed); - if (err) + if (err) { + verbose(env, "R%d max value is outside of stack bound\n", + regno); return err; + } }
if (meta && meta->raw_mode) {
From: Andrey Ignatov rdna@fb.com
stable inclusion from linux-4.19.207 commit 2a28675d4b5cff2bfa0b3475c53edba1f5ec8401
--------------------------------
commit 8ff80e96e3ccea5ff0a890d4f18997e0344dbec2 upstream.
Test different scenarios of indirect variable-offset stack access: out of bound access (>0), min_off below initialized part of the stack, max_off+size above initialized part of the stack, initialized stack.
Example of output: ... #856/p indirect variable-offset stack access, out of bound OK #857/p indirect variable-offset stack access, max_off+size > max_initialized OK #858/p indirect variable-offset stack access, min_off < min_initialized OK #859/p indirect variable-offset stack access, ok OK ...
Signed-off-by: Andrey Ignatov rdna@fb.com Signed-off-by: Alexei Starovoitov ast@kernel.org [OP: backport to 4.19] Signed-off-by: Ovidiu Panait ovidiu.panait@windriver.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/testing/selftests/bpf/test_verifier.c | 79 ++++++++++++++++++++- 1 file changed, 77 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index aaee2b4e2fcd2..36895ba9ec344 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -8473,7 +8473,7 @@ static struct bpf_test tests[] = { .prog_type = BPF_PROG_TYPE_LWT_IN, }, { - "indirect variable-offset stack access", + "indirect variable-offset stack access, out of bound", .insns = { /* Fill the top 8 bytes of the stack */ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), @@ -8494,10 +8494,85 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .fixup_map1 = { 5 }, - .errstr = "variable stack read R2", + .errstr = "invalid stack type R2 var_off", .result = REJECT, .prog_type = BPF_PROG_TYPE_LWT_IN, }, + { + "indirect variable-offset stack access, max_off+size > max_initialized", + .insns = { + /* Fill only the second from top 8 bytes of the stack. */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, 0), + /* Get an unknown value. */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0), + /* Make it small and 4-byte aligned. */ + BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 4), + BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, 16), + /* Add it to fp. We now have either fp-12 or fp-16, but we don't know + * which. fp-12 size 8 is partially uninitialized stack. + */ + BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_10), + /* Dereference it indirectly. */ + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 5 }, + .errstr = "invalid indirect read from stack var_off", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_LWT_IN, + }, + { + "indirect variable-offset stack access, min_off < min_initialized", + .insns = { + /* Fill only the top 8 bytes of the stack. */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + /* Get an unknown value */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0), + /* Make it small and 4-byte aligned. */ + BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 4), + BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, 16), + /* Add it to fp. We now have either fp-12 or fp-16, but we don't know + * which. fp-16 size 8 is partially uninitialized stack. + */ + BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_10), + /* Dereference it indirectly. */ + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 5 }, + .errstr = "invalid indirect read from stack var_off", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_LWT_IN, + }, + { + "indirect variable-offset stack access, ok", + .insns = { + /* Fill the top 16 bytes of the stack. */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, 0), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + /* Get an unknown value. */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0), + /* Make it small and 4-byte aligned. */ + BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 4), + BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, 16), + /* Add it to fp. We now have either fp-12 or fp-16, we don't know + * which, but either way it points to initialized stack. + */ + BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_10), + /* Dereference it indirectly. */ + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 6 }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_LWT_IN, + }, { "direct stack access with 32-bit wraparound. test1", .insns = {
From: Alexei Starovoitov ast@kernel.org
stable inclusion from linux-4.19.207 commit fc578c64398df4f93fd7a6143e218281cc7c4348
--------------------------------
commit fc559a70d57c6ee5443f7a750858503e94cdc941 upstream.
fix tests that incorrectly assumed that the verifier cannot track constants through stack.
Signed-off-by: Alexei Starovoitov ast@kernel.org Acked-by: Andrii Nakryiko andriin@fb.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net [OP: backport to 4.19] Signed-off-by: Ovidiu Panait ovidiu.panait@windriver.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/testing/selftests/bpf/test_verifier.c | 31 +++++++++++---------- 1 file changed, 17 insertions(+), 14 deletions(-)
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 36895ba9ec344..43bf89e25c195 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -3873,7 +3873,8 @@ static struct bpf_test tests[] = { offsetof(struct __sk_buff, data)), BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct __sk_buff, data_end)), - BPF_MOV64_IMM(BPF_REG_0, 0xffffffff), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, mark)), BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xffff), @@ -6539,9 +6540,9 @@ static struct bpf_test tests[] = { { "helper access to variable memory: stack, bitwise AND, zero included", .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8), BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64), - BPF_MOV64_IMM(BPF_REG_2, 16), BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128), BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 64), @@ -6556,9 +6557,9 @@ static struct bpf_test tests[] = { { "helper access to variable memory: stack, bitwise AND + JMP, wrong max", .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8), BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64), - BPF_MOV64_IMM(BPF_REG_2, 16), BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128), BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 65), @@ -6632,9 +6633,9 @@ static struct bpf_test tests[] = { { "helper access to variable memory: stack, JMP, bounds + offset", .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8), BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64), - BPF_MOV64_IMM(BPF_REG_2, 16), BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128), BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 64, 5), @@ -6653,9 +6654,9 @@ static struct bpf_test tests[] = { { "helper access to variable memory: stack, JMP, wrong max", .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8), BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64), - BPF_MOV64_IMM(BPF_REG_2, 16), BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128), BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 65, 4), @@ -6673,9 +6674,9 @@ static struct bpf_test tests[] = { { "helper access to variable memory: stack, JMP, no max check", .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8), BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64), - BPF_MOV64_IMM(BPF_REG_2, 16), BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128), BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), BPF_MOV64_IMM(BPF_REG_4, 0), @@ -6693,9 +6694,9 @@ static struct bpf_test tests[] = { { "helper access to variable memory: stack, JMP, no min check", .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8), BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64), - BPF_MOV64_IMM(BPF_REG_2, 16), BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128), BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 64, 3), @@ -6711,9 +6712,9 @@ static struct bpf_test tests[] = { { "helper access to variable memory: stack, JMP (signed), no min check", .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8), BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64), - BPF_MOV64_IMM(BPF_REG_2, 16), BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128), BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), BPF_JMP_IMM(BPF_JSGT, BPF_REG_2, 64, 3), @@ -6755,6 +6756,7 @@ static struct bpf_test tests[] = { { "helper access to variable memory: map, JMP, wrong max", .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 8), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), @@ -6762,7 +6764,7 @@ static struct bpf_test tests[] = { BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10), BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), - BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val)), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_6), BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128), BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128), BPF_JMP_IMM(BPF_JSGT, BPF_REG_2, @@ -6774,7 +6776,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map2 = { 4 }, .errstr = "invalid access to map value, value_size=48 off=0 size=49", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -6809,6 +6811,7 @@ static struct bpf_test tests[] = { { "helper access to variable memory: map adjusted, JMP, wrong max", .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 8), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), @@ -6817,7 +6820,7 @@ static struct bpf_test tests[] = { BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11), BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 20), - BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val)), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_6), BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128), BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128), BPF_JMP_IMM(BPF_JSGT, BPF_REG_2, @@ -6829,7 +6832,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .fixup_map2 = { 3 }, + .fixup_map2 = { 4 }, .errstr = "R1 min value is outside of the array range", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -6851,8 +6854,8 @@ static struct bpf_test tests[] = { { "helper access to variable memory: size > 0 not allowed on NULL (ARG_PTR_TO_MEM_OR_NULL)", .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0), BPF_MOV64_IMM(BPF_REG_1, 0), - BPF_MOV64_IMM(BPF_REG_2, 1), BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128), BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128), BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 64), @@ -7078,6 +7081,7 @@ static struct bpf_test tests[] = { { "helper access to variable memory: 8 bytes leak", .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8), BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64), BPF_MOV64_IMM(BPF_REG_0, 0), @@ -7088,7 +7092,6 @@ static struct bpf_test tests[] = { BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -24), BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), - BPF_MOV64_IMM(BPF_REG_2, 1), BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128), BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128), BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 63),
From: Lorenz Bauer lmb@cloudflare.com
stable inclusion from linux-4.19.207 commit a4fe956b03316516ce0eca037d480d270b9bed4c
--------------------------------
commit c9e73e3d2b1eb1ea7ff068e05007eec3bd8ef1c9 upstream.
func_states_equal makes a very short lived allocation for idmap, probably because it's too large to fit on the stack. However the function is called quite often, leading to a lot of alloc / free churn. Replace the temporary allocation with dedicated scratch space in struct bpf_verifier_env.
Signed-off-by: Lorenz Bauer lmb@cloudflare.com Signed-off-by: Alexei Starovoitov ast@kernel.org Acked-by: Edward Cree ecree.xilinx@gmail.com Link: https://lore.kernel.org/bpf/20210429134656.122225-4-lmb@cloudflare.com [OP: adjusted context for 4.19] Signed-off-by: Ovidiu Panait ovidiu.panait@windriver.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/bpf_verifier.h | 8 +++++++ kernel/bpf/verifier.c | 42 +++++++++++------------------------- 2 files changed, 21 insertions(+), 29 deletions(-)
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e64ac93f7f4c0..729c65b320d47 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -127,6 +127,13 @@ struct bpf_func_state { struct bpf_stack_state *stack; };
+struct bpf_id_pair { + u32 old; + u32 cur; +}; + +/* Maximum number of register states that can exist at once */ +#define BPF_ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) #define MAX_CALL_FRAMES 8 struct bpf_verifier_state { /* call stack tracking */ @@ -213,6 +220,7 @@ struct bpf_verifier_env { struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ struct bpf_verifier_log log; struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1]; + struct bpf_id_pair idmap_scratch[BPF_ID_MAP_SIZE]; u32 subprog_cnt; };
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eb601aefc5ce1..1cdad85d230fb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4841,13 +4841,6 @@ static bool range_within(struct bpf_reg_state *old, old->smax_value >= cur->smax_value; }
-/* Maximum number of register states that can exist at once */ -#define ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) -struct idpair { - u32 old; - u32 cur; -}; - /* If in the old state two registers had the same id, then they need to have * the same id in the new state as well. But that id could be different from * the old state, so we need to track the mapping from old to new ids. @@ -4858,11 +4851,11 @@ struct idpair { * So we look through our idmap to see if this old id has been seen before. If * so, we require the new id to match; otherwise, we add the id pair to the map. */ -static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap) +static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap) { unsigned int i;
- for (i = 0; i < ID_MAP_SIZE; i++) { + for (i = 0; i < BPF_ID_MAP_SIZE; i++) { if (!idmap[i].old) { /* Reached an empty slot; haven't seen this id before */ idmap[i].old = old_id; @@ -4879,7 +4872,7 @@ static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap)
/* Returns true if (rold safe implies rcur safe) */ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, - struct idpair *idmap) + struct bpf_id_pair *idmap) { bool equal;
@@ -4983,7 +4976,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
static bool stacksafe(struct bpf_func_state *old, struct bpf_func_state *cur, - struct idpair *idmap) + struct bpf_id_pair *idmap) { int i, spi;
@@ -5069,29 +5062,20 @@ static bool stacksafe(struct bpf_func_state *old, * whereas register type in current state is meaningful, it means that * the current state will reach 'bpf_exit' instruction safely */ -static bool func_states_equal(struct bpf_func_state *old, +static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old, struct bpf_func_state *cur) { - struct idpair *idmap; - bool ret = false; int i;
- idmap = kcalloc(ID_MAP_SIZE, sizeof(struct idpair), GFP_KERNEL); - /* If we failed to allocate the idmap, just say it's not safe */ - if (!idmap) - return false; + memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch)); + for (i = 0; i < MAX_BPF_REG; i++) + if (!regsafe(&old->regs[i], &cur->regs[i], env->idmap_scratch)) + return false;
- for (i = 0; i < MAX_BPF_REG; i++) { - if (!regsafe(&old->regs[i], &cur->regs[i], idmap)) - goto out_free; - } + if (!stacksafe(old, cur, env->idmap_scratch)) + return false;
- if (!stacksafe(old, cur, idmap)) - goto out_free; - ret = true; -out_free: - kfree(idmap); - return ret; + return true; }
static bool states_equal(struct bpf_verifier_env *env, @@ -5115,7 +5099,7 @@ static bool states_equal(struct bpf_verifier_env *env, for (i = 0; i <= old->curframe; i++) { if (old->frame[i]->callsite != cur->frame[i]->callsite) return false; - if (!func_states_equal(old->frame[i], cur->frame[i])) + if (!func_states_equal(env, old->frame[i], cur->frame[i])) return false; } return true;
From: Daniel Borkmann daniel@iogearbox.net
stable inclusion from linux-4.19.207 commit a386fceed74e9f1125104eea7e66cff187edeff7
--------------------------------
commit e042aa532c84d18ff13291d00620502ce7a38dda upstream.
In 7fedb63a8307 ("bpf: Tighten speculative pointer arithmetic mask") we narrowed the offset mask for unprivileged pointer arithmetic in order to mitigate a corner case where in the speculative domain it is possible to advance, for example, the map value pointer by up to value_size-1 out-of- bounds in order to leak kernel memory via side-channel to user space.
The verifier's state pruning for scalars leaves one corner case open where in the first verification path R_x holds an unknown scalar with an aux->alu_limit of e.g. 7, and in a second verification path that same register R_x, here denoted as R_x', holds an unknown scalar which has tighter bounds and would thus satisfy range_within(R_x, R_x') as well as tnum_in(R_x, R_x') for state pruning, yielding an aux->alu_limit of 3: Given the second path fits the register constraints for pruning, the final generated mask from aux->alu_limit will remain at 7. While technically not wrong for the non-speculative domain, it would however be possible to craft similar cases where the mask would be too wide as in 7fedb63a8307.
One way to fix it is to detect the presence of unknown scalar map pointer arithmetic and force a deeper search on unknown scalars to ensure that we do not run into a masking mismatch.
Signed-off-by: Daniel Borkmann daniel@iogearbox.net Acked-by: Alexei Starovoitov ast@kernel.org [OP: adjusted context for 4.19] Signed-off-by: Ovidiu Panait ovidiu.panait@windriver.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 27 +++++++++++++++++---------- 2 files changed, 18 insertions(+), 10 deletions(-)
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 729c65b320d47..4acd06cca703d 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -215,6 +215,7 @@ struct bpf_verifier_env { struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ u32 used_map_cnt; /* number of used maps */ u32 id_gen; /* used to generate unique reg IDs */ + bool explore_alu_limits; bool allow_ptr_leaks; bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1cdad85d230fb..2c091df344735 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2903,6 +2903,12 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0; alu_state |= ptr_is_dst_reg ? BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; + + /* Limit pruning on unknown scalars to enable deep search for + * potential masking differences from other program paths. + */ + if (!off_is_imm) + env->explore_alu_limits = true; }
err = update_alu_sanitation_state(aux, alu_state, alu_limit); @@ -4871,8 +4877,8 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap) }
/* Returns true if (rold safe implies rcur safe) */ -static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, - struct bpf_id_pair *idmap) +static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, + struct bpf_reg_state *rcur, struct bpf_id_pair *idmap) { bool equal;
@@ -4898,6 +4904,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; switch (rold->type) { case SCALAR_VALUE: + if (env->explore_alu_limits) + return false; if (rcur->type == SCALAR_VALUE) { /* new val must satisfy old val knowledge */ return range_within(rold, rcur) && @@ -4974,9 +4982,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; }
-static bool stacksafe(struct bpf_func_state *old, - struct bpf_func_state *cur, - struct bpf_id_pair *idmap) +static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, + struct bpf_func_state *cur, struct bpf_id_pair *idmap) { int i, spi;
@@ -5018,9 +5025,8 @@ static bool stacksafe(struct bpf_func_state *old, continue; if (old->stack[spi].slot_type[0] != STACK_SPILL) continue; - if (!regsafe(&old->stack[spi].spilled_ptr, - &cur->stack[spi].spilled_ptr, - idmap)) + if (!regsafe(env, &old->stack[spi].spilled_ptr, + &cur->stack[spi].spilled_ptr, idmap)) /* when explored and current stack slot are both storing * spilled registers, check that stored pointers types * are the same as well. @@ -5069,10 +5075,11 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch)); for (i = 0; i < MAX_BPF_REG; i++) - if (!regsafe(&old->regs[i], &cur->regs[i], env->idmap_scratch)) + if (!regsafe(env, &old->regs[i], &cur->regs[i], + env->idmap_scratch)) return false;
- if (!stacksafe(old, cur, env->idmap_scratch)) + if (!stacksafe(env, old, cur, env->idmap_scratch)) return false;
return true;
From: Mark Rutland mark.rutland@arm.com
stable inclusion from linux-4.19.207 commit 79d15fc5646f79e769b90957370a84c0ba981c85
--------------------------------
commit 90268574a3e8a6b883bd802d702a2738577e1006 upstream.
The `compute_indices` and `populate_entries` macros operate on inclusive bounds, and thus the `map_memory` macro which uses them also operates on inclusive bounds.
We pass `_end` and `_idmap_text_end` to `map_memory`, but these are exclusive bounds, and if one of these is sufficiently aligned (as a result of kernel configuration, physical placement, and KASLR), then:
* In `compute_indices`, the computed `iend` will be in the page/block *after* the final byte of the intended mapping.
* In `populate_entries`, an unnecessary entry will be created at the end of each level of table. At the leaf level, this entry will map up to SWAPPER_BLOCK_SIZE bytes of physical addresses that we did not intend to map.
As we may map up to SWAPPER_BLOCK_SIZE bytes more than intended, we may violate the boot protocol and map physical address past the 2MiB-aligned end address we are permitted to map. As we map these with Normal memory attributes, this may result in further problems depending on what these physical addresses correspond to.
The final entry at each level may require an additional table at that level. As EARLY_ENTRIES() calculates an inclusive bound, we allocate enough memory for this.
Avoid the extraneous mapping by having map_memory convert the exclusive end address to an inclusive end address by subtracting one, and do likewise in EARLY_ENTRIES() when calculating the number of required tables. For clarity, comments are updated to more clearly document which boundaries the macros operate on. For consistency with the other macros, the comments in map_memory are also updated to describe `vstart` and `vend` as virtual addresses.
Fixes: 0370b31e4845 ("arm64: Extend early page table code to allow for larger kernels") Cc: stable@vger.kernel.org # 4.16.x Signed-off-by: Mark Rutland mark.rutland@arm.com Cc: Anshuman Khandual anshuman.khandual@arm.com Cc: Ard Biesheuvel ard.biesheuvel@linaro.org Cc: Steve Capper steve.capper@arm.com Cc: Will Deacon will@kernel.org Acked-by: Will Deacon will@kernel.org Link: https://lore.kernel.org/r/20210823101253.55567-1-mark.rutland@arm.com Signed-off-by: Catalin Marinas catalin.marinas@arm.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/include/asm/kernel-pgtable.h | 4 ++-- arch/arm64/kernel/head.S | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-)
diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index a780f6714b445..74ab40b76ad53 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -76,8 +76,8 @@ #define EARLY_KASLR (0) #endif
-#define EARLY_ENTRIES(vstart, vend, shift) (((vend) >> (shift)) \ - - ((vstart) >> (shift)) + 1 + EARLY_KASLR) +#define EARLY_ENTRIES(vstart, vend, shift) \ + ((((vend) - 1) >> (shift)) - ((vstart) >> (shift)) + 1 + EARLY_KASLR)
#define EARLY_PGDS(vstart, vend) (EARLY_ENTRIES(vstart, vend, PGDIR_SHIFT))
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 34bd0790d1e68..9f083b11efe25 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -202,7 +202,7 @@ ENDPROC(preserve_boot_args) * to be composed of multiple pages. (This effectively scales the end index). * * vstart: virtual address of start of range - * vend: virtual address of end of range + * vend: virtual address of end of range - we map [vstart, vend] * shift: shift used to transform virtual address into index * ptrs: number of entries in page table * istart: index in table corresponding to vstart @@ -239,17 +239,18 @@ ENDPROC(preserve_boot_args) * * tbl: location of page table * rtbl: address to be used for first level page table entry (typically tbl + PAGE_SIZE) - * vstart: start address to map - * vend: end address to map - we map [vstart, vend] + * vstart: virtual address of start of range + * vend: virtual address of end of range - we map [vstart, vend - 1] * flags: flags to use to map last level entries * phys: physical address corresponding to vstart - physical memory is contiguous * pgds: the number of pgd entries * * Temporaries: istart, iend, tmp, count, sv - these need to be different registers - * Preserves: vstart, vend, flags - * Corrupts: tbl, rtbl, istart, iend, tmp, count, sv + * Preserves: vstart, flags + * Corrupts: tbl, rtbl, vend, istart, iend, tmp, count, sv */ .macro map_memory, tbl, rtbl, vstart, vend, flags, phys, pgds, istart, iend, tmp, count, sv + sub \vend, \vend, #1 add \rtbl, \tbl, #PAGE_SIZE mov \sv, \rtbl mov \count, #0
From: Damien Le Moal damien.lemoal@wdc.com
stable inclusion from linux-4.19.207 commit 35a276f173356448cd07d83238d2bbc30796b267
--------------------------------
commit a680dd72ec336b81511e3bff48efac6dbfa563e7 upstream.
For a request that has a priority level equal to or larger than IOPRIO_BE_NR, bfq_set_next_ioprio_data() prints a critical warning but defaults to setting the request new_ioprio field to IOPRIO_BE_NR. This is not consistent with the warning and the allowed values for priority levels. Fix this by setting the request new_ioprio field to IOPRIO_BE_NR - 1, the lowest priority level allowed.
Cc: stable@vger.kernel.org Fixes: aee69d78dec0 ("block, bfq: introduce the BFQ-v0 I/O scheduler as an extra scheduler") Signed-off-by: Damien Le Moal damien.lemoal@wdc.com Reviewed-by: Hannes Reinecke hare@suse.de Link: https://lore.kernel.org/r/20210811033702.368488-2-damien.lemoal@wdc.com Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index c15e2c112ea8a..c20e6b772b4cd 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4242,7 +4242,7 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) if (bfqq->new_ioprio >= IOPRIO_BE_NR) { pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", bfqq->new_ioprio); - bfqq->new_ioprio = IOPRIO_BE_NR; + bfqq->new_ioprio = IOPRIO_BE_NR - 1; }
bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
From: Krzysztof Wilczyński kw@linux.com
stable inclusion from linux-4.19.207 commit 60242386c7ef9b084fc138e3d2b7bb99042fd779
--------------------------------
commit a8bd29bd49c4156ea0ec5a97812333e2aeef44e7 upstream.
The pciconfig_read() syscall reads PCI configuration space using hardware-dependent config accessors.
If the read fails on PCI, most accessors don't return an error; they pretend the read was successful and got ~0 data from the device, so the syscall returns success with ~0 data in the buffer.
When the accessor does return an error, pciconfig_read() normally fills the user's buffer with ~0 and returns an error in errno. But after e4585da22ad0 ("pci syscall.c: Switch to refcounting API"), we don't fill the buffer with ~0 for the EPERM "user lacks CAP_SYS_ADMIN" error.
Userspace may rely on the ~0 data to detect errors, but after e4585da22ad0, that would not detect CAP_SYS_ADMIN errors.
Restore the original behaviour of filling the buffer with ~0 when the CAP_SYS_ADMIN check fails.
[bhelgaas: commit log, fold in Nathan's fix https://lore.kernel.org/r/20210803200836.500658-1-nathan@kernel.org] Fixes: e4585da22ad0 ("pci syscall.c: Switch to refcounting API") Link: https://lore.kernel.org/r/20210729233755.1509616-1-kw@linux.com Signed-off-by: Krzysztof Wilczyński kw@linux.com Signed-off-by: Bjorn Helgaas bhelgaas@google.com Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/pci/syscall.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/pci/syscall.c b/drivers/pci/syscall.c index d96626c614f56..cdd71126fa468 100644 --- a/drivers/pci/syscall.c +++ b/drivers/pci/syscall.c @@ -21,8 +21,10 @@ SYSCALL_DEFINE5(pciconfig_read, unsigned long, bus, unsigned long, dfn, long err; long cfg_ret;
+ err = -EPERM; + dev = NULL; if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + goto error;
err = -ENODEV; dev = pci_get_domain_bus_and_slot(0, bus, dfn);
From: Nadav Amit namit@vmware.com
stable inclusion from linux-4.19.207 commit e826df5864e449f89c2d056aab998c813d40c932
--------------------------------
[ Upstream commit 22e5fe2a2a279d9a6fcbdfb4dffe73821bef1c90 ]
userfaultfd assumes that the enabled features are set once and never changed after UFFDIO_API ioctl succeeded.
However, currently, UFFDIO_API can be called concurrently from two different threads, succeed on both threads and leave userfaultfd's features in non-deterministic state. Theoretically, other uffd operations (ioctl's and page-faults) can be dispatched while adversely affected by such changes of features.
Moreover, the writes to ctx->state and ctx->features are not ordered, which can - theoretically, again - let userfaultfd_ioctl() think that userfaultfd API completed, while the features are still not initialized.
To avoid races, it is arguably best to get rid of ctx->state. Since there are only 2 states, record the API initialization in ctx->features as the uppermost bit and remove ctx->state.
Link: https://lkml.kernel.org/r/20210808020724.1022515-3-namit@vmware.com Fixes: 9cd75c3cd4c3d ("userfaultfd: non-cooperative: add ability to report non-PF events from uffd descriptor") Signed-off-by: Nadav Amit namit@vmware.com Cc: Alexander Viro viro@zeniv.linux.org.uk Cc: Andrea Arcangeli aarcange@redhat.com Cc: Axel Rasmussen axelrasmussen@google.com Cc: Jens Axboe axboe@kernel.dk Cc: Mike Rapoport rppt@linux.vnet.ibm.com Cc: Peter Xu peterx@redhat.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/userfaultfd.c | 93 +++++++++++++++++++++++------------------------- 1 file changed, 45 insertions(+), 48 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index ff06dfaf6cd70..803c61f66d54e 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -35,11 +35,6 @@ static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; int enable_userswap; #endif
-enum userfaultfd_state { - UFFD_STATE_WAIT_API, - UFFD_STATE_RUNNING, -}; - /* * Start with fault_pending_wqh and fault_wqh so they're more likely * to be in the same cacheline. @@ -71,8 +66,6 @@ struct userfaultfd_ctx { unsigned int flags; /* features requested from the userspace */ unsigned int features; - /* state machine */ - enum userfaultfd_state state; /* released */ bool released; /* memory mappings are changing because of non-cooperative event */ @@ -106,6 +99,14 @@ struct userfaultfd_wake_range { unsigned long len; };
+/* internal indication that UFFD_API ioctl was successfully executed */ +#define UFFD_FEATURE_INITIALIZED (1u << 31) + +static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx) +{ + return ctx->features & UFFD_FEATURE_INITIALIZED; +} + static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, int wake_flags, void *key) { @@ -707,7 +708,6 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
atomic_set(&ctx->refcount, 1); ctx->flags = octx->flags; - ctx->state = UFFD_STATE_RUNNING; ctx->features = octx->features; ctx->released = false; ctx->mmap_changing = false; @@ -993,38 +993,33 @@ static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
poll_wait(file, &ctx->fd_wqh, wait);
- switch (ctx->state) { - case UFFD_STATE_WAIT_API: + if (!userfaultfd_is_initialized(ctx)) return EPOLLERR; - case UFFD_STATE_RUNNING: - /* - * poll() never guarantees that read won't block. - * userfaults can be waken before they're read(). - */ - if (unlikely(!(file->f_flags & O_NONBLOCK))) - return EPOLLERR; - /* - * lockless access to see if there are pending faults - * __pollwait last action is the add_wait_queue but - * the spin_unlock would allow the waitqueue_active to - * pass above the actual list_add inside - * add_wait_queue critical section. So use a full - * memory barrier to serialize the list_add write of - * add_wait_queue() with the waitqueue_active read - * below. - */ - ret = 0; - smp_mb(); - if (waitqueue_active(&ctx->fault_pending_wqh)) - ret = EPOLLIN; - else if (waitqueue_active(&ctx->event_wqh)) - ret = EPOLLIN; - - return ret; - default: - WARN_ON_ONCE(1); + + /* + * poll() never guarantees that read won't block. + * userfaults can be waken before they're read(). + */ + if (unlikely(!(file->f_flags & O_NONBLOCK))) return EPOLLERR; - } + /* + * lockless access to see if there are pending faults + * __pollwait last action is the add_wait_queue but + * the spin_unlock would allow the waitqueue_active to + * pass above the actual list_add inside + * add_wait_queue critical section. So use a full + * memory barrier to serialize the list_add write of + * add_wait_queue() with the waitqueue_active read + * below. + */ + ret = 0; + smp_mb(); + if (waitqueue_active(&ctx->fault_pending_wqh)) + ret = EPOLLIN; + else if (waitqueue_active(&ctx->event_wqh)) + ret = EPOLLIN; + + return ret; }
static const struct file_operations userfaultfd_fops; @@ -1218,7 +1213,7 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf, struct uffd_msg msg; int no_wait = file->f_flags & O_NONBLOCK;
- if (ctx->state == UFFD_STATE_WAIT_API) + if (!userfaultfd_is_initialized(ctx)) return -EINVAL;
for (;;) { @@ -1842,9 +1837,10 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, static inline unsigned int uffd_ctx_features(__u64 user_features) { /* - * For the current set of features the bits just coincide + * For the current set of features the bits just coincide. Set + * UFFD_FEATURE_INITIALIZED to mark the features as enabled. */ - return (unsigned int)user_features; + return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED; }
/* @@ -1857,12 +1853,10 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, { struct uffdio_api uffdio_api; void __user *buf = (void __user *)arg; + unsigned int ctx_features; int ret; __u64 features;
- ret = -EINVAL; - if (ctx->state != UFFD_STATE_WAIT_API) - goto out; ret = -EFAULT; if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) goto out; @@ -1879,9 +1873,13 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, ret = -EFAULT; if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) goto out; - ctx->state = UFFD_STATE_RUNNING; + /* only enable the requested features for this uffd context */ - ctx->features = uffd_ctx_features(features); + ctx_features = uffd_ctx_features(features); + ret = -EINVAL; + if (cmpxchg(&ctx->features, 0, ctx_features) != 0) + goto err_out; + ret = 0; out: return ret; @@ -1898,7 +1896,7 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd, int ret = -EINVAL; struct userfaultfd_ctx *ctx = file->private_data;
- if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API) + if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx)) return -EINVAL;
switch(cmd) { @@ -1996,7 +1994,6 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) atomic_set(&ctx->refcount, 1); ctx->flags = flags; ctx->features = 0; - ctx->state = UFFD_STATE_WAIT_API; ctx->released = false; ctx->mmap_changing = false; ctx->mm = current->mm;
From: "Rafael J. Wysocki" rafael.j.wysocki@intel.com
stable inclusion from linux-4.19.207 commit 6d941bd6366a08bf5c660dbd4f8345427d56aefe
--------------------------------
[ Upstream commit 14858dcc3b3587f4bb5c48e130ee7d68fc2b0a29 ]
Updating the current_state field of struct pci_dev the way it is done in pci_enable_device_flags() before calling do_pci_enable_device() may not work. For example, if the given PCI device depends on an ACPI power resource whose _STA method initially returns 0 ("off"), but the config space of the PCI device is accessible and the power state retrieved from the PCI_PM_CTRL register is D0, the current_state field in the struct pci_dev representing that device will get out of sync with the power.state of its ACPI companion object and that will lead to power management issues going forward.
To avoid such issues, make pci_enable_device_flags() call pci_update_current_state() which takes ACPI device power management into account, if present, to retrieve the current power state of the device.
Link: https://lore.kernel.org/lkml/20210314000439.3138941-1-luzmaximilian@gmail.co... Reported-by: Maximilian Luz luzmaximilian@gmail.com Signed-off-by: Rafael J. Wysocki rafael.j.wysocki@intel.com Tested-by: Maximilian Luz luzmaximilian@gmail.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/pci/pci.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 7d73ebc920098..e43b11f539d90 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1663,11 +1663,7 @@ static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags) * so that things like MSI message writing will behave as expected * (e.g. if the device really is in D0 at enable time). */ - if (dev->pm_cap) { - u16 pmcsr; - pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr); - dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK); - } + pci_update_current_state(dev, dev->current_state);
if (atomic_inc_return(&dev->enable_cnt) > 1) return 0; /* already enabled */
From: Yufeng Mo moyufeng@huawei.com
stable inclusion from linux-4.19.207 commit 25a35dd83627d33ff4304b656f17928efa339a09
--------------------------------
[ Upstream commit 220ade77452c15ecb1ab94c3f8aaeb6d033c3582 ]
Some time ago, I reported a calltrace issue "did not find a suitable aggregator", please see[1]. After a period of analysis and reproduction, I find that this problem is caused by concurrency.
Before the problem occurs, the bond structure is like follows:
bond0 - slaver0(eth0) - agg0.lag_ports -> port0 - port1 \ port0 \ slaver1(eth1) - agg1.lag_ports -> NULL \ port1
If we run 'ifenslave bond0 -d eth1', the process is like below:
excuting __bond_release_one() | bond_upper_dev_unlink()[step1] | | | | | bond_3ad_lacpdu_recv() | | ->bond_3ad_rx_indication() | | spin_lock_bh() | | ->ad_rx_machine() | | ->__record_pdu()[step2] | | spin_unlock_bh() | | | | bond_3ad_state_machine_handler() | spin_lock_bh() | ->ad_port_selection_logic() | ->try to find free aggregator[step3] | ->try to find suitable aggregator[step4] | ->did not find a suitable aggregator[step5] | spin_unlock_bh() | | | | bond_3ad_unbind_slave() | spin_lock_bh() spin_unlock_bh()
step1: already removed slaver1(eth1) from list, but port1 remains step2: receive a lacpdu and update port0 step3: port0 will be removed from agg0.lag_ports. The struct is "agg0.lag_ports -> port1" now, and agg0 is not free. At the same time, slaver1/agg1 has been removed from the list by step1. So we can't find a free aggregator now. step4: can't find suitable aggregator because of step2 step5: cause a calltrace since port->aggregator is NULL
To solve this concurrency problem, put bond_upper_dev_unlink() after bond_3ad_unbind_slave(). In this way, we can invalid the port first and skip this port in bond_3ad_state_machine_handler(). This eliminates the situation that the slaver has been removed from the list but the port is still valid.
[1]https://lore.kernel.org/netdev/10374.1611947473@famine/
Signed-off-by: Yufeng Mo moyufeng@huawei.com Acked-by: Jay Vosburgh jay.vosburgh@canonical.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/net/bonding/bond_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index dd853d85afdd3..c952ab169e4e0 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1917,7 +1917,6 @@ static int __bond_release_one(struct net_device *bond_dev, /* recompute stats just before removing the slave */ bond_get_stats(bond->dev, &bond->bond_stats);
- bond_upper_dev_unlink(bond, slave); /* unregister rx_handler early so bond_handle_frame wouldn't be called * for this slave anymore. */ @@ -1926,6 +1925,8 @@ static int __bond_release_one(struct net_device *bond_dev, if (BOND_MODE(bond) == BOND_MODE_8023AD) bond_3ad_unbind_slave(slave);
+ bond_upper_dev_unlink(bond, slave); + if (bond_mode_can_use_xmit_hash(bond)) bond_update_slave_arr(bond, slave);
From: Ding Hui dinghui@sangfor.com.cn
stable inclusion from linux-4.19.207 commit 1c409595b46bc405e9d4761d449d9307e009fc6f
--------------------------------
[ Upstream commit d72c74197b70bc3c95152f351a568007bffa3e11 ]
smb_buf is allocated by small_smb_init_no_tc(), and buf type is CIFS_SMALL_BUFFER, so we should use cifs_small_buf_release() to release it in failed path.
Signed-off-by: Ding Hui dinghui@sangfor.com.cn Reviewed-by: Paulo Alcantara (SUSE) pc@cjr.nz Signed-off-by: Steve French stfrench@microsoft.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/cifs/sess.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index aa23c00367ec4..0113dba28eb09 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -602,7 +602,7 @@ sess_alloc_buffer(struct sess_data *sess_data, int wct) return 0;
out_free_smb_buf: - kfree(smb_buf); + cifs_small_buf_release(smb_buf); sess_data->iov[0].iov_base = NULL; sess_data->iov[0].iov_len = 0; sess_data->buf0_type = CIFS_NO_BUFFER;
From: chenying chenying.kernel@bytedance.com
stable inclusion from linux-4.19.207 commit 1bc41e47e0ee05367b2efbdd95bdc2d4e7fcc164
--------------------------------
commit 52d5a0c6bd8a89f460243ed937856354f8f253a3 upstream.
If function ovl_instantiate() returns an error, ovl_cleanup will be called and try to remove newdentry from wdir, but the newdentry has been moved to udir at this time. This will causes BUG_ON(victim->d_parent->d_inode != dir) in fs/namei.c:may_delete.
Signed-off-by: chenying chenying.kernel@bytedance.com Fixes: 01b39dcc9568 ("ovl: use inode_insert5() to hash a newly created inode") Link: https://lore.kernel.org/linux-unionfs/e6496a94-a161-dc04-c38a-d2544633acb4@b... Cc: stable@vger.kernel.org # v4.18 Signed-off-by: Miklos Szeredi mszeredi@redhat.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/overlayfs/dir.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index d0d4b80dff9a7..72e95c9c244de 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -519,8 +519,10 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, goto out_cleanup; } err = ovl_instantiate(dentry, inode, newdentry, hardlink); - if (err) - goto out_cleanup; + if (err) { + ovl_cleanup(udir, newdentry); + dput(newdentry); + } out_dput: dput(upper); out_unlock:
From: "Maciej W. Rozycki" macro@orcam.me.uk
stable inclusion from linux-4.19.207 commit 33b8fcdedbde2b2a508c3d78d4bb34e4067391a0
--------------------------------
commit 44d01fc86d952f5a8b8b32bdb4841504d5833d95 upstream.
Update BusLogic driver's messaging system to use pr_cont() for continuation lines, bringing messy output:
pci 0000:00:13.0: PCI->APIC IRQ transform: INT A -> IRQ 17 scsi: ***** BusLogic SCSI Driver Version 2.1.17 of 12 September 2013 ***** scsi: Copyright 1995-1998 by Leonard N. Zubkoff lnz@dandelion.com scsi0: Configuring BusLogic Model BT-958 PCI Wide Ultra SCSI Host Adapter scsi0: Firmware Version: 5.07B, I/O Address: 0x7000, IRQ Channel: 17/Level scsi0: PCI Bus: 0, Device: 19, Address: 0xE0012000, Host Adapter SCSI ID: 7 scsi0: Parity Checking: Enabled, Extended Translation: Enabled scsi0: Synchronous Negotiation: Ultra, Wide Negotiation: Enabled scsi0: Disconnect/Reconnect: Enabled, Tagged Queuing: Enabled scsi0: Scatter/Gather Limit: 128 of 8192 segments, Mailboxes: 211 scsi0: Driver Queue Depth: 211, Host Adapter Queue Depth: 192 scsi0: Tagged Queue Depth: Automatic , Untagged Queue Depth: 3 scsi0: SCSI Bus Termination: Both Enabled , SCAM: Disabled
scsi0: *** BusLogic BT-958 Initialized Successfully *** scsi host0: BusLogic BT-958
back to order:
pci 0000:00:13.0: PCI->APIC IRQ transform: INT A -> IRQ 17 scsi: ***** BusLogic SCSI Driver Version 2.1.17 of 12 September 2013 ***** scsi: Copyright 1995-1998 by Leonard N. Zubkoff lnz@dandelion.com scsi0: Configuring BusLogic Model BT-958 PCI Wide Ultra SCSI Host Adapter scsi0: Firmware Version: 5.07B, I/O Address: 0x7000, IRQ Channel: 17/Level scsi0: PCI Bus: 0, Device: 19, Address: 0xE0012000, Host Adapter SCSI ID: 7 scsi0: Parity Checking: Enabled, Extended Translation: Enabled scsi0: Synchronous Negotiation: Ultra, Wide Negotiation: Enabled scsi0: Disconnect/Reconnect: Enabled, Tagged Queuing: Enabled scsi0: Scatter/Gather Limit: 128 of 8192 segments, Mailboxes: 211 scsi0: Driver Queue Depth: 211, Host Adapter Queue Depth: 192 scsi0: Tagged Queue Depth: Automatic, Untagged Queue Depth: 3 scsi0: SCSI Bus Termination: Both Enabled, SCAM: Disabled scsi0: *** BusLogic BT-958 Initialized Successfully *** scsi host0: BusLogic BT-958
Also diagnostic output such as with the BusLogic=TraceConfiguration parameter is affected and becomes vertical and therefore hard to read. This has now been corrected, e.g.:
pci 0000:00:13.0: PCI->APIC IRQ transform: INT A -> IRQ 17 blogic_cmd(86) Status = 30: 4 ==> 4: FF 05 93 00 blogic_cmd(95) Status = 28: (Modify I/O Address) blogic_cmd(91) Status = 30: 1 ==> 1: 01 blogic_cmd(04) Status = 30: 4 ==> 4: 41 41 35 30 blogic_cmd(8D) Status = 30: 14 ==> 14: 45 DC 00 20 00 00 00 00 00 40 30 37 42 1D scsi: ***** BusLogic SCSI Driver Version 2.1.17 of 12 September 2013 ***** scsi: Copyright 1995-1998 by Leonard N. Zubkoff lnz@dandelion.com blogic_cmd(04) Status = 30: 4 ==> 4: 41 41 35 30 blogic_cmd(0B) Status = 30: 3 ==> 3: 00 08 07 blogic_cmd(0D) Status = 30: 34 ==> 34: 03 01 07 04 00 00 00 00 00 00 00 00 00 00 00 00 FF 42 44 46 FF 00 00 00 00 00 00 00 00 00 FF 00 FF 00 blogic_cmd(8D) Status = 30: 14 ==> 14: 45 DC 00 20 00 00 00 00 00 40 30 37 42 1D blogic_cmd(84) Status = 30: 1 ==> 1: 37 blogic_cmd(8B) Status = 30: 5 ==> 5: 39 35 38 20 20 blogic_cmd(85) Status = 30: 1 ==> 1: 42 blogic_cmd(86) Status = 30: 4 ==> 4: FF 05 93 00 blogic_cmd(91) Status = 30: 64 ==> 64: 41 46 3E 20 39 35 38 20 20 00 C4 00 04 01 07 2F 07 04 35 FF FF FF FF FF FF FF FF FF FF 01 00 FE FF 08 FF FF 00 00 00 00 00 00 00 01 00 01 00 00 FF FF 00 00 00 00 00 00 00 00 00 00 00 00 00 FC scsi0: Configuring BusLogic Model BT-958 PCI Wide Ultra SCSI Host Adapter
etc.
Link: https://lore.kernel.org/r/alpine.DEB.2.21.2104201940430.44318@angie.orcam.me... Fixes: 4bcc595ccd80 ("printk: reinstate KERN_CONT for printing continuation lines") Cc: stable@vger.kernel.org # v4.9+ Acked-by: Khalid Aziz khalid@gonehiking.org Signed-off-by: Maciej W. Rozycki macro@orcam.me.uk Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/scsi/BusLogic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/scsi/BusLogic.c b/drivers/scsi/BusLogic.c index 0d4ffe0ae3065..9840f4b06d781 100644 --- a/drivers/scsi/BusLogic.c +++ b/drivers/scsi/BusLogic.c @@ -3605,7 +3605,7 @@ static void blogic_msg(enum blogic_msglevel msglevel, char *fmt, if (buf[0] != '\n' || len > 1) printk("%sscsi%d: %s", blogic_msglevelmap[msglevel], adapter->host_no, buf); } else - printk("%s", buf); + pr_cont("%s", buf); } else { if (begin) { if (adapter != NULL && adapter->adapter_initd) @@ -3613,7 +3613,7 @@ static void blogic_msg(enum blogic_msglevel msglevel, char *fmt, else printk("%s%s", blogic_msglevelmap[msglevel], buf); } else - printk("%s", buf); + pr_cont("%s", buf); } begin = (buf[len - 1] == '\n'); }
From: Liu Zixian liuzixian4@huawei.com
stable inclusion from linux-4.19.207 commit 2fed7f8eda3211190a27caddd6ba8fd728f7b17b
--------------------------------
commit 13db8c50477d83ad3e3b9b0ae247e5cd833a7ae4 upstream.
After fork, the child process will get incorrect (2x) hugetlb_usage. If a process uses 5 2MB hugetlb pages in an anonymous mapping,
HugetlbPages: 10240 kB
and then forks, the child will show,
HugetlbPages: 20480 kB
The reason for double the amount is because hugetlb_usage will be copied from the parent and then increased when we copy page tables from parent to child. Child will have 2x actual usage.
Fix this by adding hugetlb_count_init in mm_init.
Link: https://lkml.kernel.org/r/20210826071742.877-1-liuzixian4@huawei.com Fixes: 5d317b2b6536 ("mm: hugetlb: proc: add HugetlbPages field to /proc/PID/status") Signed-off-by: Liu Zixian liuzixian4@huawei.com Reviewed-by: Naoya Horiguchi naoya.horiguchi@nec.com Reviewed-by: Mike Kravetz mike.kravetz@oracle.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/hugetlb.h | 9 +++++++++ kernel/fork.c | 1 + 2 files changed, 10 insertions(+)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 5e7cc7bd616a7..561e7679fadc4 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -549,6 +549,11 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm);
+static inline void hugetlb_count_init(struct mm_struct *mm) +{ + atomic_long_set(&mm->hugetlb_usage, 0); +} + static inline void hugetlb_count_add(long l, struct mm_struct *mm) { atomic_long_add(l, &mm->hugetlb_usage); @@ -629,6 +634,10 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h, return &mm->page_table_lock; }
+static inline void hugetlb_count_init(struct mm_struct *mm) +{ +} + static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m) { } diff --git a/kernel/fork.c b/kernel/fork.c index d091b680e5e62..88463fd569308 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1012,6 +1012,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->pmd_huge_pte = NULL; #endif mm_init_uprobes_state(mm); + hugetlb_count_init(mm);
if (current->mm) { mm->flags = current->mm->flags & MMF_INIT_MASK;
From: Mark Brown broonie@kernel.org
stable inclusion from linux-4.19.207 commit aadf3115f86c6df4308e2734b5c39eadbe9573e0
--------------------------------
commit e35ac9d0b56e9efefaeeb84b635ea26c2839ea86 upstream.
When we need a buffer for SVE register state we call sve_alloc() to make sure that one is there. In order to avoid repeated allocations and frees we keep the buffer around unless we change vector length and just memset() it to ensure a clean register state. The function that deals with this takes the task to operate on as an argument, however in the case where we do a memset() we initialise using the SVE state size for the current task rather than the task passed as an argument.
This is only an issue in the case where we are setting the register state for a task via ptrace and the task being configured has a different vector length to the task tracing it. In the case where the buffer is larger in the traced process we will leak old state from the traced process to itself, in the case where the buffer is smaller in the traced process we will overflow the buffer and corrupt memory.
Fixes: bc0ee4760364 ("arm64/sve: Core task context handling") Cc: stable@vger.kernel.org # 4.15.x Signed-off-by: Mark Brown broonie@kernel.org Link: https://lore.kernel.org/r/20210909165356.10675-1-broonie@kernel.org Signed-off-by: Catalin Marinas catalin.marinas@arm.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/kernel/fpsimd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 14d4009f57ac4..bb048144c3bd1 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -456,7 +456,7 @@ size_t sve_state_size(struct task_struct const *task) void sve_alloc(struct task_struct *task) { if (task->thread.sve_state) { - memset(task->thread.sve_state, 0, sve_state_size(current)); + memset(task->thread.sve_state, 0, sve_state_size(task)); return; }
From: Mike Rapoport rppt@linux.ibm.com
stable inclusion from linux-4.19.207 commit 2717db72f74c8e51068801b6327559241c54b86e
--------------------------------
commit 34b1999da935a33be6239226bfa6cd4f704c5c88 upstream.
Jiri Olsa reported a fault when running:
# cat /proc/kallsyms | grep ksys_read ffffffff8136d580 T ksys_read # objdump -d --start-address=0xffffffff8136d580 --stop-address=0xffffffff8136d590 /proc/kcore
/proc/kcore: file format elf64-x86-64
Segmentation fault
general protection fault, probably for non-canonical address 0xf887ffcbff000: 0000 [#1] SMP PTI CPU: 12 PID: 1079 Comm: objdump Not tainted 5.14.0-rc5qemu+ #508 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-4.fc34 04/01/2014 RIP: 0010:kern_addr_valid Call Trace: read_kcore ? rcu_read_lock_sched_held ? rcu_read_lock_sched_held ? rcu_read_lock_sched_held ? trace_hardirqs_on ? rcu_read_lock_sched_held ? lock_acquire ? lock_acquire ? rcu_read_lock_sched_held ? lock_acquire ? rcu_read_lock_sched_held ? rcu_read_lock_sched_held ? rcu_read_lock_sched_held ? lock_release ? _raw_spin_unlock ? __handle_mm_fault ? rcu_read_lock_sched_held ? lock_acquire ? rcu_read_lock_sched_held ? lock_release proc_reg_read ? vfs_read vfs_read ksys_read do_syscall_64 entry_SYSCALL_64_after_hwframe
The fault happens because kern_addr_valid() dereferences existent but not present PMD in the high kernel mappings.
Such PMDs are created when free_kernel_image_pages() frees regions larger than 2Mb. In this case, a part of the freed memory is mapped with PMDs and the set_memory_np_noalias() -> ... -> __change_page_attr() sequence will mark the PMD as not present rather than wipe it completely.
Have kern_addr_valid() check whether higher level page table entries are present before trying to dereference them to fix this issue and to avoid similar issues in the future.
Stable backporting note: ------------------------
Note that the stable marking is for all active stable branches because there could be cases where pagetable entries exist but are not valid - see 9a14aefc1d28 ("x86: cpa, fix lookup_address"), for example. So make sure to be on the safe side here and use pXY_present() accessors rather than pXY_none() which could #GP when accessing pages in the direct map.
Also see:
c40a56a7818c ("x86/mm/init: Remove freed kernel image areas from alias mapping")
for more info.
Reported-by: Jiri Olsa jolsa@redhat.com Signed-off-by: Mike Rapoport rppt@linux.ibm.com Signed-off-by: Borislav Petkov bp@suse.de Reviewed-by: David Hildenbrand david@redhat.com Acked-by: Dave Hansen dave.hansen@intel.com Tested-by: Jiri Olsa jolsa@redhat.com Cc: stable@vger.kernel.org # 4.4+ Link: https://lkml.kernel.org/r/20210819132717.19358-1-rppt@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/x86/mm/init_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 81e85a8dd3005..4b25a1ad18ffd 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1289,18 +1289,18 @@ int kern_addr_valid(unsigned long addr) return 0;
p4d = p4d_offset(pgd, addr); - if (p4d_none(*p4d)) + if (!p4d_present(*p4d)) return 0;
pud = pud_offset(p4d, addr); - if (pud_none(*pud)) + if (!pud_present(*pud)) return 0;
if (pud_large(*pud)) return pfn_valid(pud_pfn(*pud));
pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) + if (!pmd_present(*pmd)) return 0;
if (pmd_large(*pmd))
From: Baptiste Lepers baptiste.lepers@gmail.com
stable inclusion from linux-4.19.207 commit c09a84aea0d3902f955cc6504e1c25cddb7c48c2
--------------------------------
commit b89a05b21f46150ac10a962aa50109250b56b03b upstream.
In perf_event_addr_filters_apply, the task associated with the event (event->ctx->task) is read using READ_ONCE at the beginning of the function, checked, and then re-read from event->ctx->task, voiding all guarantees of the checks. Reuse the value that was read by READ_ONCE to ensure the consistency of the task struct throughout the function.
Fixes: 375637bc52495 ("perf/core: Introduce address range filtering") Signed-off-by: Baptiste Lepers baptiste.lepers@gmail.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Link: https://lkml.kernel.org/r/20210906015310.12802-1-baptiste.lepers@gmail.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c index 09869287b7ae3..d6293a2f01e9b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8941,7 +8941,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event) return;
if (ifh->nr_file_filters) { - mm = get_task_mm(event->ctx->task); + mm = get_task_mm(task); if (!mm) goto restart;
From: Eric Dumazet edumazet@google.com
stable inclusion from linux-4.19.207 commit 44ba281510190e2915506016407ba5b374a3add2
--------------------------------
commit 04f08eb44b5011493d77b602fdec29ff0f5c6cd5 upstream.
syzbot reported another data-race in af_unix [1]
Lets change __skb_insert() to use WRITE_ONCE() when changing skb head qlen.
Also, change unix_dgram_poll() to use lockless version of unix_recvq_full()
It is verry possible we can switch all/most unix_recvq_full() to the lockless version, this will be done in a future kernel version.
[1] HEAD commit: 8596e589b787732c8346f0482919e83cc9362db1
BUG: KCSAN: data-race in skb_queue_tail / unix_dgram_poll
write to 0xffff88814eeb24e0 of 4 bytes by task 25815 on cpu 0: __skb_insert include/linux/skbuff.h:1938 [inline] __skb_queue_before include/linux/skbuff.h:2043 [inline] __skb_queue_tail include/linux/skbuff.h:2076 [inline] skb_queue_tail+0x80/0xa0 net/core/skbuff.c:3264 unix_dgram_sendmsg+0xff2/0x1600 net/unix/af_unix.c:1850 sock_sendmsg_nosec net/socket.c:703 [inline] sock_sendmsg net/socket.c:723 [inline] ____sys_sendmsg+0x360/0x4d0 net/socket.c:2392 ___sys_sendmsg net/socket.c:2446 [inline] __sys_sendmmsg+0x315/0x4b0 net/socket.c:2532 __do_sys_sendmmsg net/socket.c:2561 [inline] __se_sys_sendmmsg net/socket.c:2558 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2558 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae
read to 0xffff88814eeb24e0 of 4 bytes by task 25834 on cpu 1: skb_queue_len include/linux/skbuff.h:1869 [inline] unix_recvq_full net/unix/af_unix.c:194 [inline] unix_dgram_poll+0x2bc/0x3e0 net/unix/af_unix.c:2777 sock_poll+0x23e/0x260 net/socket.c:1288 vfs_poll include/linux/poll.h:90 [inline] ep_item_poll fs/eventpoll.c:846 [inline] ep_send_events fs/eventpoll.c:1683 [inline] ep_poll fs/eventpoll.c:1798 [inline] do_epoll_wait+0x6ad/0xf00 fs/eventpoll.c:2226 __do_sys_epoll_wait fs/eventpoll.c:2238 [inline] __se_sys_epoll_wait fs/eventpoll.c:2233 [inline] __x64_sys_epoll_wait+0xf6/0x120 fs/eventpoll.c:2233 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae
value changed: 0x0000001b -> 0x00000001
Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 25834 Comm: syz-executor.1 Tainted: G W 5.14.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Fixes: 86b18aaa2b5b ("skbuff: fix a data race in skb_queue_len()") Cc: Qian Cai cai@lca.pw Signed-off-by: Eric Dumazet edumazet@google.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/skbuff.h | 2 +- net/unix/af_unix.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 06176ef2a8424..e9f100bd73d98 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1761,7 +1761,7 @@ static inline void __skb_insert(struct sk_buff *newsk, WRITE_ONCE(newsk->prev, prev); WRITE_ONCE(next->prev, newsk); WRITE_ONCE(prev->next, newsk); - list->qlen++; + WRITE_ONCE(list->qlen, list->qlen + 1); }
static inline void __skb_queue_splice(const struct sk_buff_head *list, diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 98c253afa0db2..c293a558b0d4f 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2739,7 +2739,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
other = unix_peer(sk); if (other && unix_peer(other) != sk && - unix_recvq_full(other) && + unix_recvq_full_lockless(other) && unix_dgram_peer_wake_me(sk, other)) writable = 0;
From: zhenggy zhenggy@chinatelecom.cn
stable inclusion from linux-4.19.207 commit dfefcc46354530c3ec1d12db0e16c740548c6229
--------------------------------
commit 4f884f3962767877d7aabbc1ec124d2c307a4257 upstream.
Commit 10d3be569243 ("tcp-tso: do not split TSO packets at retransmit time") may directly retrans a multiple segments TSO/GSO packet without split, Since this commit, we can no longer assume that a retransmitted packet is a single segment.
This patch fixes the tp->undo_retrans accounting in tcp_sacktag_one() that use the actual segments(pcount) of the retransmitted packet.
Before that commit (10d3be569243), the assumption underlying the tp->undo_retrans-- seems correct.
Fixes: 10d3be569243 ("tcp-tso: do not split TSO packets at retransmit time") Signed-off-by: zhenggy zhenggy@chinatelecom.cn Reviewed-by: Eric Dumazet edumazet@google.com Acked-by: Yuchung Cheng ycheng@google.com Acked-by: Neal Cardwell ncardwell@google.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a058fb936c2a1..ac6c1c41fbffa 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1195,7 +1195,7 @@ static u8 tcp_sacktag_one(struct sock *sk, if (dup_sack && (sacked & TCPCB_RETRANS)) { if (tp->undo_marker && tp->undo_retrans > 0 && after(end_seq, tp->undo_marker)) - tp->undo_retrans--; + tp->undo_retrans = max_t(int, 0, tp->undo_retrans - pcount); if ((sacked & TCPCB_SACKED_ACKED) && before(start_seq, state->reord)) state->reord = start_seq;
From: David Hildenbrand david@redhat.com
stable inclusion from linux-4.19.207 commit c48402015e02901ff8b1fac5c112b02546364bb6
--------------------------------
commit 7cf209ba8a86410939a24cb1aeb279479a7e0ca6 upstream.
Patch series "mm/memory_hotplug: preparatory patches for new online policy and memory"
These are all cleanups and one fix previously sent as part of [1]: [PATCH v1 00/12] mm/memory_hotplug: "auto-movable" online policy and memory groups.
These patches make sense even without the other series, therefore I pulled them out to make the other series easier to digest.
[1] https://lkml.kernel.org/r/20210607195430.48228-1-david@redhat.com
This patch (of 4):
Checkpatch complained on a follow-up patch that we are using "unsigned" here, which defaults to "unsigned int" and checkpatch is correct.
As we will search for a fitting zone using the wrong pfn, we might end up onlining memory to one of the special kernel zones, such as ZONE_DMA, which can end badly as the onlined memory does not satisfy properties of these zones.
Use "unsigned long" instead, just as we do in other places when handling PFNs. This can bite us once we have physical addresses in the range of multiple TB.
Link: https://lkml.kernel.org/r/20210712124052.26491-2-david@redhat.com Fixes: e5e689302633 ("mm, memory_hotplug: display allowed zones in the preferred ordering") Signed-off-by: David Hildenbrand david@redhat.com Reviewed-by: Pankaj Gupta pankaj.gupta@ionos.com Reviewed-by: Muchun Song songmuchun@bytedance.com Reviewed-by: Oscar Salvador osalvador@suse.de Cc: David Hildenbrand david@redhat.com Cc: Vitaly Kuznetsov vkuznets@redhat.com Cc: "Michael S. Tsirkin" mst@redhat.com Cc: Jason Wang jasowang@redhat.com Cc: Pankaj Gupta pankaj.gupta.linux@gmail.com Cc: Wei Yang richard.weiyang@linux.alibaba.com Cc: Michal Hocko mhocko@kernel.org Cc: Dan Williams dan.j.williams@intel.com Cc: Anshuman Khandual anshuman.khandual@arm.com Cc: Dave Hansen dave.hansen@linux.intel.com Cc: Vlastimil Babka vbabka@suse.cz Cc: Mike Rapoport rppt@kernel.org Cc: "Rafael J. Wysocki" rjw@rjwysocki.net Cc: Len Brown lenb@kernel.org Cc: Pavel Tatashin pasha.tatashin@soleen.com Cc: Heiko Carstens hca@linux.ibm.com Cc: Michael Ellerman mpe@ellerman.id.au Cc: Catalin Marinas catalin.marinas@arm.com Cc: virtualization@lists.linux-foundation.org Cc: Andy Lutomirski luto@kernel.org Cc: "Aneesh Kumar K.V" aneesh.kumar@linux.ibm.com Cc: Anton Blanchard anton@ozlabs.org Cc: Ard Biesheuvel ardb@kernel.org Cc: Baoquan He bhe@redhat.com Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Borislav Petkov bp@alien8.de Cc: Christian Borntraeger borntraeger@de.ibm.com Cc: Christophe Leroy christophe.leroy@c-s.fr Cc: Dave Jiang dave.jiang@intel.com Cc: "H. Peter Anvin" hpa@zytor.com Cc: Ingo Molnar mingo@redhat.com Cc: Jia He justin.he@arm.com Cc: Joe Perches joe@perches.com Cc: Kefeng Wang wangkefeng.wang@huawei.com Cc: Laurent Dufour ldufour@linux.ibm.com Cc: Michel Lespinasse michel@lespinasse.org Cc: Nathan Lynch nathanl@linux.ibm.com Cc: Nicholas Piggin npiggin@gmail.com Cc: Paul Mackerras paulus@samba.org Cc: Peter Zijlstra peterz@infradead.org Cc: Pierre Morel pmorel@linux.ibm.com Cc: "Rafael J. Wysocki" rafael.j.wysocki@intel.com Cc: Rich Felker dalias@libc.org Cc: Scott Cheloha cheloha@linux.ibm.com Cc: Sergei Trofimovich slyfox@gentoo.org Cc: Thiago Jung Bauermann bauerman@linux.ibm.com Cc: Thomas Gleixner tglx@linutronix.de Cc: Vasily Gorbik gor@linux.ibm.com Cc: Vishal Verma vishal.l.verma@intel.com Cc: Will Deacon will@kernel.org Cc: Yoshinori Sato ysato@users.sourceforge.jp Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: David Hildenbrand david@redhat.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/memory_hotplug.h | 4 ++-- mm/memory_hotplug.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index dd803e5d84e4c..8782f0e993704 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -344,6 +344,6 @@ extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type); -extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, - unsigned long nr_pages); +extern struct zone *zone_for_pfn_range(int online_type, int nid, + unsigned long start_pfn, unsigned long nr_pages); #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 885dbe619930c..1e20ad038496d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -789,8 +789,8 @@ static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn return movable_node_enabled ? movable_zone : kernel_zone; }
-struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, - unsigned long nr_pages) +struct zone *zone_for_pfn_range(int online_type, int nid, + unsigned long start_pfn, unsigned long nr_pages) { if (online_type == MMOP_ONLINE_KERNEL) return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages);
From: Paolo Valente paolo.valente@linaro.org
stable inclusion from linux-4.19.207 commit 2c1b1848357dc69f62ce3630b850e6680b87854b
--------------------------------
[ Upstream commit 2d52c58b9c9bdae0ca3df6a1eab5745ab3f7d80b ]
The function bfq_setup_merge prepares the merging between two bfq_queues, say bfqq and new_bfqq. To this goal, it assigns bfqq->new_bfqq = new_bfqq. Then, each time some I/O for bfqq arrives, the process that generated that I/O is disassociated from bfqq and associated with new_bfqq (merging is actually a redirection). In this respect, bfq_setup_merge increases new_bfqq->ref in advance, adding the number of processes that are expected to be associated with new_bfqq.
Unfortunately, the stable-merging mechanism interferes with this setup. After bfqq->new_bfqq has been set by bfq_setup_merge, and before all the expected processes have been associated with bfqq->new_bfqq, bfqq may happen to be stably merged with a different queue than the current bfqq->new_bfqq. In this case, bfqq->new_bfqq gets changed. So, some of the processes that have been already accounted for in the ref counter of the previous new_bfqq will not be associated with that queue. This creates an unbalance, because those references will never be decremented.
This commit fixes this issue by reestablishing the previous, natural behaviour: once bfqq->new_bfqq has been set, it will not be changed until all expected redirections have occurred.
Signed-off-by: Davide Zini davidezini2@gmail.com Signed-off-by: Paolo Valente paolo.valente@linaro.org Link: https://lore.kernel.org/r/20210802141352.74353-2-paolo.valente@linaro.org Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/bfq-iosched.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index c20e6b772b4cd..7d77de9a0f5c0 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2137,6 +2137,15 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) * are likely to increase the throughput. */ bfqq->new_bfqq = new_bfqq; + /* + * The above assignment schedules the following redirections: + * each time some I/O for bfqq arrives, the process that + * generated that I/O is disassociated from bfqq and + * associated with new_bfqq. Here we increases new_bfqq->ref + * in advance, adding the number of processes that are + * expected to be associated with new_bfqq as they happen to + * issue I/O. + */ new_bfqq->ref += process_refs; return new_bfqq; } @@ -2196,6 +2205,10 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, { struct bfq_queue *in_service_bfqq, *new_bfqq;
+ /* if a merge has already been setup, then proceed with that first */ + if (bfqq->new_bfqq) + return bfqq->new_bfqq; + /* * Prevent bfqq from being merged if it has been created too * long ago. The idea is that true cooperating processes, and @@ -2210,9 +2223,6 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bfq_too_late_for_merging(bfqq)) return NULL;
- if (bfqq->new_bfqq) - return bfqq->new_bfqq; - if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) return NULL;
From: Dan Carpenter dan.carpenter@oracle.com
stable inclusion from linux-4.19.207 commit 1a091bfd11e61032b6192cf2a1ebb259889f28b3
--------------------------------
[ Upstream commit 7eb6ea4148579b85540a41d57bcec315b8af8ff8 ]
pci_dev_str_match_path() is often called with a spinlock held so the allocation has to be atomic. The call tree is:
pci_specified_resource_alignment() <-- takes spin_lock(); pci_dev_str_match() pci_dev_str_match_path()
Fixes: 45db33709ccc ("PCI: Allow specifying devices using a base bus and path of devfns") Link: https://lore.kernel.org/r/20210812070004.GC31863@kili Signed-off-by: Dan Carpenter dan.carpenter@oracle.com Signed-off-by: Bjorn Helgaas bhelgaas@google.com Reviewed-by: Logan Gunthorpe logang@deltatee.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/pci/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index e43b11f539d90..e74ebff1e29f3 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -224,7 +224,7 @@ static int pci_dev_str_match_path(struct pci_dev *dev, const char *path,
*endptr = strchrnul(path, ';');
- wpath = kmemdup_nul(path, *endptr - path, GFP_KERNEL); + wpath = kmemdup_nul(path, *endptr - path, GFP_ATOMIC); if (!wpath) return -ENOMEM;
From: Andy Shevchenko andriy.shevchenko@linux.intel.com
stable inclusion from linux-4.19.207 commit 6bdadfff347e42b6da70a9c77bb443479781c1f3
--------------------------------
[ Upstream commit 817f9916a6e96ae43acdd4e75459ef4f92d96eb1 ]
The CONFIG_PCI=y case got a new parameter long time ago. Sync the stub as well.
[bhelgaas: add parameter names] Fixes: 725522b5453d ("PCI: add the sysfs driver name to all modules") Link: https://lore.kernel.org/r/20210813153619.89574-1-andriy.shevchenko@linux.int... Reported-by: kernel test robot lkp@intel.com Signed-off-by: Andy Shevchenko andriy.shevchenko@linux.intel.com Signed-off-by: Bjorn Helgaas bhelgaas@google.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/pci.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/include/linux/pci.h b/include/linux/pci.h index de9c199ca606a..e4edd2d79d275 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1703,8 +1703,9 @@ static inline int pci_enable_device(struct pci_dev *dev) { return -EIO; } static inline void pci_disable_device(struct pci_dev *dev) { } static inline int pci_assign_resource(struct pci_dev *dev, int i) { return -EBUSY; } -static inline int __pci_register_driver(struct pci_driver *drv, - struct module *owner) +static inline int __must_check __pci_register_driver(struct pci_driver *drv, + struct module *owner, + const char *mod_name) { return 0; } static inline int pci_register_driver(struct pci_driver *drv) { return 0; }
From: Benjamin Hesmans benjamin.hesmans@tessares.net
stable inclusion from linux-4.19.207 commit d6efada330af09253b0f81a0d836cee02192bd4f
--------------------------------
[ Upstream commit 730affed24bffcd1eebd5903171960f5ff9f1f22 ]
Bug reported by KASAN:
BUG: KASAN: use-after-scope in inet6_ehashfn (net/ipv6/inet6_hashtables.c:40) Call Trace: (...) inet6_ehashfn (net/ipv6/inet6_hashtables.c:40) (...) nf_sk_lookup_slow_v6 (net/ipv6/netfilter/nf_socket_ipv6.c:91 net/ipv6/netfilter/nf_socket_ipv6.c:146)
It seems that this bug has already been fixed by Eric Dumazet in the past in: commit 78296c97ca1f ("netfilter: xt_socket: fix a stack corruption bug")
But a variant of the same issue has been introduced in commit d64d80a2cde9 ("netfilter: x_tables: don't extract flow keys on early demuxed sks in socket match")
`daddr` and `saddr` potentially hold a reference to ipv6_var that is no longer in scope when the call to `nf_socket_get_sock_v6` is made.
Fixes: d64d80a2cde9 ("netfilter: x_tables: don't extract flow keys on early demuxed sks in socket match") Acked-by: Matthieu Baerts matthieu.baerts@tessares.net Signed-off-by: Benjamin Hesmans benjamin.hesmans@tessares.net Reviewed-by: Florian Westphal fw@strlen.de Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv6/netfilter/nf_socket_ipv6.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c index f14de4b6d639d..58e839e2ce1d3 100644 --- a/net/ipv6/netfilter/nf_socket_ipv6.c +++ b/net/ipv6/netfilter/nf_socket_ipv6.c @@ -104,7 +104,7 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, { __be16 uninitialized_var(dport), uninitialized_var(sport); const struct in6_addr *daddr = NULL, *saddr = NULL; - struct ipv6hdr *iph = ipv6_hdr(skb); + struct ipv6hdr *iph = ipv6_hdr(skb), ipv6_var; struct sk_buff *data_skb = NULL; int doff = 0; int thoff = 0, tproto; @@ -134,8 +134,6 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, thoff + sizeof(*hp);
} else if (tproto == IPPROTO_ICMPV6) { - struct ipv6hdr ipv6_var; - if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr, &sport, &dport, &ipv6_var)) return NULL;
From: Neeraj Upadhyay neeraju@codeaurora.org
stable inclusion from linux-4.19.208 commit 3226fb90cf5dc89611f742f122a33d4598076ad5
--------------------------------
commit fd6bc19d7676a060a171d1cf3dcbf6fd797eb05f upstream.
Tasks waiting within exp_funnel_lock() for an expedited grace period to elapse can be starved due to the following sequence of events:
1. Tasks A and B both attempt to start an expedited grace period at about the same time. This grace period will have completed when the lower four bits of the rcu_state structure's ->expedited_sequence field are 0b'0100', for example, when the initial value of this counter is zero. Task A wins, and thus does the actual work of starting the grace period, including acquiring the rcu_state structure's .exp_mutex and sets the counter to 0b'0001'.
2. Because task B lost the race to start the grace period, it waits on ->expedited_sequence to reach 0b'0100' inside of exp_funnel_lock(). This task therefore blocks on the rcu_node structure's ->exp_wq[1] field, keeping in mind that the end-of-grace-period value of ->expedited_sequence (0b'0100') is shifted down two bits before indexing the ->exp_wq[] field.
3. Task C attempts to start another expedited grace period, but blocks on ->exp_mutex, which is still held by Task A.
4. The aforementioned expedited grace period completes, so that ->expedited_sequence now has the value 0b'0100'. A kworker task therefore acquires the rcu_state structure's ->exp_wake_mutex and starts awakening any tasks waiting for this grace period.
5. One of the first tasks awakened happens to be Task A. Task A therefore releases the rcu_state structure's ->exp_mutex, which allows Task C to start the next expedited grace period, which causes the lower four bits of the rcu_state structure's ->expedited_sequence field to become 0b'0101'.
6. Task C's expedited grace period completes, so that the lower four bits of the rcu_state structure's ->expedited_sequence field now become 0b'1000'.
7. The kworker task from step 4 above continues its wakeups. Unfortunately, the wake_up_all() refetches the rcu_state structure's .expedited_sequence field:
wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rcu_state.expedited_sequence) & 0x3]);
This results in the wakeup being applied to the rcu_node structure's ->exp_wq[2] field, which is unfortunate given that Task B is instead waiting on ->exp_wq[1].
On a busy system, no harm is done (or at least no permanent harm is done). Some later expedited grace period will redo the wakeup. But on a quiet system, such as many embedded systems, it might be a good long time before there was another expedited grace period. On such embedded systems, this situation could therefore result in a system hang.
This issue manifested as DPM device timeout during suspend (which usually qualifies as a quiet time) due to a SCSI device being stuck in _synchronize_rcu_expedited(), with the following stack trace:
schedule() synchronize_rcu_expedited() synchronize_rcu() scsi_device_quiesce() scsi_bus_suspend() dpm_run_callback() __device_suspend()
This commit therefore prevents such delays, timeouts, and hangs by making rcu_exp_wait_wake() use its "s" argument consistently instead of refetching from rcu_state.expedited_sequence.
Fixes: 3b5f668e715b ("rcu: Overlap wakeups with next expedited grace period") Signed-off-by: Neeraj Upadhyay neeraju@codeaurora.org Signed-off-by: Paul E. McKenney paulmck@kernel.org Signed-off-by: David Chen david.chen@nutanix.com Acked-by: Neeraj Upadhyay neeraju@codeaurora.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/rcu/tree_exp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 0b2c2ad69629c..72770a551c24d 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -613,7 +613,7 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) spin_unlock(&rnp->exp_lock); } smp_mb(); /* All above changes before wakeup. */ - wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rsp->expedited_sequence) & 0x3]); + wake_up_all(&rnp->exp_wq[rcu_seq_ctr(s) & 0x3]); } trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); mutex_unlock(&rsp->exp_wake_mutex);
From: Li Huafei lihuafei1@huawei.com
stable inclusion from linux-4.19.208 commit 6cfbbb961bb94de85455fe35140b1350c7ccb76c
--------------------------------
The commit 960434acef37 ("tracing/kprobe: Fix to support kretprobe events on unloaded modules") backport from v5.11, which modifies the return value of kprobe_on_func_entry(). However, there is no adaptation modification in create_trace_kprobe(), resulting in the exact opposite behavior. Now we need to return an error immediately only if kprobe_on_func_entry() returns -EINVAL.
Fixes: 960434acef37 ("tracing/kprobe: Fix to support kretprobe events on unloaded modules") Signed-off-by: Li Huafei lihuafei1@huawei.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/trace/trace_kprobe.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index f389de402fe20..1b1da744a2e72 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -836,8 +836,9 @@ static int create_trace_kprobe(int argc, char **argv) pr_info("Failed to parse either an address or a symbol.\n"); return ret; } + /* Defer the ENOENT case until register kprobe */ if (offset && is_return && - !kprobe_on_func_entry(NULL, symbol, offset)) { + kprobe_on_func_entry(NULL, symbol, offset) == -EINVAL) { pr_info("Given offset is not valid for return probe.\n"); return -EINVAL; }
From: Andy Shevchenko andriy.shevchenko@linux.intel.com
stable inclusion from linux-4.19.208 commit 523559507138ca4abcf4c2522c0061071c1d60a0
--------------------------------
commit 67db87dc8284070adb15b3c02c1c31d5cf51c5d6 upstream.
Currently the CRST parsing relies on the fact that on most of x86 devices the IRQ mapping is 1:1 with Linux vIRQ. However, it may be not true for some. Fix this by converting GSI to Linux vIRQ before checking it.
Fixes: ee8209fd026b ("dma: acpi-dma: parse CSRT to extract additional resources") Signed-off-by: Andy Shevchenko andriy.shevchenko@linux.intel.com Link: https://lore.kernel.org/r/20210730202715.24375-1-andriy.shevchenko@linux.int... Signed-off-by: Vinod Koul vkoul@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/dma/acpi-dma.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/drivers/dma/acpi-dma.c b/drivers/dma/acpi-dma.c index 4a748c3435d7d..02149742b334c 100644 --- a/drivers/dma/acpi-dma.c +++ b/drivers/dma/acpi-dma.c @@ -72,10 +72,14 @@ static int acpi_dma_parse_resource_group(const struct acpi_csrt_group *grp,
si = (const struct acpi_csrt_shared_info *)&grp[1];
- /* Match device by MMIO and IRQ */ + /* Match device by MMIO */ if (si->mmio_base_low != lower_32_bits(mem) || - si->mmio_base_high != upper_32_bits(mem) || - si->gsi_interrupt != irq) + si->mmio_base_high != upper_32_bits(mem)) + return 0; + + /* Match device by Linux vIRQ */ + ret = acpi_register_gsi(NULL, si->gsi_interrupt, si->interrupt_mode, si->interrupt_polarity); + if (ret != irq) return 0;
dev_dbg(&adev->dev, "matches with %.4s%04X (rev %u)\n",
From: Cyrill Gorcunov gorcunov@gmail.com
stable inclusion from linux-4.19.208 commit 6a96bac8ba0a5ab9c9af1ed1c77478e19ae1f0f6
--------------------------------
commit e1fbbd073137a9d63279f6bf363151a938347640 upstream.
Keno Fischer reported that when a binray loaded via ld-linux-x the prctl(PR_SET_MM_MAP) doesn't allow to setup brk value because it lays before mm:end_data.
For example a test program shows
| # ~/t | | start_code 401000 | end_code 401a15 | start_stack 7ffce4577dd0 | start_data 403e10 | end_data 40408c | start_brk b5b000 | sbrk(0) b5b000
and when executed via ld-linux
| # /lib64/ld-linux-x86-64.so.2 ~/t | | start_code 7fc25b0a4000 | end_code 7fc25b0c4524 | start_stack 7fffcc6b2400 | start_data 7fc25b0ce4c0 | end_data 7fc25b0cff98 | start_brk 55555710c000 | sbrk(0) 55555710c000
This of course prevent criu from restoring such programs. Looking into how kernel operates with brk/start_brk inside brk() syscall I don't see any problem if we allow to setup brk/start_brk without checking for end_data. Even if someone pass some weird address here on a purpose then the worst possible result will be an unexpected unmapping of existing vma (own vma, since prctl works with the callers memory) but test for RLIMIT_DATA is still valid and a user won't be able to gain more memory in case of expanding VMAs via new values shipped with prctl call.
Link: https://lkml.kernel.org/r/20210121221207.GB2174@grain Fixes: bbdc6076d2e5 ("binfmt_elf: move brk out of mmap when doing direct loader exec") Signed-off-by: Cyrill Gorcunov gorcunov@gmail.com Reported-by: Keno Fischer keno@juliacomputing.com Acked-by: Andrey Vagin avagin@gmail.com Tested-by: Andrey Vagin avagin@gmail.com Cc: Dmitry Safonov 0x7f454c46@gmail.com Cc: Kirill Tkhai ktkhai@virtuozzo.com Cc: Eric W. Biederman ebiederm@xmission.com Cc: Pavel Tikhomirov ptikhomirov@virtuozzo.com Cc: Alexander Mikhalitsyn alexander.mikhalitsyn@virtuozzo.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/sys.c | 7 ------- 1 file changed, 7 deletions(-)
diff --git a/kernel/sys.c b/kernel/sys.c index 55e142077dcf4..faef6e6b635f1 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1931,13 +1931,6 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map)
error = -EINVAL;
- /* - * @brk should be after @end_data in traditional maps. - */ - if (prctl_map->start_brk <= prctl_map->end_data || - prctl_map->brk <= prctl_map->end_data) - goto out; - /* * Neither we should allow to override limits if they set. */
From: Pavel Skripkin paskripkin@gmail.com
stable inclusion from linux-4.19.208 commit a94a60ef75d6cc2ac4c0d07cd043316e7e8b5b3b
--------------------------------
commit 2d186afd04d669fe9c48b994c41a7405a3c9f16d upstream.
Syzbot reported shift-out-of-bounds bug in profile_init(). The problem was in incorrect prof_shift. Since prof_shift value comes from userspace we need to clamp this value into [0, BITS_PER_LONG -1] boundaries.
Second possible shiht-out-of-bounds was found by Tetsuo: sample_step local variable in read_profile() had "unsigned int" type, but prof_shift allows to make a BITS_PER_LONG shift. So, to prevent possible shiht-out-of-bounds sample_step type was changed to "unsigned long".
Also, "unsigned short int" will be sufficient for storing [0, BITS_PER_LONG] value, that's why there is no need for "unsigned long" prof_shift.
Link: https://lkml.kernel.org/r/20210813140022.5011-1-paskripkin@gmail.com Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-and-tested-by: syzbot+e68c89a9510c159d9684@syzkaller.appspotmail.com Suggested-by: Tetsuo Handa penguin-kernel@i-love.sakura.ne.jp Signed-off-by: Pavel Skripkin paskripkin@gmail.com Cc: Thomas Gleixner tglx@linutronix.de Cc: Steven Rostedt rostedt@goodmis.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/profile.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-)
diff --git a/kernel/profile.c b/kernel/profile.c index 9aa2a4445b0d2..efa58f63dc1bf 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -40,7 +40,8 @@ struct profile_hit { #define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ)
static atomic_t *prof_buffer; -static unsigned long prof_len, prof_shift; +static unsigned long prof_len; +static unsigned short int prof_shift;
int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); @@ -66,8 +67,8 @@ int profile_setup(char *str) if (str[strlen(sleepstr)] == ',') str += strlen(sleepstr) + 1; if (get_option(&str, &par)) - prof_shift = par; - pr_info("kernel sleep profiling enabled (shift: %ld)\n", + prof_shift = clamp(par, 0, BITS_PER_LONG - 1); + pr_info("kernel sleep profiling enabled (shift: %u)\n", prof_shift); #else pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); @@ -77,21 +78,21 @@ int profile_setup(char *str) if (str[strlen(schedstr)] == ',') str += strlen(schedstr) + 1; if (get_option(&str, &par)) - prof_shift = par; - pr_info("kernel schedule profiling enabled (shift: %ld)\n", + prof_shift = clamp(par, 0, BITS_PER_LONG - 1); + pr_info("kernel schedule profiling enabled (shift: %u)\n", prof_shift); } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { prof_on = KVM_PROFILING; if (str[strlen(kvmstr)] == ',') str += strlen(kvmstr) + 1; if (get_option(&str, &par)) - prof_shift = par; - pr_info("kernel KVM profiling enabled (shift: %ld)\n", + prof_shift = clamp(par, 0, BITS_PER_LONG - 1); + pr_info("kernel KVM profiling enabled (shift: %u)\n", prof_shift); } else if (get_option(&str, &par)) { - prof_shift = par; + prof_shift = clamp(par, 0, BITS_PER_LONG - 1); prof_on = CPU_PROFILING; - pr_info("kernel profiling enabled (shift: %ld)\n", + pr_info("kernel profiling enabled (shift: %u)\n", prof_shift); } return 1; @@ -467,7 +468,7 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) unsigned long p = *ppos; ssize_t read; char *pnt; - unsigned int sample_step = 1 << prof_shift; + unsigned long sample_step = 1UL << prof_shift;
profile_flip_buffers(); if (p >= (prof_len+1)*sizeof(unsigned int))