This patchset fixes the packet stuck problem mentioned in [1].
Patch 1: Add STATE_MISSED flag to fix packet stuck problem. Patch 2: Fix a tx_action rescheduling problem after STATE_MISSED flag is added in patch 1.
V4: Change STATE_NEED_RESCHEDULE to STATE_MISSED and add patch 2.
[1]. https://lkml.org/lkml/2019/10/9/42
Yunsheng Lin (2): net: sched: fix packet stuck problem for lockless qdisc net: sched: fix endless tx action reschedule during deactivation
include/net/pkt_sched.h | 7 +------ include/net/sch_generic.h | 37 ++++++++++++++++++++++++++++++++++++- net/core/dev.c | 26 ++++++++++++++++++++++---- net/sched/sch_generic.c | 16 +++++++++++++++- 4 files changed, 74 insertions(+), 12 deletions(-)
Lockless qdisc has below concurrent problem: cpu0 cpu1 . . q->enqueue . . . qdisc_run_begin() . . . dequeue_skb() . . . sch_direct_xmit() . . . . q->enqueue . qdisc_run_begin() . return and do nothing . . qdisc_run_end() .
cpu1 enqueue a skb without calling __qdisc_run() because cpu0 has not released the lock yet and spin_trylock() return false for cpu1 in qdisc_run_begin(), and cpu0 do not see the skb enqueued by cpu1 when calling dequeue_skb() because cpu1 may enqueue the skb after cpu0 calling dequeue_skb() and before cpu0 calling qdisc_run_end().
Lockless qdisc has below another concurrent problem when tx_action is involved:
cpu0(serving tx_action) cpu1 cpu2 . . . . q->enqueue . . qdisc_run_begin() . . dequeue_skb() . . . q->enqueue . . . . sch_direct_xmit() . . . qdisc_run_begin() . . return and do nothing . . . clear __QDISC_STATE_SCHED . . qdisc_run_begin() . . return and do nothing . . . . . . qdisc_run_end() .
This patch fixes the above data race by: 1. Test STATE_MISSED before doing spin_trylock(). 2. If the first spin_trylock() return false and STATE_MISSED is not set before the first spin_trylock(), Set STATE_MISSED and retry another spin_trylock() in case other CPU may not see STATE_MISSED after it releases the lock. 3. reschedule if STATE_MISSED is set after the lock is released at the end of qdisc_run_end().
For tx_action case, STATE_MISSED is also set when cpu1 is at the end if qdisc_run_end(), so tx_action will be rescheduled again to dequeue the skb enqueued by cpu2.
Clear STATE_MISSED before retrying a dequeuing when dequeuing returns NULL in order to reduce the overhead of the above double spin_trylock() and __netif_schedule() calling.
The performance impact of this patch, tested using pktgen and dummy netdev with pfifo_fast qdisc attached:
threads without+this_patch with+this_patch delta 1 2.61Mpps 2.60Mpps -0.3% 2 3.97Mpps 3.82Mpps -3.7% 4 5.62Mpps 5.59Mpps -0.5% 8 2.78Mpps 2.77Mpps -0.3% 16 2.22Mpps 2.22Mpps -0.0%
Fixes: 6b3ba9146fe6 ("net: sched: allow qdiscs to handle locking") Signed-off-by: Yunsheng Lin linyunsheng@huawei.com Tested-by: Juergen Gross jgross@suse.com --- V4: Change STATE_NEED_RESCHEDULE to STATE_MISSED mirroring NAPI's NAPIF_STATE_MISSED, and add Juergen's "Tested-by" tag for there is only renaming and typo fixing between V4 and V3. V3: Fix a compile error and a few comment typo, remove the __QDISC_STATE_DEACTIVATED checking, and update the performance data. V2: Avoid the overhead of fixing the data race as much as possible. --- include/net/sch_generic.h | 37 ++++++++++++++++++++++++++++++++++++- net/sched/sch_generic.c | 12 ++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-)
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index f7a6e14..b85b8ea 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -36,6 +36,7 @@ struct qdisc_rate_table { enum qdisc_state_t { __QDISC_STATE_SCHED, __QDISC_STATE_DEACTIVATED, + __QDISC_STATE_MISSED, };
struct qdisc_size_table { @@ -159,8 +160,37 @@ static inline bool qdisc_is_empty(const struct Qdisc *qdisc) static inline bool qdisc_run_begin(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) { + bool dont_retry = test_bit(__QDISC_STATE_MISSED, + &qdisc->state); + + if (spin_trylock(&qdisc->seqlock)) + goto nolock_empty; + + /* If the flag is set before doing the spin_trylock() and + * the above spin_trylock() return false, it means other cpu + * holding the lock will do dequeuing for us, or it wil see + * the flag set after releasing lock and reschedule the + * net_tx_action() to do the dequeuing. + */ + if (dont_retry) + return false; + + /* We could do set_bit() before the first spin_trylock(), + * and avoid doing second spin_trylock() completely, then + * we could have multi cpus doing the set_bit(). Here use + * dont_retry to avoid doing the set_bit() and the second + * spin_trylock(), which has 5% performance improvement than + * doing the set_bit() before the first spin_trylock(). + */ + set_bit(__QDISC_STATE_MISSED, &qdisc->state); + + /* Retry again in case other CPU may not see the new flag + * after it releases the lock at the end of qdisc_run_end(). + */ if (!spin_trylock(&qdisc->seqlock)) return false; + +nolock_empty: WRITE_ONCE(qdisc->empty, false); } else if (qdisc_is_running(qdisc)) { return false; @@ -176,8 +206,13 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) static inline void qdisc_run_end(struct Qdisc *qdisc) { write_seqcount_end(&qdisc->running); - if (qdisc->flags & TCQ_F_NOLOCK) + if (qdisc->flags & TCQ_F_NOLOCK) { spin_unlock(&qdisc->seqlock); + + if (unlikely(test_bit(__QDISC_STATE_MISSED, + &qdisc->state))) + __netif_schedule(qdisc); + } }
static inline bool qdisc_may_bulk(const struct Qdisc *qdisc) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 44991ea..9bc73ea 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -640,8 +640,10 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) { struct pfifo_fast_priv *priv = qdisc_priv(qdisc); struct sk_buff *skb = NULL; + bool need_retry = true; int band;
+retry: for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { struct skb_array *q = band2list(priv, band);
@@ -652,6 +654,16 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) } if (likely(skb)) { qdisc_update_stats_at_dequeue(qdisc, skb); + } else if (need_retry && + test_and_clear_bit(__QDISC_STATE_MISSED, + &qdisc->state)) { + /* do another dequeuing after clearing the flag to + * avoid calling __netif_schedule(). + */ + smp_mb__after_atomic(); + need_retry = false; + + goto retry; } else { WRITE_ONCE(qdisc->empty, true); }
On Fri, Apr 16, 2021 at 09:16:48AM +0800, Yunsheng Lin wrote:
Lockless qdisc has below concurrent problem: cpu0 cpu1 . . q->enqueue . . . qdisc_run_begin() . . . dequeue_skb() . . . sch_direct_xmit() . . . . q->enqueue . qdisc_run_begin() . return and do nothing . . qdisc_run_end() .
cpu1 enqueue a skb without calling __qdisc_run() because cpu0 has not released the lock yet and spin_trylock() return false for cpu1 in qdisc_run_begin(), and cpu0 do not see the skb enqueued by cpu1 when calling dequeue_skb() because cpu1 may enqueue the skb after cpu0 calling dequeue_skb() and before cpu0 calling qdisc_run_end().
Lockless qdisc has below another concurrent problem when tx_action is involved:
cpu0(serving tx_action) cpu1 cpu2 . . . . q->enqueue . . qdisc_run_begin() . . dequeue_skb() . . . q->enqueue . . . . sch_direct_xmit() . . . qdisc_run_begin() . . return and do nothing . . . clear __QDISC_STATE_SCHED . . qdisc_run_begin() . . return and do nothing . . . . . . qdisc_run_end() .
This patch fixes the above data race by:
- Test STATE_MISSED before doing spin_trylock().
- If the first spin_trylock() return false and STATE_MISSED is not set before the first spin_trylock(), Set STATE_MISSED and retry another spin_trylock() in case other CPU may not see STATE_MISSED after it releases the lock.
- reschedule if STATE_MISSED is set after the lock is released at the end of qdisc_run_end().
For tx_action case, STATE_MISSED is also set when cpu1 is at the end if qdisc_run_end(), so tx_action will be rescheduled again to dequeue the skb enqueued by cpu2.
Clear STATE_MISSED before retrying a dequeuing when dequeuing returns NULL in order to reduce the overhead of the above double spin_trylock() and __netif_schedule() calling.
The performance impact of this patch, tested using pktgen and dummy netdev with pfifo_fast qdisc attached:
threads without+this_patch with+this_patch delta 1 2.61Mpps 2.60Mpps -0.3% 2 3.97Mpps 3.82Mpps -3.7% 4 5.62Mpps 5.59Mpps -0.5% 8 2.78Mpps 2.77Mpps -0.3% 16 2.22Mpps 2.22Mpps -0.0%
Fixes: 6b3ba9146fe6 ("net: sched: allow qdiscs to handle locking") Signed-off-by: Yunsheng Lin linyunsheng@huawei.com Tested-by: Juergen Gross jgross@suse.com
V4: Change STATE_NEED_RESCHEDULE to STATE_MISSED mirroring NAPI's NAPIF_STATE_MISSED, and add Juergen's "Tested-by" tag for there is only renaming and typo fixing between V4 and V3. V3: Fix a compile error and a few comment typo, remove the __QDISC_STATE_DEACTIVATED checking, and update the performance data. V2: Avoid the overhead of fixing the data race as much as possible.
As pointed out in the discussion on v3, this patch may result in significantly higher CPU consumption with multiple threads competing on a saturated outgoing device. I missed this submission so that I haven't checked it yet but given the description of v3->v4 changes above, it's quite likely that it suffers from the same problem.
Michal
On Mon, Apr 19, 2021 at 05:29:46PM +0200, Michal Kubecek wrote:
As pointed out in the discussion on v3, this patch may result in significantly higher CPU consumption with multiple threads competing on a saturated outgoing device. I missed this submission so that I haven't checked it yet but given the description of v3->v4 changes above, it's quite likely that it suffers from the same problem.
And it indeed does. However, with the additional patch from the v3 discussion, the numbers are approximately the same as with an unpatched mainline kernel.
As with v4, I tried this patch on top of 5.12-rc7 with real devices. I used two machines with 10Gb/s Intel ixgbe NICs, sender has 16 CPUs (2 8-core CPUs with HT disabled) and 16 Rx/Tx queues, receiver has 48 CPUs (2 12-core CPUs with HT enabled) and 48 Rx/Tx queues.
threads 5.12-rc7 5.12-rc7 + v4 5.12-rc7 + v4 + stop 1 25.1% 38.1% 22.9% 8 66.2% 277.0% 74.1% 16 90.1% 150.7% 91.0% 32 107.2% 272.6% 108.3% 64 116.3% 487.5% 118.1% 128 126.1% 946.7% 126.9%
(The values are normalized to one core, i.e. 100% corresponds to one fully used logical CPU.)
So it seems that repeated scheduling while the queue was stopped is indeed the main performance issue and that other cases of the logic being too pessimistic do not play significant role. There is an exception with 8 connections/threads and the result with just this series also looks abnormally high (e.g. much higher than with 16 threads). It might be worth investigating what happens there and what do the results with other thread counts around 8 look like.
I'll run some more tests with other traffic patterns tomorrow and I'm also going to take a closer look at the additional patch.
Michal
On 2021/4/20 7:55, Michal Kubecek wrote:
On Mon, Apr 19, 2021 at 05:29:46PM +0200, Michal Kubecek wrote:
As pointed out in the discussion on v3, this patch may result in significantly higher CPU consumption with multiple threads competing on a saturated outgoing device. I missed this submission so that I haven't checked it yet but given the description of v3->v4 changes above, it's quite likely that it suffers from the same problem.
And it indeed does. However, with the additional patch from the v3 discussion, the numbers are approximately the same as with an unpatched mainline kernel.
As with v4, I tried this patch on top of 5.12-rc7 with real devices. I used two machines with 10Gb/s Intel ixgbe NICs, sender has 16 CPUs (2 8-core CPUs with HT disabled) and 16 Rx/Tx queues, receiver has 48 CPUs (2 12-core CPUs with HT enabled) and 48 Rx/Tx queues.
threads 5.12-rc7 5.12-rc7 + v4 5.12-rc7 + v4 + stop 1 25.1% 38.1% 22.9% 8 66.2% 277.0% 74.1% 16 90.1% 150.7% 91.0% 32 107.2% 272.6% 108.3% 64 116.3% 487.5% 118.1% 128 126.1% 946.7% 126.9%
(The values are normalized to one core, i.e. 100% corresponds to one fully used logical CPU.)
So it seems that repeated scheduling while the queue was stopped is indeed the main performance issue and that other cases of the logic being too pessimistic do not play significant role. There is an exception with 8 connections/threads and the result with just this series also looks abnormally high (e.g. much higher than with 16 threads). It might be worth investigating what happens there and what do the results with other thread counts around 8 look like.
Will try to investigate the 8 connections/threads case.
I'll run some more tests with other traffic patterns tomorrow and I'm also going to take a closer look at the additional patch.
Thanks for taking the detail testing and looking.
Michal
.
On Tue, Apr 20, 2021 at 01:55:03AM +0200, Michal Kubecek wrote:
On Mon, Apr 19, 2021 at 05:29:46PM +0200, Michal Kubecek wrote:
As pointed out in the discussion on v3, this patch may result in significantly higher CPU consumption with multiple threads competing on a saturated outgoing device. I missed this submission so that I haven't checked it yet but given the description of v3->v4 changes above, it's quite likely that it suffers from the same problem.
And it indeed does. However, with the additional patch from the v3 discussion, the numbers are approximately the same as with an unpatched mainline kernel.
As with v4, I tried this patch on top of 5.12-rc7 with real devices. I used two machines with 10Gb/s Intel ixgbe NICs, sender has 16 CPUs (2 8-core CPUs with HT disabled) and 16 Rx/Tx queues, receiver has 48 CPUs (2 12-core CPUs with HT enabled) and 48 Rx/Tx queues.
threads 5.12-rc7 5.12-rc7 + v4 5.12-rc7 + v4 + stop 1 25.1% 38.1% 22.9% 8 66.2% 277.0% 74.1% 16 90.1% 150.7% 91.0% 32 107.2% 272.6% 108.3% 64 116.3% 487.5% 118.1% 128 126.1% 946.7% 126.9%
(The values are normalized to one core, i.e. 100% corresponds to one fully used logical CPU.)
I repeated the tests few more times and with more iterations and it seems the problem rather was that the CPU utilization numbers are not very stable, in particular with number of connections/threads close to the number of CPUs and Tx queues. Refined results (and also other tests) show that full 3-patch series performs similar to unpatched 5.12-rc7 (within the margin of statistical error).
However, I noticed something disturbing in the results of a simple 1-thread TCP_STREAM test (client sends data through a TCP connection to server using long writes, we measure the amount of data received by the server):
server: 172.17.1.1, port 12543 iterations: 20, threads: 1, test length: 30 test: TCP_STREAM, message size: 1048576
1 927403548.4 B/s, avg 927403548.4 B/s, mdev 0.0 B/s ( 0.0%) 2 1176317172.1 B/s, avg 1051860360.2 B/s, mdev 124456811.8 B/s ( 11.8%), confid. +/- 1581348251.3 B/s (150.3%) 3 927335837.8 B/s, avg 1010352186.1 B/s, mdev 117354970.3 B/s ( 11.6%), confid. +/- 357073677.2 B/s ( 35.3%) 4 1176728045.1 B/s, avg 1051946150.8 B/s, mdev 124576544.7 B/s ( 11.8%), confid. +/- 228863127.8 B/s ( 21.8%) 5 1176788216.3 B/s, avg 1076914563.9 B/s, mdev 122102985.3 B/s ( 11.3%), confid. +/- 169478943.5 B/s ( 15.7%) 6 1158167055.1 B/s, avg 1090456645.8 B/s, mdev 115504209.5 B/s ( 10.6%), confid. +/- 132805140.8 B/s ( 12.2%) 7 1176243474.4 B/s, avg 1102711907.0 B/s, mdev 111069717.1 B/s ( 10.1%), confid. +/- 110956822.2 B/s ( 10.1%) 8 1176771142.8 B/s, avg 1111969311.5 B/s, mdev 106744173.5 B/s ( 9.6%), confid. +/- 95417120.0 B/s ( 8.6%) 9 1176206364.6 B/s, avg 1119106761.8 B/s, mdev 102644185.2 B/s ( 9.2%), confid. +/- 83685200.5 B/s ( 7.5%) 10 1175888409.4 B/s, avg 1124784926.6 B/s, mdev 98855550.5 B/s ( 8.8%), confid. +/- 74537085.1 B/s ( 6.6%) 11 1176541407.6 B/s, avg 1129490061.2 B/s, mdev 95422224.8 B/s ( 8.4%), confid. +/- 67230249.7 B/s ( 6.0%) 12 934185352.8 B/s, avg 1113214668.9 B/s, mdev 106114984.5 B/s ( 9.5%), confid. +/- 70420712.5 B/s ( 6.3%) 13 1176550558.1 B/s, avg 1118086660.3 B/s, mdev 103339448.9 B/s ( 9.2%), confid. +/- 65002902.4 B/s ( 5.8%) 14 1176521808.8 B/s, avg 1122260599.5 B/s, mdev 100711151.3 B/s ( 9.0%), confid. +/- 60333655.0 B/s ( 5.4%) 15 1176744840.8 B/s, avg 1125892882.3 B/s, mdev 98240838.2 B/s ( 8.7%), confid. +/- 56319052.3 B/s ( 5.0%) 16 1176593778.5 B/s, avg 1129061688.3 B/s, mdev 95909740.8 B/s ( 8.5%), confid. +/- 52771633.5 B/s ( 4.7%) 17 1176583967.4 B/s, avg 1131857116.5 B/s, mdev 93715582.2 B/s ( 8.3%), confid. +/- 49669258.6 B/s ( 4.4%) 18 1176853301.8 B/s, avg 1134356904.5 B/s, mdev 91656530.2 B/s ( 8.1%), confid. +/- 46905244.8 B/s ( 4.1%) 19 1176592845.7 B/s, avg 1136579848.8 B/s, mdev 89709043.8 B/s ( 7.9%), confid. +/- 44424855.9 B/s ( 3.9%) 20 1176608117.3 B/s, avg 1138581262.2 B/s, mdev 87871692.6 B/s ( 7.7%), confid. +/- 42193098.5 B/s ( 3.7%) all avg 1138581262.2 B/s, mdev 87871692.6 B/s ( 7.7%), confid. +/- 42193098.5 B/s ( 3.7%)
Each line shows result of one 30 second long test and average, mean deviation and 99% confidence interval half width through the iterations so far. While 17 iteration results are essentially the wire speed minus TCP overhead, iterations 1, 3 and 12 are more than 20% lower. As results of the same test on unpatched 5.12-rc7 are much more consistent (the lowest iteration result through the whole test was 1175939718.3 and the mean deviation only 276889.1 B/s), it doesn't seeem to be just a random fluctuation.
I'll try to find out what happens in these outstanding iterations.
Michal
Currently qdisc_run() checks the STATE_DEACTIVATED of lockless qdisc before calling __qdisc_run(), which ultimately clear the STATE_MISSED when all the skb is dequeued. If STATE_DEACTIVATED is set before clearing STATE_MISSED, there may be endless rescheduling of net_tx_action() at the end of qdisc_run_end(), see below:
CPU0(net_tx_atcion) CPU1(__dev_xmit_skb) CPU2(dev_deactivate) . . . . set STATE_MISSED . . __netif_schedule() . . . set STATE_DEACTIVATED . . qdisc_reset() . . . .<--------------- . synchronize_net() clear __QDISC_STATE_SCHED | . . . | . . . | . . . | . --------->. . | . | . test STATE_DEACTIVATED | . | some_qdisc_is_busy() __qdisc_run() *not* called | . |-----return *true* . | . . test STATE_MISS | . . __netif_schedule()--------| . . . . . . . .
__qdisc_run() is not called by net_tx_atcion() in CPU0 because CPU2 has set STATE_DEACTIVATED flag during dev_deactivate(), and STATE_MISSED is only cleared in __qdisc_run(), __netif_schedule is called endlessly at the end of qdisc_run_end(), causing endless tx action rescheduling problem.
qdisc_run() called by net_tx_action() runs in the softirq context, which should has the same semantic as the qdisc_run() called by __dev_xmit_skb() protected by rcu_read_lock_bh(). And there is a synchronize_net() between STATE_DEACTIVATED flag being set and qdisc_reset()/some_qdisc_is_busy in dev_deactivate(), we can safely bail out for the deactived lockless qdisc in net_tx_action(), and qdisc_reset() will reset all skb not dequeued yet.
So add the rcu_read_lock() explicitly to protect the qdisc_run() and do the STATE_DEACTIVATED checking in net_tx_action() before calling qdisc_run_begin(). Another option is to do the checking in the qdisc_run_end(), but it will add unnecessary overhead for non-tx_action case, because __dev_queue_xmit() will not see qdisc with STATE_DEACTIVATED after synchronize_net(), the qdisc with STATE_DEACTIVATED can only be seen by net_tx_action() because of __netif_schedule().
The STATE_DEACTIVATED checking in qdisc_run() is to avoid race between net_tx_action() and qdisc_reset(), see: commit d518d2ed8640 ("net/sched: fix race between deactivation and dequeue for NOLOCK qdisc"). As the bailout added above for deactived lockless qdisc in net_tx_action() provides better protection for the race without calling qdisc_run() at all, so remove the STATE_DEACTIVATED checking in qdisc_run().
After qdisc_reset(), there is no skb in qdisc to be dequeued, so clear the STATE_MISSED in dev_reset_queue() too.
Fixes: 6b3ba9146fe6 ("net: sched: allow qdiscs to handle locking") Signed-off-by: Yunsheng Lin linyunsheng@huawei.com --- include/net/pkt_sched.h | 7 +------ net/core/dev.c | 26 ++++++++++++++++++++++---- net/sched/sch_generic.c | 4 +++- 3 files changed, 26 insertions(+), 11 deletions(-)
diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index f5c1bee..6d7b12c 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -128,12 +128,7 @@ void __qdisc_run(struct Qdisc *q); static inline void qdisc_run(struct Qdisc *q) { if (qdisc_run_begin(q)) { - /* NOLOCK qdisc must check 'state' under the qdisc seqlock - * to avoid racing with dev_qdisc_reset() - */ - if (!(q->flags & TCQ_F_NOLOCK) || - likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) - __qdisc_run(q); + __qdisc_run(q); qdisc_run_end(q); } } diff --git a/net/core/dev.c b/net/core/dev.c index be941ed..47cefcc 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4958,25 +4958,43 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) sd->output_queue_tailp = &sd->output_queue; local_irq_enable();
+ rcu_read_lock(); + while (head) { struct Qdisc *q = head; spinlock_t *root_lock = NULL;
head = head->next_sched;
- if (!(q->flags & TCQ_F_NOLOCK)) { - root_lock = qdisc_lock(q); - spin_lock(root_lock); - } /* We need to make sure head->next_sched is read * before clearing __QDISC_STATE_SCHED */ smp_mb__before_atomic(); + + if (!(q->flags & TCQ_F_NOLOCK)) { + root_lock = qdisc_lock(q); + spin_lock(root_lock); + } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, + &q->state))) { + /* There is a synchronize_net() between + * STATE_DEACTIVATED flag being set and + * qdisc_reset()/some_qdisc_is_busy() in + * dev_deactivate(), so we can safely bail out + * early here to avoid data race between + * qdisc_deactivate() and some_qdisc_is_busy() + * for lockless qdisc. + */ + clear_bit(__QDISC_STATE_SCHED, &q->state); + continue; + } + clear_bit(__QDISC_STATE_SCHED, &q->state); qdisc_run(q); if (root_lock) spin_unlock(root_lock); } + + rcu_read_unlock(); }
xfrm_dev_backlog(sd); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 9bc73ea..c32ac5b 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -1170,8 +1170,10 @@ static void dev_reset_queue(struct net_device *dev, qdisc_reset(qdisc);
spin_unlock_bh(qdisc_lock(qdisc)); - if (nolock) + if (nolock) { + clear_bit(__QDISC_STATE_MISSED, &qdisc->state); spin_unlock_bh(&qdisc->seqlock); + } }
static bool some_qdisc_is_busy(struct net_device *dev)