This patchset fixes the packet stuck problem mentioned in [1].
Patch 1: Add STATE_MISSED flag to fix packet stuck problem. Patch 2: Fix a tx_action rescheduling problem after STATE_MISSED flag is added in patch 1. Patch 3: Fix the significantly higher CPU consumption problem when multiple threads are competing on a saturated outgoing device.
V5: add patch 3 to fix the problem reported by Michal Kubecek. V4: Change STATE_NEED_RESCHEDULE to STATE_MISSED and add patch 2.
[1]. https://lkml.org/lkml/2019/10/9/42
Yunsheng Lin (3): net: sched: fix packet stuck problem for lockless qdisc net: sched: fix endless tx action reschedule during deactivation net: sched: fix tx action reschedule issue with stopped queue
include/net/pkt_sched.h | 7 +------ include/net/sch_generic.h | 37 ++++++++++++++++++++++++++++++++++++- net/core/dev.c | 30 +++++++++++++++++++++++++----- net/sched/sch_generic.c | 24 ++++++++++++++++++++++-- 4 files changed, 84 insertions(+), 14 deletions(-)
Lockless qdisc has below concurrent problem: cpu0 cpu1 . . q->enqueue . . . qdisc_run_begin() . . . dequeue_skb() . . . sch_direct_xmit() . . . . q->enqueue . qdisc_run_begin() . return and do nothing . . qdisc_run_end() .
cpu1 enqueue a skb without calling __qdisc_run() because cpu0 has not released the lock yet and spin_trylock() return false for cpu1 in qdisc_run_begin(), and cpu0 do not see the skb enqueued by cpu1 when calling dequeue_skb() because cpu1 may enqueue the skb after cpu0 calling dequeue_skb() and before cpu0 calling qdisc_run_end().
Lockless qdisc has below another concurrent problem when tx_action is involved:
cpu0(serving tx_action) cpu1 cpu2 . . . . q->enqueue . . qdisc_run_begin() . . dequeue_skb() . . . q->enqueue . . . . sch_direct_xmit() . . . qdisc_run_begin() . . return and do nothing . . . clear __QDISC_STATE_SCHED . . qdisc_run_begin() . . return and do nothing . . . . . . qdisc_run_end() .
This patch fixes the above data race by: 1. Test STATE_MISSED before doing spin_trylock(). 2. If the first spin_trylock() return false and STATE_MISSED is not set before the first spin_trylock(), Set STATE_MISSED and retry another spin_trylock() in case other CPU may not see STATE_MISSED after it releases the lock. 3. reschedule if STATE_MISSED is set after the lock is released at the end of qdisc_run_end().
For tx_action case, STATE_MISSED is also set when cpu1 is at the end if qdisc_run_end(), so tx_action will be rescheduled again to dequeue the skb enqueued by cpu2.
Clear STATE_MISSED before retrying a dequeuing when dequeuing returns NULL in order to reduce the overhead of the above double spin_trylock() and __netif_schedule() calling.
The performance impact of this patch, tested using pktgen and dummy netdev with pfifo_fast qdisc attached:
threads without+this_patch with+this_patch delta 1 2.61Mpps 2.60Mpps -0.3% 2 3.97Mpps 3.82Mpps -3.7% 4 5.62Mpps 5.59Mpps -0.5% 8 2.78Mpps 2.77Mpps -0.3% 16 2.22Mpps 2.22Mpps -0.0%
Fixes: 6b3ba9146fe6 ("net: sched: allow qdiscs to handle locking") Signed-off-by: Yunsheng Lin linyunsheng@huawei.com Tested-by: Juergen Gross jgross@suse.com --- V4: Change STATE_NEED_RESCHEDULE to STATE_MISSED mirroring NAPI's NAPIF_STATE_MISSED, and add Juergen's "Tested-by" tag for there is only renaming and typo fixing between V4 and V3. V3: Fix a compile error and a few comment typo, remove the __QDISC_STATE_DEACTIVATED checking, and update the performance data. V2: Avoid the overhead of fixing the data race as much as possible. --- include/net/sch_generic.h | 37 ++++++++++++++++++++++++++++++++++++- net/sched/sch_generic.c | 12 ++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-)
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index f7a6e14..b85b8ea 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -36,6 +36,7 @@ struct qdisc_rate_table { enum qdisc_state_t { __QDISC_STATE_SCHED, __QDISC_STATE_DEACTIVATED, + __QDISC_STATE_MISSED, };
struct qdisc_size_table { @@ -159,8 +160,37 @@ static inline bool qdisc_is_empty(const struct Qdisc *qdisc) static inline bool qdisc_run_begin(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) { + bool dont_retry = test_bit(__QDISC_STATE_MISSED, + &qdisc->state); + + if (spin_trylock(&qdisc->seqlock)) + goto nolock_empty; + + /* If the flag is set before doing the spin_trylock() and + * the above spin_trylock() return false, it means other cpu + * holding the lock will do dequeuing for us, or it wil see + * the flag set after releasing lock and reschedule the + * net_tx_action() to do the dequeuing. + */ + if (dont_retry) + return false; + + /* We could do set_bit() before the first spin_trylock(), + * and avoid doing second spin_trylock() completely, then + * we could have multi cpus doing the set_bit(). Here use + * dont_retry to avoid doing the set_bit() and the second + * spin_trylock(), which has 5% performance improvement than + * doing the set_bit() before the first spin_trylock(). + */ + set_bit(__QDISC_STATE_MISSED, &qdisc->state); + + /* Retry again in case other CPU may not see the new flag + * after it releases the lock at the end of qdisc_run_end(). + */ if (!spin_trylock(&qdisc->seqlock)) return false; + +nolock_empty: WRITE_ONCE(qdisc->empty, false); } else if (qdisc_is_running(qdisc)) { return false; @@ -176,8 +206,13 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) static inline void qdisc_run_end(struct Qdisc *qdisc) { write_seqcount_end(&qdisc->running); - if (qdisc->flags & TCQ_F_NOLOCK) + if (qdisc->flags & TCQ_F_NOLOCK) { spin_unlock(&qdisc->seqlock); + + if (unlikely(test_bit(__QDISC_STATE_MISSED, + &qdisc->state))) + __netif_schedule(qdisc); + } }
static inline bool qdisc_may_bulk(const struct Qdisc *qdisc) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 44991ea..9bc73ea 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -640,8 +640,10 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) { struct pfifo_fast_priv *priv = qdisc_priv(qdisc); struct sk_buff *skb = NULL; + bool need_retry = true; int band;
+retry: for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { struct skb_array *q = band2list(priv, band);
@@ -652,6 +654,16 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) } if (likely(skb)) { qdisc_update_stats_at_dequeue(qdisc, skb); + } else if (need_retry && + test_and_clear_bit(__QDISC_STATE_MISSED, + &qdisc->state)) { + /* do another dequeuing after clearing the flag to + * avoid calling __netif_schedule(). + */ + smp_mb__after_atomic(); + need_retry = false; + + goto retry; } else { WRITE_ONCE(qdisc->empty, true); }
On Thu, 6 May 2021 09:57:42 +0800 Yunsheng Lin wrote:
@@ -159,8 +160,37 @@ static inline bool qdisc_is_empty(const struct Qdisc *qdisc) static inline bool qdisc_run_begin(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) {
bool dont_retry = test_bit(__QDISC_STATE_MISSED,
&qdisc->state);
if (spin_trylock(&qdisc->seqlock))
goto nolock_empty;
/* If the flag is set before doing the spin_trylock() and
* the above spin_trylock() return false, it means other cpu
* holding the lock will do dequeuing for us, or it wil see
s/wil/will/
* the flag set after releasing lock and reschedule the
* net_tx_action() to do the dequeuing.
I don't understand why MISSED is checked before the trylock. Could you explain why it can't be tested directly here?
*/
if (dont_retry)
return false;
/* We could do set_bit() before the first spin_trylock(),
* and avoid doing second spin_trylock() completely, then
* we could have multi cpus doing the set_bit(). Here use
* dont_retry to avoid doing the set_bit() and the second
* spin_trylock(), which has 5% performance improvement than
* doing the set_bit() before the first spin_trylock().
*/
set_bit(__QDISC_STATE_MISSED, &qdisc->state);
/* Retry again in case other CPU may not see the new flag
* after it releases the lock at the end of qdisc_run_end().
if (!spin_trylock(&qdisc->seqlock)) return false;*/
+nolock_empty: WRITE_ONCE(qdisc->empty, false); } else if (qdisc_is_running(qdisc)) { return false; @@ -176,8 +206,13 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) static inline void qdisc_run_end(struct Qdisc *qdisc) { write_seqcount_end(&qdisc->running);
- if (qdisc->flags & TCQ_F_NOLOCK)
- if (qdisc->flags & TCQ_F_NOLOCK) { spin_unlock(&qdisc->seqlock);
if (unlikely(test_bit(__QDISC_STATE_MISSED,
&qdisc->state)))
__netif_schedule(qdisc);
- }
}
static inline bool qdisc_may_bulk(const struct Qdisc *qdisc) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 44991ea..9bc73ea 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -640,8 +640,10 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) { struct pfifo_fast_priv *priv = qdisc_priv(qdisc); struct sk_buff *skb = NULL;
- bool need_retry = true; int band;
+retry: for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { struct skb_array *q = band2list(priv, band);
@@ -652,6 +654,16 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) } if (likely(skb)) { qdisc_update_stats_at_dequeue(qdisc, skb);
- } else if (need_retry &&
test_and_clear_bit(__QDISC_STATE_MISSED,
&qdisc->state)) {
Why test_and_clear_bit() here? AFAICT this is the only place the bit is cleared. So the test and clear do not have to be atomic.
To my limited understanding on x86 test_bit() is never a locked operation, while test_and_clear_bit() is always locked. So we'd save an atomic operation in un-contended case if we tested first and then cleared.
/* do another dequeuing after clearing the flag to
* avoid calling __netif_schedule().
*/
smp_mb__after_atomic();
test_and_clear_bit() which returned true implies a memory barrier, AFAIU, so the barrier is not needed with the code as is. It will be needed if we switch to test_bit() + clear_bit(), but please clarify what it is paring with.
need_retry = false;
} else { WRITE_ONCE(qdisc->empty, true); }goto retry;
Currently qdisc_run() checks the STATE_DEACTIVATED of lockless qdisc before calling __qdisc_run(), which ultimately clear the STATE_MISSED when all the skb is dequeued. If STATE_DEACTIVATED is set before clearing STATE_MISSED, there may be endless rescheduling of net_tx_action() at the end of qdisc_run_end(), see below:
CPU0(net_tx_atcion) CPU1(__dev_xmit_skb) CPU2(dev_deactivate) . . . . set STATE_MISSED . . __netif_schedule() . . . set STATE_DEACTIVATED . . qdisc_reset() . . . .<--------------- . synchronize_net() clear __QDISC_STATE_SCHED | . . . | . . . | . . . | . --------->. . | . | . test STATE_DEACTIVATED | . | some_qdisc_is_busy() __qdisc_run() *not* called | . |-----return *true* . | . . test STATE_MISS | . . __netif_schedule()--------| . . . . . . . .
__qdisc_run() is not called by net_tx_atcion() in CPU0 because CPU2 has set STATE_DEACTIVATED flag during dev_deactivate(), and STATE_MISSED is only cleared in __qdisc_run(), __netif_schedule is called endlessly at the end of qdisc_run_end(), causing endless tx action rescheduling problem.
qdisc_run() called by net_tx_action() runs in the softirq context, which should has the same semantic as the qdisc_run() called by __dev_xmit_skb() protected by rcu_read_lock_bh(). And there is a synchronize_net() between STATE_DEACTIVATED flag being set and qdisc_reset()/some_qdisc_is_busy in dev_deactivate(), we can safely bail out for the deactived lockless qdisc in net_tx_action(), and qdisc_reset() will reset all skb not dequeued yet.
So add the rcu_read_lock() explicitly to protect the qdisc_run() and do the STATE_DEACTIVATED checking in net_tx_action() before calling qdisc_run_begin(). Another option is to do the checking in the qdisc_run_end(), but it will add unnecessary overhead for non-tx_action case, because __dev_queue_xmit() will not see qdisc with STATE_DEACTIVATED after synchronize_net(), the qdisc with STATE_DEACTIVATED can only be seen by net_tx_action() because of __netif_schedule().
The STATE_DEACTIVATED checking in qdisc_run() is to avoid race between net_tx_action() and qdisc_reset(), see: commit d518d2ed8640 ("net/sched: fix race between deactivation and dequeue for NOLOCK qdisc"). As the bailout added above for deactived lockless qdisc in net_tx_action() provides better protection for the race without calling qdisc_run() at all, so remove the STATE_DEACTIVATED checking in qdisc_run().
After qdisc_reset(), there is no skb in qdisc to be dequeued, so clear the STATE_MISSED in dev_reset_queue() too.
Fixes: 6b3ba9146fe6 ("net: sched: allow qdiscs to handle locking") Signed-off-by: Yunsheng Lin linyunsheng@huawei.com --- include/net/pkt_sched.h | 7 +------ net/core/dev.c | 26 ++++++++++++++++++++++---- net/sched/sch_generic.c | 4 +++- 3 files changed, 26 insertions(+), 11 deletions(-)
diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index f5c1bee..6d7b12c 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -128,12 +128,7 @@ void __qdisc_run(struct Qdisc *q); static inline void qdisc_run(struct Qdisc *q) { if (qdisc_run_begin(q)) { - /* NOLOCK qdisc must check 'state' under the qdisc seqlock - * to avoid racing with dev_qdisc_reset() - */ - if (!(q->flags & TCQ_F_NOLOCK) || - likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) - __qdisc_run(q); + __qdisc_run(q); qdisc_run_end(q); } } diff --git a/net/core/dev.c b/net/core/dev.c index 222b1d3..d596cd7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5025,25 +5025,43 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) sd->output_queue_tailp = &sd->output_queue; local_irq_enable();
+ rcu_read_lock(); + while (head) { struct Qdisc *q = head; spinlock_t *root_lock = NULL;
head = head->next_sched;
- if (!(q->flags & TCQ_F_NOLOCK)) { - root_lock = qdisc_lock(q); - spin_lock(root_lock); - } /* We need to make sure head->next_sched is read * before clearing __QDISC_STATE_SCHED */ smp_mb__before_atomic(); + + if (!(q->flags & TCQ_F_NOLOCK)) { + root_lock = qdisc_lock(q); + spin_lock(root_lock); + } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, + &q->state))) { + /* There is a synchronize_net() between + * STATE_DEACTIVATED flag being set and + * qdisc_reset()/some_qdisc_is_busy() in + * dev_deactivate(), so we can safely bail out + * early here to avoid data race between + * qdisc_deactivate() and some_qdisc_is_busy() + * for lockless qdisc. + */ + clear_bit(__QDISC_STATE_SCHED, &q->state); + continue; + } + clear_bit(__QDISC_STATE_SCHED, &q->state); qdisc_run(q); if (root_lock) spin_unlock(root_lock); } + + rcu_read_unlock(); }
xfrm_dev_backlog(sd); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 9bc73ea..c32ac5b 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -1170,8 +1170,10 @@ static void dev_reset_queue(struct net_device *dev, qdisc_reset(qdisc);
spin_unlock_bh(qdisc_lock(qdisc)); - if (nolock) + if (nolock) { + clear_bit(__QDISC_STATE_MISSED, &qdisc->state); spin_unlock_bh(&qdisc->seqlock); + } }
static bool some_qdisc_is_busy(struct net_device *dev)
The netdev qeueue might be stopped when byte queue limit has reached or tx hw ring is full, net_tx_action() may still be rescheduled endlessly if STATE_MISSED is set, which consumes a lot of cpu without dequeuing and transmiting any skb because the netdev queue is stopped, see qdisc_run_end().
This patch fixes it by checking the netdev queue state before calling qdisc_run() and clearing STATE_MISSED if netdev queue is stopped during qdisc_run(), the net_tx_action() is recheduled again when netdev qeueue is restarted, see netif_tx_wake_queue().
As q->enqueue() may return NET_XMIT_DROP when there is no enough space, running qdisc_run() will likely consume unnecessary cpu, so avoid calling qdisc_run() when q->enqueue() returns NET_XMIT_DROP too.
Fixes: 6b3ba9146fe6 ("net: sched: allow qdiscs to handle locking") Reported-by: Michal Kubecek mkubecek@suse.cz Signed-off-by: Yunsheng Lin linyunsheng@huawei.com --- net/core/dev.c | 4 +++- net/sched/sch_generic.c | 8 +++++++- 2 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/net/core/dev.c b/net/core/dev.c index d596cd7..005bc3e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3853,7 +3853,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
if (q->flags & TCQ_F_NOLOCK) { rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; - qdisc_run(q); + if (likely(rc != NET_XMIT_DROP && + !netif_xmit_frozen_or_stopped(txq))) + qdisc_run(q);
if (unlikely(to_free)) kfree_skb_list(to_free); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index c32ac5b..2bb829ea 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -74,6 +74,7 @@ static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q) } } else { skb = SKB_XOFF_MAGIC; + clear_bit(__QDISC_STATE_MISSED, &q->state); } }
@@ -242,6 +243,7 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, } } else { skb = NULL; + clear_bit(__QDISC_STATE_MISSED, &q->state); } if (lock) spin_unlock(lock); @@ -251,8 +253,10 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, *validate = true;
if ((q->flags & TCQ_F_ONETXQUEUE) && - netif_xmit_frozen_or_stopped(txq)) + netif_xmit_frozen_or_stopped(txq)) { + clear_bit(__QDISC_STATE_MISSED, &q->state); return skb; + }
skb = qdisc_dequeue_skb_bad_txq(q); if (unlikely(skb)) { @@ -311,6 +315,8 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_stopped(txq)) skb = dev_hard_start_xmit(skb, dev, txq, &ret); + else + clear_bit(__QDISC_STATE_MISSED, &q->state);
HARD_TX_UNLOCK(dev, txq); } else {