From: Yipeng Zou <zouyipeng(a)huawei.com>
hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I6BO2R
CVE: NA
--------------------------------
Now, There is some issues about LPI migration on ARM SMP platform.
For example, NIC device generates MSI and sends LPI to CPU0 via ITS,
meanwhile irqbalance running on CPU1 set irq affinty of NIC to CPU1,
the next interrupt will be sent to CPU2, due to the state of irq is
still in progress, kernel does not end up performing irq handler on
CPU2, which results in some userland service timeouts, the sequence
of events is shown as follows:
NIC CPU0 CPU1
Generate IRQ#1 READ_IAR
Lock irq_desc
Set IRQD_IN_PROGRESS
Unlock irq_desc
Lock irq_desc
Change LPI Affinity
Unlock irq_desc
Call irq_handler
Generate IRQ#2
READ_IAR
Lock irq_desc
Check IRQD_IN_PROGRESS
Unlock irq_desc
Return from interrupt#2
Lock irq_desc
Clear IRQD_IN_PROGRESS
Unlock irq_desc
return from interrupt#1
For this scenario, We can enable CONFIG_GENERIC_PENDING_IRQ to avoid this.
The CONFIG_GENERIC_PENDING_IRQ will delay all action that modify LPI
affinity until the next interrupt eoi handler.
Signed-off-by: Yipeng Zou <zouyipeng(a)huawei.com>
Reviewed-by: Liao Chang <liaochang1(a)huawei.com>
Reviewed-by: Zhang Jianhua <chris.zjh(a)huawei.com>
Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com>
---
drivers/irqchip/irq-gic-v3-its.c | 8 +++++++-
kernel/irq/Kconfig | 4 +++-
kernel/irq/migration.c | 2 ++
3 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 15a0292e8e61..7a0adf67761e 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -1162,6 +1162,12 @@ static void its_unmask_irq(struct irq_data *d)
lpi_update_config(d, 0, LPI_PROP_ENABLED);
}
+static void its_irq_chip_eoi(struct irq_data *d)
+{
+ irq_move_irq(d);
+ irq_chip_eoi_parent(d);
+}
+
static int its_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
bool force)
{
@@ -1485,7 +1491,7 @@ static struct irq_chip its_irq_chip = {
.name = "ITS",
.irq_mask = its_mask_irq,
.irq_unmask = its_unmask_irq,
- .irq_eoi = irq_chip_eoi_parent,
+ .irq_eoi = its_irq_chip_eoi,
.irq_set_affinity = its_set_affinity,
.irq_compose_msi_msg = its_irq_compose_msi_msg,
.irq_set_irqchip_state = its_irq_set_irqchip_state,
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 14a85d0161ea..eebbced84e44 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -32,7 +32,9 @@ config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
# Support for delayed migration from interrupt context
config GENERIC_PENDING_IRQ
- bool
+ bool "Support for delayed migration from interrupt context"
+ depends on SMP
+ default n
# Support for generic irq migrating off cpu before the cpu is offline.
config GENERIC_IRQ_MIGRATION
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index def48589ea48..bcb61ee69c20 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -117,3 +117,5 @@ void __irq_move_irq(struct irq_data *idata)
if (!masked)
idata->chip->irq_unmask(idata);
}
+
+void __weak irq_force_complete_move(struct irq_desc *desc) { }
--
2.25.1
From: Zhihao Cheng <chengzhihao1(a)huawei.com>
maillist inclusion
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I6C5HV
CVE: NA
Reference: https://lore.kernel.org/lkml/20230110015327.1181863-1-chengzhihao1@huawei.c…
--------------------------------
Following process will make data lost and could lead to a filesystem
corrupted problem:
1. jh(bh) is inserted into T1->t_checkpoint_list, bh is dirty, and
jh->b_transaction = NULL
2. T1 is added into journal->j_checkpoint_transactions.
3. Get bh prepare to write while doing checkpoing:
PA PB
do_get_write_access jbd2_log_do_checkpoint
spin_lock(&jh->b_state_lock)
if (buffer_dirty(bh))
clear_buffer_dirty(bh) // clear buffer dirty
set_buffer_jbddirty(bh)
transaction =
journal->j_checkpoint_transactions
jh = transaction->t_checkpoint_list
if (!buffer_dirty(bh))
__jbd2_journal_remove_checkpoint(jh)
// bh won't be flushed
jbd2_cleanup_journal_tail
__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved)
4. Aborting journal/Power-cut before writing latest bh on journal area.
In this way we get a corrupted filesystem with bh's data lost.
Fix it by moving the clearing of buffer_dirty bit just before the call
to __jbd2_journal_file_buffer(), both bit clearing and jh->b_transaction
assignment are under journal->j_list_lock locked, so that
jbd2_log_do_checkpoint() will wait until jh's new transaction fininshed
even bh is currently not dirty. And journal_shrink_one_cp_list() won't
remove jh from checkpoint list if the buffer head is reused in
do_get_write_access().
Fetch a reproducer in [Link].
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216898
Cc: <stable(a)kernel.org>
Signed-off-by: Zhihao Cheng <chengzhihao1(a)huawei.com>
Signed-off-by: zhanchengbin <zhanchengbin1(a)huawei.com>
Suggested-by: Jan Kara <jack(a)suse.cz>
Conflicts:
fs/jbd2/transaction.c
[ 464170647b5648bb8("jbd2: Make state lock a spinlock") is not
applied. ]
Signed-off-by: Zhihao Cheng <chengzhihao1(a)huawei.com>
Reviewed-by: Yang Erkun <yangerkun(a)huawei.com>
Reviewed-by: Zhang Yi <yi.zhang(a)huawei.com>
Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com>
---
fs/jbd2/transaction.c | 50 +++++++++++++++++++++++++------------------
1 file changed, 29 insertions(+), 21 deletions(-)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 3b31bd1f7b77..149190c2ac89 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -935,36 +935,28 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
* ie. locked but not dirty) or tune2fs (which may actually have
* the buffer dirtied, ugh.) */
- if (buffer_dirty(bh)) {
+ if (buffer_dirty(bh) && jh->b_transaction) {
+ warn_dirty_buffer(bh);
/*
- * First question: is this buffer already part of the current
- * transaction or the existing committing transaction?
- */
- if (jh->b_transaction) {
- J_ASSERT_JH(jh,
- jh->b_transaction == transaction ||
- jh->b_transaction ==
- journal->j_committing_transaction);
- if (jh->b_next_transaction)
- J_ASSERT_JH(jh, jh->b_next_transaction ==
- transaction);
- warn_dirty_buffer(bh);
- }
- /*
- * In any case we need to clean the dirty flag and we must
- * do it under the buffer lock to be sure we don't race
- * with running write-out.
+ * We need to clean the dirty flag and we must do it under the
+ * buffer lock to be sure we don't race with running write-out.
*/
JBUFFER_TRACE(jh, "Journalling dirty buffer");
clear_buffer_dirty(bh);
+ /*
+ * The buffer is going to be added to BJ_Reserved list now and
+ * nothing guarantees jbd2_journal_dirty_metadata() will be
+ * ever called for it. So we need to set jbddirty bit here to
+ * make sure the buffer is dirtied and written out when the
+ * journaling machinery is done with it.
+ */
set_buffer_jbddirty(bh);
}
- unlock_buffer(bh);
-
error = -EROFS;
if (is_handle_aborted(handle)) {
jbd_unlock_bh_state(bh);
+ unlock_buffer(bh);
goto out;
}
error = 0;
@@ -974,8 +966,10 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
* b_next_transaction points to it
*/
if (jh->b_transaction == transaction ||
- jh->b_next_transaction == transaction)
+ jh->b_next_transaction == transaction) {
+ unlock_buffer(bh);
goto done;
+ }
/*
* this is the first time this transaction is touching this buffer,
@@ -999,10 +993,24 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
*/
smp_wmb();
spin_lock(&journal->j_list_lock);
+ if (test_clear_buffer_dirty(bh)) {
+ /*
+ * Execute buffer dirty clearing and jh->b_transaction
+ * assignment under journal->j_list_lock locked to
+ * prevent bh being removed from checkpoint list if
+ * the buffer is in an intermediate state (not dirty
+ * and jh->b_transaction is NULL).
+ */
+ JBUFFER_TRACE(jh, "Journalling dirty buffer");
+ set_buffer_jbddirty(bh);
+ }
__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
spin_unlock(&journal->j_list_lock);
+ unlock_buffer(bh);
goto done;
}
+ unlock_buffer(bh);
+
/*
* If there is already a copy-out version of this buffer, then we don't
* need to make another one
--
2.25.1
From: Dave Chinner <dchinner(a)redhat.com>
mainline inclusion
from mainline-v5.19-rc1
commit c230a4a85bcdbfc1a7415deec6caf04e8fca1301
category: bugfix
bugzilla: 187372, https://gitee.com/openeuler/kernel/issues/I5K0OM
CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
--------------------------------
Ever since we added shadown format buffers to the log items, log
items need to handle the item being released with shadow buffers
attached. Due to the fact this requirement was added at the same
time we added new rmap/reflink intents, we missed the cleanup of
those items.
In theory, this means shadow buffers can be leaked in a very small
window when a shutdown is initiated. Testing with KASAN shows this
leak does not happen in practice - we haven't identified a single
leak in several years of shutdown testing since ~v4.8 kernels.
However, the intent whiteout cleanup mechanism results in every
cancelled intent in exactly the same state as this tiny race window
creates and so if intents down clean up shadow buffers on final
release we will leak the shadow buffer for just about every intent
we create.
Hence we start with this patch to close this condition off and
ensure that when whiteouts start to be used we don't leak lots of
memory.
Signed-off-by: Dave Chinner <dchinner(a)redhat.com>
Reviewed-by: Darrick J. Wong <djwong(a)kernel.org>
Reviewed-by: Allison Henderson <allison.henderson(a)oracle.com>
Signed-off-by: Dave Chinner <david(a)fromorbit.com>
conflicts:
fs/xfs/xfs_bmap_item.c
fs/xfs/xfs_icreate_item.c
fs/xfs/xfs_refcount_item.c
fs/xfs/xfs_rmap_item.c
Signed-off-by: Li Nan <linan122(a)huawei.com>
Reviewed-by: Yang Erkun <yangerkun(a)huawei.com>
Reviewed-by: Zhang Yi <yi.zhang(a)huawei.com>
Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com>
---
fs/xfs/xfs_bmap_item.c | 2 ++
fs/xfs/xfs_icreate_item.c | 1 +
fs/xfs/xfs_refcount_item.c | 2 ++
fs/xfs/xfs_rmap_item.c | 2 ++
4 files changed, 7 insertions(+)
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 44ec0f2d5253..e6de8081451f 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -40,6 +40,7 @@ STATIC void
xfs_bui_item_free(
struct xfs_bui_log_item *buip)
{
+ kmem_free(buip->bui_item.li_lv_shadow);
kmem_cache_free(xfs_bui_zone, buip);
}
@@ -199,6 +200,7 @@ xfs_bud_item_release(
struct xfs_bud_log_item *budp = BUD_ITEM(lip);
xfs_bui_release(budp->bud_buip);
+ kmem_free(budp->bud_item.li_lv_shadow);
kmem_cache_free(xfs_bud_zone, budp);
}
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 9b3994b9c716..aa8c7c261d24 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -63,6 +63,7 @@ STATIC void
xfs_icreate_item_release(
struct xfs_log_item *lip)
{
+ kmem_free(ICR_ITEM(lip)->ic_item.li_lv_shadow);
kmem_cache_free(xfs_icreate_zone, ICR_ITEM(lip));
}
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 0dee316283a9..9f4ff45c7a93 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -35,6 +35,7 @@ STATIC void
xfs_cui_item_free(
struct xfs_cui_log_item *cuip)
{
+ kmem_free(cuip->cui_item.li_lv_shadow);
if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS)
kmem_free(cuip);
else
@@ -204,6 +205,7 @@ xfs_cud_item_release(
struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
xfs_cui_release(cudp->cud_cuip);
+ kmem_free(cudp->cud_item.li_lv_shadow);
kmem_cache_free(xfs_cud_zone, cudp);
}
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 20905953fe76..b5447ac7cb9b 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -35,6 +35,7 @@ STATIC void
xfs_rui_item_free(
struct xfs_rui_log_item *ruip)
{
+ kmem_free(ruip->rui_item.li_lv_shadow);
if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS)
kmem_free(ruip);
else
@@ -227,6 +228,7 @@ xfs_rud_item_release(
struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
xfs_rui_release(rudp->rud_ruip);
+ kmem_free(rudp->rud_item.li_lv_shadow);
kmem_cache_free(xfs_rud_zone, rudp);
}
--
2.25.1
From: Dave Chinner <dchinner(a)redhat.com>
mainline inclusion
from mainline-v5.19-rc1
commit c230a4a85bcdbfc1a7415deec6caf04e8fca1301
category: bugfix
bugzilla: 187372, https://gitee.com/openeuler/kernel/issues/I5K0OM
CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
--------------------------------
Ever since we added shadown format buffers to the log items, log
items need to handle the item being released with shadow buffers
attached. Due to the fact this requirement was added at the same
time we added new rmap/reflink intents, we missed the cleanup of
those items.
In theory, this means shadow buffers can be leaked in a very small
window when a shutdown is initiated. Testing with KASAN shows this
leak does not happen in practice - we haven't identified a single
leak in several years of shutdown testing since ~v4.8 kernels.
However, the intent whiteout cleanup mechanism results in every
cancelled intent in exactly the same state as this tiny race window
creates and so if intents down clean up shadow buffers on final
release we will leak the shadow buffer for just about every intent
we create.
Hence we start with this patch to close this condition off and
ensure that when whiteouts start to be used we don't leak lots of
memory.
Signed-off-by: Dave Chinner <dchinner(a)redhat.com>
Reviewed-by: Darrick J. Wong <djwong(a)kernel.org>
Reviewed-by: Allison Henderson <allison.henderson(a)oracle.com>
Signed-off-by: Dave Chinner <david(a)fromorbit.com>
conflicts:
fs/xfs/xfs_bmap_item.c
fs/xfs/xfs_icreate_item.c
fs/xfs/xfs_refcount_item.c
fs/xfs/xfs_rmap_item.c
Signed-off-by: Li Nan <linan122(a)huawei.com>
Reviewed-by: Yang Erkun <yangerkun(a)huawei.com>
Reviewed-by: Zhang Yi <yi.zhang(a)huawei.com>
Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com>
---
fs/xfs/xfs_bmap_item.c | 2 ++
fs/xfs/xfs_icreate_item.c | 1 +
fs/xfs/xfs_refcount_item.c | 2 ++
fs/xfs/xfs_rmap_item.c | 2 ++
4 files changed, 7 insertions(+)
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 44ec0f2d5253..e6de8081451f 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -40,6 +40,7 @@ STATIC void
xfs_bui_item_free(
struct xfs_bui_log_item *buip)
{
+ kmem_free(buip->bui_item.li_lv_shadow);
kmem_cache_free(xfs_bui_zone, buip);
}
@@ -199,6 +200,7 @@ xfs_bud_item_release(
struct xfs_bud_log_item *budp = BUD_ITEM(lip);
xfs_bui_release(budp->bud_buip);
+ kmem_free(budp->bud_item.li_lv_shadow);
kmem_cache_free(xfs_bud_zone, budp);
}
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 9b3994b9c716..aa8c7c261d24 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -63,6 +63,7 @@ STATIC void
xfs_icreate_item_release(
struct xfs_log_item *lip)
{
+ kmem_free(ICR_ITEM(lip)->ic_item.li_lv_shadow);
kmem_cache_free(xfs_icreate_zone, ICR_ITEM(lip));
}
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 0dee316283a9..9f4ff45c7a93 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -35,6 +35,7 @@ STATIC void
xfs_cui_item_free(
struct xfs_cui_log_item *cuip)
{
+ kmem_free(cuip->cui_item.li_lv_shadow);
if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS)
kmem_free(cuip);
else
@@ -204,6 +205,7 @@ xfs_cud_item_release(
struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
xfs_cui_release(cudp->cud_cuip);
+ kmem_free(cudp->cud_item.li_lv_shadow);
kmem_cache_free(xfs_cud_zone, cudp);
}
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 20905953fe76..b5447ac7cb9b 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -35,6 +35,7 @@ STATIC void
xfs_rui_item_free(
struct xfs_rui_log_item *ruip)
{
+ kmem_free(ruip->rui_item.li_lv_shadow);
if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS)
kmem_free(ruip);
else
@@ -227,6 +228,7 @@ xfs_rud_item_release(
struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
xfs_rui_release(rudp->rud_ruip);
+ kmem_free(rudp->rud_item.li_lv_shadow);
kmem_cache_free(xfs_rud_zone, rudp);
}
--
2.25.1