[PATCH 00/10] fix compile error and implenting NUMA affinity for order workqueue

newer
[PATCH 0/2] fixes CVE-2020-8648

Yang Yingliang

24 Feb 2020 24 Feb '20

9:23 p.m.

Biaoxiang Ye (2): workqueue: implement NUMA affinity for single thread workqueue iscsi: use dynamic single thread workqueue to improve performance Gao Xun (1): RDMA/hns: Compilation Configuration update Yang Yingliang (5): Revert "PCI: fix kabi change in struct pci_bus" Revert "membarrier/kabi: fix kabi for membarrier_state" Revert "bdi: fix kabi for struct backing_dev_info" Revert "debugfs: fix kabi for function debugfs_remove_recursive" iscsi: add member for NUMA aware order workqueue zhangyi (F) (2): jbd2: move the clearing of b_modified flag to the journal_unmap_buffer() jbd2: do not clear the BH_Mapped flag when forgetting a metadata buffer .../infiniband/hw/hns/roce-customer/rdfx_entry.c | 3 +- .../infiniband/hw/hns/roce-customer/rdfx_intf.c | 4 +- drivers/scsi/iscsi_tcp.c | 8 ++++ drivers/scsi/libiscsi.c | 15 +++++-- fs/debugfs/inode.c | 6 --- fs/jbd2/commit.c | 46 ++++++++++++---------- fs/jbd2/transaction.c | 10 +++-- include/linux/backing-dev-defs.h | 4 -- include/linux/debugfs.h | 2 +- include/linux/mm_types.h | 20 +++++----- include/linux/pci.h | 6 +-- include/linux/workqueue.h | 1 + include/scsi/libiscsi.h | 1 + kernel/sched/sched.h | 11 ++---- kernel/workqueue.c | 15 ++++--- 15 files changed, 84 insertions(+), 68 deletions(-) -- 1.8.3

Show replies by date

Yang Yingliang

24 Feb 24 Feb

9:23 p.m.

New subject: [PATCH 01/10] Revert "PCI: fix kabi change in struct pci_bus"

hulk inclusion category: bugfix bugzilla: 30939 CVE: NA --------------------------- The kabi can be broken before official release. This reverts commit 8664b79edac95322379eee025763ba0840d458d1. Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Reviewed-By: Xie XiuQi <xiexiuqi@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- include/linux/pci.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/include/linux/pci.h b/include/linux/pci.h index eb85ed6..c97de5c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -584,6 +584,7 @@ struct pci_bus { struct resource busn_res; /* Bus numbers routed to this bus */ struct pci_ops *ops; /* Configuration access functions */ + struct pci_ops *backup_ops; struct msi_controller *msi; /* MSI controller */ void *sysdata; /* Hook for sys-specific extension */ struct proc_dir_entry *procdir; /* Directory entry in /proc/bus/pci */ @@ -605,11 +606,8 @@ struct pci_bus { struct bin_attribute *legacy_io; /* Legacy I/O for this bus */ struct bin_attribute *legacy_mem; /* Legacy mem */ unsigned int is_added:1; -#ifndef __GENKSYMS__ - struct pci_ops *backup_ops; -#else + KABI_RESERVE(1) -#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) -- 1.8.3

Yang Yingliang

9:23 p.m.

New subject: [PATCH 02/10] Revert "membarrier/kabi: fix kabi for membarrier_state"

hulk inclusion category: bugfix bugzilla: 30939 CVE: NA ------------------------------------------------- The kabi can be broken before official release. This reverts commit f316812150a4fbb52720fe7fb7702c5a52c37602. Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Reviewed-By: Xie XiuQi <xiexiuqi@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- include/linux/mm_types.h | 20 ++++++++++---------- kernel/sched/sched.h | 11 ++++------- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bb48449..178e9de 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -369,6 +369,16 @@ struct mm_struct { unsigned long highest_vm_end; /* highest vma end address */ pgd_t * pgd; +#ifdef CONFIG_MEMBARRIER + /** + * @membarrier_state: Flags controlling membarrier behavior. + * + * This field is close to @pgd to hopefully fit in the same + * cache-line, which needs to be touched by switch_mm(). + */ + atomic_t membarrier_state; +#endif + /** * @mm_users: The number of users including userspace. * @@ -439,16 +449,6 @@ struct mm_struct { struct core_state *core_state; /* coredumping support */ -#ifdef CONFIG_MEMBARRIER - /** - * @membarrier_state: Flags controlling membarrier behavior. - * - * This field is close to @pgd to hopefully fit in the same - * cache-line, which needs to be touched by switch_mm(). - */ - atomic_t membarrier_state; -#endif - #ifdef CONFIG_AIO spinlock_t ioctx_lock; struct kioctx_table __rcu *ioctx_table; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 378bf05..2213dac 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -843,6 +843,10 @@ struct rq { atomic_t nr_iowait; +#ifdef CONFIG_MEMBARRIER + int membarrier_state; +#endif + #ifdef CONFIG_SMP struct root_domain *rd; struct sched_domain *sd; @@ -926,14 +930,7 @@ struct rq { struct cpuidle_state *idle_state; #endif -#if defined(CONFIG_MEMBARRIER) && !defined(__GENKSYMS__) - union { - int membarrier_state; - long membarrier_state_KABI; - }; -#else KABI_RESERVE(1) -#endif KABI_RESERVE(2) }; -- 1.8.3

Yang Yingliang

9:23 p.m.

New subject: [PATCH 03/10] Revert "bdi: fix kabi for struct backing_dev_info"

hulk inclusion category: bugfix bugzilla: 30939 CVE: NA --------------------------- The kabi can be broken before official release. This reverts commit f8589079659b51222d86a1cb8fd9129752b0d97c. Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Reviewed-By: Xie XiuQi <xiexiuqi@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- include/linux/backing-dev-defs.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index f2bda82..73fb6bc 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -201,14 +201,10 @@ struct backing_dev_info { #endif wait_queue_head_t wb_waitq; -#ifndef __GENKSYMS__ union { struct rcu_device *rcu_dev; struct device *dev; }; -#else - struct device *dev; -#endif struct device *owner; struct timer_list laptop_mode_wb_timer; -- 1.8.3

Yang Yingliang

9:23 p.m.

New subject: [PATCH 04/10] Revert "debugfs: fix kabi for function debugfs_remove_recursive"

hulk inclusion category: bugfix bugzilla: 30939 CVE: NA --------------------------- The kabi can be broken before official release. This reverts commit ce620c1a6783b2341a376ef948484b5314ed064e. Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Reviewed-By: Xie XiuQi <xiexiuqi@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- fs/debugfs/inode.c | 6 ------ include/linux/debugfs.h | 2 +- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 59e5e49..11664fd 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -672,12 +672,6 @@ void debugfs_remove(struct dentry *dentry) } EXPORT_SYMBOL_GPL(debugfs_remove); -void debugfs_remove_recursive(struct dentry *dentry) -{ - debugfs_remove(dentry); -} -EXPORT_SYMBOL_GPL(debugfs_remove_recursive); - /** * debugfs_rename - rename a file/directory in the debugfs filesystem * @old_dir: a pointer to the parent dentry for the renamed object. This diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index 3b0ba54..5fdcfef 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -82,7 +82,7 @@ struct dentry *debugfs_create_automount(const char *name, void *data); void debugfs_remove(struct dentry *dentry); -void debugfs_remove_recursive(struct dentry *dentry); +#define debugfs_remove_recursive debugfs_remove const struct file_operations *debugfs_real_fops(const struct file *filp); -- 1.8.3

Yang Yingliang

9:23 p.m.

New subject: [PATCH 05/10] iscsi: add member for NUMA aware order workqueue

euleros inclusion category: feature feature: Implement NUMA affinity for order workqueue ------------------------------------------------- Add member to struct iscsi_conn. Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Reviewed-By: Xie XiuQi <xiexiuqi@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- include/linux/workqueue.h | 1 + include/scsi/libiscsi.h | 1 + 2 files changed, 2 insertions(+) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index ac013f2..93b87cd 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -361,6 +361,7 @@ enum { __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ __WQ_ORDERED_EXPLICIT = 1 << 19, /* internal: alloc_ordered_workqueue() */ + __WQ_DYNAMIC = 1 << 25, /* internal: only support single work order WQ */ WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */ diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h index c9bd935f..e03b924 100644 --- a/include/scsi/libiscsi.h +++ b/include/scsi/libiscsi.h @@ -253,6 +253,7 @@ struct iscsi_conn { /* custom statistics */ uint32_t eh_abort_cnt; uint32_t fmr_unalign_cnt; + int intimate_cpu; }; struct iscsi_pool { -- 1.8.3

Yang Yingliang

9:23 p.m.

New subject: [PATCH 06/10] workqueue: implement NUMA affinity for single thread workqueue

From: Biaoxiang Ye <yebiaoxiang@huawei.com> euleros inclusion category: feature feature: Implement NUMA affinity for order workqueue ------------------------------------------------- Currently, single thread workqueue only have single pwq, all of works are queued the same workerpool. This is not optimal on NUMA machines, will cause workers jump around across node. This patch add a new wq flags __WQ_DYNAMIC, this new kind of single thread workqueue creates a separate pwq covering the intersecting CPUS for each NUMA node which has online CPUS in @attrs->cpumask instead of mapping all entries of numa_pwq_tbl[] to the same pwq. After this, we can specify the @cpu of queue_work_on, so the work can be executed on the same NUMA node of the specified @cpu. This kind of wq only support single work, multi works can't guarantee the work's order. Signed-off-by: Biaoxiang Ye <yebiaoxiang@huawei.com> Acked-by: Hanjun Guo <guohanjun@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- kernel/workqueue.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index cdf9d95..118cf73 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3843,6 +3843,9 @@ static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx) * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask. */ copy_workqueue_attrs(new_attrs, attrs); + if (wq->flags & __WQ_DYNAMIC) + new_attrs->no_numa = false; + cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask); if (unlikely(cpumask_empty(new_attrs->cpumask))) cpumask_copy(new_attrs->cpumask, wq_unbound_cpumask); @@ -4092,10 +4095,12 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) return 0; } else if (wq->flags & __WQ_ORDERED) { ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]); - /* there should only be single pwq for ordering guarantee */ - WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node || - wq->pwqs.prev != &wq->dfl_pwq->pwqs_node), - "ordering guarantee broken for workqueue %s\n", wq->name); + if (!(wq->flags & __WQ_DYNAMIC)) { + /* there should only be single pwq for ordering guarantee */ + WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node || + wq->pwqs.prev != &wq->dfl_pwq->pwqs_node), + "ordering guarantee broken for workqueue %s\n", wq->name); + } return ret; } else { return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); @@ -5166,7 +5171,7 @@ static int workqueue_apply_unbound_cpumask(void) if (!(wq->flags & WQ_UNBOUND)) continue; /* creating multiple pwqs breaks ordering guarantee */ - if (wq->flags & __WQ_ORDERED) + if ((wq->flags & __WQ_ORDERED) && !(wq->flags & __WQ_DYNAMIC)) continue; ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs); -- 1.8.3

Yang Yingliang

9:23 p.m.

New subject: [PATCH 07/10] iscsi: use dynamic single thread workqueue to improve performance

From: Biaoxiang Ye <yebiaoxiang@huawei.com> euleros inclusion category: feature feature: Implement NUMA affinity for order workqueue ------------------------------------------------- On aarch64 NUMA machines, the kworker of iscsi created always jump around across node boundaries. If it work on the different node even different cpu package with the softirq of network interface, memcpy with in iscsi_tcp_segment_recv will be slow down, and iscsi got an terrible performance. In this patch, we trace the cpu of softirq, and tell queue_work_on to execute iscsi_xmitworker on the same NUMA node. The performance data as below: fio cmd: fio -filename=/dev/disk/by-id/wwn-0x6883fd3100a2ad260036281700000000 -direct=1 -iodepth=32 -rw=read -bs=64k -size=30G -ioengine=libaio -numjobs=1 -group_reporting -name=mytest -time_based -ramp_time=60 -runtime=60 before patch: Jobs: 1 (f=1): [R] [52.5% done] [852.3MB/0KB/0KB /s] [13.7K/0/0 iops] [eta 00m:57s] Jobs: 1 (f=1): [R] [53.3% done] [861.4MB/0KB/0KB /s] [13.8K/0/0 iops] [eta 00m:56s] Jobs: 1 (f=1): [R] [54.2% done] [868.2MB/0KB/0KB /s] [13.9K/0/0 iops] [eta 00m:55s] after pactch: Jobs: 1 (f=1): [R] [53.3% done] [1070MB/0KB/0KB /s] [17.2K/0/0 iops] [eta 00m:56s] Jobs: 1 (f=1): [R] [55.0% done] [1064MB/0KB/0KB /s] [17.3K/0/0 iops] [eta 00m:54s] Jobs: 1 (f=1): [R] [56.7% done] [1069MB/0KB/0KB /s] [17.1K/0/0 iops] [eta 00m:52s] cpu info: Architecture: aarch64 Byte Order: Little Endian CPU(s): 128 On-line CPU(s) list: 0-127 Thread(s) per core: 1 Core(s) per socket: 64 Socket(s): 2 NUMA node(s): 4 Model: 0 CPU max MHz: 2600.0000 CPU min MHz: 200.0000 BogoMIPS: 200.00 L1d cache: 64K L1i cache: 64K L2 cache: 512K L3 cache: 32768K NUMA node0 CPU(s): 0-31 NUMA node1 CPU(s): 32-63 NUMA node2 CPU(s): 64-95 NUMA node3 CPU(s): 96-127 Signed-off-by: Biaoxiang Ye <yebiaoxiang@huawei.com> Acked-by: Hanjun Guo <guohanjun@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- drivers/scsi/iscsi_tcp.c | 8 ++++++++ drivers/scsi/libiscsi.c | 15 ++++++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c index 55181d2..0262158 100644 --- a/drivers/scsi/iscsi_tcp.c +++ b/drivers/scsi/iscsi_tcp.c @@ -132,6 +132,7 @@ static void iscsi_sw_tcp_data_ready(struct sock *sk) struct iscsi_conn *conn; struct iscsi_tcp_conn *tcp_conn; read_descriptor_t rd_desc; + int current_cpu; read_lock_bh(&sk->sk_callback_lock); conn = sk->sk_user_data; @@ -141,6 +142,13 @@ static void iscsi_sw_tcp_data_ready(struct sock *sk) } tcp_conn = conn->dd_data; + /* save intimate cpu when in softirq */ + if (!sock_owned_by_user_nocheck(sk)) { + current_cpu = smp_processor_id(); + if (conn->intimate_cpu != current_cpu) + conn->intimate_cpu = current_cpu; + } + /* * Use rd_desc to pass 'conn' to iscsi_tcp_recv. * We set count to 1 because we want the network layer to diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index 214fe77..ec356e0 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -90,9 +90,15 @@ inline void iscsi_conn_queue_work(struct iscsi_conn *conn) { struct Scsi_Host *shost = conn->session->host; struct iscsi_host *ihost = shost_priv(shost); + int intimate_cpu = conn->intimate_cpu; - if (ihost->workq) - queue_work(ihost->workq, &conn->xmitwork); + if (ihost->workq) { + /* we expect it to be excuted on the same numa of the intimate cpu */ + if ((intimate_cpu >= 0) && cpu_possible(intimate_cpu)) + queue_work_on(intimate_cpu, ihost->workq, &conn->xmitwork); + else + queue_work(ihost->workq, &conn->xmitwork); + } } EXPORT_SYMBOL_GPL(iscsi_conn_queue_work); @@ -2682,7 +2688,9 @@ struct Scsi_Host *iscsi_host_alloc(struct scsi_host_template *sht, if (xmit_can_sleep) { snprintf(ihost->workq_name, sizeof(ihost->workq_name), "iscsi_q_%d", shost->host_no); - ihost->workq = create_singlethread_workqueue(ihost->workq_name); + /* this kind of workqueue only support single work */ + ihost->workq = alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM | + __WQ_DYNAMIC, ihost->workq_name); if (!ihost->workq) goto free_host; } @@ -2959,6 +2967,7 @@ struct iscsi_cls_conn * conn->id = conn_idx; conn->exp_statsn = 0; conn->tmf_state = TMF_INITIAL; + conn->intimate_cpu = -1; timer_setup(&conn->transport_timer, iscsi_check_transport_timeouts, 0); -- 1.8.3

Yang Yingliang

9:23 p.m.

New subject: [PATCH 08/10] jbd2: move the clearing of b_modified flag to the journal_unmap_buffer()

From: "zhangyi (F)" <yi.zhang@huawei.com> [ Upstream commit 6a66a7ded12baa6ebbb2e3e82f8cb91382814839 ] There is no need to delay the clearing of b_modified flag to the transaction committing time when unmapping the journalled buffer, so just move it to the journal_unmap_buffer(). Link: https://lore.kernel.org/r/20200213063821.30455-2-yi.zhang@huawei.com Reviewed-by: Jan Kara <jack@suse.cz> Signed-off-by: zhangyi (F) <yi.zhang@huawei.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@kernel.org Signed-off-by: Sasha Levin <sashal@kernel.org> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- fs/jbd2/commit.c | 43 +++++++++++++++---------------------------- fs/jbd2/transaction.c | 10 ++++++---- 2 files changed, 21 insertions(+), 32 deletions(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 4a3300e..2b5a56c 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -972,34 +972,21 @@ void jbd2_journal_commit_transaction(journal_t *journal) * it. */ /* - * A buffer which has been freed while still being journaled by - * a previous transaction. - */ - if (buffer_freed(bh)) { - /* - * If the running transaction is the one containing - * "add to orphan" operation (b_next_transaction != - * NULL), we have to wait for that transaction to - * commit before we can really get rid of the buffer. - * So just clear b_modified to not confuse transaction - * credit accounting and refile the buffer to - * BJ_Forget of the running transaction. If the just - * committed transaction contains "add to orphan" - * operation, we can completely invalidate the buffer - * now. We are rather through in that since the - * buffer may be still accessible when blocksize < - * pagesize and it is attached to the last partial - * page. - */ - jh->b_modified = 0; - if (!jh->b_next_transaction) { - clear_buffer_freed(bh); - clear_buffer_jbddirty(bh); - clear_buffer_mapped(bh); - clear_buffer_new(bh); - clear_buffer_req(bh); - bh->b_bdev = NULL; - } + * A buffer which has been freed while still being journaled + * by a previous transaction, refile the buffer to BJ_Forget of + * the running transaction. If the just committed transaction + * contains "add to orphan" operation, we can completely + * invalidate the buffer now. We are rather through in that + * since the buffer may be still accessible when blocksize < + * pagesize and it is attached to the last partial page. + */ + if (buffer_freed(bh) && !jh->b_next_transaction) { + clear_buffer_freed(bh); + clear_buffer_jbddirty(bh); + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_req(bh); + bh->b_bdev = NULL; } if (buffer_jbddirty(bh)) { diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index fddfd6e..af31e28 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2304,14 +2304,16 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, return -EBUSY; } /* - * OK, buffer won't be reachable after truncate. We just set - * j_next_transaction to the running transaction (if there is - * one) and mark buffer as freed so that commit code knows it - * should clear dirty bits when it is done with the buffer. + * OK, buffer won't be reachable after truncate. We just clear + * b_modified to not confuse transaction credit accounting, and + * set j_next_transaction to the running transaction (if there + * is one) and mark buffer as freed so that commit code knows + * it should clear dirty bits when it is done with the buffer. */ set_buffer_freed(bh); if (journal->j_running_transaction && buffer_jbddirty(bh)) jh->b_next_transaction = journal->j_running_transaction; + jh->b_modified = 0; jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); -- 1.8.3

Yang Yingliang

9:23 p.m.

New subject: [PATCH 09/10] jbd2: do not clear the BH_Mapped flag when forgetting a metadata buffer

From: "zhangyi (F)" <yi.zhang@huawei.com> [ Upstream commit c96dceeabf765d0b1b1f29c3bf50a5c01315b820 ] Commit 904cdbd41d74 ("jbd2: clear dirty flag when revoking a buffer from an older transaction") set the BH_Freed flag when forgetting a metadata buffer which belongs to the committing transaction, it indicate the committing process clear dirty bits when it is done with the buffer. But it also clear the BH_Mapped flag at the same time, which may trigger below NULL pointer oops when block_size < PAGE_SIZE. rmdir 1 kjournald2 mkdir 2 jbd2_journal_commit_transaction commit transaction N jbd2_journal_forget set_buffer_freed(bh1) jbd2_journal_commit_transaction commit transaction N+1 ... clear_buffer_mapped(bh1) ext4_getblk(bh2 ummapped) ... grow_dev_page init_page_buffers bh1->b_private=NULL bh2->b_private=NULL jbd2_journal_put_journal_head(jh1) __journal_remove_journal_head(hb1) jh1 is NULL and trigger oops *) Dir entry block bh1 and bh2 belongs to one page, and the bh2 has already been unmapped. For the metadata buffer we forgetting, we should always keep the mapped flag and clear the dirty flags is enough, so this patch pick out the these buffers and keep their BH_Mapped flag. Link: https://lore.kernel.org/r/20200213063821.30455-3-yi.zhang@huawei.com Fixes: 904cdbd41d74 ("jbd2: clear dirty flag when revoking a buffer from an older transaction") Reviewed-by: Jan Kara <jack@suse.cz> Signed-off-by: zhangyi (F) <yi.zhang@huawei.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@kernel.org Signed-off-by: Sasha Levin <sashal@kernel.org> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- fs/jbd2/commit.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 2b5a56c..e755865 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -981,12 +981,29 @@ void jbd2_journal_commit_transaction(journal_t *journal) * pagesize and it is attached to the last partial page. */ if (buffer_freed(bh) && !jh->b_next_transaction) { + struct address_space *mapping; + clear_buffer_freed(bh); clear_buffer_jbddirty(bh); - clear_buffer_mapped(bh); - clear_buffer_new(bh); - clear_buffer_req(bh); - bh->b_bdev = NULL; + + /* + * Block device buffers need to stay mapped all the + * time, so it is enough to clear buffer_jbddirty and + * buffer_freed bits. For the file mapping buffers (i.e. + * journalled data) we need to unmap buffer and clear + * more bits. We also need to be careful about the check + * because the data page mapping can get cleared under + * out hands, which alse need not to clear more bits + * because the page and buffers will be freed and can + * never be reused once we are done with them. + */ + mapping = READ_ONCE(bh->b_page->mapping); + if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) { + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_req(bh); + bh->b_bdev = NULL; + } } if (buffer_jbddirty(bh)) { -- 1.8.3

Yang Yingliang

9:23 p.m.

New subject: [PATCH 10/10] RDMA/hns: Compilation Configuration update

From: Gao Xun <gaoxun3@huawei.com> driver inclusion category: Bugfix bugzilla: NA CVE: NA We updated dfx module related conditional compilation layout to ensure proper compilation when we turnoff dfx in .config file. Signed-off-by: Gao Xun <gaoxun3@huawei.com> Reviewed-by: Hu Chunzhi <huchunzhi@huawei.com> Reviewed-by: Wang Lin <wanglin137@huawei.com> Reviewed-by: Zhao Weibo <zhaoweibo3@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- drivers/infiniband/hw/hns/roce-customer/rdfx_entry.c | 3 ++- drivers/infiniband/hw/hns/roce-customer/rdfx_intf.c | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hns/roce-customer/rdfx_entry.c b/drivers/infiniband/hw/hns/roce-customer/rdfx_entry.c index 52cb9f7..8c2de43 100644 --- a/drivers/infiniband/hw/hns/roce-customer/rdfx_entry.c +++ b/drivers/infiniband/hw/hns/roce-customer/rdfx_entry.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ // Copyright (c) 2016-2017 Hisilicon Limited. - +#ifdef CONFIG_INFINIBAND_HNS_DFX #include "roce_k_compat.h" #include "rdfx_common.h" #include "rdfx_intf.h" @@ -328,3 +328,4 @@ void rdfx_set_cqe_info(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, } } EXPORT_SYMBOL_GPL(rdfx_set_cqe_info); +#endif diff --git a/drivers/infiniband/hw/hns/roce-customer/rdfx_intf.c b/drivers/infiniband/hw/hns/roce-customer/rdfx_intf.c index 7561e0ef..9a84e7c 100644 --- a/drivers/infiniband/hw/hns/roce-customer/rdfx_intf.c +++ b/drivers/infiniband/hw/hns/roce-customer/rdfx_intf.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ // Copyright (c) 2016-2017 Hisilicon Limited. - +#ifdef CONFIG_INFINIBAND_HNS_DFX #include "roce_k_compat.h" #include "hns_roce_device.h" #include "hns_roce_common.h" @@ -940,4 +940,4 @@ void rdfx_set_rdfx_cq_ci(struct hns_roce_dev *hr_dev, spin_unlock(&rdfx->cq.cq_lock); } EXPORT_SYMBOL_GPL(rdfx_set_rdfx_cq_ci); - +#endif -- 1.8.3

1957

Age (days ago)

1957

Last active (days ago)

List overview

10 comments

1 participants

participants (1)

Yang Yingliang

[PATCH 00/10] fix compile error and implenting NUMA affinity for order workqueue

tags

participants (1)