From: Yanling Song songyl@ramaxel.com
Ramaxel inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4I0OZ CVE: NA
--------------------------
Fix typo of last_cmsn which should be last_pmsn
Signed-off-by: Yanling Song songyl@ramaxel.com Reviewed-by: Zhang Leizhanglei48@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/scsi/spfc/hw/spfc_queue.c | 28 ++++++++++++++-------------- drivers/scsi/spfc/hw/spfc_queue.h | 2 +- 2 files changed, 15 insertions(+), 15 deletions(-)
diff --git a/drivers/scsi/spfc/hw/spfc_queue.c b/drivers/scsi/spfc/hw/spfc_queue.c index 3f73fa26aad1..abcf1ff3f49f 100644 --- a/drivers/scsi/spfc/hw/spfc_queue.c +++ b/drivers/scsi/spfc/hw/spfc_queue.c @@ -1027,7 +1027,7 @@ u32 spfc_create_ssq(void *handle) sq_ctrl->wqe_offset = 0; sq_ctrl->head_start_cmsn = 0; sq_ctrl->head_end_cmsn = SPFC_GET_WP_END_CMSN(0, sq_ctrl->wqe_num_per_buf); - sq_ctrl->last_cmsn = 0; + sq_ctrl->last_pmsn = 0; /* Linked List SQ Owner Bit 1 valid,0 invalid */ sq_ctrl->last_pi_owner = 1; atomic_set(&sq_ctrl->sq_valid, true); @@ -3127,7 +3127,7 @@ static u32 spfc_parent_sq_ring_direct_wqe_doorbell(struct spfc_parent_ssq_info * struct spfc_hba_info *hba;
hba = (struct spfc_hba_info *)sq->hba; - pmsn = sq->last_cmsn; + pmsn = sq->last_pmsn;
if (sq->cache_id == INVALID_VALUE32) { FC_DRV_PRINT(UNF_LOG_IO_ATT, UNF_ERR, @@ -3166,7 +3166,7 @@ u32 spfc_parent_sq_ring_doorbell(struct spfc_parent_ssq_info *sq, u8 qos_level, struct spfc_parent_sq_db door_bell;
hba = (struct spfc_hba_info *)sq->hba; - pmsn = sq->last_cmsn; + pmsn = sq->last_pmsn; /* Obtain the low 8 Bit of PMSN */ pmsn_lo = (u8)(pmsn & SPFC_PMSN_MASK); /* Obtain the high 8 Bit of PMSN */ @@ -3231,10 +3231,10 @@ u32 spfc_direct_sq_enqueue(struct spfc_parent_ssq_info *ssq, struct spfc_sqe *io FC_DRV_PRINT(UNF_LOG_NORMAL, UNF_INFO, "[info]Ssq(0x%x), xid(0x%x) qid(0x%x) add wqepage at Pmsn(0x%x), sqe_minus_cqe_cnt(0x%x)", ssq->sqn, ssq->context_id, ssq->sq_queue_id, - ssq->last_cmsn, + ssq->last_pmsn, atomic_read(&ssq->sqe_minus_cqe_cnt));
- link_wqe_msn = SPFC_MSN_DEC(ssq->last_cmsn); + link_wqe_msn = SPFC_MSN_DEC(ssq->last_pmsn); link_wqe = (struct spfc_linkwqe *)spfc_get_wqe_page_entry(tail_wpg, ssq->wqe_offset); msn_wd = be32_to_cpu(link_wqe->val_wd1); @@ -3250,7 +3250,7 @@ u32 spfc_direct_sq_enqueue(struct spfc_parent_ssq_info *ssq, struct spfc_sqe *io } sqe_in_wp = (struct spfc_sqe *)spfc_get_wqe_page_entry(tail_wpg, ssq->wqe_offset); - spfc_build_wqe_owner_pmsn(io_sqe, (ssq->last_pi_owner), ssq->last_cmsn); + spfc_build_wqe_owner_pmsn(io_sqe, (ssq->last_pi_owner), ssq->last_pmsn); SPFC_IO_STAT((struct spfc_hba_info *)ssq->hba, wqe_type);
wqe_gpa = tail_wpg->wpg_phy_addr + (ssq->wqe_offset * sizeof(struct spfc_sqe)); @@ -3260,11 +3260,11 @@ u32 spfc_direct_sq_enqueue(struct spfc_parent_ssq_info *ssq, struct spfc_sqe *io dre_door_bell.wd0.cos = 0; dre_door_bell.wd0.c = 0; dre_door_bell.wd0.pi_hi = - (u32)(ssq->last_cmsn >> UNF_SHIFT_12) & SPFC_DB_WD0_PI_H_MASK; + (u32)(ssq->last_pmsn >> UNF_SHIFT_12) & SPFC_DB_WD0_PI_H_MASK; dre_door_bell.wd0.cntx_size = SPFC_CNTX_SIZE_T_256B; dre_door_bell.wd0.xid = ssq->context_id; dre_door_bell.wd1.sm_data = ssq->cache_id; - dre_door_bell.wd1.pi_lo = (u32)(ssq->last_cmsn & SPFC_DB_WD0_PI_L_MASK); + dre_door_bell.wd1.pi_lo = (u32)(ssq->last_pmsn & SPFC_DB_WD0_PI_L_MASK); io_sqe->db_val = *(u64 *)&dre_door_bell;
spfc_convert_parent_wqe_to_big_endian(io_sqe); @@ -3275,7 +3275,7 @@ u32 spfc_direct_sq_enqueue(struct spfc_parent_ssq_info *ssq, struct spfc_sqe *io "[INFO]Ssq(0x%x) xid:0x%x,qid:0x%x wqegpa:0x%llx,o:0x%x,outstandind:0x%x,pmsn:0x%x,cmsn:0x%x", ssq->sqn, ssq->context_id, ssq->sq_queue_id, wqe_gpa, ssq->last_pi_owner, atomic_read(&ssq->sqe_minus_cqe_cnt), - ssq->last_cmsn, SPFC_GET_QUEUE_CMSN(ssq)); + ssq->last_pmsn, SPFC_GET_QUEUE_CMSN(ssq));
ssq->accum_wqe_cnt++; if (ssq->accum_wqe_cnt == accum_db_num) { @@ -3286,7 +3286,7 @@ u32 spfc_direct_sq_enqueue(struct spfc_parent_ssq_info *ssq, struct spfc_sqe *io }
ssq->wqe_offset += 1; - ssq->last_cmsn = SPFC_MSN_INC(ssq->last_cmsn); + ssq->last_pmsn = SPFC_MSN_INC(ssq->last_pmsn); atomic_inc(&ssq->sq_wqe_cnt); atomic_inc(&ssq->sqe_minus_cqe_cnt); SPFC_SQ_IO_STAT(ssq, wqe_type); @@ -3319,7 +3319,7 @@ u32 spfc_parent_ssq_enqueue(struct spfc_parent_ssq_info *ssq, struct spfc_sqe *i FC_DRV_PRINT(UNF_LOG_NORMAL, UNF_INFO, "[info]Ssq(0x%x), xid(0x%x) qid(0x%x) add wqepage at Pmsn(0x%x), WpgCnt(0x%x)", ssq->sqn, ssq->context_id, ssq->sq_queue_id, - ssq->last_cmsn, + ssq->last_pmsn, atomic_read(&ssq->wqe_page_cnt)); cur_cmsn = SPFC_GET_QUEUE_CMSN(ssq); spfc_free_sq_wqe_page(ssq, cur_cmsn); @@ -3335,7 +3335,7 @@ u32 spfc_parent_ssq_enqueue(struct spfc_parent_ssq_info *ssq, struct spfc_sqe *i link_wqe->next_page_addr_hi = cpu_to_be32(addr_wd); addr_wd = SPFC_LSD(new_wqe_page->wpg_phy_addr); link_wqe->next_page_addr_lo = cpu_to_be32(addr_wd); - link_wqe_msn = SPFC_MSN_DEC(ssq->last_cmsn); + link_wqe_msn = SPFC_MSN_DEC(ssq->last_pmsn); msn_wd = be32_to_cpu(link_wqe->val_wd1); msn_wd |= ((u32)(link_wqe_msn & SPFC_MSNWD_L_MASK)); msn_wd |= (((u32)(link_wqe_msn & SPFC_MSNWD_H_MASK)) << UNF_SHIFT_16); @@ -3351,7 +3351,7 @@ u32 spfc_parent_ssq_enqueue(struct spfc_parent_ssq_info *ssq, struct spfc_sqe *i atomic_inc(&ssq->wqe_page_cnt); }
- spfc_build_wqe_owner_pmsn(io_sqe, !(ssq->last_pi_owner), ssq->last_cmsn); + spfc_build_wqe_owner_pmsn(io_sqe, !(ssq->last_pi_owner), ssq->last_pmsn); SPFC_IO_STAT((struct spfc_hba_info *)ssq->hba, wqe_type); spfc_convert_parent_wqe_to_big_endian(io_sqe); sqe_in_wp = (struct spfc_sqe *)spfc_get_wqe_page_entry(tail_wpg, ssq->wqe_offset); @@ -3371,7 +3371,7 @@ u32 spfc_parent_ssq_enqueue(struct spfc_parent_ssq_info *ssq, struct spfc_sqe *i ssq->accum_wqe_cnt = 0; } ssq->wqe_offset += 1; - ssq->last_cmsn = SPFC_MSN_INC(ssq->last_cmsn); + ssq->last_pmsn = SPFC_MSN_INC(ssq->last_pmsn); atomic_inc(&ssq->sq_wqe_cnt); atomic_inc(&ssq->sqe_minus_cqe_cnt); SPFC_SQ_IO_STAT(ssq, wqe_type); diff --git a/drivers/scsi/spfc/hw/spfc_queue.h b/drivers/scsi/spfc/hw/spfc_queue.h index b1184eb17556..c09f098e7324 100644 --- a/drivers/scsi/spfc/hw/spfc_queue.h +++ b/drivers/scsi/spfc/hw/spfc_queue.h @@ -597,7 +597,7 @@ struct spfc_parent_ssq_info { u32 wqe_offset; u16 head_start_cmsn; u16 head_end_cmsn; - u16 last_cmsn; + u16 last_pmsn; u16 last_pi_owner; u32 queue_style; atomic_t sq_valid;
From: Ye Weihua yeweihua4@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4IGQ2 CVE: NA
-------------------------------
Before a module is deleted, the kernel traverses the jump_label section to release resources applied by each jump entry. Common modules apply for these resources before module initialization, but livepatch modules apply during livepatch redirection.
Therefore, when a livepatch module fails to be inserted, the resources to be applied by jump_label are not applied. As a result, a panic occurs when a null pointer is accessed during resource release.
To solve the this problem, skip resource release if jump label is not relocated.
Signed-off-by: Ye Weihua yeweihua4@huawei.com Reviewed-by: Yang Jihong yangjihong1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/jump_label.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 282d81eb5aa4..7470cdc432a0 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -695,6 +695,9 @@ static void jump_label_del_module(struct module *mod) struct static_key *key = NULL; struct static_key_mod *jlm, **prev;
+ if (unlikely(!mod_klp_rel_completed(mod))) + return; + for (iter = iter_start; iter < iter_stop; iter++) { if (jump_entry_key(iter) == key) continue;
From: Ye Weihua yeweihua4@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4IH1G CVE: NA
---------------------------
Livepatch allows users to call hook functions to perform some customized operations when insmod the .ko. During the test, it is found that in the ARM64 architecture, if accesses the global variables defined by other ko in hook functions, a crash occurs.
Since relocation is performed during the livepatch insertion, instructions in the icache should be invalid. If the instructions in the icache are directly obtained, incorrect addresses may be obtained, caseing crash. Therefore, flush the icache before calling the hook functions.
Signed-off-by: Ye Weihua yeweihua4@huawei.com Reviewed-by: Jian Cheng cj.chengjian@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com
Signed-off-by: Ye Weihua yeweihua4@huawei.com Reviewed-by: Yang Jihong yangjihong1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/moduleloader.h | 2 ++ kernel/livepatch/core.c | 1 + kernel/module.c | 2 +- 3 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index 4fa67a8b2265..2d835b7dc918 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -96,6 +96,8 @@ void module_arch_cleanup(struct module *mod); /* Any cleanup before freeing mod->module_init */ void module_arch_freeing_init(struct module *mod);
+void flush_module_icache(const struct module *mod); + #if defined(CONFIG_KASAN) && !defined(CONFIG_KASAN_VMALLOC) #include <linux/kasan.h> #define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT) diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index de077785e507..1fde6ba196a4 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -1214,6 +1214,7 @@ static int klp_init_patch(struct klp_patch *patch) goto out; }
+ flush_module_icache(patch->mod); set_mod_klp_rel_state(patch->mod, MODULE_KLP_REL_DONE); module_disable_ro(patch->mod); jump_label_apply_nops(patch->mod); diff --git a/kernel/module.c b/kernel/module.c index c5af21dcb873..e7b9ecc1aa34 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3523,7 +3523,7 @@ static int check_module_license_and_versions(struct module *mod) return 0; }
-static void flush_module_icache(const struct module *mod) +void flush_module_icache(const struct module *mod) { /* * Flush the instruction cache, since we've played with text.
From: Yang Yingliang yangyingliang@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4IZNO CVE: NA
-------------------------------------------------
Add member to struct iscsi_conn.
Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-By: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: BiaoXiang Ye yebiaoxiang@huawei.com Reviewed-by: fang yi eric.fangyi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/workqueue.h | 1 + include/scsi/libiscsi.h | 1 + 2 files changed, 2 insertions(+)
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 26de0cae2a0a..0c35ad697a7b 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -344,6 +344,7 @@ enum { __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ __WQ_ORDERED_EXPLICIT = 1 << 19, /* internal: alloc_ordered_workqueue() */ + __WQ_DYNAMIC = 1 << 25, /* internal: only support single work order WQ */
WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */ diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h index e265b274f793..b47428d86a4b 100644 --- a/include/scsi/libiscsi.h +++ b/include/scsi/libiscsi.h @@ -237,6 +237,7 @@ struct iscsi_conn { /* custom statistics */ uint32_t eh_abort_cnt; uint32_t fmr_unalign_cnt; + int intimate_cpu; };
struct iscsi_pool {
From: Biaoxiang Ye yebiaoxiang@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4IZNO CVE: NA
-------------------------------------------------
Currently, single thread workqueue only have single pwq, all of works are queued the same workerpool. This is not optimal on NUMA machines, will cause workers jump around across node.
This patch add a new wq flags __WQ_DYNAMIC, this new kind of single thread workqueue creates a separate pwq covering the intersecting CPUS for each NUMA node which has online CPUS in @attrs->cpumask instead of mapping all entries of numa_pwq_tbl[] to the same pwq. After this, we can specify the @cpu of queue_work_on, so the work can be executed on the same NUMA node of the specified @cpu. This kind of wq only support single work, multi works can't guarantee the work's order.
Signed-off-by: Biaoxiang Ye yebiaoxiang@huawei.com Acked-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: fang yi eric.fangyi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/workqueue.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4cb622b2661b..29a677697fd4 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3938,6 +3938,9 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask. */ copy_workqueue_attrs(new_attrs, attrs); + if (wq->flags & __WQ_DYNAMIC) + new_attrs->no_numa = false; + cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask); if (unlikely(cpumask_empty(new_attrs->cpumask))) cpumask_copy(new_attrs->cpumask, wq_unbound_cpumask); @@ -4193,10 +4196,12 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) get_online_cpus(); if (wq->flags & __WQ_ORDERED) { ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]); - /* there should only be single pwq for ordering guarantee */ - WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node || - wq->pwqs.prev != &wq->dfl_pwq->pwqs_node), - "ordering guarantee broken for workqueue %s\n", wq->name); + if (!(wq->flags & __WQ_DYNAMIC)) { + /* there should only be single pwq for ordering guarantee */ + WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node || + wq->pwqs.prev != &wq->dfl_pwq->pwqs_node), + "ordering guarantee broken for workqueue %s\n", wq->name); + } } else { ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); } @@ -5288,7 +5293,7 @@ static int workqueue_apply_unbound_cpumask(void) if (!(wq->flags & WQ_UNBOUND)) continue; /* creating multiple pwqs breaks ordering guarantee */ - if (wq->flags & __WQ_ORDERED) + if ((wq->flags & __WQ_ORDERED) && !(wq->flags & __WQ_DYNAMIC)) continue;
ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs);
From: Biaoxiang Ye yebiaoxiang@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4IZNO CVE: NA
-------------------------------------------------
On aarch64 NUMA machines, the kworker of iscsi created always jump around across node boundaries. If it work on the different node even different cpu package with the softirq of network interface, memcpy with in iscsi_tcp_segment_recv will be slow down, and iscsi got an terrible performance.
In this patch, we trace the cpu of softirq, and tell queue_work_on to execute iscsi_xmitworker on the same NUMA node.
The performance data as below: fio cmd: fio -filename=/dev/disk/by-id/wwn-0x6883fd3100a2ad260036281700000000 -direct=1 -iodepth=32 -rw=read -bs=64k -size=30G -ioengine=libaio -numjobs=1 -group_reporting -name=mytest -time_based -ramp_time=60 -runtime=60
before patch: Jobs: 1 (f=1): [R] [52.5% done] [852.3MB/0KB/0KB /s] [13.7K/0/0 iops] [eta 00m:57s] Jobs: 1 (f=1): [R] [53.3% done] [861.4MB/0KB/0KB /s] [13.8K/0/0 iops] [eta 00m:56s] Jobs: 1 (f=1): [R] [54.2% done] [868.2MB/0KB/0KB /s] [13.9K/0/0 iops] [eta 00m:55s]
after pactch: Jobs: 1 (f=1): [R] [53.3% done] [1070MB/0KB/0KB /s] [17.2K/0/0 iops] [eta 00m:56s] Jobs: 1 (f=1): [R] [55.0% done] [1064MB/0KB/0KB /s] [17.3K/0/0 iops] [eta 00m:54s] Jobs: 1 (f=1): [R] [56.7% done] [1069MB/0KB/0KB /s] [17.1K/0/0 iops] [eta 00m:52s]
cpu info: Architecture: aarch64 Byte Order: Little Endian CPU(s): 128 On-line CPU(s) list: 0-127 Thread(s) per core: 1 Core(s) per socket: 64 Socket(s): 2 NUMA node(s): 4 Model: 0 CPU max MHz: 2600.0000 CPU min MHz: 200.0000 BogoMIPS: 200.00 L1d cache: 64K L1i cache: 64K L2 cache: 512K L3 cache: 32768K NUMA node0 CPU(s): 0-31 NUMA node1 CPU(s): 32-63 NUMA node2 CPU(s): 64-95 NUMA node3 CPU(s): 96-127
Signed-off-by: Biaoxiang Ye yebiaoxiang@huawei.com Acked-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: fang yi eric.fangyi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/scsi/iscsi_tcp.c | 8 ++++++++ drivers/scsi/libiscsi.c | 17 ++++++++++++----- 2 files changed, 20 insertions(+), 5 deletions(-)
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c index d39f812d9b92..a226a040647a 100644 --- a/drivers/scsi/iscsi_tcp.c +++ b/drivers/scsi/iscsi_tcp.c @@ -127,6 +127,7 @@ static void iscsi_sw_tcp_data_ready(struct sock *sk) struct iscsi_conn *conn; struct iscsi_tcp_conn *tcp_conn; read_descriptor_t rd_desc; + int current_cpu;
read_lock_bh(&sk->sk_callback_lock); conn = sk->sk_user_data; @@ -136,6 +137,13 @@ static void iscsi_sw_tcp_data_ready(struct sock *sk) } tcp_conn = conn->dd_data;
+ /* save intimate cpu when in softirq */ + if (!sock_owned_by_user_nocheck(sk)) { + current_cpu = smp_processor_id(); + if (conn->intimate_cpu != current_cpu) + conn->intimate_cpu = current_cpu; + } + /* * Use rd_desc to pass 'conn' to iscsi_tcp_recv. * We set count to 1 because we want the network layer to diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index ed6a6bbfef23..fa1cb988dfcf 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -87,9 +87,15 @@ inline void iscsi_conn_queue_work(struct iscsi_conn *conn) { struct Scsi_Host *shost = conn->session->host; struct iscsi_host *ihost = shost_priv(shost); + int intimate_cpu = conn->intimate_cpu;
- if (ihost->workq) - queue_work(ihost->workq, &conn->xmitwork); + if (ihost->workq) { + /* we expect it to be excuted on the same numa of the intimate cpu */ + if ((intimate_cpu >= 0) && cpu_possible(intimate_cpu)) + queue_work_on(intimate_cpu, ihost->workq, &conn->xmitwork); + else + queue_work(ihost->workq, &conn->xmitwork); + } } EXPORT_SYMBOL_GPL(iscsi_conn_queue_work);
@@ -2732,9 +2738,9 @@ struct Scsi_Host *iscsi_host_alloc(struct scsi_host_template *sht, if (xmit_can_sleep) { snprintf(ihost->workq_name, sizeof(ihost->workq_name), "iscsi_q_%d", shost->host_no); - ihost->workq = alloc_workqueue("%s", - WQ_SYSFS | __WQ_LEGACY | WQ_MEM_RECLAIM | WQ_UNBOUND, - 1, ihost->workq_name); + /* this kind of workqueue only support single work */ + ihost->workq = alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM | + __WQ_DYNAMIC, ihost->workq_name); if (!ihost->workq) goto free_host; } @@ -2986,6 +2992,7 @@ iscsi_conn_setup(struct iscsi_cls_session *cls_session, int dd_size, conn->c_stage = ISCSI_CONN_INITIAL_STAGE; conn->id = conn_idx; conn->exp_statsn = 0; + conn->intimate_cpu = -1;
timer_setup(&conn->transport_timer, iscsi_check_transport_timeouts, 0);
From: Yanling Song songyl@ramaxel.com
Ramaxel inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4J0GH CVE: NA
--------------------------------------------
1.Solve the problem that the indirection table cannot be obtained by ethtool - x ethx command. 2.Solve the system oops problem caused by ethtool - x ethx default command.
Signed-off-by: Yanling Song songyl@ramaxel.com Reviewed-by: Xu Yunxuyun@ramaxel.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/ramaxel/spnic/spnic_ethtool.c | 2 ++ drivers/net/ethernet/ramaxel/spnic/spnic_rss.c | 5 +++++ drivers/net/ethernet/ramaxel/spnic/spnic_rss.h | 2 ++ 3 files changed, 9 insertions(+)
diff --git a/drivers/net/ethernet/ramaxel/spnic/spnic_ethtool.c b/drivers/net/ethernet/ramaxel/spnic/spnic_ethtool.c index 3f6f69b4cb34..dc49395c47d5 100644 --- a/drivers/net/ethernet/ramaxel/spnic/spnic_ethtool.c +++ b/drivers/net/ethernet/ramaxel/spnic/spnic_ethtool.c @@ -939,6 +939,7 @@ static const struct ethtool_ops spnic_ethtool_ops = { .get_module_info = spnic_get_module_info, .get_module_eeprom = spnic_get_module_eeprom,
+ .get_rxfh_indir_size = spnic_get_rxfh_indir_size, .get_rxfh_key_size = spnic_get_rxfh_key_size, .get_rxfh = spnic_get_rxfh, .set_rxfh = spnic_set_rxfh, @@ -972,6 +973,7 @@ static const struct ethtool_ops spnicvf_ethtool_ops = { .get_channels = spnic_get_channels, .set_channels = spnic_set_channels,
+ .get_rxfh_indir_size = spnic_get_rxfh_indir_size, .get_rxfh_key_size = spnic_get_rxfh_key_size, .get_rxfh = spnic_get_rxfh, .set_rxfh = spnic_set_rxfh, diff --git a/drivers/net/ethernet/ramaxel/spnic/spnic_rss.c b/drivers/net/ethernet/ramaxel/spnic/spnic_rss.c index a0dcc1519262..86f6f92f669b 100644 --- a/drivers/net/ethernet/ramaxel/spnic/spnic_rss.c +++ b/drivers/net/ethernet/ramaxel/spnic/spnic_rss.c @@ -681,6 +681,11 @@ static int set_rss_rxfh(struct net_device *netdev, const u32 *indir, const u8 *k return 0; }
+u32 spnic_get_rxfh_indir_size(struct net_device *netdev) +{ + return SPNIC_RSS_INDIR_SIZE; +} + u32 spnic_get_rxfh_key_size(struct net_device *netdev) { return SPNIC_RSS_KEY_SIZE; diff --git a/drivers/net/ethernet/ramaxel/spnic/spnic_rss.h b/drivers/net/ethernet/ramaxel/spnic/spnic_rss.h index 15c930a271be..e64a4dcf39dd 100644 --- a/drivers/net/ethernet/ramaxel/spnic/spnic_rss.h +++ b/drivers/net/ethernet/ramaxel/spnic/spnic_rss.h @@ -39,6 +39,8 @@ void spnic_get_channels(struct net_device *netdev, struct ethtool_channels *chan
int spnic_set_channels(struct net_device *netdev, struct ethtool_channels *channels);
+u32 spnic_get_rxfh_indir_size(struct net_device *netdev); + u32 spnic_get_rxfh_key_size(struct net_device *netdev);
int spnic_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key, u8 *hfunc);
From: Yanling Song songyl@ramaxel.com
Ramaxel inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4J0GH CVE: NA
-----------------------------------------------
Support to configure DMA attribute thru firmware
Signed-off-by: Yanling Song songyl@ramaxel.com Reviewed-by: Xu Yunxuyun@ramaxel.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../ramaxel/spnic/hw/sphw_comm_msg_intf.h | 2 +- .../ethernet/ramaxel/spnic/hw/sphw_hw_comm.c | 27 ++++++ .../ethernet/ramaxel/spnic/hw/sphw_hw_comm.h | 3 + .../ethernet/ramaxel/spnic/hw/sphw_hwdev.c | 82 ++++--------------- 4 files changed, 46 insertions(+), 68 deletions(-)
diff --git a/drivers/net/ethernet/ramaxel/spnic/hw/sphw_comm_msg_intf.h b/drivers/net/ethernet/ramaxel/spnic/hw/sphw_comm_msg_intf.h index a1abbd054d2a..fd12a47e5bb5 100644 --- a/drivers/net/ethernet/ramaxel/spnic/hw/sphw_comm_msg_intf.h +++ b/drivers/net/ethernet/ramaxel/spnic/hw/sphw_comm_msg_intf.h @@ -135,7 +135,7 @@ struct comm_cmd_msix_config { u8 rsvd2[5]; };
-struct comm_cmd_dma_attr { +struct comm_cmd_dma_attr_config { struct mgmt_msg_head head;
u16 func_id; diff --git a/drivers/net/ethernet/ramaxel/spnic/hw/sphw_hw_comm.c b/drivers/net/ethernet/ramaxel/spnic/hw/sphw_hw_comm.c index 1629d1e480e2..b868bf8ed1cb 100644 --- a/drivers/net/ethernet/ramaxel/spnic/hw/sphw_hw_comm.c +++ b/drivers/net/ethernet/ramaxel/spnic/hw/sphw_hw_comm.c @@ -403,6 +403,33 @@ int sphw_set_ceq_ctrl_reg(struct sphw_hwdev *hwdev, u16 q_id, u32 ctrl0, u32 ctr return 0; }
+int sphw_set_dma_attr_tbl(struct sphw_hwdev *hwdev, u8 entry_idx, u8 st, u8 at, u8 ph, + u8 no_snooping, u8 tph_en) +{ + struct comm_cmd_dma_attr_config dma_attr; + u16 out_size = sizeof(dma_attr); + int err; + + memset(&dma_attr, 0, sizeof(dma_attr)); + dma_attr.func_id = sphw_global_func_id(hwdev); + dma_attr.entry_idx = entry_idx; + dma_attr.st = st; + dma_attr.at = at; + dma_attr.ph = ph; + dma_attr.no_snooping = no_snooping; + dma_attr.tph_en = tph_en; + + err = comm_msg_to_mgmt_sync(hwdev, COMM_MGMT_CMD_SET_DMA_ATTR, &dma_attr, sizeof(dma_attr), + &dma_attr, &out_size); + if (err || !out_size || dma_attr.head.status) { + sdk_err(hwdev->dev_hdl, "Failed to set dma_attr, err: %d, status: 0x%x, out_size: 0x%x\n", + err, dma_attr.head.status, out_size); + return -EIO; + } + + return 0; +} + int sphw_set_bdf_ctxt(void *hwdev, u8 bus, u8 device, u8 function) { struct comm_cmd_bdf_info bdf_info; diff --git a/drivers/net/ethernet/ramaxel/spnic/hw/sphw_hw_comm.h b/drivers/net/ethernet/ramaxel/spnic/hw/sphw_hw_comm.h index da37d3ed20ad..4e0cf2dfb21e 100644 --- a/drivers/net/ethernet/ramaxel/spnic/hw/sphw_hw_comm.h +++ b/drivers/net/ethernet/ramaxel/spnic/hw/sphw_hw_comm.h @@ -34,6 +34,9 @@ int sphw_ppf_ext_db_deinit(void *dev);
int sphw_set_ceq_ctrl_reg(struct sphw_hwdev *hwdev, u16 q_id, u32 ctrl0, u32 ctrl1);
+int sphw_set_dma_attr_tbl(struct sphw_hwdev *hwdevm, u8 entry_idx, u8 st, u8 at, u8 ph, + u8 no_snooping, u8 tph_en); + int sphw_get_comm_features(void *hwdev, u64 *s_feature, u16 size); int sphw_set_comm_features(void *hwdev, u64 *s_feature, u16 size);
diff --git a/drivers/net/ethernet/ramaxel/spnic/hw/sphw_hwdev.c b/drivers/net/ethernet/ramaxel/spnic/hw/sphw_hwdev.c index f58b0325bf04..c88799bcda98 100644 --- a/drivers/net/ethernet/ramaxel/spnic/hw/sphw_hwdev.c +++ b/drivers/net/ethernet/ramaxel/spnic/hw/sphw_hwdev.c @@ -366,89 +366,37 @@ void sphw_detect_hw_present(void *hwdev) }
/** - * set_pf_dma_attr_entry - set the dma attributes for entry - * @hwif: the hardware interface of a pci function device - * @entry_idx: the entry index in the dma table - * @st: PCIE TLP steering tag - * @at: PCIE TLP AT field - * @ph: PCIE TLP Processing Hint field - * @no_snooping: PCIE TLP No snooping - * @tph_en: PCIE TLP Processing Hint Enable + * dma_attr_table_init - initialize the default dma attributes + * @hwdev: the pointer to hw device **/ -static void set_pf_dma_attr_entry(struct sphw_hwdev *hwdev, u32 entry_idx, - u8 st, u8 at, u8 ph, - enum sphw_pcie_nosnoop no_snooping, - enum sphw_pcie_tph tph_en) +static int dma_attr_table_init(struct sphw_hwdev *hwdev) { - u32 addr, val, dma_attr_entry; + u32 addr, val, dst_attr;
/* Use indirect access should set entry_idx first*/ addr = SPHW_CSR_DMA_ATTR_INDIR_IDX_ADDR; val = sphw_hwif_read_reg(hwdev->hwif, addr); val = SPHW_DMA_ATTR_INDIR_IDX_CLEAR(val, IDX);
- entry_idx = SPHW_DMA_ATTR_INDIR_IDX_SET(entry_idx, IDX); - - val |= entry_idx; + val |= SPHW_DMA_ATTR_INDIR_IDX_SET(PCIE_MSIX_ATTR_ENTRY, IDX);
sphw_hwif_write_reg(hwdev->hwif, addr, val);
wmb(); /* write index before config */
addr = SPHW_CSR_DMA_ATTR_TBL_ADDR; - val = sphw_hwif_read_reg(hwdev->hwif, addr); - val = SPHW_DMA_ATTR_ENTRY_CLEAR(val, ST) & - SPHW_DMA_ATTR_ENTRY_CLEAR(val, AT) & - SPHW_DMA_ATTR_ENTRY_CLEAR(val, PH) & - SPHW_DMA_ATTR_ENTRY_CLEAR(val, NO_SNOOPING) & - SPHW_DMA_ATTR_ENTRY_CLEAR(val, TPH_EN); - - dma_attr_entry = SPHW_DMA_ATTR_ENTRY_SET(st, ST) | - SPHW_DMA_ATTR_ENTRY_SET(at, AT) | - SPHW_DMA_ATTR_ENTRY_SET(ph, PH) | - SPHW_DMA_ATTR_ENTRY_SET(no_snooping, NO_SNOOPING) | - SPHW_DMA_ATTR_ENTRY_SET(tph_en, TPH_EN); - - val |= dma_attr_entry; - sphw_hwif_write_reg(hwdev->hwif, addr, val); -} - -static int set_vf_dma_attr_entry(struct sphw_hwdev *hwdev, u8 entry_idx, - u8 st, u8 at, u8 ph, - enum sphw_pcie_nosnoop no_snooping, - enum sphw_pcie_tph tph_en) -{ - /* SPHW_MGMT_CMD_DMA_ATTR_SET */ - /*to do vf set dma attr by mpu*/ - return 0; -} - -/** - * dma_attr_table_init - initialize the default dma attributes - * @hwif: the hardware interface of a pci function device - **/ -static int dma_attr_table_init(struct sphw_hwdev *hwdev) -{ - int err = 0; - - /* TODO: check if set pf dma attr through uP, the same as vf */ - if (SPHW_IS_VF(hwdev)) - err = set_vf_dma_attr_entry(hwdev, PCIE_MSIX_ATTR_ENTRY, - SPHW_PCIE_ST_DISABLE, - SPHW_PCIE_AT_DISABLE, - SPHW_PCIE_PH_DISABLE, - SPHW_PCIE_SNOOP, - SPHW_PCIE_TPH_DISABLE); - else - set_pf_dma_attr_entry(hwdev, PCIE_MSIX_ATTR_ENTRY, - SPHW_PCIE_ST_DISABLE, - SPHW_PCIE_AT_DISABLE, - SPHW_PCIE_PH_DISABLE, - SPHW_PCIE_SNOOP, - SPHW_PCIE_TPH_DISABLE); + dst_attr = SPHW_DMA_ATTR_ENTRY_SET(SPHW_PCIE_ST_DISABLE, ST) | + SPHW_DMA_ATTR_ENTRY_SET(SPHW_PCIE_AT_DISABLE, AT) | + SPHW_DMA_ATTR_ENTRY_SET(SPHW_PCIE_PH_DISABLE, PH) | + SPHW_DMA_ATTR_ENTRY_SET(SPHW_PCIE_SNOOP, NO_SNOOPING) | + SPHW_DMA_ATTR_ENTRY_SET(SPHW_PCIE_TPH_DISABLE, TPH_EN); + if (dst_attr == val) + return 0;
- return err; + return sphw_set_dma_attr_tbl(hwdev, PCIE_MSIX_ATTR_ENTRY, SPHW_PCIE_ST_DISABLE, + SPHW_PCIE_AT_DISABLE, SPHW_PCIE_PH_DISABLE, + SPHW_PCIE_SNOOP, SPHW_PCIE_TPH_DISABLE); }
static int init_aeqs_msix_attr(struct sphw_hwdev *hwdev)
From: Yanling Song songyl@ramaxel.com
Ramaxel inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4J0GH CVE: NA
----------------------------------------
When the firmware does not support VXLAN offload, clear the flag of CSUM and TS0.
Signed-off-by: Yanling Song songyl@ramaxel.com Reviewed-by: Xu Yunxuyun@ramaxel.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/ramaxel/spnic/spnic_main.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/ramaxel/spnic/spnic_main.c b/drivers/net/ethernet/ramaxel/spnic/spnic_main.c index fa57a912cd5e..f09f11488042 100644 --- a/drivers/net/ethernet/ramaxel/spnic/spnic_main.c +++ b/drivers/net/ethernet/ramaxel/spnic/spnic_main.c @@ -235,8 +235,11 @@ static void netdev_feature_init(struct net_device *netdev)
netdev->priv_flags |= IFF_UNICAST_FLT;
- netdev->hw_enc_features |= dft_fts | cso_fts; - netdev->hw_enc_features |= tso_fts | NETIF_F_TSO_ECN; + netdev->hw_enc_features |= dft_fts; + if (SPNIC_SUPPORT_VXLAN_OFFLOAD(nic_dev->hwdev)) { + netdev->hw_enc_features |= cso_fts; + netdev->hw_enc_features |= tso_fts | NETIF_F_TSO_ECN; + } }
static void init_intr_coal_param(struct spnic_nic_dev *nic_dev)
From: Xiangyou Xie xiexiangyou@huawei.com
virt inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4J0W7 CVE: NA
-------------------------------------------------
Because dist->lpi_list_lock is a perVM lock, when a virtual machine is configured with multiple virtual NIC devices and receives network packets at the same time, dist->lpi_list_lock will become a performance bottleneck.
This patch increases the number of lpi_translation_cache to eight, hashes the cpuid that executes irqfd_wakeup, and chooses which lpi_translation_cache to use.
I tested the impact of virtual interrupt injection time-consuming: Run the iperf command to send UDP packets to the VM: iperf -c $IP -u -b 40m -l 64 -t 6000& The vm just receive UDP traffic. When configure multiple NICs, each NIC receives the above iperf UDP traffic, This may reflect the performance impact of shared resource competition, such as lock.
Observing the delay of virtual interrupt injection: the time spent by the "irqfd_wakeup", "irqfd_inject" function, and kworker context switch. The less the better.
ITS translation cache greatly reduces the delay of interrupt injection compared to kworker thread, because it eliminate wakeup and uncertain scheduling delay: kworker ITS translation cache improved 1 NIC 6.692 us 1.766 us 73.6% 10 NICs 7.536 us 2.574 us 65.8%
Increases the number of lpi_translation_cache reduce lock competition. Multi-interrupt concurrent injections perform better:
ITS translation cache with patch improved 1 NIC 1.766 us 1.694 us 4.1% 10 NICs 2.574 us 1.848 us 28.2%
Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Reviewed-by: Hailiang Zhang zhang.zhanghailiang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Chaochao Xing xingchaochao@huawei.com Reviewed-by: Xiangyou Xie xiexiangyou@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kvm/vgic/vgic-init.c | 8 +- arch/arm64/kvm/vgic/vgic-its.c | 211 +++++++++++++++++++------------- include/kvm/arm_vgic.h | 13 +- 3 files changed, 146 insertions(+), 86 deletions(-)
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 32e32d67a127..980f20418a42 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -52,9 +52,15 @@ void kvm_vgic_early_init(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; + raw_spinlock_t *lpi_lock; + int i;
INIT_LIST_HEAD(&dist->lpi_list_head); - INIT_LIST_HEAD(&dist->lpi_translation_cache); + for (i = 0; i < LPI_TRANS_CACHES_NUM; i++) { + lpi_lock = &dist->lpi_translation_cache[i].lpi_cache_lock; + INIT_LIST_HEAD(&dist->lpi_translation_cache[i].lpi_cache); + raw_spin_lock_init(lpi_lock); + } raw_spin_lock_init(&dist->lpi_list_lock); }
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 61728c543eb9..ff5880c3dc78 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -545,13 +545,21 @@ static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm, return 0; }
+/* Default is 16 cached LPIs per vcpu */ +#define LPI_DEFAULT_PCPU_CACHE_SIZE 16 + static struct vgic_irq *__vgic_its_check_cache(struct vgic_dist *dist, phys_addr_t db, - u32 devid, u32 eventid) + u32 devid, u32 eventid, + int cacheid) { struct vgic_translation_cache_entry *cte; + struct vgic_irq *irq = NULL; + struct list_head *cache_head; + int pos = 0;
- list_for_each_entry(cte, &dist->lpi_translation_cache, entry) { + cache_head = &dist->lpi_translation_cache[cacheid].lpi_cache; + list_for_each_entry(cte, cache_head, entry) { /* * If we hit a NULL entry, there is nothing after this * point. @@ -559,21 +567,25 @@ static struct vgic_irq *__vgic_its_check_cache(struct vgic_dist *dist, if (!cte->irq) break;
- if (cte->db != db || cte->devid != devid || - cte->eventid != eventid) - continue; + pos++;
- /* - * Move this entry to the head, as it is the most - * recently used. - */ - if (!list_is_first(&cte->entry, &dist->lpi_translation_cache)) - list_move(&cte->entry, &dist->lpi_translation_cache); + if (cte->devid == devid && + cte->eventid == eventid && + cte->db == db) { + /* + * Move this entry to the head if the entry at the + * position behind the LPI_DEFAULT_PCPU_CACHE_SIZE * 2 + * of the LRU list, as it is the most recently used. + */ + if (pos > LPI_DEFAULT_PCPU_CACHE_SIZE * 2) + list_move(&cte->entry, cache_head);
- return cte->irq; + irq = cte->irq; + break; + } }
- return NULL; + return irq; }
static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db, @@ -581,11 +593,15 @@ static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db, { struct vgic_dist *dist = &kvm->arch.vgic; struct vgic_irq *irq; - unsigned long flags; + int cpu; + int cacheid;
- raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); - irq = __vgic_its_check_cache(dist, db, devid, eventid); - raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); + cpu = smp_processor_id(); + cacheid = cpu % LPI_TRANS_CACHES_NUM; + + raw_spin_lock(&dist->lpi_translation_cache[cacheid].lpi_cache_lock); + irq = __vgic_its_check_cache(dist, db, devid, eventid, cacheid); + raw_spin_unlock(&dist->lpi_translation_cache[cacheid].lpi_cache_lock);
return irq; } @@ -598,49 +614,58 @@ static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its, struct vgic_translation_cache_entry *cte; unsigned long flags; phys_addr_t db; + raw_spinlock_t *lpi_lock; + struct list_head *cache_head; + int cacheid;
/* Do not cache a directly injected interrupt */ if (irq->hw) return;
- raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); - - if (unlikely(list_empty(&dist->lpi_translation_cache))) - goto out; - - /* - * We could have raced with another CPU caching the same - * translation behind our back, so let's check it is not in - * already - */ - db = its->vgic_its_base + GITS_TRANSLATER; - if (__vgic_its_check_cache(dist, db, devid, eventid)) - goto out; - - /* Always reuse the last entry (LRU policy) */ - cte = list_last_entry(&dist->lpi_translation_cache, - typeof(*cte), entry); + for (cacheid = 0; cacheid < LPI_TRANS_CACHES_NUM; cacheid++) { + lpi_lock = &dist->lpi_translation_cache[cacheid].lpi_cache_lock; + cache_head = &dist->lpi_translation_cache[cacheid].lpi_cache; + raw_spin_lock_irqsave(lpi_lock, flags); + if (unlikely(list_empty(cache_head))) { + raw_spin_unlock_irqrestore(lpi_lock, flags); + break; + }
- /* - * Caching the translation implies having an extra reference - * to the interrupt, so drop the potential reference on what - * was in the cache, and increment it on the new interrupt. - */ - if (cte->irq) - __vgic_put_lpi_locked(kvm, cte->irq); + /* + * We could have raced with another CPU caching the same + * translation behind our back, so let's check it is not in + * already + */ + db = its->vgic_its_base + GITS_TRANSLATER; + if (__vgic_its_check_cache(dist, db, devid, eventid, cacheid)) { + raw_spin_unlock_irqrestore(lpi_lock, flags); + continue; + }
- vgic_get_irq_kref(irq); + /* Always reuse the last entry (LRU policy) */ + cte = list_last_entry(cache_head, typeof(*cte), entry);
- cte->db = db; - cte->devid = devid; - cte->eventid = eventid; - cte->irq = irq; + /* + * Caching the translation implies having an extra reference + * to the interrupt, so drop the potential reference on what + * was in the cache, and increment it on the new interrupt. + */ + if (cte->irq) { + raw_spin_lock(&dist->lpi_list_lock); + __vgic_put_lpi_locked(kvm, cte->irq); + raw_spin_unlock(&dist->lpi_list_lock); + } + vgic_get_irq_kref(irq);
- /* Move the new translation to the head of the list */ - list_move(&cte->entry, &dist->lpi_translation_cache); + cte->db = db; + cte->devid = devid; + cte->eventid = eventid; + cte->irq = irq;
-out: - raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); + /* Move the new translation to the head of the list */ + list_move(&cte->entry, cache_head); + raw_spin_unlock_irqrestore(lpi_lock, flags); + } }
void vgic_its_invalidate_cache(struct kvm *kvm) @@ -648,22 +673,29 @@ void vgic_its_invalidate_cache(struct kvm *kvm) struct vgic_dist *dist = &kvm->arch.vgic; struct vgic_translation_cache_entry *cte; unsigned long flags; + raw_spinlock_t *lpi_lock; + struct list_head *cache_head; + int i;
- raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); - - list_for_each_entry(cte, &dist->lpi_translation_cache, entry) { - /* - * If we hit a NULL entry, there is nothing after this - * point. - */ - if (!cte->irq) - break; - - __vgic_put_lpi_locked(kvm, cte->irq); - cte->irq = NULL; + for (i = 0; i < LPI_TRANS_CACHES_NUM; i++) { + lpi_lock = &dist->lpi_translation_cache[i].lpi_cache_lock; + cache_head = &dist->lpi_translation_cache[i].lpi_cache; + raw_spin_lock_irqsave(lpi_lock, flags); + list_for_each_entry(cte, cache_head, entry) { + /* + * If we hit a NULL entry, there is nothing after this + * point. + */ + if (!cte->irq) + break; + + raw_spin_lock(&dist->lpi_list_lock); + __vgic_put_lpi_locked(kvm, cte->irq); + raw_spin_unlock(&dist->lpi_list_lock); + cte->irq = NULL; + } + raw_spin_unlock_irqrestore(lpi_lock, flags); } - - raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); }
int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, @@ -1829,30 +1861,34 @@ static int vgic_register_its_iodev(struct kvm *kvm, struct vgic_its *its, return ret; }
-/* Default is 16 cached LPIs per vcpu */ -#define LPI_DEFAULT_PCPU_CACHE_SIZE 16 - void vgic_lpi_translation_cache_init(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; unsigned int sz; + struct list_head *cache_head; int i; + int cacheid;
- if (!list_empty(&dist->lpi_translation_cache)) - return; + for (cacheid = 0; cacheid < LPI_TRANS_CACHES_NUM; cacheid++) { + cache_head = &dist->lpi_translation_cache[cacheid].lpi_cache; + if (!list_empty(cache_head)) + return; + }
sz = atomic_read(&kvm->online_vcpus) * LPI_DEFAULT_PCPU_CACHE_SIZE;
- for (i = 0; i < sz; i++) { - struct vgic_translation_cache_entry *cte; - - /* An allocation failure is not fatal */ - cte = kzalloc(sizeof(*cte), GFP_KERNEL); - if (WARN_ON(!cte)) - break; - - INIT_LIST_HEAD(&cte->entry); - list_add(&cte->entry, &dist->lpi_translation_cache); + for (cacheid = 0; cacheid < LPI_TRANS_CACHES_NUM; cacheid++) { + cache_head = &dist->lpi_translation_cache[cacheid].lpi_cache; + for (i = 0; i < sz; i++) { + struct vgic_translation_cache_entry *cte; + + /* An allocation failure is not fatal */ + cte = kzalloc(sizeof(*cte), GFP_KERNEL); + if (WARN_ON(!cte)) + break; + INIT_LIST_HEAD(&cte->entry); + list_add(&cte->entry, cache_head); + } } }
@@ -1860,13 +1896,22 @@ void vgic_lpi_translation_cache_destroy(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; struct vgic_translation_cache_entry *cte, *tmp; + unsigned long flags; + raw_spinlock_t *lpi_lock; + struct list_head *cache_head; + int cacheid;
vgic_its_invalidate_cache(kvm);
- list_for_each_entry_safe(cte, tmp, - &dist->lpi_translation_cache, entry) { - list_del(&cte->entry); - kfree(cte); + for (cacheid = 0; cacheid < LPI_TRANS_CACHES_NUM; cacheid++) { + lpi_lock = &dist->lpi_translation_cache[cacheid].lpi_cache_lock; + cache_head = &dist->lpi_translation_cache[cacheid].lpi_cache; + raw_spin_lock_irqsave(lpi_lock, flags); + list_for_each_entry_safe(cte, tmp, cache_head, entry) { + list_del(&cte->entry); + kfree(cte); + } + raw_spin_unlock_irqrestore(lpi_lock, flags); } }
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 3d74f1060bd1..c82c9c76e000 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -33,6 +33,9 @@ #define irq_is_spi(irq) ((irq) >= VGIC_NR_PRIVATE_IRQS && \ (irq) <= VGIC_MAX_SPI)
+/*The number of lpi translation cache lists*/ +#define LPI_TRANS_CACHES_NUM 8 + enum vgic_type { VGIC_V2, /* Good ol' GICv2 */ VGIC_V3, /* New fancy GICv3 */ @@ -163,6 +166,12 @@ struct vgic_io_device { struct kvm_io_device dev; };
+struct its_trans_cache { + /* LPI translation cache */ + struct list_head lpi_cache; + raw_spinlock_t lpi_cache_lock; +}; + struct vgic_its { /* The base address of the ITS control register frame */ gpa_t vgic_its_base; @@ -253,8 +262,8 @@ struct vgic_dist { struct list_head lpi_list_head; int lpi_list_count;
- /* LPI translation cache */ - struct list_head lpi_translation_cache; + /* LPI translation cache array*/ + struct its_trans_cache lpi_translation_cache[LPI_TRANS_CACHES_NUM];
/* used by vgic-debug */ struct vgic_state_iter *iter;
From: Xiangyou Xie xiexiangyou@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4J0W7 CVE: NA
-------------------------------------------------
It is not necessary to invalidate the lpi translation cache when the virtual machine executes the movi instruction to adjust the affinity of the interrupt. Irqbalance will adjust the interrupt affinity in a short period of time to achieve the purpose of interrupting load balancing, but this does not affect the contents of the lpi translation cache.
Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Reviewed-by: Hailiang Zhang zhang.zhanghailiang@huawei.com Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Chaochao Xing xingchaochao@huawei.com Reviewed-by: Xiangyou Xie xiexiangyou@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kvm/vgic/vgic-its.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index ff5880c3dc78..ec6d5b9fc57c 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -921,7 +921,8 @@ static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its, ite->collection = collection; vcpu = kvm_get_vcpu(kvm, collection->target_addr);
- vgic_its_invalidate_cache(kvm); + if (!vcpu->arch.vgic_cpu.lpis_enabled) + vgic_its_invalidate_cache(kvm);
return update_affinity(ite->irq, vcpu); }
From: Yiwen Jiang jiangyiwen@huawei.com
euleros inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4J0W7 CVE: NA
-------------------------------------------------
lpi_cache_lock can be called in irq context, so it should use irqsave spinlock.
Signed-off-by: Yiwen Jiang jiangyiwen@huawei.com Reviewed-by: Hailiang Zhang zhang.zhanghailiang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Chaochao Xing xingchaochao@huawei.com Reviewed-by: Xiangyou Xie xiexiangyou@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kvm/vgic/vgic-its.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index ec6d5b9fc57c..4f31880803d4 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -593,15 +593,16 @@ static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db, { struct vgic_dist *dist = &kvm->arch.vgic; struct vgic_irq *irq; + unsigned long flags; int cpu; int cacheid;
cpu = smp_processor_id(); cacheid = cpu % LPI_TRANS_CACHES_NUM;
- raw_spin_lock(&dist->lpi_translation_cache[cacheid].lpi_cache_lock); + raw_spin_lock_irqsave(&dist->lpi_translation_cache[cacheid].lpi_cache_lock, flags); irq = __vgic_its_check_cache(dist, db, devid, eventid, cacheid); - raw_spin_unlock(&dist->lpi_translation_cache[cacheid].lpi_cache_lock); + raw_spin_unlock_irqrestore(&dist->lpi_translation_cache[cacheid].lpi_cache_lock, flags);
return irq; }
From: Yicong Yang yangyicong@hisilicon.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4J9B0?from=project-issue
------------------------------------------------------------------------
Build HiSilicon I2C/SPI/SFC driver as module.
Signed-off-by: Yicong Yang yangyicong@hisilicon.com Reviewed-by: Jay Fang f.fangjian@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/configs/openeuler_defconfig | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 76d6a118330d..34aa2da5bf15 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -3378,6 +3378,7 @@ CONFIG_I2C_DESIGNWARE_PLATFORM=m # CONFIG_I2C_EMEV2 is not set CONFIG_I2C_GPIO=m # CONFIG_I2C_GPIO_FAULT_INJECTOR is not set +CONFIG_I2C_HISI=m # CONFIG_I2C_NOMADIK is not set # CONFIG_I2C_OCORES is not set CONFIG_I2C_PCA_PLATFORM=m @@ -3431,7 +3432,8 @@ CONFIG_SPI_DESIGNWARE=y # CONFIG_SPI_DW_DMA is not set CONFIG_SPI_DW_PCI=m CONFIG_SPI_DW_MMIO=y -# CONFIG_SPI_HISI_SFC_V3XX is not set +CONFIG_SPI_HISI_KUNPENG=m +CONFIG_SPI_HISI_SFC_V3XX=m # CONFIG_SPI_NXP_FLEXSPI is not set # CONFIG_SPI_GPIO is not set # CONFIG_SPI_FSL_SPI is not set
From: Zenghui Yu yuzenghui@huawei.com
virt inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4IZPY CVE: NA
-------------------------------------------------
Currently, we use trace_kvm_exit() to report exception type (e.g., "IRQ", "TRAP") and exception class (ESR_ELx's bit[31:26]) together. But hardware only saves the exit class to ESR_ELx on synchronous exceptions, not on asynchronous exceptions. When the guest exits due to external interrupts, we will get tracing output like:
"kvm_exit: IRQ: HSR_EC: 0x0000 (UNKNOWN), PC: 0xffff87259e30"
Obviously, "HSR_EC" here is meaningless.
This patch splits "exit" and "trap" events by adding two tracepoints explicitly in handle_trap_exceptions(). Let trace_kvm_exit() report VM exit events, and trace_kvm_trap_exit() report VM trap events.
These tracepoints are adjusted also in preparation for supporting 'perf kvm stat' on arm64.
Signed-off-by: Zenghui Yu yuzenghui@huawei.com Reviewed-by: Hailiang Zhang zhang.zhanghailiang@huawei.com Signed-off-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Link: https://lore.kernel.org/r/1560330526-15468-3-git-send-email-yuzenghui@huawei... Link: https://gitee.com/openeuler/kernel/commit/14b85d8d7d2d Reviewed-by: Yanan Wang wangyanan55@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kvm/arm.c | 4 ++-- arch/arm64/kvm/handle_exit.c | 3 +++ arch/arm64/kvm/trace_arm.h | 21 +++++++++--------- arch/arm64/kvm/trace_handle_exit.h | 35 ++++++++++++++++++++++++++++++ 4 files changed, 51 insertions(+), 12 deletions(-)
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 1dc68282c840..d27aa51a163b 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -836,7 +836,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) /************************************************************** * Enter the guest */ - trace_kvm_entry(*vcpu_pc(vcpu)); + trace_kvm_entry(vcpu->vcpu_id, *vcpu_pc(vcpu)); guest_enter_irqoff();
ret = kvm_call_hyp_ret(__kvm_vcpu_run, vcpu); @@ -894,7 +894,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) * guest time. */ guest_exit(); - trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); + trace_kvm_exit(vcpu->vcpu_id, ret, *vcpu_pc(vcpu));
/* Exit types that need handling before we can be preempted */ handle_exit_early(vcpu, ret); diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 00d7ece1727d..a3837c9f3521 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -232,7 +232,10 @@ static int handle_trap_exceptions(struct kvm_vcpu *vcpu) exit_handle_fn exit_handler;
exit_handler = kvm_get_exit_handler(vcpu); + trace_kvm_trap_enter(vcpu->vcpu_id, + kvm_vcpu_trap_get_class(vcpu)); handled = exit_handler(vcpu); + trace_kvm_trap_exit(vcpu->vcpu_id); }
return handled; diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h index 6fb8b11abdc2..d5edb4cb217b 100644 --- a/arch/arm64/kvm/trace_arm.h +++ b/arch/arm64/kvm/trace_arm.h @@ -12,40 +12,41 @@ * Tracepoints for entry/exit to guest */ TRACE_EVENT(kvm_entry, - TP_PROTO(unsigned long vcpu_pc), - TP_ARGS(vcpu_pc), + TP_PROTO(unsigned int vcpu_id, unsigned long vcpu_pc), + TP_ARGS(vcpu_id, vcpu_pc),
TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) __field( unsigned long, vcpu_pc ) ),
TP_fast_assign( + __entry->vcpu_id = vcpu_id; __entry->vcpu_pc = vcpu_pc; ),
- TP_printk("PC: 0x%016lx", __entry->vcpu_pc) + TP_printk("VCPU %u: PC=0x%016lx", __entry->vcpu_id, __entry->vcpu_pc) );
TRACE_EVENT(kvm_exit, - TP_PROTO(int ret, unsigned int esr_ec, unsigned long vcpu_pc), - TP_ARGS(ret, esr_ec, vcpu_pc), + TP_PROTO(unsigned int vcpu_id, int ret, unsigned long vcpu_pc), + TP_ARGS(vcpu_id, ret, vcpu_pc),
TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) __field( int, ret ) - __field( unsigned int, esr_ec ) __field( unsigned long, vcpu_pc ) ),
TP_fast_assign( + __entry->vcpu_id = vcpu_id; __entry->ret = ARM_EXCEPTION_CODE(ret); - __entry->esr_ec = ARM_EXCEPTION_IS_TRAP(ret) ? esr_ec : 0; __entry->vcpu_pc = vcpu_pc; ),
- TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%016lx", + TP_printk("VCPU %u: exit_type=%s, PC=0x%016lx", + __entry->vcpu_id, __print_symbolic(__entry->ret, kvm_arm_exception_type), - __entry->esr_ec, - __print_symbolic(__entry->esr_ec, kvm_arm_exception_class), __entry->vcpu_pc) );
diff --git a/arch/arm64/kvm/trace_handle_exit.h b/arch/arm64/kvm/trace_handle_exit.h index 8d78acc4fba7..486721fb6bda 100644 --- a/arch/arm64/kvm/trace_handle_exit.h +++ b/arch/arm64/kvm/trace_handle_exit.h @@ -8,6 +8,41 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm
+TRACE_EVENT(kvm_trap_enter, + TP_PROTO(unsigned int vcpu_id, unsigned int esr_ec), + TP_ARGS(vcpu_id, esr_ec), + + TP_STRUCT__entry( + __field(unsigned int, vcpu_id) + __field(unsigned int, esr_ec) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->esr_ec = esr_ec; + ), + + TP_printk("VCPU %u: HSR_EC=0x%04x (%s)", + __entry->vcpu_id, + __entry->esr_ec, + __print_symbolic(__entry->esr_ec, kvm_arm_exception_class)) +); + +TRACE_EVENT(kvm_trap_exit, + TP_PROTO(unsigned int vcpu_id), + TP_ARGS(vcpu_id), + + TP_STRUCT__entry( + __field(unsigned int, vcpu_id) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + ), + + TP_printk("VCPU %u", __entry->vcpu_id) +); + TRACE_EVENT(kvm_wfx_arm64, TP_PROTO(unsigned long vcpu_pc, bool is_wfe), TP_ARGS(vcpu_pc, is_wfe),
From: Zenghui Yu yuzenghui@huawei.com
virt inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4IZPY CVE: NA
-------------------------------------------------
'perf kvm stat report/record' generates a statistical analysis of KVM events and can be used to analyze guest exit reasons. This patch tries to add stat support on arm64.
We have a mapping between guest's "exit_code" and "exit_reason" which already exists under arch/arm64/include/asm/ (kvm_arm_exception_type), and we've used it to report guest's exit type through trace_kvm_exit(). Copy kvm_arm_exception_type into aarch64_guest_exits.h, thus export it to userspace.
It records on two available KVM tracepoints for arm64: "kvm:kvm_entry" and "kvm:kvm_exit", and reports statistical data which includes events handles time, samples, and so on.
A simple test go below:
# pgrep qemu 6039 9937
# ./tools/perf/perf kvm stat record -p 6039 [ perf record: Woken up 3 times to write data ] [ perf record: Captured and wrote 15.629 MB perf.data.guest (199063 samples) ]
# ./tools/perf/perf kvm stat report --event=vmexit
Analyze events for all VMs, all VCPUs:
VM-EXIT Samples Samples% Time% Min Time Max Time Avg time
TRAP 49040 97.15% 100.00% 2.60us 4072.98us 3431.60us ( +- 0.17% ) IRQ 1437 2.85% 0.00% 0.90us 24.56us 2.06us ( +- 1.37% )
Total Samples:50477, Total events handled time:168288630.04us.
Signed-off-by: Zenghui Yu yuzenghui@huawei.com Reviewed-by: Hailiang Zhang zhang.zhanghailiang@huawei.com Signed-off-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Link: https://lore.kernel.org/r/1560330526-15468-5-git-send-email-yuzenghui@huawei... Link: https://gitee.com/openeuler/kernel/commit/15db05576381 Reviewed-by: Yanan Wang wangyanan55@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- tools/perf/arch/arm64/Makefile | 2 + tools/perf/arch/arm64/util/Build | 1 + .../arch/arm64/util/aarch64_guest_exits.h | 27 +++++++++++ tools/perf/arch/arm64/util/kvm-stat.c | 46 +++++++++++++++++++ 4 files changed, 76 insertions(+) create mode 100644 tools/perf/arch/arm64/util/aarch64_guest_exits.h create mode 100644 tools/perf/arch/arm64/util/kvm-stat.c
diff --git a/tools/perf/arch/arm64/Makefile b/tools/perf/arch/arm64/Makefile index dbef716a1913..172146e95dbc 100644 --- a/tools/perf/arch/arm64/Makefile +++ b/tools/perf/arch/arm64/Makefile @@ -2,6 +2,8 @@ ifndef NO_DWARF PERF_HAVE_DWARF_REGS := 1 endif + +HAVE_KVM_STAT_SUPPORT := 1 PERF_HAVE_JITDUMP := 1 PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1
diff --git a/tools/perf/arch/arm64/util/Build b/tools/perf/arch/arm64/util/Build index b53294d74b01..8d2b9bcfffca 100644 --- a/tools/perf/arch/arm64/util/Build +++ b/tools/perf/arch/arm64/util/Build @@ -2,6 +2,7 @@ perf-y += header.o perf-y += machine.o perf-y += perf_regs.o perf-y += tsc.o +perf-y += kvm-stat.o perf-$(CONFIG_DWARF) += dwarf-regs.o perf-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o perf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o diff --git a/tools/perf/arch/arm64/util/aarch64_guest_exits.h b/tools/perf/arch/arm64/util/aarch64_guest_exits.h new file mode 100644 index 000000000000..aec2e6e012d3 --- /dev/null +++ b/tools/perf/arch/arm64/util/aarch64_guest_exits.h @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright(c) 2019 Huawei Technologies Co., Ltd + */ + +#ifndef ARCH_PERF_AARCH64_GUEST_EXITS_H +#define ARCH_PERF_AARCH64_GUEST_EXITS_H + +/* virt.h */ +/* Error returned when an invalid stub number is passed into x0 */ +#define HVC_STUB_ERR 0xbadca11 + +/* kvm_asm.h */ +#define ARM_EXCEPTION_IRQ 0 +#define ARM_EXCEPTION_EL1_SERROR 1 +#define ARM_EXCEPTION_TRAP 2 +#define ARM_EXCEPTION_IL 3 +/* The hyp-stub will return this for any kvm_call_hyp() call */ +#define ARM_EXCEPTION_HYP_GONE HVC_STUB_ERR + +#define kvm_arm_exception_type \ + {ARM_EXCEPTION_IRQ, "IRQ" }, \ + {ARM_EXCEPTION_EL1_SERROR, "SERROR" }, \ + {ARM_EXCEPTION_TRAP, "TRAP" }, \ + {ARM_EXCEPTION_HYP_GONE, "HYP_GONE" } + +#endif /* ARCH_PERF_AARCH64_GUEST_EXITS_H */ diff --git a/tools/perf/arch/arm64/util/kvm-stat.c b/tools/perf/arch/arm64/util/kvm-stat.c new file mode 100644 index 000000000000..2fed20370829 --- /dev/null +++ b/tools/perf/arch/arm64/util/kvm-stat.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Arch specific functions for perf kvm stat. + * Copyright(c) 2019 Huawei Technologies Co., Ltd + */ + +#include "../../../util/kvm-stat.h" +#include "aarch64_guest_exits.h" + +define_exit_reasons_table(arm64_exit_reasons, kvm_arm_exception_type); + +static struct kvm_events_ops exit_events = { + .is_begin_event = exit_event_begin, + .is_end_event = exit_event_end, + .decode_key = exit_event_decode_key, + .name = "VM-EXIT" +}; + +const char *vcpu_id_str = "vcpu_id"; +const int decode_str_len = 20; +const char *kvm_exit_reason = "ret"; +const char *kvm_entry_trace = "kvm:kvm_entry"; +const char *kvm_exit_trace = "kvm:kvm_exit"; + +const char *kvm_events_tp[] = { + "kvm:kvm_entry", + "kvm:kvm_exit", + NULL, +}; + +struct kvm_reg_events_ops kvm_reg_events_ops[] = { + { .name = "vmexit", .ops = &exit_events }, + { NULL, NULL }, +}; + +const char * const kvm_skip_events[] = { + NULL, +}; + +int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused) +{ + kvm->exit_reasons = arm64_exit_reasons; + kvm->exit_reasons_isa = "aarch64"; + + return 0; +}
From: Zenghui Yu yuzenghui@huawei.com
virt inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4IZPY CVE: NA
-------------------------------------------------
When guest exits due to "TRAP", we can analyze the guest exit reasons deeplier. Enhance perf-kvm-stat to record and analyze VM TRAP events.
There is a mapping between guest's "trap_code" (ESR_ELx's bits[31:26]) and "trap_reason" - kvm_arm_exception_class. Copy it from kernel to aarch64_guest_exits.h, export it to userspace.
This patch records two new KVM tracepoints: "kvm:kvm_trap_enter" and "kvm:kvm_trap_exit", and reports statistical data between these two tracepoints.
A simple test go below:
# ./tools/perf/perf kvm stat record -p 20763 [ perf record: Woken up 92 times to write data ] [ perf record: Captured and wrote 203.727 MB perf.data.guest (2601786 samples) ]
# ./tools/perf/perf kvm stat report --event=vmexit
Analyze events for all VMs, all VCPUs:
VM-EXIT Samples Samples% Time% Min Time Max Time Avg time
TRAP 640931 97.12% 100.00% 2.44us 14683.86us 3446.49us ( +- 0.05% ) IRQ 19019 2.88% 0.00% 0.90us 461.94us 2.12us ( +- 2.09% )
Total Samples:659950, Total events handled time:2209005391.30us.
# ./tools/perf/perf kvm stat report --event=trap
Analyze events for all VMs, all VCPUs:
TRAP-EVENT Samples Samples% Time% Min Time Max Time Avg time
WFx 601194 93.80% 99.98% 0.90us 4294.04us 3671.01us ( +- 0.03% ) SYS64 33714 5.26% 0.01% 1.10us 41.34us 5.68us ( +- 0.18% ) DABT_LOW 6014 0.94% 0.00% 1.12us 18.04us 2.57us ( +- 0.91% ) IABT_LOW 12 0.00% 0.01% 12597.76us 14679.96us 12893.61us ( +- 1.34% )
Total Samples:640934, Total events handled time:2207353434.56us.
Signed-off-by: Zenghui Yu yuzenghui@huawei.com Reviewed-by: Hailiang Zhang zhang.zhanghailiang@huawei.com Signed-off-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Link: https://lore.kernel.org/r/1560330526-15468-6-git-send-email-yuzenghui@huawei... Link: https://gitee.com/openeuler/kernel/commit/59634497418b Reviewed-by: Yanan Wang wangyanan55@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../arch/arm64/util/aarch64_guest_exits.h | 72 +++++++++++++++++++ tools/perf/arch/arm64/util/kvm-stat.c | 68 ++++++++++++++++++ 2 files changed, 140 insertions(+)
diff --git a/tools/perf/arch/arm64/util/aarch64_guest_exits.h b/tools/perf/arch/arm64/util/aarch64_guest_exits.h index aec2e6e012d3..76e8f0358182 100644 --- a/tools/perf/arch/arm64/util/aarch64_guest_exits.h +++ b/tools/perf/arch/arm64/util/aarch64_guest_exits.h @@ -24,4 +24,76 @@ {ARM_EXCEPTION_TRAP, "TRAP" }, \ {ARM_EXCEPTION_HYP_GONE, "HYP_GONE" }
+/* esr.h */ +#define ESR_ELx_EC_UNKNOWN (0x00) +#define ESR_ELx_EC_WFx (0x01) +/* Unallocated EC: 0x02 */ +#define ESR_ELx_EC_CP15_32 (0x03) +#define ESR_ELx_EC_CP15_64 (0x04) +#define ESR_ELx_EC_CP14_MR (0x05) +#define ESR_ELx_EC_CP14_LS (0x06) +#define ESR_ELx_EC_FP_ASIMD (0x07) +#define ESR_ELx_EC_CP10_ID (0x08) /* EL2 only */ +#define ESR_ELx_EC_PAC (0x09) /* EL2 and above */ +/* Unallocated EC: 0x0A - 0x0B */ +#define ESR_ELx_EC_CP14_64 (0x0C) +#define ESR_ELx_EC_BTI (0x0D) +#define ESR_ELx_EC_ILL (0x0E) +/* Unallocated EC: 0x0F - 0x10 */ +#define ESR_ELx_EC_SVC32 (0x11) +#define ESR_ELx_EC_HVC32 (0x12) /* EL2 only */ +#define ESR_ELx_EC_SMC32 (0x13) /* EL2 and above */ +/* Unallocated EC: 0x14 */ +#define ESR_ELx_EC_SVC64 (0x15) +#define ESR_ELx_EC_HVC64 (0x16) /* EL2 and above */ +#define ESR_ELx_EC_SMC64 (0x17) /* EL2 and above */ +#define ESR_ELx_EC_SYS64 (0x18) +#define ESR_ELx_EC_SVE (0x19) +#define ESR_ELx_EC_ERET (0x1a) /* EL2 only */ +/* Unallocated EC: 0x1B */ +#define ESR_ELx_EC_FPAC (0x1C) /* EL1 and above */ +/* Unallocated EC: 0x1D - 0x1E */ +#define ESR_ELx_EC_IMP_DEF (0x1f) /* EL3 only */ +#define ESR_ELx_EC_IABT_LOW (0x20) +#define ESR_ELx_EC_IABT_CUR (0x21) +#define ESR_ELx_EC_PC_ALIGN (0x22) +/* Unallocated EC: 0x23 */ +#define ESR_ELx_EC_DABT_LOW (0x24) +#define ESR_ELx_EC_DABT_CUR (0x25) +#define ESR_ELx_EC_SP_ALIGN (0x26) +/* Unallocated EC: 0x27 */ +#define ESR_ELx_EC_FP_EXC32 (0x28) +/* Unallocated EC: 0x29 - 0x2B */ +#define ESR_ELx_EC_FP_EXC64 (0x2C) +/* Unallocated EC: 0x2D - 0x2E */ +#define ESR_ELx_EC_SERROR (0x2F) +#define ESR_ELx_EC_BREAKPT_LOW (0x30) +#define ESR_ELx_EC_BREAKPT_CUR (0x31) +#define ESR_ELx_EC_SOFTSTP_LOW (0x32) +#define ESR_ELx_EC_SOFTSTP_CUR (0x33) +#define ESR_ELx_EC_WATCHPT_LOW (0x34) +#define ESR_ELx_EC_WATCHPT_CUR (0x35) +/* Unallocated EC: 0x36 - 0x37 */ +#define ESR_ELx_EC_BKPT32 (0x38) +/* Unallocated EC: 0x39 */ +#define ESR_ELx_EC_VECTOR32 (0x3A) /* EL2 only */ +/* Unallocated EC: 0x3B */ +#define ESR_ELx_EC_BRK64 (0x3C) +/* Unallocated EC: 0x3D - 0x3F */ +#define ESR_ELx_EC_MAX (0x3F) + +/* kvm_arm.h */ +#define ECN(x) { ESR_ELx_EC_##x, #x } + +#define kvm_arm_exception_class \ + ECN(UNKNOWN), ECN(WFx), ECN(CP15_32), ECN(CP15_64), ECN(CP14_MR), \ + ECN(CP14_LS), ECN(FP_ASIMD), ECN(CP10_ID), ECN(PAC), ECN(CP14_64), \ + ECN(SVC64), ECN(HVC64), ECN(SMC64), ECN(SYS64), ECN(SVE), \ + ECN(IMP_DEF), ECN(IABT_LOW), ECN(IABT_CUR), \ + ECN(PC_ALIGN), ECN(DABT_LOW), ECN(DABT_CUR), \ + ECN(SP_ALIGN), ECN(FP_EXC32), ECN(FP_EXC64), ECN(SERROR), \ + ECN(BREAKPT_LOW), ECN(BREAKPT_CUR), ECN(SOFTSTP_LOW), \ + ECN(SOFTSTP_CUR), ECN(WATCHPT_LOW), ECN(WATCHPT_CUR), \ + ECN(BKPT32), ECN(VECTOR32), ECN(BRK64) + #endif /* ARCH_PERF_AARCH64_GUEST_EXITS_H */ diff --git a/tools/perf/arch/arm64/util/kvm-stat.c b/tools/perf/arch/arm64/util/kvm-stat.c index 2fed20370829..a0a97073d2d1 100644 --- a/tools/perf/arch/arm64/util/kvm-stat.c +++ b/tools/perf/arch/arm64/util/kvm-stat.c @@ -4,10 +4,14 @@ * Copyright(c) 2019 Huawei Technologies Co., Ltd */
+#include <string.h> +#include "../../../util/debug.h" +#include "../../../util/evsel.h" #include "../../../util/kvm-stat.h" #include "aarch64_guest_exits.h"
define_exit_reasons_table(arm64_exit_reasons, kvm_arm_exception_type); +define_exit_reasons_table(arm64_trap_reasons, kvm_arm_exception_class);
static struct kvm_events_ops exit_events = { .is_begin_event = exit_event_begin, @@ -22,14 +26,78 @@ const char *kvm_exit_reason = "ret"; const char *kvm_entry_trace = "kvm:kvm_entry"; const char *kvm_exit_trace = "kvm:kvm_exit";
+const char *kvm_trap_reason = "esr_ec"; +const char *kvm_trap_enter_trace = "kvm:kvm_trap_enter"; +const char *kvm_trap_exit_trace = "kvm:kvm_trap_exit"; + +static void trap_event_get_key(struct evsel *evsel, + struct perf_sample *sample, + struct event_key *key) +{ + key->info = 0; + key->key = evsel__intval(evsel, sample, kvm_trap_reason); +} + +static const char *get_trap_reason(u64 exit_code) +{ + struct exit_reasons_table *tbl = arm64_trap_reasons; + + while (tbl->reason != NULL) { + if (tbl->exit_code == exit_code) + return tbl->reason; + tbl++; + } + + pr_err("Unknown kvm trap exit code: %lld on aarch64\n", + (unsigned long long)exit_code); + return "UNKNOWN"; +} + +static bool trap_event_end(struct evsel *evsel, + struct perf_sample *sample __maybe_unused, + struct event_key *key __maybe_unused) +{ + return (!strcmp(evsel->name, kvm_trap_exit_trace)); +} + +static bool trap_event_begin(struct evsel *evsel, + struct perf_sample *sample, struct event_key *key) +{ + if (!strcmp(evsel->name, kvm_trap_enter_trace)) { + trap_event_get_key(evsel, sample, key); + return true; + } + + return false; +} + +static void trap_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused, + struct event_key *key, + char *decode) +{ + const char *trap_reason = get_trap_reason(key->key); + + scnprintf(decode, decode_str_len, "%s", trap_reason); +} + +static struct kvm_events_ops trap_events = { + .is_begin_event = trap_event_begin, + .is_end_event = trap_event_end, + .decode_key = trap_event_decode_key, + .name = "TRAP-EVENT", +}; + const char *kvm_events_tp[] = { "kvm:kvm_entry", "kvm:kvm_exit", + "kvm:kvm_trap_enter", + "kvm:kvm_trap_exit", NULL, };
struct kvm_reg_events_ops kvm_reg_events_ops[] = { { .name = "vmexit", .ops = &exit_events }, + { .name = "trap", .ops = &trap_events }, { NULL, NULL }, };
From: Zenghui Yu yuzenghui@huawei.com
virt inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4IZOS CVE: NA
----------------------------------------------------
Parse ACPI/DTB to get where the hypervisor is running.
Signed-off-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Yanan Wang wangyanan55@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/include/asm/hisi_cpu_model.h | 19 ++++++ arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/Makefile | 1 + arch/arm64/kvm/arm.c | 6 ++ arch/arm64/kvm/hisi_cpu_model.c | 83 +++++++++++++++++++++++++ 5 files changed, 110 insertions(+) create mode 100644 arch/arm64/include/asm/hisi_cpu_model.h create mode 100644 arch/arm64/kvm/hisi_cpu_model.c
diff --git a/arch/arm64/include/asm/hisi_cpu_model.h b/arch/arm64/include/asm/hisi_cpu_model.h new file mode 100644 index 000000000000..f686a7591e8f --- /dev/null +++ b/arch/arm64/include/asm/hisi_cpu_model.h @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright(c) 2019 Huawei Technologies Co., Ltd + */ + +#ifndef __HISI_CPU_MODEL_H__ +#define __HISI_CPU_MODEL_H__ + +enum hisi_cpu_type { + HI_1612, + HI_1616, + HI_1620, + UNKNOWN_HI_TYPE +}; + +extern enum hisi_cpu_type hi_cpu_type; + +void probe_hisi_cpu_type(void); +#endif /* __HISI_CPU_MODEL_H__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index a2a61bb37b22..37aba086c179 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -26,6 +26,7 @@ #include <asm/kvm.h> #include <asm/kvm_asm.h> #include <asm/thread_info.h> +#include <asm/hisi_cpu_model.h>
#define __KVM_HAVE_ARCH_INTC_INITIALIZED
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 02f25d63a6f9..928065a7bae9 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -17,6 +17,7 @@ kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \ guest.o debug.o reset.o sys_regs.o \ vgic-sys-reg-v3.o fpsimd.o pmu.o \ aarch32.o arch_timer.o \ + hisi_cpu_model.o \ vgic/vgic.o vgic/vgic-init.o \ vgic/vgic-irqfd.o vgic/vgic-v2.o \ vgic/vgic-v3.o vgic/vgic-v4.o \ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index d27aa51a163b..901691d5d24d 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -58,6 +58,9 @@ static DEFINE_SPINLOCK(kvm_vmid_lock);
static bool vgic_present;
+/* Hisi cpu type enum */ +enum hisi_cpu_type hi_cpu_type = UNKNOWN_HI_TYPE; + static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled); DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
@@ -1833,6 +1836,9 @@ int kvm_arch_init(void *opaque) return -ENODEV; }
+ /* Probe the Hisi CPU type */ + probe_hisi_cpu_type(); + in_hyp_mode = is_kernel_in_hyp_mode();
if (!in_hyp_mode && kvm_arch_requires_vhe()) { diff --git a/arch/arm64/kvm/hisi_cpu_model.c b/arch/arm64/kvm/hisi_cpu_model.c new file mode 100644 index 000000000000..4d5a099bc27a --- /dev/null +++ b/arch/arm64/kvm/hisi_cpu_model.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright(c) 2019 Huawei Technologies Co., Ltd + */ + +#include <linux/acpi.h> +#include <linux/of.h> +#include <linux/init.h> +#include <linux/kvm_host.h> + +#ifdef CONFIG_ACPI + +/* ACPI Hisi oem table id str */ +const char *oem_str[] = { + "HIP06", /* Hisi 1612 */ + "HIP07", /* Hisi 1616 */ + "HIP08" /* Hisi 1620 */ +}; + +/* + * Get Hisi oem table id. + */ +static void acpi_get_hw_cpu_type(void) +{ + struct acpi_table_header *table; + acpi_status status; + int i, str_size = ARRAY_SIZE(oem_str); + + /* Get oem table id from ACPI table header */ + status = acpi_get_table(ACPI_SIG_DSDT, 0, &table); + if (ACPI_FAILURE(status)) { + pr_err("Failed to get ACPI table: %s\n", + acpi_format_exception(status)); + return; + } + + for (i = 0; i < str_size; ++i) { + if (!strncmp(oem_str[i], table->oem_table_id, 5)) { + hi_cpu_type = i; + return; + } + } +} + +#else +static void acpi_get_hw_cpu_type(void) {} +#endif + +/* of Hisi cpu model str */ +const char *of_model_str[] = { + "Hi1612", + "Hi1616" +}; + +static void of_get_hw_cpu_type(void) +{ + const char *cpu_type; + int ret, i, str_size = ARRAY_SIZE(of_model_str); + + ret = of_property_read_string(of_root, "model", &cpu_type); + if (ret < 0) { + pr_err("Failed to get Hisi cpu model by OF.\n"); + return; + } + + for (i = 0; i < str_size; ++i) { + if (strstr(cpu_type, of_model_str[i])) { + hi_cpu_type = i; + return; + } + } +} + +void probe_hisi_cpu_type(void) +{ + if (!acpi_disabled) + acpi_get_hw_cpu_type(); + else + of_get_hw_cpu_type(); + + if (hi_cpu_type == UNKNOWN_HI_TYPE) + pr_warn("UNKNOWN Hisi cpu type.\n"); +}
From: Zenghui Yu yuzenghui@huawei.com
virt inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4IZOS CVE: NA
----------------------------------------------------
Kunpeng 920 offers the HHA ncsnp capability, with which hypervisor doesn't need to perform a lot of cache maintenance like before (in case the guest has some non-cacheable Stage-1 mappings). Currently we apply this hardware capability when
- vCPU switching MMU+caches on/off - creating Stage-2 mappings for Daborts
Signed-off-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Yanan Wang wangyanan55@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/include/asm/hisi_cpu_model.h | 2 ++ arch/arm64/include/asm/kvm_mmu.h | 2 +- arch/arm64/kvm/arm.c | 2 ++ arch/arm64/kvm/hisi_cpu_model.c | 34 +++++++++++++++++++++++++ arch/arm64/kvm/hyp/pgtable.c | 4 +-- 5 files changed, 41 insertions(+), 3 deletions(-)
diff --git a/arch/arm64/include/asm/hisi_cpu_model.h b/arch/arm64/include/asm/hisi_cpu_model.h index f686a7591e8f..e0da0ef61613 100644 --- a/arch/arm64/include/asm/hisi_cpu_model.h +++ b/arch/arm64/include/asm/hisi_cpu_model.h @@ -14,6 +14,8 @@ enum hisi_cpu_type { };
extern enum hisi_cpu_type hi_cpu_type; +extern bool kvm_ncsnp_support;
void probe_hisi_cpu_type(void); +void probe_hisi_ncsnp_support(void); #endif /* __HISI_CPU_MODEL_H__ */ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 331394306cce..da041664602b 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -150,7 +150,7 @@ static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size) * faulting in pages. Furthermore, FWB implies IDC, so cleaning to * PoU is not required either in this case. */ - if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) + if (kvm_ncsnp_support || cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) return;
kvm_flush_dcache_to_poc(va, size); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 901691d5d24d..7f64131641a4 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -60,6 +60,7 @@ static bool vgic_present;
/* Hisi cpu type enum */ enum hisi_cpu_type hi_cpu_type = UNKNOWN_HI_TYPE; +bool kvm_ncsnp_support;
static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled); DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use); @@ -1838,6 +1839,7 @@ int kvm_arch_init(void *opaque)
/* Probe the Hisi CPU type */ probe_hisi_cpu_type(); + probe_hisi_ncsnp_support();
in_hyp_mode = is_kernel_in_hyp_mode();
diff --git a/arch/arm64/kvm/hisi_cpu_model.c b/arch/arm64/kvm/hisi_cpu_model.c index 4d5a099bc27a..52eecf1ba1cf 100644 --- a/arch/arm64/kvm/hisi_cpu_model.c +++ b/arch/arm64/kvm/hisi_cpu_model.c @@ -81,3 +81,37 @@ void probe_hisi_cpu_type(void) if (hi_cpu_type == UNKNOWN_HI_TYPE) pr_warn("UNKNOWN Hisi cpu type.\n"); } + +#define NCSNP_MMIO_BASE 0x20107E238 + +/* + * We have the fantastic HHA ncsnp capability on Kunpeng 920, + * with which hypervisor doesn't need to perform a lot of cache + * maintenance like before (in case the guest has non-cacheable + * Stage-1 mappings). + */ +void probe_hisi_ncsnp_support(void) +{ + void __iomem *base; + unsigned int high; + + kvm_ncsnp_support = false; + + if (hi_cpu_type != HI_1620) + goto out; + + base = ioremap(NCSNP_MMIO_BASE, 4); + if (!base) { + pr_err("Unable to map MMIO region when probing ncsnp!\n"); + goto out; + } + + high = readl_relaxed(base) >> 28; + iounmap(base); + if (high != 0x1) + kvm_ncsnp_support = true; + +out: + kvm_info("Hisi ncsnp: %s\n", kvm_ncsnp_support ? "enabled" : + "disabled"); +} diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 4d99d07c610c..87bced701c8a 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -643,7 +643,7 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
static void stage2_flush_dcache(void *addr, u64 size) { - if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) + if (kvm_ncsnp_support || cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) return;
__flush_dcache_area(addr, size); @@ -847,7 +847,7 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) .flags = KVM_PGTABLE_WALK_LEAF, };
- if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) + if (kvm_ncsnp_support || cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) return 0;
return kvm_pgtable_walk(pgt, addr, size, &walker);
From: Yanan Wang wangyanan55@huawei.com
mainline inclusion from mainline-v5.12-rc1-dontuse commit 8ed80051c8c3 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4IZOS CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Procedures of hyp stage-1 map and guest stage-2 map are quite different, but they are tied closely by function kvm_set_valid_leaf_pte(). So adjust the relative code for ease of code maintenance in the future.
Signed-off-by: Will Deacon will@kernel.org Signed-off-by: Yanan Wang wangyanan55@huawei.com Signed-off-by: Marc Zyngier maz@kernel.org Link: https://lore.kernel.org/r/20210114121350.123684-2-wangyanan55@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kvm/hyp/pgtable.c | 55 ++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 27 deletions(-)
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 87bced701c8a..87e46b36e1b5 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -170,10 +170,9 @@ static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp) smp_store_release(ptep, pte); }
-static bool kvm_set_valid_leaf_pte(kvm_pte_t *ptep, u64 pa, kvm_pte_t attr, - u32 level) +static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level) { - kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(pa); + kvm_pte_t pte = kvm_phys_to_pte(pa); u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE : KVM_PTE_TYPE_BLOCK;
@@ -181,12 +180,7 @@ static bool kvm_set_valid_leaf_pte(kvm_pte_t *ptep, u64 pa, kvm_pte_t attr, pte |= FIELD_PREP(KVM_PTE_TYPE, type); pte |= KVM_PTE_VALID;
- /* Tolerate KVM recreating the exact same mapping. */ - if (kvm_pte_valid(old)) - return old == pte; - - smp_store_release(ptep, pte); - return true; + return pte; }
static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr, @@ -342,12 +336,17 @@ static int hyp_map_set_prot_attr(enum kvm_pgtable_prot prot, static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct hyp_map_data *data) { + kvm_pte_t new, old = *ptep; u64 granule = kvm_granule_size(level), phys = data->phys;
if (!kvm_block_mapping_supported(addr, end, phys, level)) return false;
- WARN_ON(!kvm_set_valid_leaf_pte(ptep, phys, data->attr, level)); + /* Tolerate KVM recreating the exact same mapping */ + new = kvm_init_valid_leaf_pte(phys, data->attr, level); + if (old != new && !WARN_ON(kvm_pte_valid(old))) + smp_store_release(ptep, new); + data->phys += granule; return true; } @@ -466,27 +465,30 @@ static bool stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct stage2_map_data *data) { + kvm_pte_t new, old = *ptep; u64 granule = kvm_granule_size(level), phys = data->phys; + struct page *page = virt_to_page(ptep);
if (!kvm_block_mapping_supported(addr, end, phys, level)) return false;
- /* - * If the PTE was already valid, drop the refcount on the table - * early, as it will be bumped-up again in stage2_map_walk_leaf(). - * This ensures that the refcount stays constant across a valid to - * valid PTE update. - */ - if (kvm_pte_valid(*ptep)) - put_page(virt_to_page(ptep)); + new = kvm_init_valid_leaf_pte(phys, data->attr, level); + if (kvm_pte_valid(old)) { + /* Tolerate KVM recreating the exact same mapping */ + if (old == new) + goto out;
- if (kvm_set_valid_leaf_pte(ptep, phys, data->attr, level)) - goto out; + /* + * There's an existing different valid leaf entry, so perform + * break-before-make. + */ + kvm_set_invalid_pte(ptep); + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level); + put_page(page); + }
- /* There's an existing valid leaf entry, so perform break-before-make */ - kvm_set_invalid_pte(ptep); - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level); - kvm_set_valid_leaf_pte(ptep, phys, data->attr, level); + smp_store_release(ptep, new); + get_page(page); out: data->phys += granule; return true; @@ -528,7 +530,7 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, }
if (stage2_map_walker_try_leaf(addr, end, level, ptep, data)) - goto out_get_page; + return 0;
if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1)) return -EINVAL; @@ -552,9 +554,8 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, }
kvm_set_table_pte(ptep, childp); - -out_get_page: get_page(page); + return 0; }
From: Yanan Wang wangyanan55@huawei.com
mainline inclusion from mainline-v5.12-rc1-dontuse commit 694d071f8d85 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4IZOS CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
---------------------------------------------------------------------
(1) During running time of a a VM with numbers of vCPUs, if some vCPUs access the same GPA almost at the same time and the stage-2 mapping of the GPA has not been built yet, as a result they will all cause translation faults. The first vCPU builds the mapping, and the followed ones end up updating the valid leaf PTE. Note that these vCPUs might want different access permissions (RO, RW, RX, RWX, etc.).
(2) It's inevitable that we sometimes will update an existing valid leaf PTE in the map path, and we perform break-before-make in this case. Then more unnecessary translation faults could be caused if the *break stage* of BBM is just catched by other vCPUS.
With (1) and (2), something unsatisfactory could happen: vCPU A causes a translation fault and builds the mapping with RW permissions, vCPU B then update the valid leaf PTE with break-before-make and permissions are updated back to RO. Besides, *break stage* of BBM may trigger more translation faults. Finally, some useless small loops could occur.
We can make some optimization to solve above problems: When we need to update a valid leaf PTE in the map path, let's filter out the case where this update only change access permissions, and don't update the valid leaf PTE here in this case. Instead, let the vCPU enter back the guest and it will exit next time to go through the relax_perms path without break-before-make if it still wants more permissions.
Signed-off-by: Yanan Wang wangyanan55@huawei.com Signed-off-by: Marc Zyngier maz@kernel.org Link: https://lore.kernel.org/r/20210114121350.123684-3-wangyanan55@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/include/asm/kvm_pgtable.h | 5 +++++ arch/arm64/kvm/hyp/pgtable.c | 32 ++++++++++++++++++---------- 2 files changed, 26 insertions(+), 11 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 52ab38db04c7..8886d43cfb11 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -157,6 +157,11 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); * If device attributes are not explicitly requested in @prot, then the * mapping will be normal, cacheable. * + * Note that the update of a valid leaf PTE in this function will be aborted, + * if it's trying to recreate the exact same mapping or only change the access + * permissions. Instead, the vCPU will exit one more time from guest if still + * needed and then go through the path of relaxing permissions. + * * Note that this function will both coalesce existing table entries and split * existing block mappings, relying on page-faults to fault back areas outside * of the new mapping lazily. diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 87e46b36e1b5..a78928951fbd 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -45,6 +45,10 @@
#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54)
+#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ + KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ + KVM_PTE_LEAF_ATTR_HI_S2_XN) + struct kvm_pgtable_walk_data { struct kvm_pgtable *pgt; struct kvm_pgtable_walker *walker; @@ -461,22 +465,27 @@ static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot, return 0; }
-static bool stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, - kvm_pte_t *ptep, - struct stage2_map_data *data) +static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, + struct stage2_map_data *data) { kvm_pte_t new, old = *ptep; u64 granule = kvm_granule_size(level), phys = data->phys; struct page *page = virt_to_page(ptep);
if (!kvm_block_mapping_supported(addr, end, phys, level)) - return false; + return -E2BIG;
new = kvm_init_valid_leaf_pte(phys, data->attr, level); if (kvm_pte_valid(old)) { - /* Tolerate KVM recreating the exact same mapping */ - if (old == new) - goto out; + /* + * Skip updating the PTE if we are trying to recreate the exact + * same mapping or only change the access permissions. Instead, + * the vCPU will exit one more time from guest if still needed + * and then go through the path of relaxing permissions. + */ + if (!((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS))) + return -EAGAIN;
/* * There's an existing different valid leaf entry, so perform @@ -489,9 +498,8 @@ static bool stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
smp_store_release(ptep, new); get_page(page); -out: data->phys += granule; - return true; + return 0; }
static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level, @@ -519,6 +527,7 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level, static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct stage2_map_data *data) { + int ret; kvm_pte_t *childp, pte = *ptep; struct page *page = virt_to_page(ptep);
@@ -529,8 +538,9 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, return 0; }
- if (stage2_map_walker_try_leaf(addr, end, level, ptep, data)) - return 0; + ret = stage2_map_walker_try_leaf(addr, end, level, ptep, data); + if (ret != -E2BIG) + return ret;
if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1)) return -EINVAL;
From: Yanan Wang wangyanan55@huawei.com
mainline inclusion from mainline-v5.12-rc1-dontuse commit 509552e65ae8 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4IZOS CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
---------------------------------------------------------------------
We now set the pfn dirty and mark the page dirty before calling fault handlers in user_mem_abort(), so we might end up having spurious dirty pages if update of permissions or mapping has failed. Let's move these two operations after the fault handlers, and they will be done only if the fault has been handled successfully.
When an -EAGAIN errno is returned from the map handler, we hope to the vcpu to enter guest directly instead of exiting back to userspace, so adjust the return value at the end of function.
Signed-off-by: Yanan Wang wangyanan55@huawei.com Signed-off-by: Marc Zyngier maz@kernel.org Link: https://lore.kernel.org/r/20210114121350.123684-4-wangyanan55@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kvm/mmu.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 0e4491b81cb3..bbc4cc26c92a 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -880,11 +880,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (vma_pagesize == PAGE_SIZE && !force_pte) vma_pagesize = transparent_hugepage_adjust(memslot, hva, &pfn, &fault_ipa); - if (writable) { + if (writable) prot |= KVM_PGTABLE_PROT_W; - kvm_set_pfn_dirty(pfn); - mark_page_dirty(kvm, gfn); - }
if (fault_status != FSC_PERM && !device) clean_dcache_guest_page(pfn, vma_pagesize); @@ -912,11 +909,17 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, memcache); }
+ /* Mark the page dirty only if the fault is handled successfully */ + if (writable && !ret) { + kvm_set_pfn_dirty(pfn); + mark_page_dirty(kvm, gfn); + } + out_unlock: spin_unlock(&kvm->mmu_lock); kvm_set_pfn_accessed(pfn); kvm_release_pfn_clean(pfn); - return ret; + return ret != -EAGAIN ? ret : 0; }
/* Resolve the access fault by making the page young again. */
From: Heyi Guo guoheyi@huawei.com
virt inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4IZPY CVE: NA
-------------------------------------------------
Pending LPIs may block new allocated LPIs in kdump secondary kernel. We only do that for guest kernel access to limit the change impact.
Signed-off-by: Heyi Guo guoheyi@huawei.com Signed-off-by: Zenghui Yu yuzenghui@huawei.com Reviewed-by: Hailiang Zhang zhang.zhanghailiang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Link: https://gitee.com/openeuler/kernel/commit/5b574685b743 Reviewed-by: Yanan Wang wangyanan55@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kvm/vgic/vgic-its.c | 41 +++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 6 deletions(-)
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 4f31880803d4..42bcce7fec86 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -1674,10 +1674,10 @@ static unsigned long vgic_mmio_read_its_baser(struct kvm *kvm, }
#define GITS_BASER_RO_MASK (GENMASK_ULL(52, 48) | GENMASK_ULL(58, 56)) -static void vgic_mmio_write_its_baser(struct kvm *kvm, - struct vgic_its *its, - gpa_t addr, unsigned int len, - unsigned long val) +static void vgic_mmio_write_its_baser_common(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val, bool uaccess) { const struct vgic_its_abi *abi = vgic_its_get_abi(its); u64 entry_size, table_type; @@ -1714,10 +1714,21 @@ static void vgic_mmio_write_its_baser(struct kvm *kvm, *regptr = reg;
if (!(reg & GITS_BASER_VALID)) { + struct kvm_vcpu *vcpu; + int c; + /* Take the its_lock to prevent a race with a save/restore */ mutex_lock(&its->its_lock); switch (table_type) { case GITS_BASER_TYPE_DEVICE: + if (!uaccess) { + /* Fix kdump irq missing issue */ + pr_debug("%s: flush pending LPIs for all VCPUs.\n", + __func__); + kvm_for_each_vcpu(c, vcpu, kvm) + vgic_flush_pending_lpis(vcpu); + } + vgic_its_free_device_list(kvm, its); break; case GITS_BASER_TYPE_COLLECTION: @@ -1728,6 +1739,23 @@ static void vgic_mmio_write_its_baser(struct kvm *kvm, } }
+static void vgic_mmio_write_its_baser(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + vgic_mmio_write_its_baser_common(kvm, its, addr, len, val, false); +} + +static int vgic_mmio_uaccess_write_its_baser(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + vgic_mmio_write_its_baser_common(kvm, its, addr, len, val, true); + return 0; +} + static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu, struct vgic_its *its, gpa_t addr, unsigned int len) @@ -1820,8 +1848,9 @@ static struct vgic_register_region its_registers[] = { vgic_mmio_read_its_creadr, its_mmio_write_wi, vgic_mmio_uaccess_write_its_creadr, 8, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), - REGISTER_ITS_DESC(GITS_BASER, - vgic_mmio_read_its_baser, vgic_mmio_write_its_baser, 0x40, + REGISTER_ITS_DESC_UACCESS(GITS_BASER, + vgic_mmio_read_its_baser, vgic_mmio_write_its_baser, + vgic_mmio_uaccess_write_its_baser, 0x40, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), REGISTER_ITS_DESC(GITS_IDREGS_BASE, vgic_mmio_read_its_idregs, its_mmio_write_wi, 0x30,
From: Dong Kai dongkai11@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4F3V1 CVE: NA
--------------------------------
The softlockup and hardlockup detector only check the status of the cpu which it resides. If certain cpu core suspends, they are both not works. There is no any valid log but the cpu already abnormal and brings a lot of problems of system. To detect this case, we add the corelockup detector.
First we use whether cpu core can responds to nmi as a sectence to determine if it is suspended. Then things is simple. Per cpu core maintains it's nmi interrupt counts and detector the nmi_counts of next cpu core. If the nmi interrupt counts not changed any more which means it can't respond nmi normally, we regard it as suspend.
To ensure robustness, only consecutive lost nmi more than two times then trigger the warn.
The detection chain is as following: cpu0->cpu1->...->cpuN->cpu0
Signed-off-by: Dong Kai dongkai11@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/nmi.h | 6 ++ kernel/watchdog.c | 15 +++- kernel/watchdog_hld.c | 165 ++++++++++++++++++++++++++++++++++++++++++ lib/Kconfig.debug | 8 ++ 4 files changed, 192 insertions(+), 2 deletions(-)
diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 8c9b857e7f62..79c1b60bc7cc 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -125,6 +125,12 @@ static inline int hardlockup_detector_perf_init(void) { return 0; } # endif #endif
+#ifdef CONFIG_CORELOCKUP_DETECTOR +extern void corelockup_detector_init(void); +extern void corelockup_detector_online_cpu(unsigned int cpu); +extern void corelockup_detector_offline_cpu(unsigned int cpu); +#endif + void watchdog_nmi_stop(void); void watchdog_nmi_start(void); int watchdog_nmi_probe(void); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 947c12790d73..58dd6777b079 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -516,15 +516,23 @@ static void softlockup_start_all(void)
int lockup_detector_online_cpu(unsigned int cpu) { - if (cpumask_test_cpu(cpu, &watchdog_allowed_mask)) + if (cpumask_test_cpu(cpu, &watchdog_allowed_mask)) { watchdog_enable(cpu); +#ifdef CONFIG_CORELOCKUP_DETECTOR + corelockup_detector_online_cpu(cpu); +#endif + } return 0; }
int lockup_detector_offline_cpu(unsigned int cpu) { - if (cpumask_test_cpu(cpu, &watchdog_allowed_mask)) + if (cpumask_test_cpu(cpu, &watchdog_allowed_mask)) { watchdog_disable(cpu); +#ifdef CONFIG_CORELOCKUP_DETECTOR + corelockup_detector_offline_cpu(cpu); +#endif + } return 0; }
@@ -754,4 +762,7 @@ void __init lockup_detector_init(void) if (!nmi_watchdog_ops.watchdog_nmi_probe()) nmi_watchdog_available = true; lockup_detector_setup(); +#ifdef CONFIG_CORELOCKUP_DETECTOR + corelockup_detector_init(); +#endif } diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index a5716ef008d2..3812184d2657 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -41,6 +41,163 @@ notrace void arch_touch_nmi_watchdog(void) EXPORT_SYMBOL(arch_touch_nmi_watchdog); #endif
+#ifdef CONFIG_CORELOCKUP_DETECTOR +/* + * The softlockup and hardlockup detector only check the status + * of the cpu which it resides. If certain cpu core suspends, + * they are both not works. There is no any valid log but the + * cpu already abnormal and brings a lot of problems of system. + * To detect this case, we add the corelockup detector. + * + * First we use whether cpu core can responds to nmi as a sectence + * to determine if it is suspended. Then things is simple. Per cpu + * core maintains it's nmi interrupt counts and detector the + * nmi_counts of next cpu core. If the nmi interrupt counts not + * changed any more which means it can't respond nmi normally, we + * regard it as suspend. + * + * To ensure robustness, only consecutive lost nmi more than two + * times then trigger the warn. + * + * The detection chain is as following: + * cpu0->cpu1->...->cpuN->cpu0 + * + * detector_cpu: the target cpu to detector of current cpu + * nmi_interrupts: the nmi counts of current cpu + * nmi_cnt_saved: saved nmi counts of detector_cpu + * nmi_cnt_missed: the nmi consecutive miss counts of detector_cpu + */ +static DEFINE_PER_CPU(unsigned int, detector_cpu); +static DEFINE_PER_CPU(unsigned long, nmi_interrupts); +static DEFINE_PER_CPU(unsigned long, nmi_cnt_saved); +static DEFINE_PER_CPU(unsigned long, nmi_cnt_missed); +static DEFINE_PER_CPU(bool, core_watchdog_warn); + +static void watchdog_nmi_interrupts(void) +{ + __this_cpu_inc(nmi_interrupts); +} + +static void corelockup_status_copy(unsigned int from, unsigned int to) +{ + per_cpu(nmi_cnt_saved, to) = per_cpu(nmi_cnt_saved, from); + per_cpu(nmi_cnt_missed, to) = per_cpu(nmi_cnt_missed, from); + + /* always update detector cpu at the end */ + per_cpu(detector_cpu, to) = per_cpu(detector_cpu, from); +} + +static void corelockup_status_init(unsigned int cpu, unsigned int target) +{ + /* + * initialize saved count to max to avoid unnecessary misjudge + * caused by delay running of nmi on target cpu + */ + per_cpu(nmi_cnt_saved, cpu) = ULONG_MAX; + per_cpu(nmi_cnt_missed, cpu) = 0; + + /* always update detector cpu at the end */ + per_cpu(detector_cpu, cpu) = target; +} + +void __init corelockup_detector_init(void) +{ + unsigned int cpu, next; + + /* detector cpu is set to the next valid logically one */ + for_each_cpu_and(cpu, &watchdog_cpumask, cpu_online_mask) { + next = cpumask_next_and(cpu, &watchdog_cpumask, + cpu_online_mask); + if (next >= nr_cpu_ids) + next = cpumask_first_and(&watchdog_cpumask, + cpu_online_mask); + corelockup_status_init(cpu, next); + } +} + +/* + * Before: first->next + * After: first->[new]->next + */ +void corelockup_detector_online_cpu(unsigned int cpu) +{ + unsigned int first = cpumask_first_and(&watchdog_cpumask, + cpu_online_mask); + + if (WARN_ON(first >= nr_cpu_ids)) + return; + + /* cpu->next */ + corelockup_status_copy(first, cpu); + + /* first->cpu */ + corelockup_status_init(first, cpu); +} + +/* + * Before: prev->cpu->next + * After: prev->next + */ +void corelockup_detector_offline_cpu(unsigned int cpu) +{ + unsigned int prev = nr_cpu_ids; + unsigned int i; + + /* found prev cpu */ + for_each_cpu_and(i, &watchdog_cpumask, cpu_online_mask) { + if (per_cpu(detector_cpu, i) == cpu) { + prev = i; + break; + } + } + + if (WARN_ON(prev == nr_cpu_ids)) + return; + + /* prev->next */ + corelockup_status_copy(cpu, prev); +} + +static bool is_corelockup(unsigned int cpu) +{ + unsigned long nmi_int = per_cpu(nmi_interrupts, cpu); + + /* skip check if only one cpu online */ + if (cpu == smp_processor_id()) + return false; + + if (__this_cpu_read(nmi_cnt_saved) != nmi_int) { + __this_cpu_write(nmi_cnt_saved, nmi_int); + __this_cpu_write(nmi_cnt_missed, 0); + per_cpu(core_watchdog_warn, cpu) = false; + return false; + } + + __this_cpu_inc(nmi_cnt_missed); + if (__this_cpu_read(nmi_cnt_missed) > 2) + return true; + + return false; +} +NOKPROBE_SYMBOL(is_corelockup); + +static void watchdog_corelockup_check(struct pt_regs *regs) +{ + unsigned int cpu = __this_cpu_read(detector_cpu); + + if (is_corelockup(cpu)) { + if (per_cpu(core_watchdog_warn, cpu) == true) + return; + pr_emerg("Watchdog detected core LOCKUP on cpu %d\n", cpu); + + if (hardlockup_panic) + nmi_panic(regs, "Core LOCKUP"); + + per_cpu(core_watchdog_warn, cpu) = true; + } +} +#endif + #ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP static DEFINE_PER_CPU(ktime_t, last_timestamp); static DEFINE_PER_CPU(unsigned int, nmi_rearmed); @@ -108,6 +265,14 @@ static inline bool watchdog_check_timestamp(void)
void watchdog_hardlockup_check(struct pt_regs *regs) { +#ifdef CONFIG_CORELOCKUP_DETECTOR + /* Kick nmi interrupts */ + watchdog_nmi_interrupts(); + + /* corelockup check */ + watchdog_corelockup_check(regs); +#endif + if (__this_cpu_read(watchdog_nmi_touch) == true) { __this_cpu_write(watchdog_nmi_touch, false); return; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 46bf9e84e9a8..f906df9db2e2 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1020,6 +1020,14 @@ config HARDLOCKUP_DETECTOR chance to run. The current stack trace is displayed upon detection and the system will stay locked up.
+config CORELOCKUP_DETECTOR + bool "Detect Core Lockups" + depends on HARDLOCKUP_DETECTOR && SOFTLOCKUP_DETECTOR + depends on ARM64 + default n + help + Corelockups is used to check whether cpu core hungup or not. + config BOOTPARAM_HARDLOCKUP_PANIC bool "Panic (Reboot) On Hard Lockups" depends on HARDLOCKUP_DETECTOR
From: Dong Kai dongkai11@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4F3V1 CVE: NA
--------------------------------
When using pmu events as nmi source, the pmu clock is disabled under wfi/wfe mode. And the nmi can't respond periodically. To minimize the misjudgment by wfi/wfe, we adopt a simple method which to disable wfi/wfe at the right time and the watchdog hrtimer is a good baseline.
The watchdog hrtimer is based on generate timer and has high freq than nmi. If watchdog hrtimer not works we disable wfi/wfe mode then the pmu nmi should always responds as long as the cpu core not suspend.
Signed-off-by: Dong Kai dongkai11@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/include/asm/barrier.h | 15 ++++++++ include/linux/nmi.h | 2 + kernel/watchdog.c | 12 ++++++ kernel/watchdog_hld.c | 63 ++++++++++++++++++++++++++++++++ 4 files changed, 92 insertions(+)
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h index 37d891af8ea5..448c14392e81 100644 --- a/arch/arm64/include/asm/barrier.h +++ b/arch/arm64/include/asm/barrier.h @@ -15,8 +15,23 @@ #define nops(n) asm volatile(__nops(n))
#define sev() asm volatile("sev" : : : "memory") +#ifdef CONFIG_CORELOCKUP_DETECTOR +extern unsigned int close_wfi_wfe; +#define wfe() \ + do { \ + if (likely(close_wfi_wfe == 0)) \ + asm volatile("wfe" : : : "memory"); \ + } while (0) +#define wfi() \ + do { \ + if (likely(close_wfi_wfe == 0)) \ + asm volatile("wfi" : : : "memory"); \ + } while (0) + +#else #define wfe() asm volatile("wfe" : : : "memory") #define wfi() asm volatile("wfi" : : : "memory") +#endif
#define isb() asm volatile("isb" : : : "memory") #define dmb(opt) asm volatile("dmb " #opt : : : "memory") diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 79c1b60bc7cc..12570dadd5e9 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -129,6 +129,8 @@ static inline int hardlockup_detector_perf_init(void) { return 0; } extern void corelockup_detector_init(void); extern void corelockup_detector_online_cpu(unsigned int cpu); extern void corelockup_detector_offline_cpu(unsigned int cpu); +extern void watchdog_check_hrtimer(void); +extern unsigned long watchdog_hrtimer_interrupts(unsigned int cpu); #endif
void watchdog_nmi_stop(void); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 58dd6777b079..4fb4be9a1d0b 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -346,6 +346,13 @@ static int softlockup_fn(void *data) return 0; }
+#ifdef CONFIG_CORELOCKUP_DETECTOR +unsigned long watchdog_hrtimer_interrupts(unsigned int cpu) +{ + return per_cpu(hrtimer_interrupts, cpu); +} +#endif + /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { @@ -357,6 +364,11 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) if (!watchdog_enabled) return HRTIMER_NORESTART;
+#ifdef CONFIG_CORELOCKUP_DETECTOR + /* check hrtimer of detector cpu */ + watchdog_check_hrtimer(); +#endif + /* kick the hardlockup detector */ watchdog_interrupt_count();
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 3812184d2657..5e58293e241a 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -62,16 +62,37 @@ EXPORT_SYMBOL(arch_touch_nmi_watchdog); * The detection chain is as following: * cpu0->cpu1->...->cpuN->cpu0 * + * When using pmu events as nmi source, the pmu clock is disabled + * under wfi/wfe mode. And the nmi can't respond periodically. + * To minimize the misjudgment by wfi/wfe, we adopt a simple method + * which to disable wfi/wfe at the right time and the watchdog hrtimer + * is a good baseline. + * + * The watchdog hrtimer is based on generate timer and has high freq + * than nmi. If watchdog hrtimer not works we disable wfi/wfe mode + * then the pmu nmi should always responds as long as the cpu core + * not suspend. + * * detector_cpu: the target cpu to detector of current cpu * nmi_interrupts: the nmi counts of current cpu * nmi_cnt_saved: saved nmi counts of detector_cpu * nmi_cnt_missed: the nmi consecutive miss counts of detector_cpu + * hrint_saved: saved hrtimer interrupts of detector_cpu + * hrint_missed: the hrtimer consecutive miss counts of detector_cpu + * corelockup_cpumask/close_wfi_wfe: + * the cpu mask is set if certain cpu maybe fall in suspend and close + * wfi/wfe mode if any bit is set */ static DEFINE_PER_CPU(unsigned int, detector_cpu); static DEFINE_PER_CPU(unsigned long, nmi_interrupts); static DEFINE_PER_CPU(unsigned long, nmi_cnt_saved); static DEFINE_PER_CPU(unsigned long, nmi_cnt_missed); static DEFINE_PER_CPU(bool, core_watchdog_warn); +static DEFINE_PER_CPU(unsigned long, hrint_saved); +static DEFINE_PER_CPU(unsigned long, hrint_missed); +struct cpumask corelockup_cpumask __read_mostly; +unsigned int close_wfi_wfe; +static bool pmu_based_nmi;
static void watchdog_nmi_interrupts(void) { @@ -82,6 +103,8 @@ static void corelockup_status_copy(unsigned int from, unsigned int to) { per_cpu(nmi_cnt_saved, to) = per_cpu(nmi_cnt_saved, from); per_cpu(nmi_cnt_missed, to) = per_cpu(nmi_cnt_missed, from); + per_cpu(hrint_saved, to) = per_cpu(hrint_saved, from); + per_cpu(hrint_missed, to) = per_cpu(hrint_missed, from);
/* always update detector cpu at the end */ per_cpu(detector_cpu, to) = per_cpu(detector_cpu, from); @@ -95,6 +118,8 @@ static void corelockup_status_init(unsigned int cpu, unsigned int target) */ per_cpu(nmi_cnt_saved, cpu) = ULONG_MAX; per_cpu(nmi_cnt_missed, cpu) = 0; + per_cpu(hrint_saved, cpu) = ULONG_MAX; + per_cpu(hrint_missed, cpu) = 0;
/* always update detector cpu at the end */ per_cpu(detector_cpu, cpu) = target; @@ -115,6 +140,38 @@ void __init corelockup_detector_init(void) } }
+void watchdog_check_hrtimer(void) +{ + unsigned int cpu = __this_cpu_read(detector_cpu); + unsigned long hrint = watchdog_hrtimer_interrupts(cpu); + + /* + * The freq of hrtimer is fast than nmi interrupts and + * the core mustn't hangs if hrtimer still working. + * So update the nmi interrupts in hrtimer either to + * improved robustness of nmi counts check. + */ + watchdog_nmi_interrupts(); + + if (!pmu_based_nmi) + return; + + if (__this_cpu_read(hrint_saved) != hrint) { + __this_cpu_write(hrint_saved, hrint); + __this_cpu_write(hrint_missed, 0); + cpumask_clear_cpu(cpu, &corelockup_cpumask); + } else { + __this_cpu_inc(hrint_missed); + if (__this_cpu_read(hrint_missed) > 2) + cpumask_set_cpu(cpu, &corelockup_cpumask); + } + + if (likely(cpumask_empty(&corelockup_cpumask))) + close_wfi_wfe = 0; + else + close_wfi_wfe = 1; +} + /* * Before: first->next * After: first->[new]->next @@ -143,6 +200,9 @@ void corelockup_detector_offline_cpu(unsigned int cpu) unsigned int prev = nr_cpu_ids; unsigned int i;
+ /* clear bitmap */ + cpumask_clear_cpu(cpu, &corelockup_cpumask); + /* found prev cpu */ for_each_cpu_and(i, &watchdog_cpumask, cpu_online_mask) { if (per_cpu(detector_cpu, i) == cpu) { @@ -479,6 +539,9 @@ int __init hardlockup_detector_perf_init(void) perf_event_release_kernel(this_cpu_read(watchdog_ev)); this_cpu_write(watchdog_ev, NULL); } +#ifdef CONFIG_CORELOCKUP_DETECTOR + pmu_based_nmi = true; +#endif return ret; } #endif /* CONFIG_HARDLOCKUP_DETECTOR_PERF */
From: Dong Kai dongkai11@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4F3V1 CVE: NA
--------------------------------
Add cmdline params "enable_corelockup_detector" to support enable core suspend detector. And enable defaultly within ascend features.
Signed-off-by: Dong Kai dongkai11@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/nmi.h | 1 + kernel/watchdog.c | 12 ++++++++---- kernel/watchdog_hld.c | 18 ++++++++++++++---- 3 files changed, 23 insertions(+), 8 deletions(-)
diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 12570dadd5e9..6f13b1d7d61b 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -131,6 +131,7 @@ extern void corelockup_detector_online_cpu(unsigned int cpu); extern void corelockup_detector_offline_cpu(unsigned int cpu); extern void watchdog_check_hrtimer(void); extern unsigned long watchdog_hrtimer_interrupts(unsigned int cpu); +extern bool enable_corelockup_detector; #endif
void watchdog_nmi_stop(void); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 4fb4be9a1d0b..b5011d0a1174 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -366,7 +366,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
#ifdef CONFIG_CORELOCKUP_DETECTOR /* check hrtimer of detector cpu */ - watchdog_check_hrtimer(); + if (enable_corelockup_detector) + watchdog_check_hrtimer(); #endif
/* kick the hardlockup detector */ @@ -531,7 +532,8 @@ int lockup_detector_online_cpu(unsigned int cpu) if (cpumask_test_cpu(cpu, &watchdog_allowed_mask)) { watchdog_enable(cpu); #ifdef CONFIG_CORELOCKUP_DETECTOR - corelockup_detector_online_cpu(cpu); + if (enable_corelockup_detector) + corelockup_detector_online_cpu(cpu); #endif } return 0; @@ -542,7 +544,8 @@ int lockup_detector_offline_cpu(unsigned int cpu) if (cpumask_test_cpu(cpu, &watchdog_allowed_mask)) { watchdog_disable(cpu); #ifdef CONFIG_CORELOCKUP_DETECTOR - corelockup_detector_offline_cpu(cpu); + if (enable_corelockup_detector) + corelockup_detector_offline_cpu(cpu); #endif } return 0; @@ -775,6 +778,7 @@ void __init lockup_detector_init(void) nmi_watchdog_available = true; lockup_detector_setup(); #ifdef CONFIG_CORELOCKUP_DETECTOR - corelockup_detector_init(); + if (enable_corelockup_detector) + corelockup_detector_init(); #endif } diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 5e58293e241a..665358df5172 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -93,6 +93,14 @@ static DEFINE_PER_CPU(unsigned long, hrint_missed); struct cpumask corelockup_cpumask __read_mostly; unsigned int close_wfi_wfe; static bool pmu_based_nmi; +bool enable_corelockup_detector; + +static int __init enable_corelockup_detector_setup(char *str) +{ + enable_corelockup_detector = true; + return 1; +} +__setup("enable_corelockup_detector", enable_corelockup_detector_setup);
static void watchdog_nmi_interrupts(void) { @@ -326,11 +334,13 @@ static inline bool watchdog_check_timestamp(void) void watchdog_hardlockup_check(struct pt_regs *regs) { #ifdef CONFIG_CORELOCKUP_DETECTOR - /* Kick nmi interrupts */ - watchdog_nmi_interrupts(); + if (enable_corelockup_detector) { + /* Kick nmi interrupts */ + watchdog_nmi_interrupts();
- /* corelockup check */ - watchdog_corelockup_check(regs); + /* corelockup check */ + watchdog_corelockup_check(regs); + } #endif
if (__this_cpu_read(watchdog_nmi_touch) == true) {
From: Xu Qiang xuqiang36@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4F3V1 CVE: NA
--------------------------------
Optimized core lockup detection judgment rules to make it easier to understand.
Core suspension detection is performed in the hrtimer interrupt processing function. The detection condition is that the hrtimer interrupt and NMI interrupt are not updated for multiple consecutive times.
Signed-off-by: Xu Qiang xuqiang36@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/include/asm/barrier.h | 15 ----- kernel/watchdog_hld.c | 104 +++++++++---------------------- 2 files changed, 29 insertions(+), 90 deletions(-)
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h index 448c14392e81..37d891af8ea5 100644 --- a/arch/arm64/include/asm/barrier.h +++ b/arch/arm64/include/asm/barrier.h @@ -15,23 +15,8 @@ #define nops(n) asm volatile(__nops(n))
#define sev() asm volatile("sev" : : : "memory") -#ifdef CONFIG_CORELOCKUP_DETECTOR -extern unsigned int close_wfi_wfe; -#define wfe() \ - do { \ - if (likely(close_wfi_wfe == 0)) \ - asm volatile("wfe" : : : "memory"); \ - } while (0) -#define wfi() \ - do { \ - if (likely(close_wfi_wfe == 0)) \ - asm volatile("wfi" : : : "memory"); \ - } while (0) - -#else #define wfe() asm volatile("wfe" : : : "memory") #define wfi() asm volatile("wfi" : : : "memory") -#endif
#define isb() asm volatile("isb" : : : "memory") #define dmb(opt) asm volatile("dmb " #opt : : : "memory") diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 665358df5172..ee1fce089593 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -64,14 +64,9 @@ EXPORT_SYMBOL(arch_touch_nmi_watchdog); * * When using pmu events as nmi source, the pmu clock is disabled * under wfi/wfe mode. And the nmi can't respond periodically. - * To minimize the misjudgment by wfi/wfe, we adopt a simple method - * which to disable wfi/wfe at the right time and the watchdog hrtimer - * is a good baseline. - * - * The watchdog hrtimer is based on generate timer and has high freq - * than nmi. If watchdog hrtimer not works we disable wfi/wfe mode - * then the pmu nmi should always responds as long as the cpu core - * not suspend. + * However, when the core is suspended, the hrtimer interrupt and + * NMI interrupt cannot be received. This can be used as the basis + * for determining whether the core is suspended. * * detector_cpu: the target cpu to detector of current cpu * nmi_interrupts: the nmi counts of current cpu @@ -79,20 +74,14 @@ EXPORT_SYMBOL(arch_touch_nmi_watchdog); * nmi_cnt_missed: the nmi consecutive miss counts of detector_cpu * hrint_saved: saved hrtimer interrupts of detector_cpu * hrint_missed: the hrtimer consecutive miss counts of detector_cpu - * corelockup_cpumask/close_wfi_wfe: - * the cpu mask is set if certain cpu maybe fall in suspend and close - * wfi/wfe mode if any bit is set */ static DEFINE_PER_CPU(unsigned int, detector_cpu); static DEFINE_PER_CPU(unsigned long, nmi_interrupts); static DEFINE_PER_CPU(unsigned long, nmi_cnt_saved); static DEFINE_PER_CPU(unsigned long, nmi_cnt_missed); -static DEFINE_PER_CPU(bool, core_watchdog_warn); static DEFINE_PER_CPU(unsigned long, hrint_saved); static DEFINE_PER_CPU(unsigned long, hrint_missed); -struct cpumask corelockup_cpumask __read_mostly; -unsigned int close_wfi_wfe; -static bool pmu_based_nmi; +static unsigned long corelockup_allcpu_dumped; bool enable_corelockup_detector;
static int __init enable_corelockup_detector_setup(char *str) @@ -152,6 +141,11 @@ void watchdog_check_hrtimer(void) { unsigned int cpu = __this_cpu_read(detector_cpu); unsigned long hrint = watchdog_hrtimer_interrupts(cpu); + unsigned long nmi_int = per_cpu(nmi_interrupts, cpu); + + /* skip check if only one cpu online */ + if (cpu == smp_processor_id()) + return;
/* * The freq of hrtimer is fast than nmi interrupts and @@ -161,23 +155,31 @@ void watchdog_check_hrtimer(void) */ watchdog_nmi_interrupts();
- if (!pmu_based_nmi) - return; - if (__this_cpu_read(hrint_saved) != hrint) { __this_cpu_write(hrint_saved, hrint); __this_cpu_write(hrint_missed, 0); - cpumask_clear_cpu(cpu, &corelockup_cpumask); - } else { - __this_cpu_inc(hrint_missed); - if (__this_cpu_read(hrint_missed) > 2) - cpumask_set_cpu(cpu, &corelockup_cpumask); + return; + } + __this_cpu_inc(hrint_missed); + + if (__this_cpu_read(nmi_cnt_saved) != nmi_int) { + __this_cpu_write(nmi_cnt_saved, nmi_int); + __this_cpu_write(nmi_cnt_missed, 0); + return; } + __this_cpu_inc(nmi_cnt_missed);
- if (likely(cpumask_empty(&corelockup_cpumask))) - close_wfi_wfe = 0; - else - close_wfi_wfe = 1; + if ((__this_cpu_read(hrint_missed) > 5) && (__this_cpu_read(nmi_cnt_missed) > 5)) { + pr_emerg("Watchdog detected core LOCKUP on cpu %d\n", cpu); + + if (!test_and_set_bit(0, &corelockup_allcpu_dumped)) { + trigger_allbutself_cpu_backtrace(); + panic("Core LOCKUP"); + } else { + while (1) + cpu_relax(); + } + } }
/* @@ -208,9 +210,6 @@ void corelockup_detector_offline_cpu(unsigned int cpu) unsigned int prev = nr_cpu_ids; unsigned int i;
- /* clear bitmap */ - cpumask_clear_cpu(cpu, &corelockup_cpumask); - /* found prev cpu */ for_each_cpu_and(i, &watchdog_cpumask, cpu_online_mask) { if (per_cpu(detector_cpu, i) == cpu) { @@ -225,45 +224,6 @@ void corelockup_detector_offline_cpu(unsigned int cpu) /* prev->next */ corelockup_status_copy(cpu, prev); } - -static bool is_corelockup(unsigned int cpu) -{ - unsigned long nmi_int = per_cpu(nmi_interrupts, cpu); - - /* skip check if only one cpu online */ - if (cpu == smp_processor_id()) - return false; - - if (__this_cpu_read(nmi_cnt_saved) != nmi_int) { - __this_cpu_write(nmi_cnt_saved, nmi_int); - __this_cpu_write(nmi_cnt_missed, 0); - per_cpu(core_watchdog_warn, cpu) = false; - return false; - } - - __this_cpu_inc(nmi_cnt_missed); - if (__this_cpu_read(nmi_cnt_missed) > 2) - return true; - - return false; -} -NOKPROBE_SYMBOL(is_corelockup); - -static void watchdog_corelockup_check(struct pt_regs *regs) -{ - unsigned int cpu = __this_cpu_read(detector_cpu); - - if (is_corelockup(cpu)) { - if (per_cpu(core_watchdog_warn, cpu) == true) - return; - pr_emerg("Watchdog detected core LOCKUP on cpu %d\n", cpu); - - if (hardlockup_panic) - nmi_panic(regs, "Core LOCKUP"); - - per_cpu(core_watchdog_warn, cpu) = true; - } -} #endif
#ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP @@ -337,9 +297,6 @@ void watchdog_hardlockup_check(struct pt_regs *regs) if (enable_corelockup_detector) { /* Kick nmi interrupts */ watchdog_nmi_interrupts(); - - /* corelockup check */ - watchdog_corelockup_check(regs); } #endif
@@ -549,9 +506,6 @@ int __init hardlockup_detector_perf_init(void) perf_event_release_kernel(this_cpu_read(watchdog_ev)); this_cpu_write(watchdog_ev, NULL); } -#ifdef CONFIG_CORELOCKUP_DETECTOR - pmu_based_nmi = true; -#endif return ret; } #endif /* CONFIG_HARDLOCKUP_DETECTOR_PERF */
From: Xu Qiang xuqiang36@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4F3V1 CVE: NA
--------------------------------
A user-mode interface is added to control the core lockup detection sensitivity.
Signed-off-by: Xu Qiang xuqiang36@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/nmi.h | 1 + kernel/sysctl.c | 13 +++++++++++++ kernel/watchdog_hld.c | 4 +++- 3 files changed, 17 insertions(+), 1 deletion(-)
diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 6f13b1d7d61b..0cc36b799df6 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -132,6 +132,7 @@ extern void corelockup_detector_offline_cpu(unsigned int cpu); extern void watchdog_check_hrtimer(void); extern unsigned long watchdog_hrtimer_interrupts(unsigned int cpu); extern bool enable_corelockup_detector; +extern int corelockup_miss_thresh; #endif
void watchdog_nmi_stop(void); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c7ca58de3b1b..3ab6ea7853ba 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -113,7 +113,9 @@ static int sixty = 60;
static int __maybe_unused neg_one = -1; static int __maybe_unused two = 2; +static int __maybe_unused three = 3; static int __maybe_unused four = 4; +static int __maybe_unused five = 5; static unsigned long zero_ul; static unsigned long one_ul = 1; static unsigned long long_max = LONG_MAX; @@ -2405,6 +2407,17 @@ static struct ctl_table kern_table[] = { }, #endif /* CONFIG_SMP */ #endif +#ifdef CONFIG_CORELOCKUP_DETECTOR + { + .procname = "corelockup_thresh", + .data = &corelockup_miss_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &three, + .extra2 = &five, + }, +#endif #endif
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index ee1fce089593..8af4ab738ce9 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -83,6 +83,7 @@ static DEFINE_PER_CPU(unsigned long, hrint_saved); static DEFINE_PER_CPU(unsigned long, hrint_missed); static unsigned long corelockup_allcpu_dumped; bool enable_corelockup_detector; +int __read_mostly corelockup_miss_thresh = 5;
static int __init enable_corelockup_detector_setup(char *str) { @@ -169,7 +170,8 @@ void watchdog_check_hrtimer(void) } __this_cpu_inc(nmi_cnt_missed);
- if ((__this_cpu_read(hrint_missed) > 5) && (__this_cpu_read(nmi_cnt_missed) > 5)) { + if ((__this_cpu_read(hrint_missed) > corelockup_miss_thresh) + && (__this_cpu_read(nmi_cnt_missed) > corelockup_miss_thresh)) { pr_emerg("Watchdog detected core LOCKUP on cpu %d\n", cpu);
if (!test_and_set_bit(0, &corelockup_allcpu_dumped)) {
From: Xu Qiang xuqiang36@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4F3V1 CVE: NA
--------------------------------
When hard lockup detection is disabled, core lockup detection is not performed.
Signed-off-by: Xu Qiang xuqiang36@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/watchdog_hld.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 8af4ab738ce9..060873ff8a6d 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -148,6 +148,10 @@ void watchdog_check_hrtimer(void) if (cpu == smp_processor_id()) return;
+ /* return if hard lockup detector is disable */ + if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + return; + /* * The freq of hrtimer is fast than nmi interrupts and * the core mustn't hangs if hrtimer still working.
From: Xu Qiang xuqiang36@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4F3V1 CVE: NA
--------------------------------
Signed-off-by: Xu Qiang xuqiang36@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + 1 file changed, 1 insertion(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 34aa2da5bf15..863c72ad49a2 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -6982,6 +6982,7 @@ CONFIG_PMU_WATCHDOG=y # end of ARM64 NMI watchdog configuration
CONFIG_HARDLOCKUP_DETECTOR=y +CONFIG_CORELOCKUP_DETECTOR=y # CONFIG_BOOTPARAM_HARDLOCKUP_PANIC is not set CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE=0 CONFIG_DETECT_HUNG_TASK=y
From: yangerkun yangerkun@huawei.com
hulk inclusion category: bugfix bugzilla: 185798 https://gitee.com/openeuler/kernel/issues/I4JWYM CVE: NA
---------------------------
luojiajun report a problem[1] two years ago which seems still exists in mainline. vfs_fallocate can avoid 'offset + len' trigger overflow, but 'offset + len + hpage_size - 1' may overflow too and will lead to a wrong 'end'. luojiajun give a solution which can fix the wrong 'end' but leave the overflow still happened. Fix it with DIV_ROUND_UP_ULL.
[1] https://patchwork.kernel.org/project/linux-mm/patch/1554775226-67213-1-git-s...
Signed-off-by: yangerkun yangerkun@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/hugetlbfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 56776b726d33..96c5f4c5ee6e 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -655,7 +655,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * as well as being converted to page offsets. */ start = offset >> hpage_shift; - end = (offset + len + hpage_size - 1) >> hpage_shift; + end = DIV_ROUND_UP_ULL(offset + len, hpage_size);
inode_lock(inode);
From: Luo Jiaxing luojiaxing@huawei.com
mainline inclusion from mainline-v5.11-rc1 commit 356b01a986a5550ee16dd0b85306c6741f2d02d5 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4HX08 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------
This GPIO driver is for HiSilicon's ARM SoC.
HiSilicon's GPIO controller support double-edge interrupt and multi-core concurrent access.
ACPI table example for this GPIO controller: Device (GPO0) { Name (_HID, "HISI0184") Device (PRTA) { Name (_ADR, Zero) Name (_UID, Zero) Name (_DSD, Package (0x01) { Package (0x02) { "ngpios", 0x20 } }) } }
Signed-off-by: Luo Jiaxing luojiaxing@huawei.com Link: https://lore.kernel.org/r/1607934255-52544-2-git-send-email-luojiaxing@huawe... Signed-off-by: Linus Walleij linus.walleij@linaro.org Signed-off-by: Yihang Li liyihang6@hisilicon.com Reviewed-by: Qi Liu liuqi115@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + drivers/gpio/Kconfig | 11 + drivers/gpio/Makefile | 1 + drivers/gpio/gpio-hisi.c | 328 +++++++++++++++++++++++++ 4 files changed, 341 insertions(+) create mode 100644 drivers/gpio/gpio-hisi.c
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 863c72ad49a2..fe7123dd102a 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -7142,3 +7142,4 @@ CONFIG_ETMEM_SWAP=m CONFIG_NET_VENDOR_RAMAXEL=y CONFIG_SPNIC=m CONFIG_SPFC=m +CONFIG_GPIO_HISI=y diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index d1300fc003ed..b6240ea130b9 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -296,6 +296,17 @@ config GPIO_GRGPIO Select this to support Aeroflex Gaisler GRGPIO cores from the GRLIB VHDL IP core library.
+config GPIO_HISI + tristate "HiSilicon GPIO controller driver" + depends on (ARM64 || COMPILE_TEST) && ACPI + select GPIO_GENERIC + select GPIOLIB_IRQCHIP + help + Say Y or M here to build support for the HiSilicon GPIO controller + driver GPIO block. + This GPIO controller support double-edge interrupt and multi-core + concurrent access. + config GPIO_HLWD tristate "Nintendo Wii (Hollywood) GPIO" depends on OF_GPIO diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile index 09dada80ac34..260ae251317c 100644 --- a/drivers/gpio/Makefile +++ b/drivers/gpio/Makefile @@ -63,6 +63,7 @@ obj-$(CONFIG_GPIO_GE_FPGA) += gpio-ge.o obj-$(CONFIG_GPIO_GPIO_MM) += gpio-gpio-mm.o obj-$(CONFIG_GPIO_GRGPIO) += gpio-grgpio.o obj-$(CONFIG_GPIO_GW_PLD) += gpio-gw-pld.o +obj-$(CONFIG_GPIO_HISI) += gpio-hisi.o obj-$(CONFIG_GPIO_HLWD) += gpio-hlwd.o obj-$(CONFIG_HTC_EGPIO) += gpio-htc-egpio.o obj-$(CONFIG_GPIO_ICH) += gpio-ich.o diff --git a/drivers/gpio/gpio-hisi.c b/drivers/gpio/gpio-hisi.c new file mode 100644 index 000000000000..a3897800f811 --- /dev/null +++ b/drivers/gpio/gpio-hisi.c @@ -0,0 +1,328 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 HiSilicon Limited. */ +#include <linux/gpio/driver.h> +#include <linux/module.h> +#include <linux/mod_devicetable.h> +#include <linux/platform_device.h> +#include <linux/property.h> + +#define HISI_GPIO_SWPORT_DR_SET_WX 0x000 +#define HISI_GPIO_SWPORT_DR_CLR_WX 0x004 +#define HISI_GPIO_SWPORT_DDR_SET_WX 0x010 +#define HISI_GPIO_SWPORT_DDR_CLR_WX 0x014 +#define HISI_GPIO_SWPORT_DDR_ST_WX 0x018 +#define HISI_GPIO_INTEN_SET_WX 0x020 +#define HISI_GPIO_INTEN_CLR_WX 0x024 +#define HISI_GPIO_INTMASK_SET_WX 0x030 +#define HISI_GPIO_INTMASK_CLR_WX 0x034 +#define HISI_GPIO_INTTYPE_EDGE_SET_WX 0x040 +#define HISI_GPIO_INTTYPE_EDGE_CLR_WX 0x044 +#define HISI_GPIO_INT_POLARITY_SET_WX 0x050 +#define HISI_GPIO_INT_POLARITY_CLR_WX 0x054 +#define HISI_GPIO_DEBOUNCE_SET_WX 0x060 +#define HISI_GPIO_DEBOUNCE_CLR_WX 0x064 +#define HISI_GPIO_INTSTATUS_WX 0x070 +#define HISI_GPIO_PORTA_EOI_WX 0x078 +#define HISI_GPIO_EXT_PORT_WX 0x080 +#define HISI_GPIO_INTCOMB_MASK_WX 0x0a0 +#define HISI_GPIO_INT_DEDGE_SET 0x0b0 +#define HISI_GPIO_INT_DEDGE_CLR 0x0b4 +#define HISI_GPIO_INT_DEDGE_ST 0x0b8 + +#define HISI_GPIO_LINE_NUM_MAX 32 +#define HISI_GPIO_DRIVER_NAME "gpio-hisi" + +struct hisi_gpio { + struct gpio_chip chip; + struct device *dev; + void __iomem *reg_base; + unsigned int line_num; + struct irq_chip irq_chip; + int irq; +}; + +static inline u32 hisi_gpio_read_reg(struct gpio_chip *chip, + unsigned int off) +{ + struct hisi_gpio *hisi_gpio = + container_of(chip, struct hisi_gpio, chip); + void __iomem *reg = hisi_gpio->reg_base + off; + + return readl(reg); +} + +static inline void hisi_gpio_write_reg(struct gpio_chip *chip, + unsigned int off, u32 val) +{ + struct hisi_gpio *hisi_gpio = + container_of(chip, struct hisi_gpio, chip); + void __iomem *reg = hisi_gpio->reg_base + off; + + writel(val, reg); +} + +static void hisi_gpio_set_debounce(struct gpio_chip *chip, unsigned int off, + u32 debounce) +{ + if (debounce) + hisi_gpio_write_reg(chip, HISI_GPIO_DEBOUNCE_SET_WX, BIT(off)); + else + hisi_gpio_write_reg(chip, HISI_GPIO_DEBOUNCE_CLR_WX, BIT(off)); +} + +static int hisi_gpio_set_config(struct gpio_chip *chip, unsigned int offset, + unsigned long config) +{ + u32 config_para = pinconf_to_config_param(config); + u32 config_arg; + + switch (config_para) { + case PIN_CONFIG_INPUT_DEBOUNCE: + config_arg = pinconf_to_config_argument(config); + hisi_gpio_set_debounce(chip, offset, config_arg); + break; + default: + return -ENOTSUPP; + } + + return 0; +} + +static void hisi_gpio_set_ack(struct irq_data *d) +{ + struct gpio_chip *chip = irq_data_get_irq_chip_data(d); + + hisi_gpio_write_reg(chip, HISI_GPIO_PORTA_EOI_WX, BIT(irqd_to_hwirq(d))); +} + +static void hisi_gpio_irq_set_mask(struct irq_data *d) +{ + struct gpio_chip *chip = irq_data_get_irq_chip_data(d); + + hisi_gpio_write_reg(chip, HISI_GPIO_INTMASK_SET_WX, BIT(irqd_to_hwirq(d))); +} + +static void hisi_gpio_irq_clr_mask(struct irq_data *d) +{ + struct gpio_chip *chip = irq_data_get_irq_chip_data(d); + + hisi_gpio_write_reg(chip, HISI_GPIO_INTMASK_CLR_WX, BIT(irqd_to_hwirq(d))); +} + +static int hisi_gpio_irq_set_type(struct irq_data *d, u32 type) +{ + struct gpio_chip *chip = irq_data_get_irq_chip_data(d); + unsigned int mask = BIT(irqd_to_hwirq(d)); + + switch (type) { + case IRQ_TYPE_EDGE_BOTH: + hisi_gpio_write_reg(chip, HISI_GPIO_INT_DEDGE_SET, mask); + break; + case IRQ_TYPE_EDGE_RISING: + hisi_gpio_write_reg(chip, HISI_GPIO_INTTYPE_EDGE_SET_WX, mask); + hisi_gpio_write_reg(chip, HISI_GPIO_INT_POLARITY_SET_WX, mask); + break; + case IRQ_TYPE_EDGE_FALLING: + hisi_gpio_write_reg(chip, HISI_GPIO_INTTYPE_EDGE_SET_WX, mask); + hisi_gpio_write_reg(chip, HISI_GPIO_INT_POLARITY_CLR_WX, mask); + break; + case IRQ_TYPE_LEVEL_HIGH: + hisi_gpio_write_reg(chip, HISI_GPIO_INTTYPE_EDGE_CLR_WX, mask); + hisi_gpio_write_reg(chip, HISI_GPIO_INT_POLARITY_SET_WX, mask); + break; + case IRQ_TYPE_LEVEL_LOW: + hisi_gpio_write_reg(chip, HISI_GPIO_INTTYPE_EDGE_CLR_WX, mask); + hisi_gpio_write_reg(chip, HISI_GPIO_INT_POLARITY_CLR_WX, mask); + break; + default: + return -EINVAL; + } + + /* + * The dual-edge interrupt and other interrupt's registers do not + * take effect at the same time. The registers of the two-edge + * interrupts have higher priorities, the configuration of + * the dual-edge interrupts must be disabled before the configuration + * of other kind of interrupts. + */ + if (type != IRQ_TYPE_EDGE_BOTH) { + unsigned int both = hisi_gpio_read_reg(chip, HISI_GPIO_INT_DEDGE_ST); + + if (both & mask) + hisi_gpio_write_reg(chip, HISI_GPIO_INT_DEDGE_CLR, mask); + } + + if (type & IRQ_TYPE_LEVEL_MASK) + irq_set_handler_locked(d, handle_level_irq); + else if (type & IRQ_TYPE_EDGE_BOTH) + irq_set_handler_locked(d, handle_edge_irq); + + return 0; +} + +static void hisi_gpio_irq_enable(struct irq_data *d) +{ + struct gpio_chip *chip = irq_data_get_irq_chip_data(d); + + hisi_gpio_irq_clr_mask(d); + hisi_gpio_write_reg(chip, HISI_GPIO_INTEN_SET_WX, BIT(irqd_to_hwirq(d))); +} + +static void hisi_gpio_irq_disable(struct irq_data *d) +{ + struct gpio_chip *chip = irq_data_get_irq_chip_data(d); + + hisi_gpio_irq_set_mask(d); + hisi_gpio_write_reg(chip, HISI_GPIO_INTEN_CLR_WX, BIT(irqd_to_hwirq(d))); +} + +static void hisi_gpio_irq_handler(struct irq_desc *desc) +{ + struct hisi_gpio *hisi_gpio = irq_desc_get_handler_data(desc); + unsigned long irq_msk = hisi_gpio_read_reg(&hisi_gpio->chip, + HISI_GPIO_INTSTATUS_WX); + struct irq_chip *irq_c = irq_desc_get_chip(desc); + int hwirq; + + chained_irq_enter(irq_c, desc); + for_each_set_bit(hwirq, &irq_msk, HISI_GPIO_LINE_NUM_MAX) + generic_handle_irq(irq_find_mapping(hisi_gpio->chip.irq.domain, + hwirq)); + chained_irq_exit(irq_c, desc); +} + +static void hisi_gpio_init_irq(struct hisi_gpio *hisi_gpio) +{ + struct gpio_chip *chip = &hisi_gpio->chip; + struct gpio_irq_chip *girq_chip = &chip->irq; + + /* Set hooks for irq_chip */ + hisi_gpio->irq_chip.irq_ack = hisi_gpio_set_ack; + hisi_gpio->irq_chip.irq_mask = hisi_gpio_irq_set_mask; + hisi_gpio->irq_chip.irq_unmask = hisi_gpio_irq_clr_mask; + hisi_gpio->irq_chip.irq_set_type = hisi_gpio_irq_set_type; + hisi_gpio->irq_chip.irq_enable = hisi_gpio_irq_enable; + hisi_gpio->irq_chip.irq_disable = hisi_gpio_irq_disable; + + girq_chip->chip = &hisi_gpio->irq_chip; + girq_chip->default_type = IRQ_TYPE_NONE; + girq_chip->num_parents = 1; + girq_chip->parents = &hisi_gpio->irq; + girq_chip->parent_handler = hisi_gpio_irq_handler; + girq_chip->parent_handler_data = hisi_gpio; + + /* Clear Mask of GPIO controller combine IRQ */ + hisi_gpio_write_reg(chip, HISI_GPIO_INTCOMB_MASK_WX, 1); +} + +static const struct acpi_device_id hisi_gpio_acpi_match[] = { + {"HISI0184", 0}, + {} +}; +MODULE_DEVICE_TABLE(acpi, hisi_gpio_acpi_match); + +static void hisi_gpio_get_pdata(struct device *dev, + struct hisi_gpio *hisi_gpio) +{ + struct platform_device *pdev = to_platform_device(dev); + struct fwnode_handle *fwnode; + int idx = 0; + + device_for_each_child_node(dev, fwnode) { + /* Cycle for once, no need for an array to save line_num */ + if (fwnode_property_read_u32(fwnode, "ngpios", + &hisi_gpio->line_num)) { + dev_err(dev, + "failed to get number of lines for port%d and use default value instead\n", + idx); + hisi_gpio->line_num = HISI_GPIO_LINE_NUM_MAX; + } + + if (WARN_ON(hisi_gpio->line_num > HISI_GPIO_LINE_NUM_MAX)) + hisi_gpio->line_num = HISI_GPIO_LINE_NUM_MAX; + + hisi_gpio->irq = platform_get_irq(pdev, idx); + + dev_info(dev, + "get hisi_gpio[%d] with %d lines\n", idx, + hisi_gpio->line_num); + + idx++; + } +} + +static int hisi_gpio_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + void __iomem *dat, *set, *clr; + struct hisi_gpio *hisi_gpio; + int port_num; + int ret; + + /* + * One GPIO controller own one port currently, + * if we get more from ACPI table, return error. + */ + port_num = device_get_child_node_count(dev); + if (WARN_ON(port_num != 1)) + return -ENODEV; + + hisi_gpio = devm_kzalloc(dev, sizeof(*hisi_gpio), GFP_KERNEL); + if (!hisi_gpio) + return -ENOMEM; + + hisi_gpio->reg_base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(hisi_gpio->reg_base)) + return PTR_ERR(hisi_gpio->reg_base); + + hisi_gpio_get_pdata(dev, hisi_gpio); + + hisi_gpio->dev = dev; + + dat = hisi_gpio->reg_base + HISI_GPIO_EXT_PORT_WX; + set = hisi_gpio->reg_base + HISI_GPIO_SWPORT_DR_SET_WX; + clr = hisi_gpio->reg_base + HISI_GPIO_SWPORT_DR_CLR_WX; + + ret = bgpio_init(&hisi_gpio->chip, hisi_gpio->dev, 0x4, + hisi_gpio->reg_base + HISI_GPIO_EXT_PORT_WX, + hisi_gpio->reg_base + HISI_GPIO_SWPORT_DR_SET_WX, + hisi_gpio->reg_base + HISI_GPIO_SWPORT_DR_CLR_WX, + hisi_gpio->reg_base + HISI_GPIO_SWPORT_DDR_SET_WX, + hisi_gpio->reg_base + HISI_GPIO_SWPORT_DDR_CLR_WX, + BGPIOF_NO_SET_ON_INPUT); + if (ret) { + dev_err(dev, "failed to init, ret = %d\n", ret); + return ret; + } + + hisi_gpio->chip.set_config = hisi_gpio_set_config; + hisi_gpio->chip.ngpio = hisi_gpio->line_num; + hisi_gpio->chip.bgpio_dir_unreadable = 1; + hisi_gpio->chip.base = -1; + + if (hisi_gpio->irq > 0) + hisi_gpio_init_irq(hisi_gpio); + + ret = devm_gpiochip_add_data(dev, &hisi_gpio->chip, hisi_gpio); + if (ret) { + dev_err(dev, "failed to register gpiochip, ret = %d\n", ret); + return ret; + } + + return 0; +} + +static struct platform_driver hisi_gpio_driver = { + .driver = { + .name = HISI_GPIO_DRIVER_NAME, + .acpi_match_table = hisi_gpio_acpi_match, + }, + .probe = hisi_gpio_probe, +}; + +module_platform_driver(hisi_gpio_driver); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Luo Jiaxing luojiaxing@huawei.com"); +MODULE_DESCRIPTION("HiSilicon GPIO controller driver"); +MODULE_ALIAS("platform:" HISI_GPIO_DRIVER_NAME);