[Linuxarm] Re: [PATCH v6 1/1] ACPI / APEI: fix the regression of synchronous external aborts occur in user-mode

7 Jun 2021

Hi Tanxiaofei,
Please find a comment.
...
-----Original Message-----
From: tanxiaofei
Sent: 05 June 2021 11:16
To: Jonathan Cameron jonathan.cameron@huawei.com; Guohanjun
(Hanjun Guo) guohanjun@huawei.com; Shiju Jose
shiju.jose@huawei.com
Cc: linuxarm@openeuler.org; tanxiaofei tanxiaofei@huawei.com
Subject: [PATCH v6 1/1] ACPI / APEI: fix the regression of synchronous
external aborts occur in user-mode
Before commit 8fcc4ae6faf8 ("arm64: acpi: Make apei_claim_sea()
synchronise with APEI's irq work"), do_sea() would unconditionally signal the
affected task from the arch code. Since that change, the GHES driver sends
the signals.
This exposes a problem as errors the GHES driver doesn't understand or
doesn't handle effectively are silently ignored. It will cause the errors get
taken again, and circulate endlessly. User-space task get stuck in this loop.
Existing firmware on Kunpeng9xx systems reports cache errors with the 'ARM
Processor Error' CPER records.
Do memory failure handling for ARM Processor Error Section just like for
Memory Error Section.
Signed-off-by: Xiaofei Tan tanxiaofei@huawei.com
Reviewed-by: James Morse james.morse@arm.com

Chnages since v5:

Do some changes following James's suggestions: 1) optimize commit log


check error info length 3) some coding style advices.

Changes since v4:


Change the patch name from " ACPI / APEI: do memory failure on the



physical address reported by ARM processor error section" to this more
proper one.


Add a comment in the code to tell why not filter out corrected error in an



uncorrected section.
Changes since v3:

Print unhandled error following James Morse's advice.

Changes since v2:

Updated commit log


drivers/acpi/apei/ghes.c | 85
++++++++++++++++++++++++++++++++++++++----------
1 file changed, 68 insertions(+), 17 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index
fce7ade..da25870 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -441,28 +441,35 @@ static void ghes_kick_task_work(struct
callback_head *head)
   gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node,
node_len);  }
-static bool ghes_handle_memory_failure(struct acpi_hest_generic_data
*gdata,

		       int sev)



+static bool ghes_do_memory_failure(u64 physical_addr, int flags)
{
   unsigned long pfn;

int flags = -1;
int sec_sev = ghes_severity(gdata->error_severity);
struct cper_sec_mem_err *mem_err =

acpi_hest_get_payload(gdata);
if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE))
   	return false;

if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
return false;



pfn = mem_err->physical_addr >> PAGE_SHIFT;


pfn = PHYS_PFN(physical_addr);
if (!pfn_valid(pfn)) {
pr_warn_ratelimited(FW_WARN GHES_PFX
"Invalid address in generic error data: %#llx\n",


mem_err->physical_addr);




physical_addr);

return false;
}

memory_failure_queue(pfn, flags);

return true;


+}



+static bool ghes_handle_memory_failure(struct acpi_hest_generic_data
*gdata,

		       int sev)



+{

int flags = -1;
int sec_sev = ghes_severity(gdata->error_severity);
struct cper_sec_mem_err *mem_err =

acpi_hest_get_payload(gdata);


if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
return false;


/* iff following two events can be handled properly by now */
 if (sec_sev == GHES_SEV_CORRECTED &&
     (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED))

@@ -470,14 +477,60 @@ static bool ghes_handle_memory_failure(struct
acpi_hest_generic_data *gdata,
   if (sev == GHES_SEV_RECOVERABLE && sec_sev ==
GHES_SEV_RECOVERABLE)
   	flags = 0;

if (flags != -1) {
memory_failure_queue(pfn, flags);


return true;


}


if (flags != -1)
return ghes_do_memory_failure(mem_err->physical_addr,



flags);
return false;
}
+static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data
+*gdata, int sev) {

struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
struct cper_arm_err_info *err_info;
bool queued = false;
int sec_sev, i;

log_arm_hw_error(err);

sec_sev = ghes_severity(gdata->error_severity);
if (sev != GHES_SEV_RECOVERABLE || sec_sev !=

GHES_SEV_RECOVERABLE)

return false;



err_info = (struct cper_arm_err_info *)(err + 1);
for (i = 0; i < err->err_info_num; i++, err_info++) {
bool is_cache = (err_info->type ==



CPER_ARM_CACHE_ERROR);

bool has_pa = (err_info->validation_bits &



CPER_ARM_INFO_VALID_PHYSICAL_ADDR);

const char *error_type = "unknown error";



if (err_info->length != sizeof(struct cper_arm_err_info)) {


	pr_warn_ratelimited(FW_WARN GHES_PFX


			    "Error info length %d is invalid\n",


			    err_info->length);


	break;


}



I think with an old firmware which fill err_info->length according to previous 
"ARM Processor Error Information Structure"  and a latest kernel with newly expanded 
"ARM Processor Error Information Structure".  Then the check
"if (err_info->length != sizeof(struct cper_arm_err_info))" would pass and break out of the loop
and  then ghes_do_memory_failure() would not be called for handling the error.
Thus "if (err_info->length > sizeof(struct cper_arm_err_info))" is right?
...


/*


 * The field (err_info->error_info & BIT(26)) is fixed to set to


 * 1 in some old firmware of HiSilicon Kunpeng920. We



assume that

 * firmware won't mix corrected errors in an uncorrected



section,

 * and don't filter out 'corrected' error here.


 */


if (is_cache && has_pa) {


	queued = ghes_do_memory_failure(err_info-



...
physical_fault_addr, 0);

	continue;


}



if (err_info->type < ARRAY_SIZE(cper_proc_error_type_strs))


	error_type = cper_proc_error_type_strs[err_info-



...
type];


pr_warn_ratelimited(FW_WARN GHES_PFX


		    "Unhandled processor error type: %s\n",


		    error_type);


}

return queued;

+}



/*

PCIe AER errors need to be sent to the AER driver for reporting and
recovery. The GHES severities map to the following AER severities and @@

-605,9 +658,7 @@ static bool ghes_do_proc(struct ghes *ghes,
   		ghes_handle_aer(gdata);
   	}
   	else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {

	struct cper_sec_proc_arm *err =



acpi_hest_get_payload(gdata);


	log_arm_hw_error(err);




	queued = ghes_handle_arm_hw_error(gdata, sev);

} else {
	void *err = acpi_hest_get_payload(gdata);

--
2.8.1
Thanks,
Shiju

    

2025

2024

2023

2022

2021

2020

[Linuxarm] Re: [PATCH v6 1/1] ACPI / APEI: fix the regression of synchronous external aborts occur in user-mode