v5 - 增加了两行注释 /* AREP will return a positive error code to represent error status */ /* AQRY will return a positive error code to represent error status */
v4 - remove "Offering HULK" to pass CI patch format check
v3
- Add #include <linux/mm.h> to avoid compile failure in arch x86_64
v2: - Change "u64 pfn" ot "unsigned long pfn" to avoid compile error - Add two stub to pretiffy the code
在 2023/11/6 11:10, Zhang Zekun 写道:
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8CCP5
Hbm memory device add support for acls hot repair. The patch add two methods for userpace:
- query a paddr if it support acls repair
- repair a paddr in hbm memory device
The feature of ACLS hot repair can help to fix a memory error from userspace by passing through the error physical address to HBM hardware.
Signed-off-by: Zhang Zekun zhangzekun11@huawei.com
drivers/soc/hisilicon/Kconfig | 10 +++ drivers/soc/hisilicon/hisi_hbmdev.c | 134 ++++++++++++++++++++++++++++ 2 files changed, 144 insertions(+)
diff --git a/drivers/soc/hisilicon/Kconfig b/drivers/soc/hisilicon/Kconfig index 383375f20cac..45754528873e 100644 --- a/drivers/soc/hisilicon/Kconfig +++ b/drivers/soc/hisilicon/Kconfig @@ -44,4 +44,14 @@ config HISI_HBMCACHE To compile the driver as a module, choose M here: the module will be called hisi_hbmcache.
+config HISI_HBMDEV_ACLS
- bool "Add support for HISI ACLS repair"
- depends on HISI_HBMDEV
- help
Add ACLS support for hbm device, which can be used to query and
repair hardware error in HBM devices. This feature need to work with
hardware firmwares.
If not sure say no.
- endmenu
diff --git a/drivers/soc/hisilicon/hisi_hbmdev.c b/drivers/soc/hisilicon/hisi_hbmdev.c index 5b6b1618148c..a9cc78bde81b 100644 --- a/drivers/soc/hisilicon/hisi_hbmdev.c +++ b/drivers/soc/hisilicon/hisi_hbmdev.c @@ -11,6 +11,7 @@ #include <linux/node.h> #include <linux/arch_topology.h> #include <linux/memory_hotplug.h> +#include <linux/mm.h>
#include "hisi_internal.h"
@@ -25,6 +26,9 @@ struct cdev_node { struct memory_dev { struct kobject *memdev_kobj; struct kobject *topo_kobj; +#ifdef CONFIG_HISI_HBMDEV_ACLS
- struct kobject *acls_kobj;
+#endif struct cdev_node cdev_list; nodemask_t cluster_cpumask[MAX_NUMNODES]; }; @@ -85,6 +89,134 @@ static void memory_topo_init(void) kobject_put(mdev->topo_kobj); }
+#ifdef CONFIG_HISI_HBMDEV_ACLS +static struct acpi_device *paddr_to_acpi_device(u64 paddr) +{
- unsigned long pfn;
- int nid;
- pfn = __phys_to_pfn(paddr);
- if (!pfn_valid(pfn))
return NULL;
- nid = pfn_to_nid(pfn);
- if (nid < 0 && nid >= MAX_NUMNODES)
return NULL;
- return hotplug_mdev[nid];
+}
+static ssize_t acls_query_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
+{
- struct acpi_object_list arg_list;
- struct acpi_device *adev;
- union acpi_object obj;
- acpi_status status;
- u64 paddr, res;
- if (kstrtoull(buf, 16, &paddr))
return -EINVAL;
- adev = paddr_to_acpi_device(paddr);
- if (!adev)
return -EINVAL;
- obj.type = ACPI_TYPE_INTEGER;
- obj.integer.value = paddr;
- arg_list.count = 1;
- arg_list.pointer = &obj;
- status = acpi_evaluate_integer(adev->handle, "AQRY", &arg_list, &res);
- if (ACPI_FAILURE(status))
return -ENODEV;
- /* AQRY will return a positive error code to represent error status */
- if (IS_ERR_VALUE(-res))
return -res;
- else if (res)
return -ENODEV;
- return count;
+}
+static struct kobj_attribute acls_query_store_attribute =
- __ATTR(acls_query, 0200, NULL, acls_query_store);
+static ssize_t acls_repair_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
+{
- struct acpi_object_list arg_list;
- struct acpi_device *adev;
- union acpi_object obj;
- acpi_status status;
- u64 paddr, res;
- if (kstrtoull(buf, 16, &paddr))
return -EINVAL;
- adev = paddr_to_acpi_device(paddr);
- if (!adev)
return -EINVAL;
- obj.type = ACPI_TYPE_INTEGER;
- obj.integer.value = paddr;
- arg_list.count = 1;
- arg_list.pointer = &obj;
- status = acpi_evaluate_integer(adev->handle, "AREP", &arg_list, &res);
- if (ACPI_FAILURE(status))
return -ENODEV;
- /* AREP will return a positive error code to represent error status */
- if (IS_ERR_VALUE(-res))
return -res;
- else if (res)
return -ENODEV;
- return count;
+} +static struct kobj_attribute acls_repair_store_attribute =
- __ATTR(acls_repair, 0200, NULL, acls_repair_store);
+static struct attribute *acls_attrs[] = {
- &acls_query_store_attribute.attr,
- &acls_repair_store_attribute.attr,
- NULL,
+};
+static struct attribute_group acls_attr_group = {
- .attrs = acls_attrs,
+};
+static void acls_init(void) +{
- int ret = -ENOMEM;
- mdev->acls_kobj = kobject_create_and_add("acls", mdev->memdev_kobj);
- if (!mdev->acls_kobj)
goto out;
- ret = sysfs_create_group(mdev->acls_kobj, &acls_attr_group);
- if (ret)
kobject_put(mdev->acls_kobj);
+out:
- if (ret)
pr_err("ACLS hot repair is not enabled\n");
+}
+static void acls_remove(void) +{
- kobject_put(mdev->acls_kobj);
+} +#else +static void acls_init(void) {} +static void acls_remove(void) {} +#endif
- static int get_pxm(struct acpi_device *acpi_device, void *arg) { acpi_handle handle = acpi_device->handle;
@@ -284,6 +416,7 @@ static int __init mdev_init(void) }
memory_topo_init();
- acls_init(); return ret; } module_init(mdev_init);
@@ -293,6 +426,7 @@ static void __exit mdev_exit(void) container_remove(); kobject_put(mdev->memdev_kobj); kobject_put(mdev->topo_kobj);
- acls_remove(); kfree(mdev); } module_exit(mdev_exit);