Kernel
Threads by month
- ----- 2025 -----
- July
- June
- May
- April
- March
- February
- January
- ----- 2024 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2023 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2022 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2021 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2020 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2019 -----
- December
- 57 participants
- 19203 discussions
From: zhuling <zhuling8(a)huawei.com>
hulk inclusion
category: feature
bugzilla: 48159
CVE: NA
Enabled e820_pmem in arm64:
Use memmap=nn[KMG]!ss[KMG] reserve memory for persistent storage
when the kernel restart or update. the data in PMEM will not be lost
and can be loaded faster.this is a general features.
if you want use this features, you need do as follows:
1.reserve memory: add memmap to reserve memory in grub.cfg
memmap=nn[KMG]!ss[KMG] exp:memmap=100K!0x1a0000000.
2.insmod nd_e820.ko: modprobe nd_e820.
3.check pmem device in /dev exp: /dev/pmem0.
Signed-off-by: zhuling <zhuling8(a)huawei.com>
---
arch/arm64/Kconfig | 24 ++++++++++
arch/arm64/kernel/Makefile | 1 +
arch/arm64/kernel/pmem.c | 35 ++++++++++++++
arch/arm64/kernel/setup.c | 6 +++
arch/arm64/mm/init.c | 98 ++++++++++++++++++++++++++++++++++++++
drivers/nvdimm/Makefile | 1 +
include/linux/mm.h | 4 ++
7 files changed, 169 insertions(+)
create mode 100644 arch/arm64/kernel/pmem.c
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index b9c56543c..f1e05d9d2 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1141,6 +1141,30 @@ config XEN_DOM0
def_bool y
depends on XEN
+config ARM64_PMEM_LEGACY_DEVICE
+ bool
+
+config ARM64_PMEM_RESERVE
+ bool "reserve memory for persistent storage"
+ default y
+ help
+ Use memmap=nn[KMG]!ss[KMG](memmap=100K!0x1a0000000) reserve memory for
+ persistent storage
+
+ Say y here to enable this feature
+
+config ARM64_PMEM_LEGACY
+ tristate "create persistent storage"
+ depends on ARM64_PMEM_RESERVE
+ depends on BLK_DEV
+ select ARM64_PMEM_LEGACY_DEVICE
+ select LIBNVDIMM
+ help
+ Use reserved memory for persistent storage when the kernel restart
+ or update. the data in PMEM will not be lost and can be loaded faster.
+
+ Say y if unsure.
+
config XEN
bool "Xen guest support on ARM64"
depends on ARM64 && OF
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 2621d5c2b..c363639b8 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -63,6 +63,7 @@ obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o
obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o
obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
obj-$(CONFIG_ARM64_MTE) += mte.o
+obj-$(CONFIG_ARM64_PMEM_LEGACY_DEVICE) += pmem.o
obj-y += vdso/ probes/
obj-$(CONFIG_COMPAT_VDSO) += vdso32/
diff --git a/arch/arm64/kernel/pmem.c b/arch/arm64/kernel/pmem.c
new file mode 100644
index 000000000..16eaf706f
--- /dev/null
+++ b/arch/arm64/kernel/pmem.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright(c) 2021 Huawei Technologies Co., Ltd
+ *
+ * Derived from x86 and arm64 implement PMEM.
+ */
+#include <linux/platform_device.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/module.h>
+
+static int found(struct resource *res, void *data)
+{
+ return 1;
+}
+
+static int __init register_e820_pmem(void)
+{
+ struct platform_device *pdev;
+ int rc;
+
+ rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY,
+ IORESOURCE_MEM, 0, -1, NULL, found);
+ if (rc <= 0)
+ return 0;
+
+ /*
+ * See drivers/nvdimm/e820.c for the implementation, this is
+ * simply here to trigger the module to load on demand.
+ */
+ pdev = platform_device_alloc("e820_pmem", -1);
+
+ return platform_device_add(pdev);
+}
+device_initcall(register_e820_pmem);
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 6aff30de8..7f506036d 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -255,6 +255,12 @@ static void __init request_standard_resources(void)
request_resource(res, &crashk_res);
#endif
}
+
+#ifdef CONFIG_ARM64_PMEM_RESERVE
+ if (pmem_res.end && pmem_res.start)
+ request_resource(&iomem_resource, &pmem_res);
+#endif
+
}
static int __init reserve_memblock_reserved_regions(void)
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 794f992cb..e4dc19145 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -63,6 +63,18 @@ EXPORT_SYMBOL(memstart_addr);
phys_addr_t arm64_dma_phys_limit __ro_after_init;
phys_addr_t arm64_dma32_phys_limit __ro_after_init;
+static unsigned long long pmem_size, pmem_start;
+
+#ifdef CONFIG_ARM64_PMEM_RESERVE
+struct resource pmem_res = {
+ .name = "Persistent Memory (legacy)",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_MEM,
+ .desc = IORES_DESC_PERSISTENT_MEMORY_LEGACY
+};
+#endif
+
#ifndef CONFIG_KEXEC_CORE
static void __init reserve_crashkernel(void)
{
@@ -236,6 +248,88 @@ static void __init fdt_enforce_memory_region(void)
memblock_add(usable_rgns[1].base, usable_rgns[1].size);
}
+static int __init is_mem_valid(unsigned long long mem_size, unsigned long long mem_start)
+{
+ if (!memblock_is_region_memory(mem_start, mem_size)) {
+ pr_warn("cannot reserve mem: region is not memory!\n");
+ return -EINVAL;
+ }
+
+ if (memblock_is_region_reserved(mem_start, mem_size)) {
+ pr_warn("cannot reserve mem: region overlaps reserved memory!\n");
+ return -EINVAL;
+ }
+
+ if (!IS_ALIGNED(mem_start, SZ_2M)) {
+ pr_warn("cannot reserve mem: base address is not 2MB aligned!\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int __init parse_memmap_one(char *p)
+{
+ char *oldp;
+ phys_addr_t start_at, mem_size;
+ int ret;
+
+ if (!p)
+ return -EINVAL;
+
+ oldp = p;
+ mem_size = memparse(p, &p);
+ if (p == oldp)
+ return -EINVAL;
+
+ if (!mem_size)
+ return -EINVAL;
+
+ mem_size = PAGE_ALIGN(mem_size);
+
+ if (*p == '!') {
+ start_at = memparse(p+1, &p);
+
+ if (is_mem_valid(mem_size, start_at) != 0)
+ return -EINVAL;
+
+ pr_info("pmem reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
+ start_at, start_at + mem_size, mem_size >> 20);
+ pmem_start = start_at;
+ pmem_size = mem_size;
+ } else
+ pr_info("Unrecognized memmap option, please check the parameter.\n");
+
+ return *p == '\0' ? 0 : -EINVAL;
+}
+
+static int __init parse_memmap_opt(char *str)
+{
+ while (str) {
+ char *k = strchr(str, ',');
+
+ if (k)
+ *k++ = 0;
+
+ parse_memmap_one(str);
+ str = k;
+ }
+
+ return 0;
+}
+early_param("memmap", parse_memmap_opt);
+
+#ifdef CONFIG_ARM64_PMEM_RESERVE
+static void __init reserve_pmem(void)
+{
+ memblock_remove(pmem_start, pmem_size);
+ pr_info("pmem reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
+ pmem_start, pmem_start + pmem_size, pmem_size >> 20);
+ pmem_res.start = pmem_start;
+ pmem_res.end = pmem_start + pmem_size - 1;
+}
+#endif
+
void __init arm64_memblock_init(void)
{
const s64 linear_region_size = BIT(vabits_actual - 1);
@@ -359,6 +453,10 @@ void __init arm64_memblock_init(void)
reserve_elfcorehdr();
+#ifdef CONFIG_ARM64_PMEM_RESERVE
+ reserve_pmem();
+#endif
+
high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
dma_contiguous_reserve(arm64_dma32_phys_limit);
diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
index 29203f3d3..b97760e9f 100644
--- a/drivers/nvdimm/Makefile
+++ b/drivers/nvdimm/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o
obj-$(CONFIG_ND_BTT) += nd_btt.o
obj-$(CONFIG_ND_BLK) += nd_blk.o
obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o
+obj-$(CONFIG_ARM64_PMEM_LEGACY) += nd_e820.o
obj-$(CONFIG_OF_PMEM) += of_pmem.o
obj-$(CONFIG_VIRTIO_PMEM) += virtio_pmem.o nd_virtio.o
diff --git a/include/linux/mm.h b/include/linux/mm.h
index cd5c31372..a5e50495e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -45,6 +45,10 @@ extern int sysctl_page_lock_unfairness;
void init_mm_internals(void);
+#ifdef CONFIG_ARM64_PMEM_RESERVE
+extern struct resource pmem_res;
+#endif
+
#ifndef CONFIG_NEED_MULTIPLE_NODES /* Don't use mapnrs, do it properly */
extern unsigned long max_mapnr;
--
2.19.1
2
1

[PATCH openEuler-21.03 2/2] pid: add pid reserve method for checkpoint and recover
by hejingxian 02 Mar '21
by hejingxian 02 Mar '21
02 Mar '21
From: Jingxian He <hejingxian(a)huawei.com>
Date: Mon, 1 Mar 2021 17:44:59 +0800
Subject: [PATCH openEuler-21.03 2/2] pid: add pid reserve method for checkpoint and recover
hulk inclusion
category: feature
bugzilla: 48159
CVE: N/A
We record the pid of dump tasks in the reserved memory,
and reserve the pids before init task start.
In the recover process, free the reserved pids and realloc them for use.
Signed-off-by: Jingxian He <hejingxian(a)huawei.com>
Reviewed-by: Wenliang He <hewenliang4(a)huawei.com>
Reviewed-by: Jing Xiangfeng <jingxiangfeng(a)huawei.com>
---
arch/arm64/configs/openeuler_defconfig | 1 +
include/linux/pin_mem.h | 6 ++++
kernel/pid.c | 10 +++++++
mm/Kconfig | 10 +++++++
mm/pin_mem.c | 51 ++++++++++++++++++++++++++++++++++
5 files changed, 78 insertions(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig
index 76fda68..de6db02 100644
--- a/arch/arm64/configs/openeuler_defconfig
+++ b/arch/arm64/configs/openeuler_defconfig
@@ -1037,6 +1037,7 @@ CONFIG_FRAME_VECTOR=y
# CONFIG_READ_ONLY_THP_FOR_FS is not set
CONFIG_ARCH_HAS_PTE_SPECIAL=y
CONFIG_PIN_MEMORY=y
+CONFIG_PID_RESERVE=y
# end of Memory Management options
CONFIG_NET=y
diff --git a/include/linux/pin_mem.h b/include/linux/pin_mem.h
index bc8b03e..a9fe2ef 100644
--- a/include/linux/pin_mem.h
+++ b/include/linux/pin_mem.h
@@ -74,5 +74,11 @@ extern struct resource pin_memory_resource;
#endif
extern void init_reserve_page_map(unsigned long map_addr, unsigned long map_size);
+#ifdef CONFIG_PID_RESERVE
+extern bool is_need_reserve_pids(void);
+extern void free_reserved_pid(struct idr *idr, int pid);
+extern void reserve_pids(struct idr *idr, int pid_max);
+#endif
+
#endif /* CONFIG_PIN_MEMORY */
#endif /* _LINUX_PIN_MEMORY_H */
diff --git a/kernel/pid.c b/kernel/pid.c
index 4856818..32ab9ef 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -44,6 +44,9 @@
#include <linux/idr.h>
#include <net/sock.h>
#include <uapi/linux/pidfd.h>
+#ifdef CONFIG_PID_RESERVE
+#include <linux/pin_mem.h>
+#endif
struct pid init_struct_pid = {
.count = REFCOUNT_INIT(1),
@@ -209,6 +212,9 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
spin_lock_irq(&pidmap_lock);
if (tid) {
+#ifdef CONFIG_PID_RESERVE
+ free_reserved_pid(&tmp->idr, tid);
+#endif
nr = idr_alloc(&tmp->idr, NULL, tid,
tid + 1, GFP_ATOMIC);
/*
@@ -621,6 +627,10 @@ void __init pid_idr_init(void)
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
+#ifdef CONFIG_PID_RESERVE
+ if (is_need_reserve_pids())
+ reserve_pids(&init_pid_ns.idr, pid_max);
+#endif
}
static struct file *__pidfd_fget(struct task_struct *task, int fd)
diff --git a/mm/Kconfig b/mm/Kconfig
index 930dc13..e27d2c6 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -868,3 +868,13 @@ config PIN_MEMORY
the corresponding physical pages mapping info in checkpoint,
and remap the physical pages to restore tasks in restore.
endmenu
+
+config PID_RESERVE
+ bool "Support for reserve pid"
+ depends on PIN_MEMORY
+ help
+ Say y here to enable the pid reserved feature for checkpoint.
+ and restore.
+ We record the pid of dump task in the reserve memory,
+ and reserve the pids before init task start. In restore process,
+ free the reserved pids and realloc them for use.
diff --git a/mm/pin_mem.c b/mm/pin_mem.c
index 0a143b6..a040853 100644
--- a/mm/pin_mem.c
+++ b/mm/pin_mem.c
@@ -947,4 +947,55 @@ void clear_pin_memory_record(void)
}
EXPORT_SYMBOL_GPL(clear_pin_memory_record);
+#ifdef CONFIG_PID_RESERVE
+struct idr *reserve_idr;
+
+/* test if there exist pin memory tasks */
+bool is_need_reserve_pids(void)
+{
+ return (pin_pid_num > 0);
+}
+
+void free_reserved_pid(struct idr *idr, int pid)
+{
+ unsigned int index;
+ struct page_map_info *pmi;
+
+ if (!max_pin_pid_num || idr != reserve_idr)
+ return;
+
+ for (index = 0; index < pin_pid_num; index++) {
+ pmi = &(user_space_reserve_start[index]);
+ if (pmi->pid == pid && pmi->pid_reserved) {
+ idr_remove(idr, pid);
+ return;
+ }
+ }
+}
+
+/* reserve pids for check point tasks which pinned memory */
+void reserve_pids(struct idr *idr, int pid_max)
+{
+ int alloc_pid;
+ unsigned int index;
+ struct page_map_info *pmi;
+
+ if (!max_pin_pid_num)
+ return;
+ reserve_idr = idr;
+ for (index = 0; index < pin_pid_num; index++) {
+ pmi = &(user_space_reserve_start[index]);
+ pmi->pid_reserved = true;
+ alloc_pid = idr_alloc(idr, NULL, pmi->pid, pid_max, GFP_ATOMIC);
+ if (alloc_pid != pmi->pid) {
+ if (alloc_pid > 0)
+ idr_remove(idr, alloc_pid);
+ pr_warn("Reserve pid (%d) fail, real pid is %d.\n", alloc_pid, pmi->pid);
+ pmi->pid_reserved = false;
+ continue;
+ }
+ }
+}
+#endif /* CONFIG_PID_RESERVE */
+
#endif /* CONFIG_PIN_MEMORY */
--
2.9.5
1
0

[PATCH openEuler-21.03 1/2] mm: add pin memory method for checkpoint add restore
by hejingxian@huawei.com 02 Mar '21
by hejingxian@huawei.com 02 Mar '21
02 Mar '21
From: Jingxian He <hejingxian(a)huawei.com>
hulk inclusion
category: feature
bugzilla: 48159
CVE: N/A
We can use the checkpoint and restore in userspace(criu) method to
dump and restore tasks when updating the kernel.
Currently, criu needs dump all memory data of tasks to files.
When the memory size is very large(larger than 1G),
the cost time of the dumping data will be very long(more than 1 min).
By pin the memory data of tasks and collect the corresponding
physical pages mapping info in checkpoint process,
we can remap the physical pages to restore tasks after
upgrading the kernel. This pin memory method can
restore the task data within one second.
The pin memory area info is saved in the reserved memblock,
which can keep usable in the kernel update process.
The pin memory driver provides the following ioctl command for criu:
1) SET_PIN_MEM_AREA:
Set pin memory area, which can be remap to the restore task.
2) CLEAR_PIN_MEM_AREA:
Clear the pin memory area info,
which enable user reset the pin data.
3) REMAP_PIN_MEM_AREA:
Remap the pages of the pin memory to the restore task.
Signed-off-by: Jingxian He <hejingxian(a)huawei.com>
Reviewed-by: Wenliang He <hewenliang4(a)huawei.com>
Reviewed-by: Jing Xiangfeng <jingxiangfeng(a)huawei.com>
---
arch/arm64/configs/openeuler_defconfig | 2 +
arch/arm64/kernel/setup.c | 9 +
arch/arm64/mm/init.c | 60 +++
drivers/char/Kconfig | 6 +
drivers/char/Makefile | 1 +
drivers/char/pin_memory.c | 208 ++++++++
include/linux/crash_core.h | 5 +
include/linux/pin_mem.h | 78 +++
kernel/crash_core.c | 11 +
mm/Kconfig | 8 +
mm/Makefile | 1 +
mm/huge_memory.c | 61 +++
mm/memory.c | 59 ++
mm/pin_mem.c | 950 +++++++++++++++++++++++++++++++++
14 files changed, 1459 insertions(+)
create mode 100644 drivers/char/pin_memory.c
create mode 100644 include/linux/pin_mem.h
create mode 100644 mm/pin_mem.c
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig
index c5271e7..76fda68 100644
--- a/arch/arm64/configs/openeuler_defconfig
+++ b/arch/arm64/configs/openeuler_defconfig
@@ -1036,6 +1036,7 @@ CONFIG_FRAME_VECTOR=y
# CONFIG_GUP_BENCHMARK is not set
# CONFIG_READ_ONLY_THP_FOR_FS is not set
CONFIG_ARCH_HAS_PTE_SPECIAL=y
+CONFIG_PIN_MEMORY=y
# end of Memory Management options
CONFIG_NET=y
@@ -3282,6 +3283,7 @@ CONFIG_TCG_TIS_ST33ZP24_SPI=y
# CONFIG_RANDOM_TRUST_CPU is not set
# CONFIG_RANDOM_TRUST_BOOTLOADER is not set
+CONFIG_PIN_MEMORY_DEV=m
#
# I2C support
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index c1f1fb9..5e282d3 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -50,6 +50,9 @@
#include <asm/efi.h>
#include <asm/xen/hypervisor.h>
#include <asm/mmu_context.h>
+#ifdef CONFIG_PIN_MEMORY
+#include <linux/pin_mem.h>
+#endif
static int num_standard_resources;
static struct resource *standard_resources;
@@ -260,6 +263,12 @@ static void __init request_standard_resources(void)
quick_kexec_res.end <= res->end)
request_resource(res, &quick_kexec_res);
#endif
+#ifdef CONFIG_PIN_MEMORY
+ if (pin_memory_resource.end &&
+ pin_memory_resource.start >= res->start &&
+ pin_memory_resource.end <= res->end)
+ request_resource(res, &pin_memory_resource);
+#endif
}
}
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index f3e5a66..8ab5aac 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -42,6 +42,9 @@
#include <linux/sizes.h>
#include <asm/tlb.h>
#include <asm/alternative.h>
+#ifdef CONFIG_PIN_MEMORY
+#include <linux/pin_mem.h>
+#endif
#define ARM64_ZONE_DMA_BITS 30
@@ -78,6 +81,55 @@ static void __init reserve_crashkernel(void)
*/
#define MAX_USABLE_RANGES 2
+#ifdef CONFIG_PIN_MEMORY
+struct resource pin_memory_resource = {
+ .name = "Pin memory",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_MEM,
+ .desc = IORES_DESC_RESERVED
+};
+
+static void __init reserve_pin_memory_res(void)
+{
+ unsigned long long mem_start, mem_len;
+ int ret;
+
+ ret = parse_pin_memory(boot_command_line, memblock_phys_mem_size(),
+ &mem_len, &mem_start);
+ if (ret || !mem_len)
+ return;
+
+ mem_len = PAGE_ALIGN(mem_len);
+
+ if (!memblock_is_region_memory(mem_start, mem_len)) {
+ pr_warn("cannot reserve for pin memory: region is not memory!\n");
+ return;
+ }
+
+ if (memblock_is_region_reserved(mem_start, mem_len)) {
+ pr_warn("cannot reserve for pin memory: region overlaps reserved memory!\n");
+ return;
+ }
+
+ if (!IS_ALIGNED(mem_start, SZ_2M)) {
+ pr_warn("cannot reserve for pin memory: base address is not 2MB aligned\n");
+ return;
+ }
+
+ memblock_reserve(mem_start, mem_len);
+ pr_debug("pin memory resource reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
+ mem_start, mem_start + mem_len, mem_len >> 20);
+
+ pin_memory_resource.start = mem_start;
+ pin_memory_resource.end = mem_start + mem_len - 1;
+}
+#else
+static void __init reserve_pin_memory_res(void)
+{
+}
+#endif /* CONFIG_PIN_MEMORY */
+
#ifdef CONFIG_CRASH_DUMP
static int __init early_init_dt_scan_elfcorehdr(unsigned long node,
const char *uname, int depth, void *data)
@@ -455,6 +507,8 @@ void __init arm64_memblock_init(void)
reserve_park_mem();
#endif
+ reserve_pin_memory_res();
+
reserve_elfcorehdr();
high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
@@ -583,6 +637,12 @@ void __init mem_init(void)
/* this will put all unused low memory onto the freelists */
memblock_free_all();
+#ifdef CONFIG_PIN_MEMORY
+ /* pre alloc the pages for pin memory */
+ init_reserve_page_map((unsigned long)pin_memory_resource.start,
+ (unsigned long)(pin_memory_resource.end - pin_memory_resource.start + 1));
+#endif
+
mem_init_print_info(NULL);
/*
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index d229a2d..fbb94b8 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -496,3 +496,9 @@ config RANDOM_TRUST_BOOTLOADER
booloader is trustworthy so it will be added to the kernel's entropy
pool. Otherwise, say N here so it will be regarded as device input that
only mixes the entropy pool.
+
+config PIN_MEMORY_DEV
+ bool "/dev/pinmem character device"
+ default m
+ help
+ pin memory driver
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index ffce287..71d76fd 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -47,3 +47,4 @@ obj-$(CONFIG_PS3_FLASH) += ps3flash.o
obj-$(CONFIG_XILLYBUS) += xillybus/
obj-$(CONFIG_POWERNV_OP_PANEL) += powernv-op-panel.o
obj-$(CONFIG_ADI) += adi.o
+obj-$(CONFIG_PIN_MEMORY_DEV) += pin_memory.o
diff --git a/drivers/char/pin_memory.c b/drivers/char/pin_memory.c
new file mode 100644
index 0000000..f46e056
--- /dev/null
+++ b/drivers/char/pin_memory.c
@@ -0,0 +1,208 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright @ Huawei Technologies Co., Ltd. 2020-2020. ALL rights reserved.
+ * Description: Euler pin memory driver
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kprobes.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <linux/mm_types.h>
+#include <linux/processor.h>
+#include <uapi/asm-generic/ioctl.h>
+#include <uapi/asm-generic/mman-common.h>
+#include <uapi/asm/setup.h>
+#include <linux/pin_mem.h>
+#include <linux/sched/mm.h>
+
+#define MAX_PIN_MEM_AREA_NUM 16
+struct _pin_mem_area {
+ unsigned long virt_start;
+ unsigned long virt_end;
+};
+
+struct pin_mem_area_set {
+ unsigned int pid;
+ unsigned int area_num;
+ struct _pin_mem_area mem_area[MAX_PIN_MEM_AREA_NUM];
+};
+
+#define PIN_MEM_MAGIC 0x59
+#define _SET_PIN_MEM_AREA 1
+#define _CLEAR_PIN_MEM_AREA 2
+#define _REMAP_PIN_MEM_AREA 3
+#define _FINISH_PIN_MEM_DUMP 4
+#define _PIN_MEM_IOC_MAX_NR 4
+#define SET_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _SET_PIN_MEM_AREA, struct pin_mem_area_set)
+#define CLEAR_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _CLEAR_PIN_MEM_AREA, int)
+#define REMAP_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _REMAP_PIN_MEM_AREA, int)
+#define FINISH_PIN_MEM_DUMP _IOW(PIN_MEM_MAGIC, _FINISH_PIN_MEM_DUMP, int)
+static int set_pin_mem(struct pin_mem_area_set *pmas)
+{
+ int i;
+ int ret = 0;
+ struct _pin_mem_area *pma;
+ struct mm_struct *mm;
+ struct task_struct *task;
+ struct pid *pid_s;
+
+ pid_s = find_get_pid(pmas->pid);
+ if (!pid_s) {
+ pr_warn("Get pid struct fail:%d.\n", pmas->pid);
+ return -EFAULT;
+ }
+ rcu_read_lock();
+ task = pid_task(pid_s, PIDTYPE_PID);
+ if (!task) {
+ pr_warn("Get task struct fail:%d.\n", pmas->pid);
+ goto fail;
+ }
+ mm = get_task_mm(task);
+ for (i = 0; i < pmas->area_num; i++) {
+ pma = &(pmas->mem_area[i]);
+ ret = pin_mem_area(task, mm, pma->virt_start, pma->virt_end);
+ if (ret) {
+ mmput(mm);
+ goto fail;
+ }
+ }
+ mmput(mm);
+ rcu_read_unlock();
+ put_pid(pid_s);
+ return ret;
+
+fail:
+ rcu_read_unlock();
+ put_pid(pid_s);
+ return -EFAULT;
+}
+
+static int set_pin_mem_area(unsigned long arg)
+{
+ struct pin_mem_area_set pmas;
+ void __user *buf = (void __user *)arg;
+
+ if (!access_ok(buf, sizeof(pmas)))
+ return -EFAULT;
+ if (copy_from_user(&pmas, buf, sizeof(pmas)))
+ return -EINVAL;
+ if (pmas.area_num > MAX_PIN_MEM_AREA_NUM) {
+ pr_warn("Input area_num is too large.\n");
+ return -EINVAL;
+ }
+
+ return set_pin_mem(&pmas);
+}
+
+static int pin_mem_remap(unsigned long arg)
+{
+ int pid;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ vm_fault_t ret;
+ void __user *buf = (void __user *)arg;
+ struct pid *pid_s;
+
+ if (!access_ok(buf, sizeof(int)))
+ return -EINVAL;
+ if (copy_from_user(&pid, buf, sizeof(int)))
+ return -EINVAL;
+
+ pid_s = find_get_pid(pid);
+ if (!pid_s) {
+ pr_warn("Get pid struct fail:%d.\n", pid);
+ return -EINVAL;
+ }
+ rcu_read_lock();
+ task = pid_task(pid_s, PIDTYPE_PID);
+ if (!task) {
+ pr_warn("Get task struct fail:%d.\n", pid);
+ goto fault;
+ }
+ mm = get_task_mm(task);
+ ret = do_mem_remap(pid, mm);
+ if (ret) {
+ pr_warn("Handle pin memory remap fail.\n");
+ mmput(mm);
+ goto fault;
+ }
+ mmput(mm);
+ rcu_read_unlock();
+ put_pid(pid_s);
+ return 0;
+
+fault:
+ rcu_read_unlock();
+ put_pid(pid_s);
+ return -EFAULT;
+}
+
+static long pin_memory_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ long ret = 0;
+
+ if (_IOC_TYPE(cmd) != PIN_MEM_MAGIC)
+ return -EINVAL;
+ if (_IOC_NR(cmd) > _PIN_MEM_IOC_MAX_NR)
+ return -EINVAL;
+
+ switch (cmd) {
+ case SET_PIN_MEM_AREA:
+ ret = set_pin_mem_area(arg);
+ break;
+ case CLEAR_PIN_MEM_AREA:
+ clear_pin_memory_record();
+ break;
+ case REMAP_PIN_MEM_AREA:
+ ret = pin_mem_remap(arg);
+ break;
+ case FINISH_PIN_MEM_DUMP:
+ ret = finish_pin_mem_dump();
+ break;
+ default:
+ return -EINVAL;
+ }
+ return ret;
+}
+
+static const struct file_operations pin_memory_fops = {
+ .owner = THIS_MODULE,
+ .unlocked_ioctl = pin_memory_ioctl,
+ .compat_ioctl = pin_memory_ioctl,
+};
+
+static struct miscdevice pin_memory_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "pinmem",
+ .fops = &pin_memory_fops,
+};
+
+static int pin_memory_init(void)
+{
+ int err = misc_register(&pin_memory_miscdev);
+
+ if (!err)
+ pr_info("pin_memory init\n");
+ else
+ pr_warn("pin_memory init failed!\n");
+ return err;
+}
+
+static void pin_memory_exit(void)
+{
+ misc_deregister(&pin_memory_miscdev);
+ pr_info("pin_memory ko exists!\n");
+}
+
+module_init(pin_memory_init);
+module_exit(pin_memory_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Euler");
+MODULE_DESCRIPTION("pin memory");
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index fc0ef33..30f0df3 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -87,4 +87,9 @@ int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
unsigned long long *crash_size, unsigned long long *crash_base);
+#ifdef CONFIG_PIN_MEMORY
+int __init parse_pin_memory(char *cmdline, unsigned long long system_ram,
+ unsigned long long *pin_size, unsigned long long *pin_base);
+#endif
+
#endif /* LINUX_CRASH_CORE_H */
diff --git a/include/linux/pin_mem.h b/include/linux/pin_mem.h
new file mode 100644
index 0000000..bc8b03e
--- /dev/null
+++ b/include/linux/pin_mem.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+ * Provide the pin memory method for check point and restore task.
+ */
+#ifndef _LINUX_PIN_MEMORY_H
+#define _LINUX_PIN_MEMORY_H
+
+#ifdef CONFIG_PIN_MEMORY
+#include <linux/errno.h>
+#include <linux/mm_types.h>
+#include <linux/err.h>
+#ifdef CONFIG_ARM64
+#include <linux/ioport.h>
+#endif
+
+#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy)
+
+#define COLLECT_PAGES_FINISH 0
+#define COLLECT_PAGES_NEED_CONTINUE 1
+#define COLLECT_PAGES_FAIL -1
+
+#define COMPOUND_PAD_MASK 0xffffffff
+#define COMPOUND_PAD_START 0x88
+#define COMPOUND_PAD_DELTA 0x40
+#define LIST_POISON4 0xdead000000000400
+#define PAGE_FLAGS_CHECK_RESERVED (1UL << PG_reserved)
+#define SHA256_DIGEST_SIZE 32
+#define next_pme(pme) ((unsigned long *)(pme + 1) + pme->nr_pages)
+#define PIN_MEM_DUMP_MAGIC 0xfeab000000001acd
+struct page_map_entry {
+ unsigned long virt_addr;
+ unsigned int nr_pages;
+ unsigned int is_huge_page;
+ unsigned long redirect_start;
+ unsigned long phy_addr_array[0];
+};
+
+struct page_map_info {
+ int pid;
+ int pid_reserved;
+ unsigned int entry_num;
+ int disable_free_page;
+ struct page_map_entry *pme;
+};
+
+struct pin_mem_dump_info {
+ char sha_digest[SHA256_DIGEST_SIZE];
+ unsigned long magic;
+ unsigned int pin_pid_num;
+ struct page_map_info pmi_array[0];
+};
+
+struct redirect_info {
+ unsigned int redirect_pages;
+ unsigned int redirect_index[0];
+};
+
+extern struct page_map_info *get_page_map_info(int pid);
+extern struct page_map_info *create_page_map_info(int pid);
+extern vm_fault_t do_mem_remap(int pid, struct mm_struct *mm);
+extern vm_fault_t do_anon_page_remap(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, struct page *page);
+extern void clear_pin_memory_record(void);
+extern int pin_mem_area(struct task_struct *task, struct mm_struct *mm,
+ unsigned long start_addr, unsigned long end_addr);
+extern vm_fault_t do_anon_huge_page_remap(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, struct page *page);
+extern int finish_pin_mem_dump(void);
+
+/* reserve space for pin memory*/
+#ifdef CONFIG_ARM64
+extern struct resource pin_memory_resource;
+#endif
+extern void init_reserve_page_map(unsigned long map_addr, unsigned long map_size);
+
+#endif /* CONFIG_PIN_MEMORY */
+#endif /* _LINUX_PIN_MEMORY_H */
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index bfed474..2407de3 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -450,6 +450,17 @@ void __init reserve_crashkernel(void)
}
#endif /* CONFIG_ARCH_WANT_RESERVE_CRASH_KERNEL */
+#ifdef CONFIG_PIN_MEMORY
+int __init parse_pin_memory(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *pin_size,
+ unsigned long long *pin_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, pin_size, pin_base,
+ "pinmemory=", NULL);
+}
+#endif
+
Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
void *data, size_t data_len)
{
diff --git a/mm/Kconfig b/mm/Kconfig
index 390165f..930dc13 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -859,4 +859,12 @@ config ARCH_HAS_HUGEPD
config MAPPING_DIRTY_HELPERS
bool
+config PIN_MEMORY
+ bool "Support for pin memory"
+ depends on CHECKPOINT_RESTORE
+ help
+ Say y here to enable the pin memory feature for checkpoint
+ and restore. We can pin the memory data of tasks and collect
+ the corresponding physical pages mapping info in checkpoint,
+ and remap the physical pages to restore tasks in restore.
endmenu
diff --git a/mm/Makefile b/mm/Makefile
index d73aed0..4963827 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -120,3 +120,4 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o
obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
+obj-$(CONFIG_PIN_MEMORY) += pin_mem.o
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0bc4a2c..8a11d30 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2996,3 +2996,64 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
update_mmu_cache_pmd(vma, address, pvmw->pmd);
}
#endif
+
+#ifdef CONFIG_PIN_MEMORY
+vm_fault_t do_anon_huge_page_remap(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, struct page *page)
+{
+ gfp_t gfp;
+ pgtable_t pgtable;
+ spinlock_t *ptl;
+ pmd_t entry;
+ vm_fault_t ret = 0;
+
+ if (unlikely(anon_vma_prepare(vma)))
+ return VM_FAULT_OOM;
+ if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
+ return VM_FAULT_OOM;
+ gfp = alloc_hugepage_direct_gfpmask(vma);
+ prep_transhuge_page(page);
+ if (mem_cgroup_charge(page, vma->vm_mm, gfp)) {
+ put_page(page);
+ count_vm_event(THP_FAULT_FALLBACK);
+ count_vm_event(THP_FAULT_FALLBACK_CHARGE);
+ return VM_FAULT_FALLBACK;
+ }
+ cgroup_throttle_swaprate(page, gfp);
+
+ pgtable = pte_alloc_one(vma->vm_mm);
+ if (unlikely(!pgtable)) {
+ ret = VM_FAULT_OOM;
+ goto release;
+ }
+ __SetPageUptodate(page);
+ ptl = pmd_lock(vma->vm_mm, pmd);
+ if (unlikely(!pmd_none(*pmd))) {
+ goto unlock_release;
+ } else {
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret)
+ goto unlock_release;
+ entry = mk_huge_pmd(page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ page_add_new_anon_rmap(page, vma, address, true);
+ lru_cache_add_inactive_or_unevictable(page, vma);
+ pgtable_trans_huge_deposit(vma->vm_mm, pmd, pgtable);
+ set_pmd_at(vma->vm_mm, address, pmd, entry);
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ mm_inc_nr_ptes(vma->vm_mm);
+ spin_unlock(ptl);
+ count_vm_event(THP_FAULT_ALLOC);
+ count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
+ }
+
+ return 0;
+unlock_release:
+ spin_unlock(ptl);
+release:
+ if (pgtable)
+ pte_free(vma->vm_mm, pgtable);
+ put_page(page);
+ return ret;
+}
+#endif
diff --git a/mm/memory.c b/mm/memory.c
index 50632c4..7b7f1a7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5248,3 +5248,62 @@ void ptlock_free(struct page *page)
kmem_cache_free(page_ptl_cachep, page->ptl);
}
#endif
+
+#ifdef CONFIG_PIN_MEMORY
+vm_fault_t do_anon_page_remap(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, struct page *page)
+{
+ pte_t entry;
+ spinlock_t *ptl;
+ pte_t *pte;
+ vm_fault_t ret = 0;
+
+ if (pte_alloc(vma->vm_mm, pmd))
+ return VM_FAULT_OOM;
+
+ /* See the comment in pte_alloc_one_map() */
+ if (unlikely(pmd_trans_unstable(pmd)))
+ return 0;
+
+ /* Allocate our own private page. */
+ if (unlikely(anon_vma_prepare(vma)))
+ goto oom;
+
+ if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
+ goto oom_free_page;
+ cgroup_throttle_swaprate(page, GFP_KERNEL);
+
+ __SetPageUptodate(page);
+
+ entry = mk_pte(page, vma->vm_page_prot);
+ if (vma->vm_flags & VM_WRITE)
+ entry = pte_mkwrite(pte_mkdirty(entry));
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, address,
+ &ptl);
+ if (!pte_none(*pte)) {
+ ret = VM_FAULT_FALLBACK;
+ goto release;
+ }
+
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret)
+ goto release;
+ inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
+ page_add_new_anon_rmap(page, vma, address, false);
+ lru_cache_add_inactive_or_unevictable(page, vma);
+
+ set_pte_at(vma->vm_mm, address, pte, entry);
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(vma, address, pte);
+unlock:
+ pte_unmap_unlock(pte, ptl);
+ return ret;
+release:
+ put_page(page);
+ goto unlock;
+oom_free_page:
+ put_page(page);
+oom:
+ return VM_FAULT_OOM;
+}
+#endif
diff --git a/mm/pin_mem.c b/mm/pin_mem.c
new file mode 100644
index 0000000..0a143b6
--- /dev/null
+++ b/mm/pin_mem.c
@@ -0,0 +1,950 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+ * Provide the pin memory method for check point and restore task.
+ */
+#ifdef CONFIG_PIN_MEMORY
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/sched/cputime.h>
+#include <linux/tick.h>
+#include <linux/mm.h>
+#include <linux/pin_mem.h>
+#include <linux/idr.h>
+#include <linux/page-isolation.h>
+#include <linux/sched/mm.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <crypto/sha.h>
+
+#define MAX_PIN_PID_NUM 128
+static DEFINE_SPINLOCK(page_map_entry_lock);
+
+struct pin_mem_dump_info *pin_mem_dump_start;
+unsigned int pin_pid_num;
+static unsigned int *pin_pid_num_addr;
+static unsigned long __page_map_entry_start;
+static unsigned long page_map_entry_end;
+static struct page_map_info *user_space_reserve_start;
+static struct page_map_entry *page_map_entry_start;
+unsigned int max_pin_pid_num __read_mostly;
+unsigned long redirect_space_size;
+unsigned long redirect_space_start;
+#define DEFAULT_REDIRECT_SPACE_SIZE 0x100000
+
+static int __init setup_max_pin_pid_num(char *str)
+{
+ int ret = 0;
+
+ if (!str)
+ goto out;
+
+ ret = kstrtouint(str, 10, &max_pin_pid_num);
+out:
+ if (ret) {
+ pr_warn("Unable to parse max pin pid num.\n");
+ } else {
+ if (max_pin_pid_num > MAX_PIN_PID_NUM) {
+ max_pin_pid_num = 0;
+ pr_warn("Input max_pin_pid_num is too large.\n");
+ }
+ }
+ return ret;
+}
+early_param("max_pin_pid_num", setup_max_pin_pid_num);
+
+static int __init setup_redirect_space_size(char *str)
+{
+ if (!str)
+ goto out;
+
+ redirect_space_size = memparse(str, NULL);
+out:
+ if (!redirect_space_size) {
+ pr_warn("Unable to parse redirect space size, use the default value.\n");
+ redirect_space_size = DEFAULT_REDIRECT_SPACE_SIZE;
+ }
+ return 0;
+}
+early_param("redirect_space_size", setup_redirect_space_size);
+
+struct page_map_info *create_page_map_info(int pid)
+{
+ struct page_map_info *new;
+
+ if (!user_space_reserve_start)
+ return NULL;
+
+ if (pin_pid_num >= max_pin_pid_num) {
+ pr_warn("Pin pid num too large than max_pin_pid_num, fail create: %d!", pid);
+ return NULL;
+ }
+ new = (struct page_map_info *)(user_space_reserve_start + pin_pid_num);
+ new->pid = pid;
+ new->pme = NULL;
+ new->entry_num = 0;
+ new->pid_reserved = false;
+ new->disable_free_page = false;
+ (*pin_pid_num_addr)++;
+ pin_pid_num++;
+ return new;
+}
+EXPORT_SYMBOL_GPL(create_page_map_info);
+
+struct page_map_info *get_page_map_info(int pid)
+{
+ int i;
+
+ if (!user_space_reserve_start)
+ return NULL;
+
+ for (i = 0; i < pin_pid_num; i++) {
+ if (user_space_reserve_start[i].pid == pid)
+ return &(user_space_reserve_start[i]);
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(get_page_map_info);
+
+static struct page *find_head_page(struct page *page)
+{
+ struct page *p = page;
+
+ while (!PageBuddy(p)) {
+ if (PageLRU(p))
+ return NULL;
+ p--;
+ }
+ return p;
+}
+
+static void spilt_page_area_left(struct zone *zone, struct free_area *area, struct page *page,
+ unsigned long size, int order)
+{
+ unsigned long cur_size = 1 << order;
+ unsigned long total_size = 0;
+
+ while (size && cur_size > size) {
+ cur_size >>= 1;
+ order--;
+ area--;
+ if (cur_size <= size) {
+ list_add(&page[total_size].lru, &area->free_list[MIGRATE_MOVABLE]);
+ atomic_set(&(page[total_size]._mapcount), PAGE_BUDDY_MAPCOUNT_VALUE);
+ set_page_private(&page[total_size], order);
+ set_pageblock_migratetype(&page[total_size], MIGRATE_MOVABLE);
+ area->nr_free++;
+ total_size += cur_size;
+ size -= cur_size;
+ }
+ }
+}
+
+static void spilt_page_area_right(struct zone *zone, struct free_area *area, struct page *page,
+ unsigned long size, int order)
+{
+ unsigned long cur_size = 1 << order;
+ struct page *right_page, *head_page;
+
+ right_page = page + size;
+ while (size && cur_size > size) {
+ cur_size >>= 1;
+ order--;
+ area--;
+ if (cur_size <= size) {
+ head_page = right_page - cur_size;
+ list_add(&head_page->lru, &area->free_list[MIGRATE_MOVABLE]);
+ atomic_set(&(head_page->_mapcount), PAGE_BUDDY_MAPCOUNT_VALUE);
+ set_page_private(head_page, order);
+ set_pageblock_migratetype(head_page, MIGRATE_MOVABLE);
+ area->nr_free++;
+ size -= cur_size;
+ right_page = head_page;
+ }
+ }
+}
+
+void reserve_page_from_buddy(unsigned long nr_pages, struct page *page)
+{
+ unsigned int current_order;
+ struct page *page_end;
+ struct free_area *area;
+ struct zone *zone;
+ struct page *head_page;
+
+ head_page = find_head_page(page);
+ if (!head_page) {
+ pr_warn("Find page head fail.");
+ return;
+ }
+ current_order = head_page->private;
+ page_end = head_page + (1 << current_order);
+ zone = page_zone(head_page);
+ area = &(zone->free_area[current_order]);
+ list_del(&head_page->lru);
+ atomic_set(&head_page->_mapcount, -1);
+ set_page_private(head_page, 0);
+ area->nr_free--;
+ if (head_page != page)
+ spilt_page_area_left(zone, area, head_page,
+ (unsigned long)(page - head_page), current_order);
+ page = page + nr_pages;
+ if (page < page_end) {
+ spilt_page_area_right(zone, area, page,
+ (unsigned long)(page_end - page), current_order);
+ } else if (page > page_end) {
+ pr_warn("Find page end smaller than page.");
+ }
+}
+
+static inline void reserve_user_normal_pages(struct page *page)
+{
+ atomic_inc(&page->_refcount);
+ reserve_page_from_buddy(1, page);
+}
+
+static void init_huge_pmd_pages(struct page *head_page)
+{
+ int i = 0;
+ struct page *page = head_page;
+
+ __set_bit(PG_head, &page->flags);
+ __set_bit(PG_active, &page->flags);
+ atomic_set(&page->_refcount, 1);
+ page++;
+ i++;
+ page->compound_head = (unsigned long)head_page + 1;
+ page->compound_dtor = HUGETLB_PAGE_DTOR + 1;
+ page->compound_order = HPAGE_PMD_ORDER;
+ page++;
+ i++;
+ page->compound_head = (unsigned long)head_page + 1;
+ i++;
+ INIT_LIST_HEAD(&(page->deferred_list));
+ for (; i < HPAGE_PMD_NR; i++) {
+ page = head_page + i;
+ page->compound_head = (unsigned long)head_page + 1;
+ }
+}
+
+static inline void reserve_user_huge_pmd_pages(struct page *page)
+{
+ atomic_inc(&page->_refcount);
+ reserve_page_from_buddy((1 << HPAGE_PMD_ORDER), page);
+ init_huge_pmd_pages(page);
+}
+
+int reserve_user_map_pages_fail;
+
+void free_user_map_pages(unsigned int pid_index, unsigned int entry_index, unsigned int page_index)
+{
+ unsigned int i, j, index, order;
+ struct page_map_info *pmi;
+ struct page_map_entry *pme;
+ struct page *page;
+ unsigned long phy_addr;
+
+ for (index = 0; index < pid_index; index++) {
+ pmi = &(user_space_reserve_start[index]);
+ pme = pmi->pme;
+ for (i = 0; i < pmi->entry_num; i++) {
+ for (j = 0; j < pme->nr_pages; j++) {
+ order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0;
+ phy_addr = pme->phy_addr_array[j];
+ if (phy_addr) {
+ page = phys_to_page(phy_addr);
+ if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) {
+ __free_pages(page, order);
+ pme->phy_addr_array[j] = 0;
+ }
+ }
+ }
+ pme = (struct page_map_entry *)next_pme(pme);
+ }
+ }
+ pmi = &(user_space_reserve_start[index]);
+ pme = pmi->pme;
+ for (i = 0; i < entry_index; i++) {
+ for (j = 0; j < pme->nr_pages; j++) {
+ order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0;
+ phy_addr = pme->phy_addr_array[j];
+ if (phy_addr) {
+ page = phys_to_page(phy_addr);
+ if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) {
+ __free_pages(page, order);
+ pme->phy_addr_array[j] = 0;
+ }
+ }
+ }
+ pme = (struct page_map_entry *)next_pme(pme);
+ }
+ for (j = 0; j < page_index; j++) {
+ order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0;
+ phy_addr = pme->phy_addr_array[j];
+ if (phy_addr) {
+ page = phys_to_page(phy_addr);
+ if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) {
+ __free_pages(page, order);
+ pme->phy_addr_array[j] = 0;
+ }
+ }
+ }
+}
+
+bool check_redirect_end_valid(struct redirect_info *redirect_start,
+ unsigned long max_redirect_page_num)
+{
+ unsigned long redirect_end;
+
+ redirect_end = ((unsigned long)(redirect_start + 1) +
+ max_redirect_page_num * sizeof(unsigned int));
+ if (redirect_end > redirect_space_start + redirect_space_size)
+ return false;
+ return false;
+}
+
+static void reserve_user_space_map_pages(void)
+{
+ struct page_map_info *pmi;
+ struct page_map_entry *pme;
+ unsigned int i, j, index;
+ struct page *page;
+ unsigned long flags;
+ unsigned long phy_addr;
+ unsigned long redirect_pages = 0;
+ struct redirect_info *redirect_start = (struct redirect_info *)redirect_space_start;
+
+ if (!user_space_reserve_start || !redirect_start)
+ return;
+ spin_lock_irqsave(&page_map_entry_lock, flags);
+ for (index = 0; index < pin_pid_num; index++) {
+ pmi = &(user_space_reserve_start[index]);
+ pme = pmi->pme;
+ for (i = 0; i < pmi->entry_num; i++) {
+ redirect_pages = 0;
+ if (!check_redirect_end_valid(redirect_start, pme->nr_pages))
+ redirect_start = NULL;
+ for (j = 0; j < pme->nr_pages; j++) {
+ phy_addr = pme->phy_addr_array[j];
+ if (!phy_addr)
+ continue;
+ page = phys_to_page(phy_addr);
+ if (atomic_read(&page->_refcount)) {
+ if ((page->flags & PAGE_FLAGS_CHECK_RESERVED)
+ && !pme->redirect_start)
+ pme->redirect_start =
+ (unsigned long)redirect_start;
+ if (redirect_start &&
+ (page->flags & PAGE_FLAGS_CHECK_RESERVED)) {
+ redirect_start->redirect_index[redirect_pages] = j;
+ redirect_pages++;
+ continue;
+ } else {
+ reserve_user_map_pages_fail = 1;
+ pr_warn("Page %pK refcount %d large than zero, no need reserve.\n",
+ page, atomic_read(&page->_refcount));
+ goto free_pages;
+ }
+ }
+ if (!pme->is_huge_page)
+ reserve_user_normal_pages(page);
+ else
+ reserve_user_huge_pmd_pages(page);
+ }
+ pme = (struct page_map_entry *)next_pme(pme);
+ if (redirect_pages && redirect_start) {
+ redirect_start->redirect_pages = redirect_pages;
+ redirect_start = (struct redirect_info *)(
+ (unsigned long)(redirect_start + 1) +
+ redirect_start->redirect_pages * sizeof(unsigned int));
+ }
+ }
+ }
+ spin_unlock(&page_map_entry_lock);
+ return;
+free_pages:
+ free_user_map_pages(index, i, j);
+ spin_unlock(&page_map_entry_lock);
+}
+
+
+int calculate_pin_mem_digest(struct pin_mem_dump_info *pmdi, char *digest)
+{
+ int i;
+ struct sha256_state sctx;
+
+ if (!digest)
+ digest = pmdi->sha_digest;
+ sha256_init(&sctx);
+ sha256_update(&sctx, (unsigned char *)(&(pmdi->magic)),
+ sizeof(struct pin_mem_dump_info) - SHA256_DIGEST_SIZE);
+ for (i = 0; i < pmdi->pin_pid_num; i++) {
+ sha256_update(&sctx, (unsigned char *)(&(pmdi->pmi_array[i])),
+ sizeof(struct page_map_info));
+ }
+ sha256_final(&sctx, digest);
+ return 0;
+}
+
+static int check_sha_digest(struct pin_mem_dump_info *pmdi)
+{
+ int ret = 0;
+ char digest[SHA256_DIGEST_SIZE] = {0};
+
+ ret = calculate_pin_mem_digest(pmdi, digest);
+ if (ret) {
+ pr_warn("calculate pin mem digest fail:%d\n", ret);
+ return ret;
+ }
+ if (memcmp(pmdi->sha_digest, digest, SHA256_DIGEST_SIZE)) {
+ pr_warn("pin mem dump info sha256 digest match error!\n");
+ return -EFAULT;
+ }
+ return ret;
+}
+
+/*
+ * The whole page map entry collect process must be Sequentially.
+ * The user_space_reserve_start points to the first page map info for
+ * the first dump task. And the page_map_entry_start points to
+ * the first page map entry of the first dump vma.
+ */
+static void init_page_map_info(struct pin_mem_dump_info *pmdi, unsigned long map_len)
+{
+ if (pin_mem_dump_start || !max_pin_pid_num) {
+ pr_warn("pin page map already init or max_pin_pid_num not set.\n");
+ return;
+ }
+ if (map_len < sizeof(struct pin_mem_dump_info) +
+ max_pin_pid_num * sizeof(struct page_map_info) + redirect_space_size) {
+ pr_warn("pin memory reserved memblock too small.\n");
+ return;
+ }
+ if ((pmdi->magic != PIN_MEM_DUMP_MAGIC) || (pmdi->pin_pid_num > max_pin_pid_num) ||
+ check_sha_digest(pmdi))
+ memset(pmdi, 0, sizeof(struct pin_mem_dump_info));
+ pin_mem_dump_start = pmdi;
+ pin_pid_num = pmdi->pin_pid_num;
+ pr_info("pin_pid_num: %d\n", pin_pid_num);
+ pin_pid_num_addr = &(pmdi->pin_pid_num);
+ user_space_reserve_start =
+ (struct page_map_info *)pmdi->pmi_array;
+ page_map_entry_start =
+ (struct page_map_entry *)(user_space_reserve_start + max_pin_pid_num);
+ page_map_entry_end = (unsigned long)pmdi + map_len - redirect_space_size;
+ redirect_space_start = page_map_entry_end;
+ if (pin_pid_num > 0)
+ reserve_user_space_map_pages();
+}
+
+int finish_pin_mem_dump(void)
+{
+ int ret;
+
+ pin_mem_dump_start->magic = PIN_MEM_DUMP_MAGIC;
+ memset(pin_mem_dump_start->sha_digest, 0, SHA256_DIGEST_SIZE);
+ ret = calculate_pin_mem_digest(pin_mem_dump_start, NULL);
+ if (ret) {
+ pr_warn("calculate pin mem digest fail:%d\n", ret);
+ return ret;
+ }
+ return ret;
+}
+
+int collect_pmd_huge_pages(struct task_struct *task,
+ unsigned long start_addr, unsigned long end_addr, struct page_map_entry *pme)
+{
+ long res;
+ int index = 0;
+ unsigned long start = start_addr;
+ struct page *temp_page;
+
+ while (start < end_addr) {
+ temp_page = NULL;
+ res = get_user_pages_remote(task->mm, start, 1,
+ FOLL_TOUCH | FOLL_GET, &temp_page, NULL, NULL);
+ if (!res) {
+ pr_warn("Get huge page for addr(%lx) fail.", start);
+ return COLLECT_PAGES_FAIL;
+ }
+ if (PageHead(temp_page)) {
+ start += HPAGE_PMD_SIZE;
+ pme->phy_addr_array[index] = page_to_phys(temp_page);
+ index++;
+ } else {
+ pme->nr_pages = index;
+ atomic_dec(&((temp_page)->_refcount));
+ return COLLECT_PAGES_NEED_CONTINUE;
+ }
+ }
+ pme->nr_pages = index;
+ return COLLECT_PAGES_FINISH;
+}
+
+int collect_normal_pages(struct task_struct *task,
+ unsigned long start_addr, unsigned long end_addr, struct page_map_entry *pme)
+{
+ int res;
+ unsigned long next;
+ unsigned long i, nr_pages;
+ struct page *tmp_page;
+ unsigned long *phy_addr_array = pme->phy_addr_array;
+ struct page **page_array = (struct page **)pme->phy_addr_array;
+
+ next = (start_addr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE;
+ next = (next > end_addr) ? end_addr : next;
+ pme->nr_pages = 0;
+ while (start_addr < next) {
+ nr_pages = (PAGE_ALIGN(next) - start_addr) / PAGE_SIZE;
+ res = get_user_pages_remote(task->mm, start_addr, 1,
+ FOLL_TOUCH | FOLL_GET, &tmp_page, NULL, NULL);
+ if (!res) {
+ pr_warn("Get user page of %lx fail.\n", start_addr);
+ return COLLECT_PAGES_FAIL;
+ }
+ if (PageHead(tmp_page)) {
+ atomic_dec(&(tmp_page->_refcount));
+ return COLLECT_PAGES_NEED_CONTINUE;
+ }
+ atomic_dec(&(tmp_page->_refcount));
+ if (PageTail(tmp_page)) {
+ start_addr = next;
+ pme->virt_addr = start_addr;
+ next = (next + HPAGE_PMD_SIZE) > end_addr ?
+ end_addr : (next + HPAGE_PMD_SIZE);
+ continue;
+ }
+ res = get_user_pages_remote(task->mm, start_addr, nr_pages,
+ FOLL_TOUCH | FOLL_GET, page_array, NULL, NULL);
+ if (!res) {
+ pr_warn("Get user pages of %lx fail.\n", start_addr);
+ return COLLECT_PAGES_FAIL;
+ }
+ for (i = 0; i < nr_pages; i++)
+ phy_addr_array[i] = page_to_phys(page_array[i]);
+ pme->nr_pages += nr_pages;
+ page_array += nr_pages;
+ phy_addr_array += nr_pages;
+ start_addr = next;
+ next = (next + HPAGE_PMD_SIZE) > end_addr ? end_addr : (next + HPAGE_PMD_SIZE);
+ }
+ return COLLECT_PAGES_FINISH;
+}
+
+/* Users make sure that the pin memory belongs to anonymous vma. */
+int pin_mem_area(struct task_struct *task, struct mm_struct *mm,
+ unsigned long start_addr, unsigned long end_addr)
+{
+ int pid, ret;
+ int is_huge_page = false;
+ unsigned int page_size;
+ unsigned long nr_pages, flags;
+ struct page_map_entry *pme;
+ struct page_map_info *pmi;
+ struct vm_area_struct *vma;
+ unsigned long i;
+ struct page *tmp_page;
+
+ if (!page_map_entry_start
+ || !task || !mm
+ || start_addr >= end_addr)
+ return -EFAULT;
+
+ pid = task->pid;
+ spin_lock_irqsave(&page_map_entry_lock, flags);
+ nr_pages = ((end_addr - start_addr) / PAGE_SIZE);
+ if ((unsigned long)page_map_entry_start + nr_pages * sizeof(struct page *) >=
+ page_map_entry_end) {
+ pr_warn("Page map entry use up!\n");
+ ret = -EFAULT;
+ goto finish;
+ }
+ vma = find_extend_vma(mm, start_addr);
+ if (!vma) {
+ pr_warn("Find no match vma!\n");
+ ret = -EFAULT;
+ goto finish;
+ }
+ if (start_addr == (start_addr & HPAGE_PMD_MASK) &&
+ transparent_hugepage_enabled(vma)) {
+ page_size = HPAGE_PMD_SIZE;
+ is_huge_page = true;
+ } else {
+ page_size = PAGE_SIZE;
+ }
+ pme = page_map_entry_start;
+ pme->virt_addr = start_addr;
+ pme->redirect_start = 0;
+ pme->is_huge_page = is_huge_page;
+ memset(pme->phy_addr_array, 0, nr_pages * sizeof(unsigned long));
+ down_write(&mm->mmap_lock);
+ if (!is_huge_page) {
+ ret = collect_normal_pages(task, start_addr, end_addr, pme);
+ if (ret != COLLECT_PAGES_FAIL && !pme->nr_pages) {
+ if (ret == COLLECT_PAGES_FINISH) {
+ ret = 0;
+ up_write(&mm->mmap_lock);
+ goto finish;
+ }
+ pme->is_huge_page = true;
+ page_size = HPAGE_PMD_SIZE;
+ ret = collect_pmd_huge_pages(task, pme->virt_addr, end_addr, pme);
+ }
+ } else {
+ ret = collect_pmd_huge_pages(task, start_addr, end_addr, pme);
+ if (ret != COLLECT_PAGES_FAIL && !pme->nr_pages) {
+ if (ret == COLLECT_PAGES_FINISH) {
+ ret = 0;
+ up_write(&mm->mmap_lock);
+ goto finish;
+ }
+ pme->is_huge_page = false;
+ page_size = PAGE_SIZE;
+ ret = collect_normal_pages(task, pme->virt_addr, end_addr, pme);
+ }
+ }
+ up_write(&mm->mmap_lock);
+ if (ret == COLLECT_PAGES_FAIL) {
+ ret = -EFAULT;
+ goto finish;
+ }
+
+ /* check for zero pages */
+ for (i = 0; i < pme->nr_pages; i++) {
+ tmp_page = phys_to_page(pme->phy_addr_array[i]);
+ if (!pme->is_huge_page) {
+ if (page_to_pfn(tmp_page) == my_zero_pfn(pme->virt_addr + i * PAGE_SIZE))
+ pme->phy_addr_array[i] = 0;
+ } else if (is_huge_zero_page(tmp_page))
+ pme->phy_addr_array[i] = 0;
+ }
+
+ page_map_entry_start = (struct page_map_entry *)(next_pme(pme));
+ pmi = get_page_map_info(pid);
+ if (!pmi)
+ pmi = create_page_map_info(pid);
+ if (!pmi) {
+ pr_warn("Create page map info fail for pid: %d!\n", pid);
+ ret = -EFAULT;
+ goto finish;
+ }
+ if (!pmi->pme)
+ pmi->pme = pme;
+ pmi->entry_num++;
+ spin_unlock_irqrestore(&page_map_entry_lock, flags);
+ if (ret == COLLECT_PAGES_NEED_CONTINUE)
+ ret = pin_mem_area(task, mm, pme->virt_addr + pme->nr_pages * page_size, end_addr);
+ return ret;
+finish:
+ spin_unlock_irqrestore(&page_map_entry_lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(pin_mem_area);
+
+vm_fault_t remap_normal_pages(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct page_map_entry *pme)
+{
+ int ret;
+ unsigned int j, i;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pmd_t *pmd;
+ pud_t *pud;
+ struct page *page, *new;
+ unsigned long address;
+ unsigned long phy_addr;
+ unsigned int redirect_pages = 0;
+ struct redirect_info *redirect_start;
+
+ redirect_start = (struct redirect_info *)pme->redirect_start;
+ for (j = 0; j < pme->nr_pages; j++) {
+ address = pme->virt_addr + j * PAGE_SIZE;
+ phy_addr = pme->phy_addr_array[j];
+ if (!phy_addr)
+ continue;
+ page = phys_to_page(phy_addr);
+ if (page_to_pfn(page) == my_zero_pfn(address)) {
+ pme->phy_addr_array[j] = 0;
+ continue;
+ }
+ pme->phy_addr_array[j] = 0;
+ if (redirect_start && (redirect_pages < redirect_start->redirect_pages) &&
+ (j == redirect_start->redirect_index[redirect_pages])) {
+ new = alloc_zeroed_user_highpage_movable(vma, address);
+ if (!new) {
+ pr_warn("Redirect alloc page fail\n");
+ continue;
+ }
+ copy_page(page_to_virt(new), phys_to_virt(phy_addr));
+ page = new;
+ redirect_pages++;
+ }
+ page->mapping = NULL;
+ pgd = pgd_offset(mm, address);
+ p4d = p4d_alloc(mm, pgd, address);
+ if (!p4d) {
+ ret = VM_FAULT_OOM;
+ goto free;
+ }
+ pud = pud_alloc(mm, p4d, address);
+ if (!pud) {
+ ret = VM_FAULT_OOM;
+ goto free;
+ }
+ pmd = pmd_alloc(mm, pud, address);
+ if (!pmd) {
+ ret = VM_FAULT_OOM;
+ goto free;
+ }
+ ret = do_anon_page_remap(vma, address, pmd, page);
+ if (ret)
+ goto free;
+ }
+ return 0;
+free:
+ for (i = j; i < pme->nr_pages; i++) {
+ phy_addr = pme->phy_addr_array[i];
+ if (phy_addr) {
+ __free_page(phys_to_page(phy_addr));
+ pme->phy_addr_array[i] = 0;
+ }
+ }
+ return ret;
+}
+
+static inline gfp_t get_hugepage_gfpmask(struct vm_area_struct *vma)
+{
+ const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
+
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
+ return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
+ return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
+ return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
+ __GFP_KSWAPD_RECLAIM);
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
+ return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
+ 0);
+ return GFP_TRANSHUGE_LIGHT;
+}
+
+vm_fault_t remap_huge_pmd_pages(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct page_map_entry *pme)
+{
+ int ret;
+ unsigned int j, i;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pmd_t *pmd;
+ pud_t *pud;
+ gfp_t gfp;
+ struct page *page, *new;
+ unsigned long address;
+ unsigned long phy_addr;
+ unsigned int redirect_pages = 0;
+ struct redirect_info *redirect_start;
+
+ redirect_start = (struct redirect_info *)pme->redirect_start;
+ for (j = 0; j < pme->nr_pages; j++) {
+ address = pme->virt_addr + j * HPAGE_PMD_SIZE;
+ phy_addr = pme->phy_addr_array[j];
+ if (!phy_addr)
+ continue;
+ page = phys_to_page(phy_addr);
+ if (is_huge_zero_page(page)) {
+ pme->phy_addr_array[j] = 0;
+ continue;
+ }
+ pme->phy_addr_array[j] = 0;
+ if (redirect_start && (redirect_pages < redirect_start->redirect_pages) &&
+ (j == redirect_start->redirect_index[redirect_pages])) {
+ gfp = get_hugepage_gfpmask(vma);
+ new = alloc_hugepage_vma(gfp, vma, address, HPAGE_PMD_ORDER);
+ if (!new) {
+ pr_warn("Redirect alloc huge page fail\n");
+ continue;
+ }
+ memcpy(page_to_virt(new), phys_to_virt(phy_addr), HPAGE_PMD_SIZE);
+ page = new;
+ redirect_pages++;
+ }
+ pgd = pgd_offset(mm, address);
+ p4d = p4d_alloc(mm, pgd, address);
+ if (!p4d) {
+ ret = VM_FAULT_OOM;
+ goto free;
+ }
+ pud = pud_alloc(mm, p4d, address);
+ if (!pud) {
+ ret = VM_FAULT_OOM;
+ goto free;
+ }
+ pmd = pmd_alloc(mm, pud, address);
+ if (!pmd) {
+ ret = VM_FAULT_OOM;
+ goto free;
+ }
+ ret = do_anon_huge_page_remap(vma, address, pmd, page);
+ if (ret)
+ goto free;
+ }
+ return 0;
+free:
+ for (i = j; i < pme->nr_pages; i++) {
+ phy_addr = pme->phy_addr_array[i];
+ if (phy_addr) {
+ page = phys_to_page(phy_addr);
+ if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) {
+ __free_pages(page, HPAGE_PMD_ORDER);
+ pme->phy_addr_array[i] = 0;
+ }
+ }
+ }
+ return ret;
+}
+
+static void free_unmap_pages(struct page_map_info *pmi,
+ struct page_map_entry *pme,
+ unsigned int index)
+{
+ unsigned int i, j;
+ unsigned long phy_addr;
+ unsigned int order;
+ struct page *page;
+
+ pme = (struct page_map_entry *)(next_pme(pme));
+ for (i = index; i < pmi->entry_num; i++) {
+ for (j = 0; j < pme->nr_pages; j++) {
+ phy_addr = pme->phy_addr_array[i];
+ if (phy_addr) {
+ page = phys_to_page(phy_addr);
+ order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0;
+ if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) {
+ __free_pages(page, order);
+ pme->phy_addr_array[i] = 0;
+ }
+ }
+ }
+ pme = (struct page_map_entry *)(next_pme(pme));
+ }
+}
+
+vm_fault_t do_mem_remap(int pid, struct mm_struct *mm)
+{
+ unsigned int i = 0;
+ vm_fault_t ret = 0;
+ struct vm_area_struct *vma;
+ struct page_map_info *pmi;
+ struct page_map_entry *pme;
+ unsigned long flags;
+
+ if (reserve_user_map_pages_fail)
+ return -EFAULT;
+ pmi = get_page_map_info(pid);
+ if (!pmi)
+ return -EFAULT;
+
+ spin_lock_irqsave(&page_map_entry_lock, flags);
+ pmi->disable_free_page = true;
+ spin_unlock(&page_map_entry_lock);
+ down_write(&mm->mmap_lock);
+ pme = pmi->pme;
+ vma = mm->mmap;
+ while ((i < pmi->entry_num) && (vma != NULL)) {
+ if (pme->virt_addr >= vma->vm_start && pme->virt_addr < vma->vm_end) {
+ i++;
+ if (!vma_is_anonymous(vma)) {
+ pme = (struct page_map_entry *)(next_pme(pme));
+ continue;
+ }
+ if (!pme->is_huge_page) {
+ ret = remap_normal_pages(mm, vma, pme);
+ if (ret < 0)
+ goto free;
+ } else {
+ ret = remap_huge_pmd_pages(mm, vma, pme);
+ if (ret < 0)
+ goto free;
+ }
+ pme = (struct page_map_entry *)(next_pme(pme));
+ } else {
+ vma = vma->vm_next;
+ }
+ }
+ up_write(&mm->mmap_lock);
+ return 0;
+free:
+ free_unmap_pages(pmi, pme, i);
+ up_write(&mm->mmap_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(do_mem_remap);
+
+#if defined(CONFIG_ARM64)
+void init_reserve_page_map(unsigned long map_addr, unsigned long map_size)
+{
+ void *addr;
+
+ if (!map_addr || !map_size)
+ return;
+ addr = phys_to_virt(map_addr);
+ init_page_map_info((struct pin_mem_dump_info *)addr, map_size);
+}
+#else
+void init_reserve_page_map(unsigned long map_addr, unsigned long map_size)
+{
+}
+#endif
+
+static void free_all_reserved_pages(void)
+{
+ unsigned int i, j, index, order;
+ struct page_map_info *pmi;
+ struct page_map_entry *pme;
+ struct page *page;
+ unsigned long phy_addr;
+
+ if (!user_space_reserve_start || reserve_user_map_pages_fail)
+ return;
+
+ for (index = 0; index < pin_pid_num; index++) {
+ pmi = &(user_space_reserve_start[index]);
+ if (pmi->disable_free_page)
+ continue;
+ pme = pmi->pme;
+ for (i = 0; i < pmi->entry_num; i++) {
+ for (j = 0; j < pme->nr_pages; j++) {
+ order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0;
+ phy_addr = pme->phy_addr_array[j];
+ if (phy_addr) {
+ page = phys_to_page(phy_addr);
+ if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) {
+ __free_pages(page, order);
+ pme->phy_addr_array[j] = 0;
+ }
+ }
+ }
+ pme = (struct page_map_entry *)next_pme(pme);
+ }
+ }
+}
+
+/* Clear all pin memory record. */
+void clear_pin_memory_record(void)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&page_map_entry_lock, flags);
+ free_all_reserved_pages();
+ if (pin_pid_num_addr) {
+ *pin_pid_num_addr = 0;
+ pin_pid_num = 0;
+ page_map_entry_start = (struct page_map_entry *)__page_map_entry_start;
+ }
+ spin_unlock(&page_map_entry_lock);
+}
+EXPORT_SYMBOL_GPL(clear_pin_memory_record);
+
+#endif /* CONFIG_PIN_MEMORY */
--
2.9.5
1
1

[PATCH OLK-5.10 v1] arm64: Declare var of local_cpu_stop only on PARK
by sangyan@huawei.com 01 Mar '21
by sangyan@huawei.com 01 Mar '21
01 Mar '21
From: Sang Yan <sangyan(a)huawei.com>
hulk inclusion
category: feature
bugzilla: 48159
CVE: N/A
Fix compile warning: unused variable 'ops' 'cpu'
while CONFIG_ARM64_CPU_PARK=n.
Put declaration of 'ops' and 'cpu' under
CONFIG_ARM64_CPU_PARK in local_cpu_stop.
Signed-off-by: Sang Yan <sangyan(a)huawei.com>
---
arch/arm64/kernel/smp.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 644bbd7..d7b750a 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -1024,8 +1024,10 @@ void arch_irq_work_raise(void)
static void local_cpu_stop(void)
{
+#ifdef CONFIG_ARM64_CPU_PARK
int cpu;
const struct cpu_operations *ops = NULL;
+#endif
set_cpu_online(smp_processor_id(), false);
--
2.9.5
1
0

26 Feb '21
From: Sang Yan <sangyan(a)huawei.com>
hulk inclusion
category: feature
bugzilla: 48159
CVE: N/A
Introducing a feature of CPU PARK in order to save time
of cpus down and up during kexec, which may cost 250ms of
per cpu's down and 30ms of up.
As a result, for 128 cores, it costs more than 30 seconds
to down and up cpus during kexec. Think about 256 cores and more.
CPU PARK is a state that cpu power-on and staying in spin loop, polling
for exit chances, such as writing exit address.
Reserving a block of memory, to fill with cpu park text section,
exit address and park-magic-flag of each cpu. In implementation,
reserved one page for one cpu core.
Cpus going to park state instead of down in machine_shutdown().
Cpus going out of park state in smp_init instead of brought up.
One of cpu park sections in pre-reserved memory blocks,:
+--------------+
+ exit address +
+--------------+
+ park magic +
+--------------+
+ park codes +
+ . +
+ . +
+ . +
+--------------+
Signed-off-by: Sang Yan <sangyan(a)huawei.com>
Reviewed-by: Jing Xiangfeng <jingxiangfeng(a)huawei.com>
---
arch/arm64/Kconfig | 12 ++
arch/arm64/include/asm/kexec.h | 6 +
arch/arm64/include/asm/smp.h | 15 +++
arch/arm64/kernel/Makefile | 1 +
arch/arm64/kernel/cpu-park.S | 59 ++++++++++
arch/arm64/kernel/machine_kexec.c | 2 +-
arch/arm64/kernel/process.c | 4 +
arch/arm64/kernel/smp.c | 230 ++++++++++++++++++++++++++++++++++++++
arch/arm64/mm/init.c | 55 +++++++++
9 files changed, 383 insertions(+), 1 deletion(-)
create mode 100644 arch/arm64/kernel/cpu-park.S
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index b9c5654..0885668 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -345,6 +345,18 @@ config KASAN_SHADOW_OFFSET
default 0xeffffff900000000 if ARM64_VA_BITS_36 && KASAN_SW_TAGS
default 0xffffffffffffffff
+config ARM64_CPU_PARK
+ bool "Support CPU PARK on kexec"
+ depends on SMP
+ depends on KEXEC_CORE
+ help
+ This enables support for CPU PARK feature in
+ order to save time of cpu down to up.
+ CPU park is a state through kexec, spin loop
+ instead of cpu die before jumping to new kernel,
+ jumping out from loop to new kernel entry in
+ smp_init.
+
source "arch/arm64/Kconfig.platforms"
menu "Kernel Features"
diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h
index 79909ae..a133889 100644
--- a/arch/arm64/include/asm/kexec.h
+++ b/arch/arm64/include/asm/kexec.h
@@ -36,6 +36,11 @@
#define CRASH_ADDR_HIGH_MAX MEMBLOCK_ALLOC_ACCESSIBLE
+#ifdef CONFIG_ARM64_CPU_PARK
+/* CPU park state flag: "park" */
+#define PARK_MAGIC 0x7061726b
+#endif
+
#ifndef __ASSEMBLY__
/**
@@ -104,6 +109,7 @@ static inline void crash_post_resume(void) {}
#ifdef CONFIG_KEXEC_CORE
extern void __init reserve_crashkernel(void);
#endif
+void machine_kexec_mask_interrupts(void);
#ifdef CONFIG_KEXEC_FILE
#define ARCH_HAS_KIMAGE_ARCH
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index 2e7f529..8c5d2d6 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -145,6 +145,21 @@ bool cpus_are_stuck_in_kernel(void);
extern void crash_smp_send_stop(void);
extern bool smp_crash_stop_failed(void);
+#ifdef CONFIG_ARM64_CPU_PARK
+#define PARK_SECTION_SIZE 1024
+struct cpu_park_info {
+ /* Physical address of reserved park memory. */
+ unsigned long start;
+ /* park reserve mem len should be PARK_SECTION_SIZE * NR_CPUS */
+ unsigned long len;
+ /* Virtual address of reserved park memory. */
+ unsigned long start_v;
+};
+extern struct cpu_park_info park_info;
+extern void enter_cpu_park(unsigned long text, unsigned long exit);
+extern void do_cpu_park(unsigned long exit);
+extern int kexec_smp_send_park(void);
+#endif
#endif /* ifndef __ASSEMBLY__ */
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 2621d5c..60478d2 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate-asm.o
obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o \
cpu-reset.o
+obj-$(CONFIG_ARM64_CPU_PARK) += cpu-park.o
obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o
obj-$(CONFIG_ARM64_RELOC_TEST) += arm64-reloc-test.o
arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o
diff --git a/arch/arm64/kernel/cpu-park.S b/arch/arm64/kernel/cpu-park.S
new file mode 100644
index 0000000..10c685c
--- /dev/null
+++ b/arch/arm64/kernel/cpu-park.S
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * CPU park routines
+ *
+ * Copyright (C) 2020 Huawei Technologies., Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/kexec.h>
+#include <asm/sysreg.h>
+#include <asm/virt.h>
+
+.text
+.pushsection .idmap.text, "awx"
+
+/* cpu park helper in idmap section */
+SYM_CODE_START(enter_cpu_park)
+ /* Clear sctlr_el1 flags. */
+ mrs x12, sctlr_el1
+ mov_q x13, SCTLR_ELx_FLAGS
+ bic x12, x12, x13
+ pre_disable_mmu_workaround
+ msr sctlr_el1, x12 /* disable mmu */
+ isb
+
+ mov x18, x0
+ mov x0, x1 /* secondary_entry addr */
+ br x18 /* call do_cpu_park of each cpu */
+SYM_CODE_END(enter_cpu_park)
+
+.popsection
+
+SYM_CODE_START(do_cpu_park)
+ ldr x18, =PARK_MAGIC /* magic number "park" */
+ add x1, x0, #8
+ str x18, [x1] /* set on-park flag */
+ dc civac, x1 /* flush cache of "park" */
+ dsb nsh
+ isb
+
+.Lloop:
+ wfe
+ isb
+ ldr x19, [x0]
+ cmp x19, #0 /* test secondary_entry */
+ b.eq .Lloop
+
+ ic iallu /* invalidate the local I-cache */
+ dsb nsh
+ isb
+
+ br x19 /* jump to secondary_entry */
+SYM_CODE_END(do_cpu_park)
+
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index a0b144c..f47ce96 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -213,7 +213,7 @@ void machine_kexec(struct kimage *kimage)
BUG(); /* Should never get here. */
}
-static void machine_kexec_mask_interrupts(void)
+void machine_kexec_mask_interrupts(void)
{
unsigned int i;
struct irq_desc *desc;
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 73e3b32..10cffee 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -146,6 +146,10 @@ void arch_cpu_idle_dead(void)
*/
void machine_shutdown(void)
{
+#ifdef CONFIG_ARM64_CPU_PARK
+ if (kexec_smp_send_park() == 0)
+ return;
+#endif
smp_shutdown_nonboot_cpus(reboot_cpu);
}
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 18e9727..dea67d0 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -32,6 +32,7 @@
#include <linux/irq_work.h>
#include <linux/kernel_stat.h>
#include <linux/kexec.h>
+
#include <linux/kvm_host.h>
#include <asm/alternative.h>
@@ -93,6 +94,167 @@ static inline int op_cpu_kill(unsigned int cpu)
}
#endif
+#ifdef CONFIG_ARM64_CPU_PARK
+struct cpu_park_section {
+ unsigned long exit; /* exit address of park look */
+ unsigned long magic; /* maigc represent park state */
+ char text[0]; /* text section of park */
+};
+
+static int mmap_cpu_park_mem(void)
+{
+ if (!park_info.start)
+ return -ENOMEM;
+
+ if (park_info.start_v)
+ return 0;
+
+ park_info.start_v = (unsigned long)__ioremap(park_info.start,
+ park_info.len,
+ PAGE_KERNEL_EXEC);
+ if (!park_info.start_v) {
+ pr_warn("map park memory failed.");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static inline unsigned long cpu_park_section_v(unsigned int cpu)
+{
+ return park_info.start_v + PARK_SECTION_SIZE * (cpu - 1);
+}
+
+static inline unsigned long cpu_park_section_p(unsigned int cpu)
+{
+ return park_info.start + PARK_SECTION_SIZE * (cpu - 1);
+}
+
+/*
+ * Write the secondary_entry to exit section of park state.
+ * Then the secondary cpu will jump straight into the kernel
+ * by the secondary_entry.
+ */
+static int write_park_exit(unsigned int cpu)
+{
+ struct cpu_park_section *park_section;
+ unsigned long *park_exit;
+ unsigned long *park_text;
+
+ if (mmap_cpu_park_mem() != 0)
+ return -EPERM;
+
+ park_section = (struct cpu_park_section *)cpu_park_section_v(cpu);
+ park_exit = &park_section->exit;
+ park_text = (unsigned long *)park_section->text;
+ pr_debug("park_text 0x%lx : 0x%lx, do_cpu_park text 0x%lx : 0x%lx",
+ (unsigned long)park_text, *park_text,
+ (unsigned long)do_cpu_park,
+ *(unsigned long *)do_cpu_park);
+
+ /*
+ * Test first 8 bytes to determine
+ * whether needs to write cpu park exit.
+ */
+ if (*park_text == *(unsigned long *)do_cpu_park) {
+ writeq_relaxed(__pa_symbol(secondary_entry), park_exit);
+ __flush_dcache_area((__force void *)park_exit,
+ sizeof(unsigned long));
+ flush_icache_range((unsigned long)park_exit,
+ (unsigned long)(park_exit + 1));
+ sev();
+ dsb(sy);
+ isb();
+
+ pr_debug("Write cpu %u secondary entry 0x%lx to 0x%lx.",
+ cpu, *park_exit, (unsigned long)park_exit);
+ pr_info("Boot cpu %u from PARK state.", cpu);
+ return 0;
+ }
+
+ return -EPERM;
+}
+
+/* Install cpu park sections for the specific cpu. */
+static int install_cpu_park(unsigned int cpu)
+{
+ struct cpu_park_section *park_section;
+ unsigned long *park_exit;
+ unsigned long *park_magic;
+ unsigned long park_text_len;
+
+ park_section = (struct cpu_park_section *)cpu_park_section_v(cpu);
+ pr_debug("Install cpu park on cpu %u park exit 0x%lx park text 0x%lx",
+ cpu, (unsigned long)park_section,
+ (unsigned long)(park_section->text));
+
+ park_exit = &park_section->exit;
+ park_magic = &park_section->magic;
+ park_text_len = PARK_SECTION_SIZE - sizeof(struct cpu_park_section);
+
+ *park_exit = 0UL;
+ *park_magic = 0UL;
+ memcpy((void *)park_section->text, do_cpu_park, park_text_len);
+ __flush_dcache_area((void *)park_section, PARK_SECTION_SIZE);
+
+ return 0;
+}
+
+static int uninstall_cpu_park(unsigned int cpu)
+{
+ unsigned long park_section;
+
+ if (mmap_cpu_park_mem() != 0)
+ return -EPERM;
+
+ park_section = cpu_park_section_v(cpu);
+ memset((void *)park_section, 0, PARK_SECTION_SIZE);
+ __flush_dcache_area((void *)park_section, PARK_SECTION_SIZE);
+
+ return 0;
+}
+
+static int cpu_wait_park(unsigned int cpu)
+{
+ long timeout;
+ struct cpu_park_section *park_section;
+
+ volatile unsigned long *park_magic;
+
+ park_section = (struct cpu_park_section *)cpu_park_section_v(cpu);
+ park_magic = &park_section->magic;
+
+ timeout = USEC_PER_SEC;
+ while (*park_magic != PARK_MAGIC && timeout--)
+ udelay(1);
+
+ if (timeout > 0)
+ pr_debug("cpu %u park done.", cpu);
+ else
+ pr_err("cpu %u park failed.", cpu);
+
+ return *park_magic == PARK_MAGIC;
+}
+
+static void cpu_park(unsigned int cpu)
+{
+ unsigned long park_section_p;
+ unsigned long park_exit_phy;
+ unsigned long do_park;
+ typeof(enter_cpu_park) *park;
+
+ park_section_p = cpu_park_section_p(cpu);
+ park_exit_phy = park_section_p;
+ pr_debug("Go to park cpu %u exit address 0x%lx", cpu, park_exit_phy);
+
+ do_park = park_section_p + sizeof(struct cpu_park_section);
+ park = (void *)__pa_symbol(enter_cpu_park);
+
+ cpu_install_idmap();
+ park(do_park, park_exit_phy);
+ unreachable();
+}
+#endif
/*
* Boot a secondary CPU, and assign it the specified idle task.
@@ -102,6 +264,10 @@ static int boot_secondary(unsigned int cpu, struct task_struct *idle)
{
const struct cpu_operations *ops = get_cpu_ops(cpu);
+#ifdef CONFIG_ARM64_CPU_PARK
+ if (write_park_exit(cpu) == 0)
+ return 0;
+#endif
if (ops->cpu_boot)
return ops->cpu_boot(cpu);
@@ -131,6 +297,9 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
return ret;
}
+#ifdef CONFIG_ARM64_CPU_PARK
+ uninstall_cpu_park(cpu);
+#endif
/*
* CPU was successfully started, wait for it to come online or
* time out.
@@ -844,10 +1013,32 @@ void arch_irq_work_raise(void)
static void local_cpu_stop(void)
{
+#ifdef CONFIG_ARM64_CPU_PARK
+ int cpu;
+ const struct cpu_operations *ops = NULL;
+#endif
+
set_cpu_online(smp_processor_id(), false);
local_daif_mask();
sdei_mask_local_cpu();
+
+#ifdef CONFIG_ARM64_CPU_PARK
+ /*
+ * Go to cpu park state.
+ * Otherwise go to cpu die.
+ */
+ cpu = smp_processor_id();
+ if (kexec_in_progress && park_info.start_v) {
+ machine_kexec_mask_interrupts();
+ cpu_park(cpu);
+
+ ops = get_cpu_ops(cpu);
+ if (ops && ops->cpu_die)
+ ops->cpu_die(cpu);
+ }
+#endif
+
cpu_park_loop();
}
@@ -1053,6 +1244,45 @@ void smp_send_stop(void)
sdei_mask_local_cpu();
}
+#ifdef CONFIG_ARM64_CPU_PARK
+int kexec_smp_send_park(void)
+{
+ unsigned long cpu;
+
+ if (WARN_ON(!kexec_in_progress)) {
+ pr_crit("%s called not in kexec progress.", __func__);
+ return -EPERM;
+ }
+
+ if (mmap_cpu_park_mem() != 0) {
+ pr_info("no cpuparkmem, goto normal way.");
+ return -EPERM;
+ }
+
+ local_irq_disable();
+
+ if (num_online_cpus() > 1) {
+ cpumask_t mask;
+
+ cpumask_copy(&mask, cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), &mask);
+
+ for_each_cpu(cpu, &mask)
+ install_cpu_park(cpu);
+ smp_cross_call(&mask, IPI_CPU_STOP);
+
+ /* Wait for other CPUs to park */
+ for_each_cpu(cpu, &mask)
+ cpu_wait_park(cpu);
+ pr_info("smp park other cpus done\n");
+ }
+
+ sdei_mask_local_cpu();
+
+ return 0;
+}
+#endif
+
#ifdef CONFIG_KEXEC_CORE
void crash_smp_send_stop(void)
{
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 794f992..d01259c 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -236,6 +236,57 @@ static void __init fdt_enforce_memory_region(void)
memblock_add(usable_rgns[1].base, usable_rgns[1].size);
}
+#ifdef CONFIG_ARM64_CPU_PARK
+struct cpu_park_info park_info = {
+ .start = 0,
+ .len = PARK_SECTION_SIZE * NR_CPUS,
+ .start_v = 0,
+};
+
+static int __init parse_park_mem(char *p)
+{
+ if (!p)
+ return 0;
+
+ park_info.start = PAGE_ALIGN(memparse(p, NULL));
+ if (park_info.start == 0)
+ pr_info("cpu park mem params[%s]", p);
+
+ return 0;
+}
+early_param("cpuparkmem", parse_park_mem);
+
+static int __init reserve_park_mem(void)
+{
+ if (park_info.start == 0 || park_info.len == 0)
+ return 0;
+
+ park_info.start = PAGE_ALIGN(park_info.start);
+ park_info.len = PAGE_ALIGN(park_info.len);
+
+ if (!memblock_is_region_memory(park_info.start, park_info.len)) {
+ pr_warn("cannot reserve park mem: region is not memory!");
+ goto out;
+ }
+
+ if (memblock_is_region_reserved(park_info.start, park_info.len)) {
+ pr_warn("cannot reserve park mem: region overlaps reserved memory!");
+ goto out;
+ }
+
+ memblock_remove(park_info.start, park_info.len);
+ pr_info("cpu park mem reserved: 0x%016lx - 0x%016lx (%ld MB)",
+ park_info.start, park_info.start + park_info.len,
+ park_info.len >> 20);
+
+ return 0;
+out:
+ park_info.start = 0;
+ park_info.len = 0;
+ return -EINVAL;
+}
+#endif
+
void __init arm64_memblock_init(void)
{
const s64 linear_region_size = BIT(vabits_actual - 1);
@@ -357,6 +408,10 @@ void __init arm64_memblock_init(void)
reserve_crashkernel();
+#ifdef CONFIG_ARM64_CPU_PARK
+ reserve_park_mem();
+#endif
+
reserve_elfcorehdr();
high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
--
2.9.5
1
1
MPAM bugfix @ 20210224
James Morse (10):
arm64/mpam: Add mpam driver discovery phase and kbuild boiler plate
cacheinfo: Provide a helper to find a cacheinfo leaf
arm64/mpam: Probe supported partid/pmg ranges from devices
arm64/mpam: Supplement MPAM MSC register layout definitions
arm64/mpam: Probe the features resctrl supports
arm64/mpam: Reset controls when CPUs come online
arm64/mpam: Summarize feature support during mpam_enable()
arm64/mpam: resctrl: Re-synchronise resctrl's view of online CPUs
drivers: base: cacheinfo: Add helper to search cacheinfo by of_node
arm64/mpam: Enabling registering and logging error interrupts
Wang ShaoBo (55):
arm64/mpam: Preparing for MPAM refactoring
arm64/mpam: Add helper for getting mpam sysprops
arm64/mpam: Allocate mpam component configuration arrays
arm64/mpam: Pick MPAM resources and events for resctrl_res exported
arm64/mpam: Init resctrl resources' info from resctrl_res selected
arm64/mpam: resctrl: Handle cpuhp and resctrl_dom allocation
arm64/mpam: Implement helpers for handling configuration and
monitoring
arm64/mpam: Migrate old MSCs' discovery process to new branch
arm64/mpam: Add helper for getting MSCs' configuration
arm64/mpam: Probe partid,pmg and feature capabilities' ranges from
classes
arm64/mpam: resctrl: Rebuild configuration and monitoring pipeline
arm64/mpam: resctrl: Append schemata CDP definitions
arm64/mpam: resctrl: Supplement cdpl2,cdpl3 for mount options
arm64/mpam: resctrl: Add helpers for init and destroy schemata list
arm64/mpam: resctrl: Use resctrl_group_init_alloc() to init schema
list
arm64/mpam: resctrl: Write and read schemata by schema_list
arm64/mpam: Support cdp in mpam_sched_in()
arm64/mpam: resctrl: Update resources reset process
arm64/mpam: resctrl: Update closid alloc and free process with bitmap
arm64/mpam: resctrl: Move ctrlmon sysfile write/read function to
mpam_ctrlmon.c
arm64/mpam: Support cdp on allocating monitors
arm64/mpam: resctrl: Support cdp on monitoring data
arm64/mpam: Clean up header files and rearrange declarations
arm64/mpam: resctrl: Remove ctrlmon sysfile
arm64/mpam: resctrl: Remove unnecessary CONFIG_ARM64
arm64/mpam: Implement intpartid narrowing process
arm64/mpam: Using software-defined id for rdtgroup instead of 32-bit
integer
arm64/mpam: resctrl: collect child mon group's monitor data
arm64/mpam: resctrl: Support cpus' monitoring for mon group
arm64/mpam: resctrl: Support priority and hardlimit(Memory bandwidth)
configuration
arm64/mpam: Store intpri and dspri for mpam device reset
arm64/mpam: Squash default priority from mpam device to class
arm64/mpam: Restore extend ctrls' max width for checking schemata
input
arm64/mpam: Re-plan intpartid narrowing process
arm64/mpam: Add hook-events id for ctrl features
arm64/mpam: Integrate monitor data for Memory Bandwidth if cdp enabled
arm64/mpam: Fix MPAM_ESR intPARTID_range error
arm64/mpam: Separate internal and downstream priority event
arm64/mpam: Remap reqpartid,pmg to rmid and intpartid to closid
arm64/mpam: Add wait queue for monitor alloc and free
arm64/mpam: Add resctrl_ctrl_feature structure to manage ctrl features
arm64/mpam: resctrl: Export resource's properties to info directory
arm64/mpam: Split header files into suitable location
arm64/mpam: resctrl: Add rmid file in resctrl sysfs
arm64/mpam: Filter schema control type with ctrl features
arm64/mpam: Simplify mpamid cdp mapping process
arm64/mpam: Set per-cpu's closid to none zero for cdp
ACPI/MPAM: Use acpi_map_pxm_to_node() to get node id for memory node
arm64/mpam: Supplement additional useful ctrl features for mount
options
arm64/mpam: resctrl: Add proper error handling to resctrl_mount()
arm64/mpam: resctrl: Use resctrl_group_init_alloc() for default group
arm64/mpam: resctrl: Allow setting register MPAMCFG_MBW_MIN to 0
arm64/mpam: resctrl: Refresh cpu mask for handling cpuhp
arm64/mpam: Sort domains when cpu online
arm64/mpam: Fix compile warning
arch/arm64/include/asm/mpam.h | 324 +---
arch/arm64/include/asm/mpam_resource.h | 129 --
arch/arm64/include/asm/mpam_sched.h | 8 -
arch/arm64/include/asm/resctrl.h | 514 +++++-
arch/arm64/kernel/Makefile | 2 +-
arch/arm64/kernel/mpam.c | 1499 ----------------
arch/arm64/kernel/mpam/Makefile | 3 +
arch/arm64/kernel/mpam/mpam_ctrlmon.c | 961 ++++++++++
arch/arm64/kernel/mpam/mpam_device.c | 1706 ++++++++++++++++++
arch/arm64/kernel/mpam/mpam_device.h | 140 ++
arch/arm64/kernel/mpam/mpam_internal.h | 345 ++++
arch/arm64/kernel/mpam/mpam_mon.c | 334 ++++
arch/arm64/kernel/mpam/mpam_resctrl.c | 2240 ++++++++++++++++++++++++
arch/arm64/kernel/mpam/mpam_resource.h | 228 +++
arch/arm64/kernel/mpam/mpam_setup.c | 608 +++++++
arch/arm64/kernel/mpam_ctrlmon.c | 623 -------
arch/arm64/kernel/mpam_mon.c | 124 --
drivers/acpi/arm64/mpam.c | 87 +-
drivers/base/cacheinfo.c | 38 +
fs/resctrlfs.c | 396 +++--
include/linux/arm_mpam.h | 118 ++
include/linux/cacheinfo.h | 36 +
include/linux/resctrlfs.h | 30 -
23 files changed, 7521 insertions(+), 2972 deletions(-)
delete mode 100644 arch/arm64/include/asm/mpam_resource.h
delete mode 100644 arch/arm64/kernel/mpam.c
create mode 100644 arch/arm64/kernel/mpam/Makefile
create mode 100644 arch/arm64/kernel/mpam/mpam_ctrlmon.c
create mode 100644 arch/arm64/kernel/mpam/mpam_device.c
create mode 100644 arch/arm64/kernel/mpam/mpam_device.h
create mode 100644 arch/arm64/kernel/mpam/mpam_internal.h
create mode 100644 arch/arm64/kernel/mpam/mpam_mon.c
create mode 100644 arch/arm64/kernel/mpam/mpam_resctrl.c
create mode 100644 arch/arm64/kernel/mpam/mpam_resource.h
create mode 100644 arch/arm64/kernel/mpam/mpam_setup.c
delete mode 100644 arch/arm64/kernel/mpam_ctrlmon.c
delete mode 100644 arch/arm64/kernel/mpam_mon.c
create mode 100644 include/linux/arm_mpam.h
--
2.25.1
1
65

23 Feb '21
From: Li ZhiGang <lizhigang(a)kylinos.cn>
Nationz Tech TCM are used for trusted computing, the chip attached via SPI or LPC.
We have a brief verify/test with this driver on KunPeng920 + openEuler system, with externally compiled module.
Signed-off-by: Li ZhiGang <lizhigang(a)kylinos.cn>
---
drivers/staging/Kconfig | 2 +
drivers/staging/Makefile | 2 +
drivers/staging/gmjstcm/Kconfig | 21 +
drivers/staging/gmjstcm/Makefile | 5 +
drivers/staging/gmjstcm/tcm.c | 949 ++++++++++++++++++++++++++
drivers/staging/gmjstcm/tcm.h | 123 ++++
drivers/staging/gmjstcm/tcm_tis_spi.c | 868 +++++++++++++++++++++++
7 files changed, 1970 insertions(+)
create mode 100644 drivers/staging/gmjstcm/Kconfig
create mode 100644 drivers/staging/gmjstcm/Makefile
create mode 100644 drivers/staging/gmjstcm/tcm.c
create mode 100644 drivers/staging/gmjstcm/tcm.h
create mode 100644 drivers/staging/gmjstcm/tcm_tis_spi.c
diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index 1abf76be2aa8..d51fa4f4e7ca 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -126,4 +126,6 @@ source "drivers/staging/axis-fifo/Kconfig"
source "drivers/staging/erofs/Kconfig"
+source "drivers/staging/gmjstcm/Kconfig"
+
endif # STAGING
diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index ab0cbe8815b1..6d41915dad5b 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@ -53,3 +53,5 @@ obj-$(CONFIG_SOC_MT7621) += mt7621-dts/
obj-$(CONFIG_STAGING_GASKET_FRAMEWORK) += gasket/
obj-$(CONFIG_XIL_AXIS_FIFO) += axis-fifo/
obj-$(CONFIG_EROFS_FS) += erofs/
+obj-$(CONFIG_GMJS_TCM) += gmjstcm/
+
diff --git a/drivers/staging/gmjstcm/Kconfig b/drivers/staging/gmjstcm/Kconfig
new file mode 100644
index 000000000000..5b5397ae1832
--- /dev/null
+++ b/drivers/staging/gmjstcm/Kconfig
@@ -0,0 +1,21 @@
+menu "GMJS TCM support"
+
+config GMJS_TCM
+ bool
+
+config GMJS_TCM_CORE
+ tristate "GMJS TCM core support"
+ depends on ARM64 || MIPS
+ default m
+ select GMJS_TCM
+ help
+ GMJS TCM core support.
+
+config GMJS_TCM_SPI
+ tristate "GMJS TCM support on SPI interface"
+ depends on GMJS_TCM_CORE && SPI_MASTER
+ default m
+ help
+ GMJS TCM support on SPI interface.
+
+endmenu
diff --git a/drivers/staging/gmjstcm/Makefile b/drivers/staging/gmjstcm/Makefile
new file mode 100644
index 000000000000..601c78e44793
--- /dev/null
+++ b/drivers/staging/gmjstcm/Makefile
@@ -0,0 +1,5 @@
+
+obj-$(CONFIG_GMJS_TCM_CORE) += tcm_core.o
+tcm_core-objs := tcm.o
+obj-$(CONFIG_GMJS_TCM_SPI) += tcm_tis_spi.o
+
diff --git a/drivers/staging/gmjstcm/tcm.c b/drivers/staging/gmjstcm/tcm.c
new file mode 100644
index 000000000000..5c41bfa8b423
--- /dev/null
+++ b/drivers/staging/gmjstcm/tcm.c
@@ -0,0 +1,949 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2009 Nationz Technologies Inc.
+ *
+ * Description: Exprot symbol for tcm_tis module
+ *
+ * Major Function: public write read register function etc.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/poll.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include "tcm.h"
+
+/*
+ * const var
+ */
+enum tcm_const {
+ TCM_MINOR = 224, /* officially assigned */
+ TCM_BUFSIZE = 2048, /* Buffer Size */
+ TCM_NUM_DEVICES = 256, /* Max supporting tcm device number */
+};
+
+/*
+ * CMD duration
+ */
+enum tcm_duration {
+ TCM_SHORT = 0,
+ TCM_MEDIUM = 1,
+ TCM_LONG = 2,
+ TCM_UNDEFINED,
+};
+
+/* Max Total of Command Number */
+#define TCM_MAX_ORDINAL 88 /*243*/
+
+static LIST_HEAD(tcm_chip_list);
+static DEFINE_SPINLOCK(driver_lock); /* spin lock */
+static DECLARE_BITMAP(dev_mask, TCM_NUM_DEVICES);
+
+typedef struct tagTCM_Command {
+ u8 ordinal;
+ u8 DURATION;
+} TCM_Command;
+
+static const TCM_Command TCM_Command_List[TCM_MAX_ORDINAL + 1] = {
+ {/*TCM_ORD_ActivateIdentity, */122, 1},
+ {/*TCM_ORD_CertifyKey, */50, 1},
+ {/*TCM_ORD_CertifyKeyM, */51, 1},
+ {/*TCM_ORD_ChangeAuth, */12, 1},
+ {/*TCM_ORD_ChangeAuthOwner, */16, 0},
+ {/*TCM_ORD_ContinueSelfTeSt, */83, 2},
+ {/*TCM_ORD_CreateCounter, */220, 0},
+ {/*TCM_ORD_CreateWrapKey, */31, 2},
+ {/*TCM_ORD_DiSableForceClear, */94, 0},
+ {/*TCM_ORD_DiSableOwnerClear, */92, 0},
+ {/*TCM_ORD_EStabliShTranSport, */230, 0},
+ {/*TCM_ORD_ExecuteTranSport, */231, 2},
+ {/*TCM_ORD_Extend, */20, 0},
+ {/*TCM_ORD_FieldUpgrade, */170, 2},
+ {/*TCM_ORD_FluShSpecific, */186, 0},
+ {/*TCM_ORD_ForceClear, */93, 0},
+ {/*TCM_ORD_GetAuditDigeSt, */133, 0},
+ {/*TCM_ORD_GetAuditDigeStSigned, */134, 1},
+ {/*TCM_ORD_GetCapability, */101, 0},
+ {/*TCM_ORD_GetPubKey, */33, 0},
+ {/*TCM_ORD_GetRandoM, */70, 0},
+ {/*TCM_ORD_GetTeStReSult, */84, 0},
+ {/*TCM_ORD_GetTickS, */241, 0},
+ {/*TCM_ORD_IncreMentCounter, */221, 0},
+ {/*TCM_ORD_LoadContext, */185, 1},
+ {/*TCM_ORD_MakeIdentity, */121, 2},
+ {/*TCM_ORD_NV_DefineSpace, */204, 0},
+ {/*TCM_ORD_NV_ReadValue, */207, 0},
+ {/*TCM_ORD_NV_ReadValueAuth, */208, 0},
+ {/*TCM_ORD_NV_WriteValue, */205, 0},
+ {/*TCM_ORD_NV_WriteValueAuth, */206, 0},
+ {/*TCM_ORD_OwnerClear, */91, 0},
+ {/*TCM_ORD_OwnerReadInternalPub, */129, 0},
+ {/*TCM_ORD_OwnerSetDiSable, */110, 0},
+ {/*TCM_ORD_PCR_ReSet, */200, 0},
+ {/*TCM_ORD_PcrRead, */21, 0},
+ {/*TCM_ORD_PhySicalDiSable, */112, 0},
+ {/*TCM_ORD_PhySicalEnable, */111, 0},
+ {/*TCM_ORD_PhySicalSetDeactivated, */114, 0},
+ {/*TCM_ORD_Quote, */22, 1},
+ {/*TCM_ORD_QuoteM, */62, 1},
+ {/*TCM_ORD_ReadCounter, */222, 0},
+ {/*TCM_ORD_ReadPubek, */124, 0},
+ {/*TCM_ORD_ReleaSeCounter, */223, 0},
+ {/*TCM_ORD_ReleaSeCounterOwner, */224, 0},
+ {/*TCM_ORD_ReleaSeTranSportSigned, */232, 1},
+ {/*TCM_ORD_ReSetLockValue, */64, 0},
+ {/*TCM_ORD_RevokeTruSt, */128, 0},
+ {/*TCM_ORD_SaveContext, */184, 1},
+ {/*TCM_ORD_SaveState, */152, 1},
+ {/*TCM_ORD_Seal, */23, 1},
+ {/*TCM_ORD_Sealx, */61, 1},
+ {/*TCM_ORD_SelfTeStFull, */80, 2},
+ {/*TCM_ORD_SetCapability, */63, 0},
+ {/*TCM_ORD_SetOperatorAuth, */116, 0},
+ {/*TCM_ORD_SetOrdinalAuditStatuS, */141, 0},
+ {/*TCM_ORD_SetOwnerInStall, */113, 0},
+ {/*TCM_ORD_SetTeMpDeactivated, */115, 0},
+ {/*TCM_ORD_Sign, */60, 1},
+ {/*TCM_ORD_Startup, */153, 0},
+ {/*TCM_ORD_TakeOwnerShip, */13, 1},
+ {/*TCM_ORD_TickStaMpBlob, */242, 1},
+ {/*TCM_ORD_UnSeal, */24, 1},
+ {/*TSC_ORD_PhySicalPreSence, */10, 0},
+ {/*TSC_ORD_ReSetEStabliShMentBit, */11, 0},
+ {/*TCM_ORD_WrapKey, */189, 2},
+ {/*TCM_ORD_APcreate, */191, 0},
+ {/*TCM_ORD_APTerMinate, */192, 0},
+ {/*TCM_ORD_CreateMigratedBlob, */193, 1},
+ {/*TCM_ORD_ConvertMigratedBlob, */194, 1},
+ {/*TCM_ORD_AuthorizeMigrationKey, */195, 0},
+ {/*TCM_ORD_SMS4Encrypt, */197, 1},
+ {/*TCM_ORD_SMS4Decrypt, */198, 1},
+ {/*TCM_ORD_ReadEKCert, */199, 1},
+ {/*TCM_ORD_WriteEKCert, */233, 1},
+ {/*TCM_ORD_SCHStart, */234, 0},
+ {/*TCM_ORD_SCHUpdata, */235, 0},
+ {/*TCM_ORD_SCHCoMplete, */236, 0},
+ {/*TCM_ORD_SCHCoMpleteExtend, */237, 0},
+ {/*TCM_ORD_ECCDecrypt, */238, 1},
+ {/*TCM_ORD_LoadKey, */239, 1},
+ {/*TCM_ORD_CreateEndorSeMentKeyPair, */120, 2},
+ {/*TCM_ORD_CreateRevocableEK, */127, 2},
+ {/*TCM_ORD_ReleaSeECCExchangeSeSSion, */174, 1},
+ {/*TCM_ORD_CreateECCExchangeSeSSion, */175, 1},
+ {/*TCM_ORD_GetKeyECCExchangeSeSSion, */176, 1},
+ {/*TCM_ORD_ActivatePEK, */217, 1},
+ {/*TCM_ORD_ActivatePEKCert, */218, 1},
+ {0, 0}
+};
+
+static void user_reader_timeout(struct timer_list *t)
+{
+ struct tcm_chip *chip = from_timer(chip, t, user_read_timer);
+
+ schedule_work(&chip->work);
+}
+
+static void timeout_work(struct work_struct *work)
+{
+ struct tcm_chip *chip = container_of(work, struct tcm_chip, work);
+
+ mutex_lock(&chip->buffer_mutex);
+ atomic_set(&chip->data_pending, 0);
+ memset(chip->data_buffer, 0, TCM_BUFSIZE);
+ mutex_unlock(&chip->buffer_mutex);
+}
+
+unsigned long tcm_calc_ordinal_duration(struct tcm_chip *chip,
+ u32 ordinal)
+{
+ int duration_idx = TCM_UNDEFINED;
+ int duration = 0;
+ int i = 0;
+
+ for (i = 0; i < TCM_MAX_ORDINAL; i++) {
+ if (ordinal == TCM_Command_List[i].ordinal) {
+ duration_idx = TCM_Command_List[i].DURATION;
+ break;
+ }
+ }
+
+ if (duration_idx != TCM_UNDEFINED)
+ duration = chip->vendor.duration[duration_idx];
+ if (duration <= 0)
+ return 2 * 60 * HZ;
+ else
+ return duration;
+}
+EXPORT_SYMBOL_GPL(tcm_calc_ordinal_duration);
+
+/*
+ * Internal kernel interface to transmit TCM commands
+ * buff format: TAG(2 bytes) + Total Size(4 bytes ) +
+ * Command Ordinal(4 bytes ) + ......
+ */
+static ssize_t tcm_transmit(struct tcm_chip *chip, const char *buf,
+ size_t bufsiz)
+{
+ ssize_t rc = 0;
+ u32 count = 0, ordinal = 0;
+ unsigned long stop = 0;
+
+ count = be32_to_cpu(*((__be32 *)(buf + 2))); /* buff size */
+ ordinal = be32_to_cpu(*((__be32 *)(buf + 6))); /* command ordinal */
+
+ if (count == 0)
+ return -ENODATA;
+ if (count > bufsiz) { /* buff size err ,invalid buff stream */
+ dev_err(chip->dev, "invalid count value %x, %zx\n",
+ count, bufsiz);
+ return -E2BIG;
+ }
+
+ mutex_lock(&chip->tcm_mutex); /* enter mutex */
+
+ rc = chip->vendor.send(chip, (u8 *)buf, count);
+ if (rc < 0) {
+ dev_err(chip->dev, "%s: tcm_send: error %zd\n",
+ __func__, rc);
+ goto out;
+ }
+
+ if (chip->vendor.irq)
+ goto out_recv;
+
+ stop = jiffies + tcm_calc_ordinal_duration(chip,
+ ordinal); /* cmd duration */
+ do {
+ u8 status = chip->vendor.status(chip);
+
+ if ((status & chip->vendor.req_complete_mask) ==
+ chip->vendor.req_complete_val)
+ goto out_recv;
+
+ if ((status == chip->vendor.req_canceled)) {
+ dev_err(chip->dev, "Operation Canceled\n");
+ rc = -ECANCELED;
+ goto out;
+ }
+
+ msleep(TCM_TIMEOUT); /* CHECK */
+ rmb();
+ } while (time_before(jiffies, stop));
+ /* time out */
+ chip->vendor.cancel(chip);
+ dev_err(chip->dev, "Operation Timed out\n");
+ rc = -ETIME;
+ goto out;
+
+out_recv:
+ rc = chip->vendor.recv(chip, (u8 *)buf, bufsiz);
+ if (rc < 0)
+ dev_err(chip->dev, "%s: tcm_recv: error %zd\n",
+ __func__, rc);
+out:
+ mutex_unlock(&chip->tcm_mutex);
+ return rc;
+}
+
+#define TCM_DIGEST_SIZE 32
+#define TCM_ERROR_SIZE 10
+#define TCM_RET_CODE_IDX 6
+#define TCM_GET_CAP_RET_SIZE_IDX 10
+#define TCM_GET_CAP_RET_UINT32_1_IDX 14
+#define TCM_GET_CAP_RET_UINT32_2_IDX 18
+#define TCM_GET_CAP_RET_UINT32_3_IDX 22
+#define TCM_GET_CAP_RET_UINT32_4_IDX 26
+#define TCM_GET_CAP_PERM_DISABLE_IDX 16
+#define TCM_GET_CAP_PERM_INACTIVE_IDX 18
+#define TCM_GET_CAP_RET_BOOL_1_IDX 14
+#define TCM_GET_CAP_TEMP_INACTIVE_IDX 16
+
+#define TCM_CAP_IDX 13
+#define TCM_CAP_SUBCAP_IDX 21
+
+enum tcm_capabilities {
+ TCM_CAP_FLAG = 4,
+ TCM_CAP_PROP = 5,
+};
+
+enum tcm_sub_capabilities {
+ TCM_CAP_PROP_PCR = 0x1, /* tcm 0x101 */
+ TCM_CAP_PROP_MANUFACTURER = 0x3, /* tcm 0x103 */
+ TCM_CAP_FLAG_PERM = 0x8, /* tcm 0x108 */
+ TCM_CAP_FLAG_VOL = 0x9, /* tcm 0x109 */
+ TCM_CAP_PROP_OWNER = 0x11, /* tcm 0x101 */
+ TCM_CAP_PROP_TIS_TIMEOUT = 0x15, /* tcm 0x115 */
+ TCM_CAP_PROP_TIS_DURATION = 0x20, /* tcm 0x120 */
+};
+
+/*
+ * This is a semi generic GetCapability command for use
+ * with the capability type TCM_CAP_PROP or TCM_CAP_FLAG
+ * and their associated sub_capabilities.
+ */
+
+static const u8 tcm_cap[] = {
+ 0, 193, /* TCM_TAG_RQU_COMMAND 0xc1*/
+ 0, 0, 0, 22, /* length */
+ 0, 0, 128, 101, /* TCM_ORD_GetCapability */
+ 0, 0, 0, 0, /* TCM_CAP_<TYPE> */
+ 0, 0, 0, 4, /* TCM_CAP_SUB_<TYPE> size */
+ 0, 0, 1, 0 /* TCM_CAP_SUB_<TYPE> */
+};
+
+static ssize_t transmit_cmd(struct tcm_chip *chip, u8 *data, int len,
+ char *desc)
+{
+ int err = 0;
+
+ len = tcm_transmit(chip, data, len);
+ if (len < 0)
+ return len;
+ if (len == TCM_ERROR_SIZE) {
+ err = be32_to_cpu(*((__be32 *)(data + TCM_RET_CODE_IDX)));
+ dev_dbg(chip->dev, "A TCM error (%d) occurred %s\n", err, desc);
+ return err;
+ }
+ return 0;
+}
+
+/*
+ * Get default timeouts value form tcm by GetCapability with TCM_CAP_PROP_TIS_TIMEOUT prop
+ */
+void tcm_get_timeouts(struct tcm_chip *chip)
+{
+ u8 data[max_t(int, ARRAY_SIZE(tcm_cap), 30)];
+ ssize_t rc = 0;
+ u32 timeout = 0;
+
+ memcpy(data, tcm_cap, sizeof(tcm_cap));
+ data[TCM_CAP_IDX] = TCM_CAP_PROP;
+ data[TCM_CAP_SUBCAP_IDX] = TCM_CAP_PROP_TIS_TIMEOUT;
+
+ rc = transmit_cmd(chip, data, sizeof(data),
+ "attempting to determine the timeouts");
+ if (rc)
+ goto duration;
+
+ if (be32_to_cpu(*((__be32 *)(data + TCM_GET_CAP_RET_SIZE_IDX))) !=
+ 4 * sizeof(u32))
+ goto duration;
+
+ /* Don't overwrite default if value is 0 */
+ timeout = be32_to_cpu(*((__be32 *)(data + TCM_GET_CAP_RET_UINT32_1_IDX)));
+ if (timeout)
+ chip->vendor.timeout_a = msecs_to_jiffies(timeout);
+ timeout = be32_to_cpu(*((__be32 *)(data + TCM_GET_CAP_RET_UINT32_2_IDX)));
+ if (timeout)
+ chip->vendor.timeout_b = msecs_to_jiffies(timeout);
+ timeout = be32_to_cpu(*((__be32 *)(data + TCM_GET_CAP_RET_UINT32_3_IDX)));
+ if (timeout)
+ chip->vendor.timeout_c = msecs_to_jiffies(timeout);
+ timeout = be32_to_cpu(*((__be32 *)(data + TCM_GET_CAP_RET_UINT32_4_IDX)));
+ if (timeout)
+ chip->vendor.timeout_d = msecs_to_jiffies(timeout);
+
+duration:
+ memcpy(data, tcm_cap, sizeof(tcm_cap));
+ data[TCM_CAP_IDX] = TCM_CAP_PROP;
+ data[TCM_CAP_SUBCAP_IDX] = TCM_CAP_PROP_TIS_DURATION;
+
+ rc = transmit_cmd(chip, data, sizeof(data),
+ "attempting to determine the durations");
+ if (rc)
+ return;
+
+ if (be32_to_cpu(*((__be32 *)(data + TCM_GET_CAP_RET_SIZE_IDX))) !=
+ 3 * sizeof(u32))
+ return;
+
+ chip->vendor.duration[TCM_SHORT] =
+ msecs_to_jiffies(be32_to_cpu(*((__be32 *)(data +
+ TCM_GET_CAP_RET_UINT32_1_IDX))));
+ chip->vendor.duration[TCM_MEDIUM] =
+ msecs_to_jiffies(be32_to_cpu(*((__be32 *)(data +
+ TCM_GET_CAP_RET_UINT32_2_IDX))));
+ chip->vendor.duration[TCM_LONG] =
+ msecs_to_jiffies(be32_to_cpu(*((__be32 *)(data +
+ TCM_GET_CAP_RET_UINT32_3_IDX))));
+}
+EXPORT_SYMBOL_GPL(tcm_get_timeouts);
+
+ssize_t tcm_show_enabled(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ u8 data[max_t(int, ARRAY_SIZE(tcm_cap), 35)];
+ ssize_t rc = 0;
+ struct tcm_chip *chip = dev_get_drvdata(dev);
+
+ if (chip == NULL)
+ return -ENODEV;
+
+ memcpy(data, tcm_cap, sizeof(tcm_cap));
+ data[TCM_CAP_IDX] = TCM_CAP_FLAG;
+ data[TCM_CAP_SUBCAP_IDX] = TCM_CAP_FLAG_PERM;
+
+ rc = transmit_cmd(chip, data, sizeof(data),
+ "attemtping to determine the permanent state");
+ if (rc)
+ return 0;
+ if (data[TCM_GET_CAP_PERM_DISABLE_IDX])
+ return sprintf(buf, "disable\n");
+ else
+ return sprintf(buf, "enable\n");
+}
+EXPORT_SYMBOL_GPL(tcm_show_enabled);
+
+ssize_t tcm_show_active(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ u8 data[max_t(int, ARRAY_SIZE(tcm_cap), 35)];
+ ssize_t rc = 0;
+ struct tcm_chip *chip = dev_get_drvdata(dev);
+
+ if (chip == NULL)
+ return -ENODEV;
+
+ memcpy(data, tcm_cap, sizeof(tcm_cap));
+ data[TCM_CAP_IDX] = TCM_CAP_FLAG;
+ data[TCM_CAP_SUBCAP_IDX] = TCM_CAP_FLAG_PERM;
+
+ rc = transmit_cmd(chip, data, sizeof(data),
+ "attemtping to determine the permanent state");
+ if (rc)
+ return 0;
+ if (data[TCM_GET_CAP_PERM_INACTIVE_IDX])
+ return sprintf(buf, "deactivated\n");
+ else
+ return sprintf(buf, "activated\n");
+}
+EXPORT_SYMBOL_GPL(tcm_show_active);
+
+ssize_t tcm_show_owned(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ u8 data[sizeof(tcm_cap)];
+ ssize_t rc = 0;
+ struct tcm_chip *chip = dev_get_drvdata(dev);
+
+ if (chip == NULL)
+ return -ENODEV;
+
+ memcpy(data, tcm_cap, sizeof(tcm_cap));
+ data[TCM_CAP_IDX] = TCM_CAP_PROP;
+ data[TCM_CAP_SUBCAP_IDX] = TCM_CAP_PROP_OWNER;
+
+ rc = transmit_cmd(chip, data, sizeof(data),
+ "attempting to determine the owner state");
+ if (rc)
+ return 0;
+ if (data[TCM_GET_CAP_RET_BOOL_1_IDX])
+ return sprintf(buf, "Owner installed\n");
+ else
+ return sprintf(buf, "Owner have not installed\n");
+}
+EXPORT_SYMBOL_GPL(tcm_show_owned);
+
+ssize_t tcm_show_temp_deactivated(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ u8 data[sizeof(tcm_cap)];
+ ssize_t rc = 0;
+ struct tcm_chip *chip = dev_get_drvdata(dev);
+
+ if (chip == NULL)
+ return -ENODEV;
+
+ memcpy(data, tcm_cap, sizeof(tcm_cap));
+ data[TCM_CAP_IDX] = TCM_CAP_FLAG;
+ data[TCM_CAP_SUBCAP_IDX] = TCM_CAP_FLAG_VOL;
+
+ rc = transmit_cmd(chip, data, sizeof(data),
+ "attempting to determine the temporary state");
+ if (rc)
+ return 0;
+ if (data[TCM_GET_CAP_TEMP_INACTIVE_IDX])
+ return sprintf(buf, "Temp deactivated\n");
+ else
+ return sprintf(buf, "activated\n");
+}
+EXPORT_SYMBOL_GPL(tcm_show_temp_deactivated);
+
+static const u8 pcrread[] = {
+ 0, 193, /* TCM_TAG_RQU_COMMAND */
+ 0, 0, 0, 14, /* length */
+ 0, 0, 128, 21, /* TCM_ORD_PcrRead */
+ 0, 0, 0, 0 /* PCR index */
+};
+
+ssize_t tcm_show_pcrs(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ u8 data[1024];
+ ssize_t rc = 0;
+ int i = 0, j = 0, num_pcrs = 0;
+ __be32 index = 0;
+ char *str = buf;
+ struct tcm_chip *chip = dev_get_drvdata(dev);
+
+ if (chip == NULL)
+ return -ENODEV;
+
+ memcpy(data, tcm_cap, sizeof(tcm_cap));
+ data[TCM_CAP_IDX] = TCM_CAP_PROP;
+ data[TCM_CAP_SUBCAP_IDX] = TCM_CAP_PROP_PCR;
+
+ rc = transmit_cmd(chip, data, sizeof(data),
+ "attempting to determine the number of PCRS");
+ if (rc)
+ return 0;
+
+ num_pcrs = be32_to_cpu(*((__be32 *)(data + 14)));
+ for (i = 0; i < num_pcrs; i++) {
+ memcpy(data, pcrread, sizeof(pcrread));
+ index = cpu_to_be32(i);
+ memcpy(data + 10, &index, 4);
+ rc = transmit_cmd(chip, data, sizeof(data),
+ "attempting to read a PCR");
+ if (rc)
+ goto out;
+ str += sprintf(str, "PCR-%02d: ", i);
+ for (j = 0; j < TCM_DIGEST_SIZE; j++)
+ str += sprintf(str, "%02X ", *(data + 10 + j));
+ str += sprintf(str, "\n");
+ memset(data, 0, 1024);
+ }
+out:
+ return str - buf;
+}
+EXPORT_SYMBOL_GPL(tcm_show_pcrs);
+
+#define READ_PUBEK_RESULT_SIZE 128
+static const u8 readpubek[] = {
+ 0, 193, /* TCM_TAG_RQU_COMMAND */
+ 0, 0, 0, 42, /* length */
+ 0, 0, 128, 124, /* TCM_ORD_ReadPubek */
+ 0, 0, 0, 0, 0, 0, 0, 0, /* NONCE */
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+ssize_t tcm_show_pubek(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ u8 data[READ_PUBEK_RESULT_SIZE] = {0};
+ ssize_t err = 0;
+ int i = 0, rc = 0;
+ char *str = buf;
+ struct tcm_chip *chip = dev_get_drvdata(dev);
+
+ if (chip == NULL)
+ return -ENODEV;
+
+ memcpy(data, readpubek, sizeof(readpubek));
+
+ err = transmit_cmd(chip, data, sizeof(data),
+ "attempting to read the PUBEK");
+ if (err)
+ goto out;
+
+ str += sprintf(str, "PUBEK:");
+ for (i = 0 ; i < 65 ; i++) {
+ if ((i) % 16 == 0)
+ str += sprintf(str, "\n");
+ str += sprintf(str, "%02X ", data[i+10]);
+ }
+
+ str += sprintf(str, "\n");
+out:
+ rc = str - buf;
+ return rc;
+}
+EXPORT_SYMBOL_GPL(tcm_show_pubek);
+
+#define CAP_VERSION_1_1 6
+#define CAP_VERSION_1_2 0x1A
+#define CAP_VERSION_IDX 13
+static const u8 cap_version[] = {
+ 0, 193, /* TCM_TAG_RQU_COMMAND */
+ 0, 0, 0, 18, /* length */
+ 0, 0, 128, 101, /* TCM_ORD_GetCapability */
+ 0, 0, 0, 0,
+ 0, 0, 0, 0
+};
+
+ssize_t tcm_show_caps(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ u8 data[max_t(int, max(ARRAY_SIZE(tcm_cap), ARRAY_SIZE(cap_version)), 30)];
+ ssize_t rc = 0;
+ char *str = buf;
+ struct tcm_chip *chip = dev_get_drvdata(dev);
+
+ if (chip == NULL)
+ return -ENODEV;
+
+ memcpy(data, tcm_cap, sizeof(tcm_cap));
+ data[TCM_CAP_IDX] = TCM_CAP_PROP;
+ data[TCM_CAP_SUBCAP_IDX] = TCM_CAP_PROP_MANUFACTURER;
+
+ rc = transmit_cmd(chip, data, sizeof(data),
+ "attempting to determine the manufacturer");
+ if (rc)
+ return 0;
+
+ str += sprintf(str, "Manufacturer: 0x%x\n",
+ be32_to_cpu(*((__be32 *)(data + TCM_GET_CAP_RET_UINT32_1_IDX))));
+
+ memcpy(data, cap_version, sizeof(cap_version));
+ data[CAP_VERSION_IDX] = CAP_VERSION_1_1;
+ rc = transmit_cmd(chip, data, sizeof(data),
+ "attempting to determine the 1.1 version");
+ if (rc)
+ goto out;
+
+ str += sprintf(str, "Firmware version: %02X.%02X.%02X.%02X\n",
+ (int)data[14], (int)data[15], (int)data[16],
+ (int)data[17]);
+
+out:
+ return str - buf;
+}
+EXPORT_SYMBOL_GPL(tcm_show_caps);
+
+ssize_t tcm_store_cancel(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct tcm_chip *chip = dev_get_drvdata(dev);
+
+ if (chip == NULL)
+ return 0;
+
+ chip->vendor.cancel(chip);
+ return count;
+}
+EXPORT_SYMBOL_GPL(tcm_store_cancel);
+
+/*
+ * Device file system interface to the TCM
+ * when App call file open in usr space ,this func will respone
+ */
+int tcm_open(struct inode *inode, struct file *file)
+{
+ int rc = 0, minor = iminor(inode);
+ struct tcm_chip *chip = NULL, *pos = NULL;
+
+ spin_lock(&driver_lock);
+
+ list_for_each_entry(pos, &tcm_chip_list, list) {
+ if (pos->vendor.miscdev.minor == minor) {
+ chip = pos;
+ break;
+ }
+ }
+
+ if (chip == NULL) {
+ rc = -ENODEV;
+ goto err_out;
+ }
+
+ if (chip->num_opens) {
+ dev_dbg(chip->dev, "Another process owns this TCM\n");
+ rc = -EBUSY;
+ goto err_out;
+ }
+
+ chip->num_opens++;
+ get_device(chip->dev);
+
+ spin_unlock(&driver_lock);
+
+ chip->data_buffer = kmalloc(TCM_BUFSIZE * sizeof(u8), GFP_KERNEL);
+ if (chip->data_buffer == NULL) {
+ chip->num_opens--;
+ put_device(chip->dev);
+ return -ENOMEM;
+ }
+
+ atomic_set(&chip->data_pending, 0);
+
+ file->private_data = chip;
+ return 0;
+
+err_out:
+ spin_unlock(&driver_lock);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(tcm_open);
+
+int tcm_release(struct inode *inode, struct file *file)
+{
+ struct tcm_chip *chip = file->private_data;
+
+ spin_lock(&driver_lock);
+ file->private_data = NULL;
+ chip->num_opens--;
+ del_singleshot_timer_sync(&chip->user_read_timer);
+ flush_work(&chip->work);
+ atomic_set(&chip->data_pending, 0);
+ put_device(chip->dev);
+ kfree(chip->data_buffer);
+ spin_unlock(&driver_lock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tcm_release);
+
+ssize_t tcm_write(struct file *file, const char __user *buf,
+ size_t size, loff_t *off)
+{
+ struct tcm_chip *chip = file->private_data;
+ int in_size = size, out_size;
+
+ /*
+ * cannot perform a write until the read has cleared
+ * either via tcm_read or a user_read_timer timeout
+ */
+ while (atomic_read(&chip->data_pending) != 0)
+ msleep(TCM_TIMEOUT);
+
+ mutex_lock(&chip->buffer_mutex);
+
+ if (in_size > TCM_BUFSIZE)
+ in_size = TCM_BUFSIZE;
+
+ if (copy_from_user(chip->data_buffer, (void __user *)buf, in_size)) {
+ mutex_unlock(&chip->buffer_mutex);
+ return -EFAULT;
+ }
+
+ /* atomic tcm command send and result receive */
+ out_size = tcm_transmit(chip, chip->data_buffer, TCM_BUFSIZE);
+
+ if (out_size >= 0) {
+ atomic_set(&chip->data_pending, out_size);
+ mutex_unlock(&chip->buffer_mutex);
+
+ /* Set a timeout by which the reader must come claim the result */
+ mod_timer(&chip->user_read_timer, jiffies + (60 * HZ));
+ } else
+ mutex_unlock(&chip->buffer_mutex);
+
+ return in_size;
+}
+EXPORT_SYMBOL_GPL(tcm_write);
+
+ssize_t tcm_read(struct file *file, char __user *buf,
+ size_t size, loff_t *off)
+{
+ struct tcm_chip *chip = file->private_data;
+ int ret_size = 0;
+
+ del_singleshot_timer_sync(&chip->user_read_timer);
+ flush_work(&chip->work);
+ ret_size = atomic_read(&chip->data_pending);
+ atomic_set(&chip->data_pending, 0);
+ if (ret_size > 0) { /* relay data */
+ if (size < ret_size)
+ ret_size = size;
+
+ mutex_lock(&chip->buffer_mutex);
+ if (copy_to_user(buf, chip->data_buffer, ret_size))
+ ret_size = -EFAULT;
+ mutex_unlock(&chip->buffer_mutex);
+ }
+
+ return ret_size;
+}
+EXPORT_SYMBOL_GPL(tcm_read);
+
+void tcm_remove_hardware(struct device *dev)
+{
+ struct tcm_chip *chip = dev_get_drvdata(dev);
+
+ if (chip == NULL) {
+ dev_err(dev, "No device data found\n");
+ return;
+ }
+
+ spin_lock(&driver_lock);
+ list_del(&chip->list);
+ spin_unlock(&driver_lock);
+
+ dev_set_drvdata(dev, NULL);
+ misc_deregister(&chip->vendor.miscdev);
+ kfree(chip->vendor.miscdev.name);
+
+ sysfs_remove_group(&dev->kobj, chip->vendor.attr_group);
+ /* tcm_bios_log_teardown(chip->bios_dir); */
+
+ clear_bit(chip->dev_num, dev_mask);
+ kfree(chip);
+ put_device(dev);
+}
+EXPORT_SYMBOL_GPL(tcm_remove_hardware);
+
+static u8 savestate[] = {
+ 0, 193, /* TCM_TAG_RQU_COMMAND */
+ 0, 0, 0, 10, /* blob length (in bytes) */
+ 0, 0, 128, 152 /* TCM_ORD_SaveState */
+};
+
+/*
+ * We are about to suspend. Save the TCM state
+ * so that it can be restored.
+ */
+int tcm_pm_suspend(struct device *dev, pm_message_t pm_state)
+{
+ struct tcm_chip *chip = dev_get_drvdata(dev);
+
+ if (chip == NULL)
+ return -ENODEV;
+
+ tcm_transmit(chip, savestate, sizeof(savestate));
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tcm_pm_suspend);
+
+int tcm_pm_suspend_p(struct device *dev)
+{
+ struct tcm_chip *chip = dev_get_drvdata(dev);
+
+ if (chip == NULL)
+ return -ENODEV;
+
+ tcm_transmit(chip, savestate, sizeof(savestate));
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tcm_pm_suspend_p);
+
+void tcm_startup(struct tcm_chip *chip)
+{
+ u8 start_up[] = {
+ 0, 193, /* TCM_TAG_RQU_COMMAND */
+ 0, 0, 0, 12, /* blob length (in bytes) */
+ 0, 0, 128, 153, /* TCM_ORD_SaveState */
+ 0, 1
+ };
+ if (chip == NULL)
+ return;
+ tcm_transmit(chip, start_up, sizeof(start_up));
+}
+EXPORT_SYMBOL_GPL(tcm_startup);
+
+/*
+ * Resume from a power safe. The BIOS already restored
+ * the TCM state.
+ */
+int tcm_pm_resume(struct device *dev)
+{
+ u8 start_up[] = {
+ 0, 193, /* TCM_TAG_RQU_COMMAND */
+ 0, 0, 0, 12, /* blob length (in bytes) */
+ 0, 0, 128, 153, /* TCM_ORD_SaveState */
+ 0, 1
+ };
+ struct tcm_chip *chip = dev_get_drvdata(dev);
+ /* dev_info(chip->dev ,"--call tcm_pm_resume\n"); */
+ if (chip == NULL)
+ return -ENODEV;
+
+ tcm_transmit(chip, start_up, sizeof(start_up));
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tcm_pm_resume);
+
+/*
+ * Called from tcm_<specific>.c probe function only for devices
+ * the driver has determined it should claim. Prior to calling
+ * this function the specific probe function has called pci_enable_device
+ * upon errant exit from this function specific probe function should call
+ * pci_disable_device
+ */
+struct tcm_chip *tcm_register_hardware(struct device *dev,
+ const struct tcm_vendor_specific *entry)
+{
+ int rc;
+#define DEVNAME_SIZE 7
+
+ char *devname = NULL;
+ struct tcm_chip *chip = NULL;
+
+ /* Driver specific per-device data */
+ chip = kzalloc(sizeof(*chip), GFP_KERNEL);
+ if (chip == NULL) {
+ dev_err(dev, "chip kzalloc err\n");
+ return NULL;
+ }
+
+ mutex_init(&chip->buffer_mutex);
+ mutex_init(&chip->tcm_mutex);
+ INIT_LIST_HEAD(&chip->list);
+
+ INIT_WORK(&chip->work, timeout_work);
+ timer_setup(&chip->user_read_timer, user_reader_timeout, 0);
+
+ memcpy(&chip->vendor, entry, sizeof(struct tcm_vendor_specific));
+
+ chip->dev_num = find_first_zero_bit(dev_mask, TCM_NUM_DEVICES);
+
+ if (chip->dev_num >= TCM_NUM_DEVICES) {
+ dev_err(dev, "No available tcm device numbers\n");
+ kfree(chip);
+ return NULL;
+ } else if (chip->dev_num == 0)
+ chip->vendor.miscdev.minor = TCM_MINOR;
+ else
+ chip->vendor.miscdev.minor = MISC_DYNAMIC_MINOR;
+
+ set_bit(chip->dev_num, dev_mask);
+
+ devname = kmalloc(DEVNAME_SIZE, GFP_KERNEL);
+ scnprintf(devname, DEVNAME_SIZE, "%s%d", "tcm", chip->dev_num);
+ chip->vendor.miscdev.name = devname;
+
+ /* chip->vendor.miscdev.dev = dev; */
+
+ chip->dev = get_device(dev);
+
+ if (misc_register(&chip->vendor.miscdev)) {
+ dev_err(chip->dev,
+ "unable to misc_register %s, minor %d\n",
+ chip->vendor.miscdev.name,
+ chip->vendor.miscdev.minor);
+ put_device(dev);
+ clear_bit(chip->dev_num, dev_mask);
+ kfree(chip);
+ kfree(devname);
+ return NULL;
+ }
+
+ spin_lock(&driver_lock);
+ dev_set_drvdata(dev, chip);
+ list_add(&chip->list, &tcm_chip_list);
+ spin_unlock(&driver_lock);
+
+ rc = sysfs_create_group(&dev->kobj, chip->vendor.attr_group);
+ /* chip->bios_dir = tcm_bios_log_setup(devname); */
+
+ return chip;
+}
+EXPORT_SYMBOL_GPL(tcm_register_hardware);
+
+static int __init tcm_init_module(void)
+{
+ return 0;
+}
+
+static void __exit tcm_exit_module(void)
+{
+}
+
+module_init(tcm_init_module);
+module_exit(tcm_exit_module);
+
+MODULE_AUTHOR("Nationz Technologies Inc.");
+MODULE_DESCRIPTION("TCM Driver");
+MODULE_VERSION("1.1.1.0");
+MODULE_LICENSE("GPL");
diff --git a/drivers/staging/gmjstcm/tcm.h b/drivers/staging/gmjstcm/tcm.h
new file mode 100644
index 000000000000..b8cafe78d590
--- /dev/null
+++ b/drivers/staging/gmjstcm/tcm.h
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2009 Nationz Technologies Inc.
+ *
+ */
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/platform_device.h>
+#include <linux/io.h>
+
+struct device;
+struct tcm_chip;
+
+enum tcm_timeout {
+ TCM_TIMEOUT = 5,
+};
+
+/* TCM addresses */
+enum tcm_addr {
+ TCM_SUPERIO_ADDR = 0x2E,
+ TCM_ADDR = 0x4E,
+};
+
+extern ssize_t tcm_show_pubek(struct device *, struct device_attribute *attr,
+ char *);
+extern ssize_t tcm_show_pcrs(struct device *, struct device_attribute *attr,
+ char *);
+extern ssize_t tcm_show_caps(struct device *, struct device_attribute *attr,
+ char *);
+extern ssize_t tcm_store_cancel(struct device *, struct device_attribute *attr,
+ const char *, size_t);
+extern ssize_t tcm_show_enabled(struct device *, struct device_attribute *attr,
+ char *);
+extern ssize_t tcm_show_active(struct device *, struct device_attribute *attr,
+ char *);
+extern ssize_t tcm_show_owned(struct device *, struct device_attribute *attr,
+ char *);
+extern ssize_t tcm_show_temp_deactivated(struct device *,
+ struct device_attribute *attr, char *);
+
+struct tcm_vendor_specific {
+ const u8 req_complete_mask;
+ const u8 req_complete_val;
+ const u8 req_canceled;
+ void __iomem *iobase; /* ioremapped address */
+ void __iomem *iolbc;
+ unsigned long base; /* TCM base address */
+
+ int irq;
+
+ int region_size;
+ int have_region;
+
+ int (*recv)(struct tcm_chip *, u8 *, size_t);
+ int (*send)(struct tcm_chip *, u8 *, size_t);
+ void (*cancel)(struct tcm_chip *);
+ u8 (*status)(struct tcm_chip *);
+ struct miscdevice miscdev;
+ struct attribute_group *attr_group;
+ struct list_head list;
+ int locality;
+ unsigned long timeout_a, timeout_b, timeout_c, timeout_d; /* jiffies */
+ unsigned long duration[3]; /* jiffies */
+
+ wait_queue_head_t read_queue;
+ wait_queue_head_t int_queue;
+};
+
+struct tcm_chip {
+ struct device *dev; /* Device stuff */
+
+ int dev_num; /* /dev/tcm# */
+ int num_opens; /* only one allowed */
+ int time_expired;
+
+ /* Data passed to and from the tcm via the read/write calls */
+ u8 *data_buffer;
+ atomic_t data_pending;
+ struct mutex buffer_mutex;
+
+ struct timer_list user_read_timer; /* user needs to claim result */
+ struct work_struct work;
+ struct mutex tcm_mutex; /* tcm is processing */
+
+ struct tcm_vendor_specific vendor;
+
+ struct dentry **bios_dir;
+
+ struct list_head list;
+};
+
+#define to_tcm_chip(n) container_of(n, struct tcm_chip, vendor)
+
+static inline int tcm_read_index(int base, int index)
+{
+ outb(index, base);
+ return inb(base+1) & 0xFF;
+}
+
+static inline void tcm_write_index(int base, int index, int value)
+{
+ outb(index, base);
+ outb(value & 0xFF, base+1);
+}
+extern void tcm_startup(struct tcm_chip *);
+extern void tcm_get_timeouts(struct tcm_chip *);
+extern unsigned long tcm_calc_ordinal_duration(struct tcm_chip *, u32);
+extern struct tcm_chip *tcm_register_hardware(struct device *,
+ const struct tcm_vendor_specific *);
+extern int tcm_open(struct inode *, struct file *);
+extern int tcm_release(struct inode *, struct file *);
+extern ssize_t tcm_write(struct file *, const char __user *, size_t,
+ loff_t *);
+extern ssize_t tcm_read(struct file *, char __user *, size_t, loff_t *);
+extern void tcm_remove_hardware(struct device *);
+extern int tcm_pm_suspend(struct device *, pm_message_t);
+extern int tcm_pm_suspend_p(struct device *);
+extern int tcm_pm_resume(struct device *);
+
diff --git a/drivers/staging/gmjstcm/tcm_tis_spi.c b/drivers/staging/gmjstcm/tcm_tis_spi.c
new file mode 100644
index 000000000000..db30a5b4c47d
--- /dev/null
+++ b/drivers/staging/gmjstcm/tcm_tis_spi.c
@@ -0,0 +1,868 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020 Kylin Tech. Co., Ltd.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/interrupt.h>
+#include <linux/wait.h>
+#include <linux/acpi.h>
+#include <linux/spi/spi.h>
+
+#include "tcm.h"
+
+#if !defined(CONFIG_KYLINOS_SERVER) && !defined(CONFIG_KYLINOS_DESKTOP)
+static int is_ft_all(void) {
+ return 0;
+}
+#endif
+
+#define TCM_HEADER_SIZE 10
+
+static bool tcm_debug;
+module_param_named(debug, tcm_debug, bool, 0600);
+MODULE_PARM_DESC(debug, "Turn TCM debugging mode on and off");
+
+#define tcm_dbg(fmt, args...) \
+{ \
+ if (tcm_debug) \
+ pr_err(fmt, ## args); \
+}
+
+enum tis_access {
+ TCM_ACCESS_VALID = 0x80,
+ TCM_ACCESS_ACTIVE_LOCALITY = 0x20,
+ TCM_ACCESS_REQUEST_PENDING = 0x04,
+ TCM_ACCESS_REQUEST_USE = 0x02,
+};
+
+enum tis_status {
+ TCM_STS_VALID = 0x80,
+ TCM_STS_COMMAND_READY = 0x40,
+ TCM_STS_GO = 0x20,
+ TCM_STS_DATA_AVAIL = 0x10,
+ TCM_STS_DATA_EXPECT = 0x08,
+};
+
+enum tis_int_flags {
+ TCM_GLOBAL_INT_ENABLE = 0x80000000,
+ TCM_INTF_BURST_COUNT_STATIC = 0x100,
+ TCM_INTF_CMD_READY_INT = 0x080,
+ TCM_INTF_INT_EDGE_FALLING = 0x040,
+ TCM_INTF_INT_EDGE_RISING = 0x020,
+ TCM_INTF_INT_LEVEL_LOW = 0x010,
+ TCM_INTF_INT_LEVEL_HIGH = 0x008,
+ TCM_INTF_LOCALITY_CHANGE_INT = 0x004,
+ TCM_INTF_STS_VALID_INT = 0x002,
+ TCM_INTF_DATA_AVAIL_INT = 0x001,
+};
+
+enum tis_defaults {
+ TIS_SHORT_TIMEOUT = 750, /* ms */
+ TIS_LONG_TIMEOUT = 2000, /* 2 sec */
+};
+
+#define TCM_ACCESS(l) (0x0000 | ((l) << 12))
+#define TCM_INT_ENABLE(l) (0x0008 | ((l) << 12)) /* interperet */
+#define TCM_INT_VECTOR(l) (0x000C | ((l) << 12))
+#define TCM_INT_STATUS(l) (0x0010 | ((l) << 12))
+#define TCM_INTF_CAPS(l) (0x0014 | ((l) << 12))
+#define TCM_STS(l) (0x0018 | ((l) << 12))
+#define TCM_DATA_FIFO(l) (0x0024 | ((l) << 12))
+
+#define TCM_DID_VID(l) (0x0F00 | ((l) << 12))
+#define TCM_RID(l) (0x0F04 | ((l) << 12))
+
+#define TIS_MEM_BASE_huawei 0x3fed40000LL
+
+#define MAX_SPI_FRAMESIZE 64
+
+//
+#define _CPU_FT2000A4
+#define REUSE_CONF_REG_BASE 0x28180208
+#define REUSE_GPIO1_A5_BASE 0x28005000
+
+static void *__iomem reuse_conf_reg;
+static void *__iomem gpio1_a5;
+
+//
+static LIST_HEAD(tis_chips);
+static DEFINE_SPINLOCK(tis_lock);
+
+struct chip_data {
+ u8 cs;
+ u8 tmode;
+ u8 type;
+ u8 poll_mode;
+ u16 clk_div;
+ u32 speed_hz;
+ void (*cs_control)(u32 command);
+};
+
+struct tcm_tis_spi_phy {
+ struct spi_device *spi_device;
+ struct completion ready;
+ u8 *iobuf;
+};
+
+int tcm_tis_spi_transfer(struct device *dev, u32 addr, u16 len,
+ u8 *in, const u8 *out)
+{
+ struct tcm_tis_spi_phy *phy = dev_get_drvdata(dev);
+ int ret = 0;
+ struct spi_message m;
+ struct spi_transfer spi_xfer;
+ u8 transfer_len;
+
+ tcm_dbg("TCM-dbg: %s, addr: 0x%x, len: %x, %s\n",
+ __func__, addr, len, (in) ? "in" : "out");
+
+ spi_bus_lock(phy->spi_device->master);
+
+ /* set gpio1_a5 to LOW */
+ if (is_ft_all() && (phy->spi_device->chip_select == 0)) {
+ iowrite32(0x0, gpio1_a5);
+ }
+
+ while (len) {
+ transfer_len = min_t(u16, len, MAX_SPI_FRAMESIZE);
+
+ phy->iobuf[0] = (in ? 0x80 : 0) | (transfer_len - 1);
+ phy->iobuf[1] = 0xd4;
+ phy->iobuf[2] = addr >> 8;
+ phy->iobuf[3] = addr;
+
+ memset(&spi_xfer, 0, sizeof(spi_xfer));
+ spi_xfer.tx_buf = phy->iobuf;
+ spi_xfer.rx_buf = phy->iobuf;
+ spi_xfer.len = 4;
+ spi_xfer.cs_change = 1;
+
+ spi_message_init(&m);
+ spi_message_add_tail(&spi_xfer, &m);
+ ret = spi_sync_locked(phy->spi_device, &m);
+ if (ret < 0)
+ goto exit;
+
+ spi_xfer.cs_change = 0;
+ spi_xfer.len = transfer_len;
+ spi_xfer.delay_usecs = 5;
+
+ if (in) {
+ spi_xfer.tx_buf = NULL;
+ } else if (out) {
+ spi_xfer.rx_buf = NULL;
+ memcpy(phy->iobuf, out, transfer_len);
+ out += transfer_len;
+ }
+
+ spi_message_init(&m);
+ spi_message_add_tail(&spi_xfer, &m);
+ reinit_completion(&phy->ready);
+ ret = spi_sync_locked(phy->spi_device, &m);
+ if (ret < 0)
+ goto exit;
+
+ if (in) {
+ memcpy(in, phy->iobuf, transfer_len);
+ in += transfer_len;
+ }
+
+ len -= transfer_len;
+ }
+
+exit:
+ /* set gpio1_a5 to HIGH */
+ if (is_ft_all() && (phy->spi_device->chip_select == 0)) {
+ iowrite32(0x20, gpio1_a5);
+ }
+
+ spi_bus_unlock(phy->spi_device->master);
+ tcm_dbg("TCM-dbg: ret: %d\n", ret);
+ return ret;
+}
+
+static int tcm_tis_read8(struct device *dev,
+ u32 addr, u16 len, u8 *result)
+{
+ return tcm_tis_spi_transfer(dev, addr, len, result, NULL);
+}
+
+static int tcm_tis_write8(struct device *dev,
+ u32 addr, u16 len, u8 *value)
+{
+ return tcm_tis_spi_transfer(dev, addr, len, NULL, value);
+}
+
+static int tcm_tis_readb(struct device *dev, u32 addr, u8 *value)
+{
+ return tcm_tis_read8(dev, addr, sizeof(u8), value);
+}
+
+static int tcm_tis_writeb(struct device *dev, u32 addr, u8 value)
+{
+ return tcm_tis_write8(dev, addr, sizeof(u8), &value);
+}
+
+static int tcm_tis_readl(struct device *dev, u32 addr, u32 *result)
+{
+ int rc;
+ __le32 result_le;
+
+ rc = tcm_tis_read8(dev, addr, sizeof(u32), (u8 *)&result_le);
+ tcm_dbg("TCM-dbg: result_le: 0x%x\n", result_le);
+ if (!rc)
+ *result = le32_to_cpu(result_le);
+
+ return rc;
+}
+
+static int tcm_tis_writel(struct device *dev, u32 addr, u32 value)
+{
+ int rc;
+ __le32 value_le;
+
+ value_le = cpu_to_le32(value);
+ rc = tcm_tis_write8(dev, addr, sizeof(u32), (u8 *)&value_le);
+
+ return rc;
+}
+
+static int request_locality(struct tcm_chip *chip, int l);
+static void release_locality(struct tcm_chip *chip, int l, int force);
+static void cleanup_tis(void)
+{
+ int ret;
+ u32 inten;
+ struct tcm_vendor_specific *i, *j;
+ struct tcm_chip *chip;
+
+ spin_lock(&tis_lock);
+ list_for_each_entry_safe(i, j, &tis_chips, list) {
+ chip = to_tcm_chip(i);
+ ret = tcm_tis_readl(chip->dev,
+ TCM_INT_ENABLE(chip->vendor.locality), &inten);
+ if (ret < 0)
+ return;
+
+ tcm_tis_writel(chip->dev, TCM_INT_ENABLE(chip->vendor.locality),
+ ~TCM_GLOBAL_INT_ENABLE & inten);
+ release_locality(chip, chip->vendor.locality, 1);
+ }
+ spin_unlock(&tis_lock);
+}
+
+static void tcm_tis_init(struct tcm_chip *chip)
+{
+ int ret;
+ u8 rid;
+ u32 vendor, intfcaps;
+
+ ret = tcm_tis_readl(chip->dev, TCM_DID_VID(0), &vendor);
+
+ if ((vendor & 0xffff) != 0x19f5 && (vendor & 0xffff) != 0x1B4E)
+ pr_info("there is no Nationz TCM on you computer\n");
+
+ ret = tcm_tis_readb(chip->dev, TCM_RID(0), &rid);
+ if (ret < 0)
+ return;
+ pr_info("kylin: 2019-09-21 1.2 TCM (device-id 0x%X, rev-id %d)\n",
+ vendor >> 16, rid);
+
+ /* Figure out the capabilities */
+ ret = tcm_tis_readl(chip->dev,
+ TCM_INTF_CAPS(chip->vendor.locality), &intfcaps);
+ if (ret < 0)
+ return;
+
+ if (request_locality(chip, 0) != 0)
+ pr_err("tcm request_locality err\n");
+
+ atomic_set(&chip->data_pending, 0);
+}
+
+static void tcm_handle_err(struct tcm_chip *chip)
+{
+ cleanup_tis();
+ tcm_tis_init(chip);
+}
+
+static bool check_locality(struct tcm_chip *chip, int l)
+{
+ int ret;
+ u8 access;
+
+ ret = tcm_tis_readb(chip->dev, TCM_ACCESS(l), &access);
+ tcm_dbg("TCM-dbg: access: 0x%x\n", access);
+ if (ret < 0)
+ return false;
+
+ if ((access & (TCM_ACCESS_ACTIVE_LOCALITY | TCM_ACCESS_VALID)) ==
+ (TCM_ACCESS_ACTIVE_LOCALITY | TCM_ACCESS_VALID)) {
+ chip->vendor.locality = l;
+ return true;
+ }
+
+ return false;
+}
+
+static int request_locality(struct tcm_chip *chip, int l)
+{
+ unsigned long stop;
+
+ if (check_locality(chip, l))
+ return l;
+
+ tcm_tis_writeb(chip->dev, TCM_ACCESS(l), TCM_ACCESS_REQUEST_USE);
+
+ /* wait for burstcount */
+ stop = jiffies + chip->vendor.timeout_a;
+ do {
+ if (check_locality(chip, l))
+ return l;
+ msleep(TCM_TIMEOUT);
+ } while (time_before(jiffies, stop));
+
+ return -1;
+}
+
+static void release_locality(struct tcm_chip *chip, int l, int force)
+{
+ int ret;
+ u8 access;
+
+ ret = tcm_tis_readb(chip->dev, TCM_ACCESS(l), &access);
+ if (ret < 0)
+ return;
+ if (force || (access & (TCM_ACCESS_REQUEST_PENDING | TCM_ACCESS_VALID)) ==
+ (TCM_ACCESS_REQUEST_PENDING | TCM_ACCESS_VALID))
+ tcm_tis_writeb(chip->dev,
+ TCM_ACCESS(l), TCM_ACCESS_ACTIVE_LOCALITY);
+}
+
+static u8 tcm_tis_status(struct tcm_chip *chip)
+{
+ int ret;
+ u8 status;
+
+ ret = tcm_tis_readb(chip->dev,
+ TCM_STS(chip->vendor.locality), &status);
+ tcm_dbg("TCM-dbg: status: 0x%x\n", status);
+ if (ret < 0)
+ return 0;
+
+ return status;
+}
+
+static void tcm_tis_ready(struct tcm_chip *chip)
+{
+ /* this causes the current command to be aboreted */
+ tcm_tis_writeb(chip->dev, TCM_STS(chip->vendor.locality),
+ TCM_STS_COMMAND_READY);
+}
+
+static int get_burstcount(struct tcm_chip *chip)
+{
+ int ret;
+ unsigned long stop;
+ u8 tmp, tmp1;
+ int burstcnt = 0;
+
+ /* wait for burstcount */
+ /* which timeout value, spec has 2 answers (c & d) */
+ stop = jiffies + chip->vendor.timeout_d;
+ do {
+ ret = tcm_tis_readb(chip->dev,
+ TCM_STS(chip->vendor.locality) + 1,
+ &tmp);
+ tcm_dbg("TCM-dbg: burstcnt: 0x%x\n", burstcnt);
+ if (ret < 0)
+ return -EINVAL;
+ ret = tcm_tis_readb(chip->dev,
+ (TCM_STS(chip->vendor.locality) + 2),
+ &tmp1);
+ tcm_dbg("TCM-dbg: burstcnt: 0x%x\n", burstcnt);
+ if (ret < 0)
+ return -EINVAL;
+
+ burstcnt = tmp | (tmp1 << 8);
+ if (burstcnt)
+ return burstcnt;
+ msleep(TCM_TIMEOUT);
+ } while (time_before(jiffies, stop));
+
+ return -EBUSY;
+}
+
+static int wait_for_stat(struct tcm_chip *chip, u8 mask,
+ unsigned long timeout,
+ wait_queue_head_t *queue)
+{
+ unsigned long stop;
+ u8 status;
+
+ /* check current status */
+ status = tcm_tis_status(chip);
+ if ((status & mask) == mask)
+ return 0;
+
+ stop = jiffies + timeout;
+ do {
+ msleep(TCM_TIMEOUT);
+ status = tcm_tis_status(chip);
+ if ((status & mask) == mask)
+ return 0;
+ } while (time_before(jiffies, stop));
+
+ return -ETIME;
+}
+
+static int recv_data(struct tcm_chip *chip, u8 *buf, size_t count)
+{
+ int ret;
+ int size = 0, burstcnt;
+
+ while (size < count && wait_for_stat(chip,
+ TCM_STS_DATA_AVAIL | TCM_STS_VALID,
+ chip->vendor.timeout_c,
+ &chip->vendor.read_queue) == 0) {
+ burstcnt = get_burstcount(chip);
+
+ if (burstcnt < 0) {
+ dev_err(chip->dev, "Unable to read burstcount\n");
+ return burstcnt;
+ }
+
+ for (; burstcnt > 0 && size < count; burstcnt--) {
+ ret = tcm_tis_readb(chip->dev,
+ TCM_DATA_FIFO(chip->vendor.locality),
+ &buf[size]);
+ tcm_dbg("TCM-dbg: buf[%d]: 0x%x\n", size, buf[size]);
+ size++;
+ }
+ }
+
+ return size;
+}
+
+static int tcm_tis_recv(struct tcm_chip *chip, u8 *buf, size_t count)
+{
+ int size = 0;
+ int expected, status;
+ unsigned long stop;
+
+ if (count < TCM_HEADER_SIZE) {
+ dev_err(chip->dev, "read size is to small: %d\n", (u32)(count));
+ size = -EIO;
+ goto out;
+ }
+
+ /* read first 10 bytes, including tag, paramsize, and result */
+ size = recv_data(chip, buf, TCM_HEADER_SIZE);
+ if (size < TCM_HEADER_SIZE) {
+ dev_err(chip->dev, "Unable to read header\n");
+ goto out;
+ }
+
+ expected = be32_to_cpu(*(__be32 *)(buf + 2));
+ if (expected > count) {
+ dev_err(chip->dev, "Expected data count\n");
+ size = -EIO;
+ goto out;
+ }
+
+ size += recv_data(chip, &buf[TCM_HEADER_SIZE],
+ expected - TCM_HEADER_SIZE);
+ if (size < expected) {
+ dev_err(chip->dev, "Unable to read remainder of result\n");
+ size = -ETIME;
+ goto out;
+ }
+
+ wait_for_stat(chip, TCM_STS_VALID, chip->vendor.timeout_c,
+ &chip->vendor.int_queue);
+
+ stop = jiffies + chip->vendor.timeout_c;
+ do {
+ msleep(TCM_TIMEOUT);
+ status = tcm_tis_status(chip);
+ if ((status & TCM_STS_DATA_AVAIL) == 0)
+ break;
+
+ } while (time_before(jiffies, stop));
+
+ status = tcm_tis_status(chip);
+ if (status & TCM_STS_DATA_AVAIL) { /* retry? */
+ dev_err(chip->dev, "Error left over data\n");
+ size = -EIO;
+ goto out;
+ }
+
+out:
+ tcm_tis_ready(chip);
+ release_locality(chip, chip->vendor.locality, 0);
+ if (size < 0)
+ tcm_handle_err(chip);
+ return size;
+}
+
+/*
+ * If interrupts are used (signaled by an irq set in the vendor structure)
+ * tcm.c can skip polling for the data to be available as the interrupt is
+ * waited for here
+ */
+static int tcm_tis_send(struct tcm_chip *chip, u8 *buf, size_t len)
+{
+ int rc, status, burstcnt;
+ size_t count = 0;
+ u32 ordinal;
+ unsigned long stop;
+ int send_again = 0;
+
+tcm_tis_send_again:
+ count = 0;
+ if (request_locality(chip, 0) < 0) {
+ dev_err(chip->dev, "send, tcm is busy\n");
+ return -EBUSY;
+ }
+ status = tcm_tis_status(chip);
+
+ if ((status & TCM_STS_COMMAND_READY) == 0) {
+ tcm_tis_ready(chip);
+ if (wait_for_stat(chip, TCM_STS_COMMAND_READY,
+ chip->vendor.timeout_b, &chip->vendor.int_queue) < 0) {
+ dev_err(chip->dev, "send, tcm wait time out1\n");
+ rc = -ETIME;
+ goto out_err;
+ }
+ }
+
+ while (count < len - 1) {
+ burstcnt = get_burstcount(chip);
+ if (burstcnt < 0) {
+ dev_err(chip->dev, "Unable to read burstcount\n");
+ rc = burstcnt;
+ goto out_err;
+ }
+ for (; burstcnt > 0 && count < len - 1; burstcnt--) {
+ tcm_tis_writeb(chip->dev,
+ TCM_DATA_FIFO(chip->vendor.locality), buf[count]);
+ count++;
+ }
+
+ wait_for_stat(chip, TCM_STS_VALID, chip->vendor.timeout_c,
+ &chip->vendor.int_queue);
+ }
+
+ /* write last byte */
+ tcm_tis_writeb(chip->dev,
+ TCM_DATA_FIFO(chip->vendor.locality), buf[count]);
+
+ wait_for_stat(chip, TCM_STS_VALID,
+ chip->vendor.timeout_c, &chip->vendor.int_queue);
+ stop = jiffies + chip->vendor.timeout_c;
+ do {
+ msleep(TCM_TIMEOUT);
+ status = tcm_tis_status(chip);
+ if ((status & TCM_STS_DATA_EXPECT) == 0)
+ break;
+
+ } while (time_before(jiffies, stop));
+
+ if ((status & TCM_STS_DATA_EXPECT) != 0) {
+ dev_err(chip->dev, "send, tcm expect data\n");
+ rc = -EIO;
+ goto out_err;
+ }
+
+ /* go and do it */
+ tcm_tis_writeb(chip->dev, TCM_STS(chip->vendor.locality), TCM_STS_GO);
+
+ ordinal = be32_to_cpu(*((__be32 *)(buf + 6)));
+ if (wait_for_stat(chip, TCM_STS_DATA_AVAIL | TCM_STS_VALID,
+ tcm_calc_ordinal_duration(chip, ordinal),
+ &chip->vendor.read_queue) < 0) {
+ dev_err(chip->dev, "send, tcm wait time out2\n");
+ rc = -ETIME;
+ goto out_err;
+ }
+
+ return len;
+
+out_err:
+ tcm_tis_ready(chip);
+ release_locality(chip, chip->vendor.locality, 0);
+ tcm_handle_err(chip);
+ if (send_again++ < 3) {
+ goto tcm_tis_send_again;
+ }
+
+ dev_err(chip->dev, "kylin send, err: %d\n", rc);
+ return rc;
+}
+
+static struct file_operations tis_ops = {
+ .owner = THIS_MODULE,
+ .llseek = no_llseek,
+ .open = tcm_open,
+ .read = tcm_read,
+ .write = tcm_write,
+ .release = tcm_release,
+};
+
+static DEVICE_ATTR(pubek, S_IRUGO, tcm_show_pubek, NULL);
+static DEVICE_ATTR(pcrs, S_IRUGO, tcm_show_pcrs, NULL);
+static DEVICE_ATTR(enabled, S_IRUGO, tcm_show_enabled, NULL);
+static DEVICE_ATTR(active, S_IRUGO, tcm_show_active, NULL);
+static DEVICE_ATTR(owned, S_IRUGO, tcm_show_owned, NULL);
+static DEVICE_ATTR(temp_deactivated, S_IRUGO, tcm_show_temp_deactivated,
+ NULL);
+static DEVICE_ATTR(caps, S_IRUGO, tcm_show_caps, NULL);
+static DEVICE_ATTR(cancel, S_IWUSR | S_IWGRP, NULL, tcm_store_cancel);
+
+static struct attribute *tis_attrs[] = {
+ &dev_attr_pubek.attr,
+ &dev_attr_pcrs.attr,
+ &dev_attr_enabled.attr,
+ &dev_attr_active.attr,
+ &dev_attr_owned.attr,
+ &dev_attr_temp_deactivated.attr,
+ &dev_attr_caps.attr,
+ &dev_attr_cancel.attr, NULL,
+};
+
+static struct attribute_group tis_attr_grp = {
+ .attrs = tis_attrs
+};
+
+static struct tcm_vendor_specific tcm_tis = {
+ .status = tcm_tis_status,
+ .recv = tcm_tis_recv,
+ .send = tcm_tis_send,
+ .cancel = tcm_tis_ready,
+ .req_complete_mask = TCM_STS_DATA_AVAIL | TCM_STS_VALID,
+ .req_complete_val = TCM_STS_DATA_AVAIL | TCM_STS_VALID,
+ .req_canceled = TCM_STS_COMMAND_READY,
+ .attr_group = &tis_attr_grp,
+ .miscdev = {
+ .fops = &tis_ops,
+ },
+};
+
+static struct tcm_chip *chip;
+static int tcm_tis_spi_probe(struct spi_device *spi)
+{
+ int ret;
+ u8 revid;
+ u32 vendor, intfcaps;
+ struct tcm_tis_spi_phy *phy;
+ struct chip_data *spi_chip;
+
+ pr_info("TCM(ky): __func__(v=%d) ..\n",
+ 10);
+
+ tcm_dbg("TCM-dbg: %s/%d, enter\n", __func__, __LINE__);
+ phy = devm_kzalloc(&spi->dev, sizeof(struct tcm_tis_spi_phy),
+ GFP_KERNEL);
+ if (!phy)
+ return -ENOMEM;
+
+ phy->iobuf = devm_kmalloc(&spi->dev, MAX_SPI_FRAMESIZE, GFP_KERNEL);
+ if (!phy->iobuf)
+ return -ENOMEM;
+
+ phy->spi_device = spi;
+ init_completion(&phy->ready);
+
+ tcm_dbg("TCM-dbg: %s/%d\n", __func__, __LINE__);
+ /* init spi dev */
+ spi->chip_select = 0; /* cs0 */
+ spi->mode = SPI_MODE_0;
+ spi->bits_per_word = 8;
+ spi->max_speed_hz = spi->max_speed_hz ? : 24000000;
+ spi_setup(spi);
+
+ spi_chip = spi_get_ctldata(spi);
+ if (!spi_chip) {
+ pr_err("There was wrong in spi master\n");
+ return -ENODEV;
+ }
+ /* tcm does not support interrupt mode, we use poll mode instead. */
+ spi_chip->poll_mode = 1;
+
+ tcm_dbg("TCM-dbg: %s/%d\n", __func__, __LINE__);
+ /* regiter tcm hw */
+ chip = tcm_register_hardware(&spi->dev, &tcm_tis);
+ if (!chip) {
+ dev_err(chip->dev, "tcm register hardware err\n");
+ return -ENODEV;
+ }
+
+ dev_set_drvdata(chip->dev, phy);
+
+ /**
+ * phytium2000a4 spi controller's clk clk level is unstable,
+ * so it is solved by using the low level of gpio output.
+ **/
+ if (is_ft_all() && (spi->chip_select == 0)) {
+ /* reuse conf reg base */
+ reuse_conf_reg = ioremap(REUSE_CONF_REG_BASE, 0x10);
+ if (!reuse_conf_reg) {
+ dev_err(&spi->dev, "Failed to ioremap reuse conf reg\n");
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ /* gpio1 a5 base addr */
+ gpio1_a5 = ioremap(REUSE_GPIO1_A5_BASE, 0x10);
+ if (!gpio1_a5) {
+ dev_err(&spi->dev, "Failed to ioremap gpio1 a5\n");
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ /* reuse cs0 to gpio1_a5 */
+ iowrite32((ioread32(reuse_conf_reg) | 0xFFFF0) & 0xFFF9004F,
+ reuse_conf_reg);
+ /* set gpio1 a5 to output */
+ iowrite32(0x20, gpio1_a5 + 0x4);
+ }
+
+ tcm_dbg("TCM-dbg: %s/%d\n",
+ __func__, __LINE__);
+ ret = tcm_tis_readl(chip->dev, TCM_DID_VID(0), &vendor);
+ if (ret < 0)
+ goto out_err;
+
+ tcm_dbg("TCM-dbg: %s/%d, vendor: 0x%x\n",
+ __func__, __LINE__, vendor);
+ if ((vendor & 0xffff) != 0x19f5 && (vendor & 0xffff) != 0x1B4E) {
+ dev_err(chip->dev, "there is no Nationz TCM on you computer\n");
+ goto out_err;
+ }
+
+ ret = tcm_tis_readb(chip->dev, TCM_RID(0), &revid);
+ tcm_dbg("TCM-dbg: %s/%d, revid: 0x%x\n",
+ __func__, __LINE__, revid);
+ if (ret < 0)
+ goto out_err;
+ dev_info(chip->dev, "kylin: 2019-09-21 1.2 TCM "
+ "(device-id 0x%X, rev-id %d)\n",
+ vendor >> 16, revid);
+
+ /* Default timeouts */
+ chip->vendor.timeout_a = msecs_to_jiffies(TIS_SHORT_TIMEOUT);
+ chip->vendor.timeout_b = msecs_to_jiffies(TIS_LONG_TIMEOUT);
+ chip->vendor.timeout_c = msecs_to_jiffies(TIS_SHORT_TIMEOUT);
+ chip->vendor.timeout_d = msecs_to_jiffies(TIS_SHORT_TIMEOUT);
+
+ tcm_dbg("TCM-dbg: %s/%d\n",
+ __func__, __LINE__);
+ /* Figure out the capabilities */
+ ret = tcm_tis_readl(chip->dev,
+ TCM_INTF_CAPS(chip->vendor.locality), &intfcaps);
+ if (ret < 0)
+ goto out_err;
+
+ tcm_dbg("TCM-dbg: %s/%d, intfcaps: 0x%x\n",
+ __func__, __LINE__, intfcaps);
+ if (request_locality(chip, 0) != 0) {
+ dev_err(chip->dev, "tcm request_locality err\n");
+ ret = -ENODEV;
+ goto out_err;
+ }
+
+ INIT_LIST_HEAD(&chip->vendor.list);
+ spin_lock(&tis_lock);
+ list_add(&chip->vendor.list, &tis_chips);
+ spin_unlock(&tis_lock);
+
+ tcm_get_timeouts(chip);
+ tcm_startup(chip);
+
+ tcm_dbg("TCM-dbg: %s/%d, exit\n", __func__, __LINE__);
+ return 0;
+
+out_err:
+ if (is_ft_all()) {
+ if (reuse_conf_reg)
+ iounmap(reuse_conf_reg);
+ if (gpio1_a5)
+ iounmap(gpio1_a5);
+ }
+ tcm_dbg("TCM-dbg: %s/%d, error\n", __func__, __LINE__);
+ dev_set_drvdata(chip->dev, chip);
+ tcm_remove_hardware(chip->dev);
+
+ return ret;
+}
+
+static int tcm_tis_spi_remove(struct spi_device *dev)
+{
+ if (is_ft_all()) {
+ if (reuse_conf_reg)
+ iounmap(reuse_conf_reg);
+ if (gpio1_a5)
+ iounmap(gpio1_a5);
+ }
+
+ dev_info(&dev->dev, "%s\n", __func__);
+ dev_set_drvdata(chip->dev, chip);
+ tcm_remove_hardware(&dev->dev);
+
+ return 0;
+}
+
+static const struct acpi_device_id tcm_tis_spi_acpi_match[] = {
+ {"TCMS0001", 0},
+ {"SMO0768", 0},
+ {"ZIC0601", 0},
+ {}
+};
+MODULE_DEVICE_TABLE(acpi, tcm_tis_spi_acpi_match);
+
+static const struct spi_device_id tcm_tis_spi_id_table[] = {
+ {"SMO0768", 0},
+ {"ZIC0601", 0},
+ {}
+};
+MODULE_DEVICE_TABLE(spi, tcm_tis_spi_id_table);
+
+static struct spi_driver tcm_tis_spi_drv = {
+ .driver = {
+ .name = "tcm_tis_spi",
+ .acpi_match_table = ACPI_PTR(tcm_tis_spi_acpi_match),
+ },
+ .id_table = tcm_tis_spi_id_table,
+ .probe = tcm_tis_spi_probe,
+ .remove = tcm_tis_spi_remove,
+};
+
+#if 1
+module_spi_driver(tcm_tis_spi_drv);
+#else/*0*/
+
+static int __init __spi_driver_init(void)
+{
+ pr_info("TCM(ky): __init __func__(ver=%2d)\n",
+ 10);
+ return spi_register_driver(&tcm_tis_spi_drv);
+}
+
+static void __exit __spi_driver_exit(void)
+{
+ pr_info("TCM(ky): __exit __func__\n");
+ spi_unregister_driver(&tcm_tis_spi_drv);
+}
+
+module_init(__spi_driver_init);
+module_exit(__spi_driver_exit);
+#endif/*0*/
+
+MODULE_AUTHOR("xiongxin<xiongxin(a)tj.kylinos.cn>");
+MODULE_DESCRIPTION("TCM Driver Base Spi");
+MODULE_VERSION("6.1.0.2");
+MODULE_LICENSE("GPL");
--
2.23.0
1
1

23 Feb '21
1
0

23 Feb '21
From: Sang Yan <sangyan(a)huawei.com>
hulk inclusion
category: feature
bugzilla: 48159
CVE: N/A
Introducing a feature of CPU PARK in order to save time
of cpus down and up during kexec, which may cost 250ms of
per cpu's down and 30ms of up.
As a result, for 128 cores, it costs more than 30 seconds
to down and up cpus during kexec. Think about 256 cores and more.
CPU PARK is a state that cpu power-on and staying in spin loop, polling
for exit chances, such as writing exit address.
Reserving a block of memory, to fill with cpu park text section,
exit address and park-magic-flag of each cpu. In implementation,
reserved one page for one cpu core.
Cpus going to park state instead of down in machine_shutdown().
Cpus going out of park state in smp_init instead of brought up.
One of cpu park sections in pre-reserved memory blocks,:
+--------------+
+ exit address +
+--------------+
+ park magic +
+--------------+
+ park codes +
+ . +
+ . +
+ . +
+--------------+
Signed-off-by: Sang Yan <sangyan(a)huawei.com>
---
arch/arm64/Kconfig | 12 ++
arch/arm64/include/asm/kexec.h | 6 +
arch/arm64/include/asm/smp.h | 15 +++
arch/arm64/kernel/Makefile | 1 +
arch/arm64/kernel/cpu-park.S | 59 ++++++++++
arch/arm64/kernel/machine_kexec.c | 2 +-
arch/arm64/kernel/process.c | 4 +
arch/arm64/kernel/smp.c | 229 ++++++++++++++++++++++++++++++++++++++
arch/arm64/mm/init.c | 55 +++++++++
9 files changed, 382 insertions(+), 1 deletion(-)
create mode 100644 arch/arm64/kernel/cpu-park.S
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index b9c5654..0885668 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -345,6 +345,18 @@ config KASAN_SHADOW_OFFSET
default 0xeffffff900000000 if ARM64_VA_BITS_36 && KASAN_SW_TAGS
default 0xffffffffffffffff
+config ARM64_CPU_PARK
+ bool "Support CPU PARK on kexec"
+ depends on SMP
+ depends on KEXEC_CORE
+ help
+ This enables support for CPU PARK feature in
+ order to save time of cpu down to up.
+ CPU park is a state through kexec, spin loop
+ instead of cpu die before jumping to new kernel,
+ jumping out from loop to new kernel entry in
+ smp_init.
+
source "arch/arm64/Kconfig.platforms"
menu "Kernel Features"
diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h
index 79909ae..a133889 100644
--- a/arch/arm64/include/asm/kexec.h
+++ b/arch/arm64/include/asm/kexec.h
@@ -36,6 +36,11 @@
#define CRASH_ADDR_HIGH_MAX MEMBLOCK_ALLOC_ACCESSIBLE
+#ifdef CONFIG_ARM64_CPU_PARK
+/* CPU park state flag: "park" */
+#define PARK_MAGIC 0x7061726b
+#endif
+
#ifndef __ASSEMBLY__
/**
@@ -104,6 +109,7 @@ static inline void crash_post_resume(void) {}
#ifdef CONFIG_KEXEC_CORE
extern void __init reserve_crashkernel(void);
#endif
+void machine_kexec_mask_interrupts(void);
#ifdef CONFIG_KEXEC_FILE
#define ARCH_HAS_KIMAGE_ARCH
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index 2e7f529..8c5d2d6 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -145,6 +145,21 @@ bool cpus_are_stuck_in_kernel(void);
extern void crash_smp_send_stop(void);
extern bool smp_crash_stop_failed(void);
+#ifdef CONFIG_ARM64_CPU_PARK
+#define PARK_SECTION_SIZE 1024
+struct cpu_park_info {
+ /* Physical address of reserved park memory. */
+ unsigned long start;
+ /* park reserve mem len should be PARK_SECTION_SIZE * NR_CPUS */
+ unsigned long len;
+ /* Virtual address of reserved park memory. */
+ unsigned long start_v;
+};
+extern struct cpu_park_info park_info;
+extern void enter_cpu_park(unsigned long text, unsigned long exit);
+extern void do_cpu_park(unsigned long exit);
+extern int kexec_smp_send_park(void);
+#endif
#endif /* ifndef __ASSEMBLY__ */
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 2621d5c..60478d2 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate-asm.o
obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o \
cpu-reset.o
+obj-$(CONFIG_ARM64_CPU_PARK) += cpu-park.o
obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o
obj-$(CONFIG_ARM64_RELOC_TEST) += arm64-reloc-test.o
arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o
diff --git a/arch/arm64/kernel/cpu-park.S b/arch/arm64/kernel/cpu-park.S
new file mode 100644
index 0000000..10c685c
--- /dev/null
+++ b/arch/arm64/kernel/cpu-park.S
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * CPU park routines
+ *
+ * Copyright (C) 2020 Huawei Technologies., Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/kexec.h>
+#include <asm/sysreg.h>
+#include <asm/virt.h>
+
+.text
+.pushsection .idmap.text, "awx"
+
+/* cpu park helper in idmap section */
+SYM_CODE_START(enter_cpu_park)
+ /* Clear sctlr_el1 flags. */
+ mrs x12, sctlr_el1
+ mov_q x13, SCTLR_ELx_FLAGS
+ bic x12, x12, x13
+ pre_disable_mmu_workaround
+ msr sctlr_el1, x12 /* disable mmu */
+ isb
+
+ mov x18, x0
+ mov x0, x1 /* secondary_entry addr */
+ br x18 /* call do_cpu_park of each cpu */
+SYM_CODE_END(enter_cpu_park)
+
+.popsection
+
+SYM_CODE_START(do_cpu_park)
+ ldr x18, =PARK_MAGIC /* magic number "park" */
+ add x1, x0, #8
+ str x18, [x1] /* set on-park flag */
+ dc civac, x1 /* flush cache of "park" */
+ dsb nsh
+ isb
+
+.Lloop:
+ wfe
+ isb
+ ldr x19, [x0]
+ cmp x19, #0 /* test secondary_entry */
+ b.eq .Lloop
+
+ ic iallu /* invalidate the local I-cache */
+ dsb nsh
+ isb
+
+ br x19 /* jump to secondary_entry */
+SYM_CODE_END(do_cpu_park)
+
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index a0b144c..f47ce96 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -213,7 +213,7 @@ void machine_kexec(struct kimage *kimage)
BUG(); /* Should never get here. */
}
-static void machine_kexec_mask_interrupts(void)
+void machine_kexec_mask_interrupts(void)
{
unsigned int i;
struct irq_desc *desc;
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 73e3b32..10cffee 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -146,6 +146,10 @@ void arch_cpu_idle_dead(void)
*/
void machine_shutdown(void)
{
+#ifdef CONFIG_ARM64_CPU_PARK
+ if (kexec_smp_send_park() == 0)
+ return;
+#endif
smp_shutdown_nonboot_cpus(reboot_cpu);
}
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 18e9727..bc475d5 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -32,6 +32,8 @@
#include <linux/irq_work.h>
#include <linux/kernel_stat.h>
#include <linux/kexec.h>
+#include <linux/console.h>
+
#include <linux/kvm_host.h>
#include <asm/alternative.h>
@@ -93,6 +95,167 @@ static inline int op_cpu_kill(unsigned int cpu)
}
#endif
+#ifdef CONFIG_ARM64_CPU_PARK
+struct cpu_park_section {
+ unsigned long exit; /* exit address of park look */
+ unsigned long magic; /* maigc represent park state */
+ char text[0]; /* text section of park */
+};
+
+static int mmap_cpu_park_mem(void)
+{
+ if (!park_info.start)
+ return -ENOMEM;
+
+ if (park_info.start_v)
+ return 0;
+
+ park_info.start_v = (unsigned long)__ioremap(park_info.start,
+ park_info.len,
+ PAGE_KERNEL_EXEC);
+ if (!park_info.start_v) {
+ pr_warn("map park memory failed.");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static inline unsigned long cpu_park_section_v(unsigned int cpu)
+{
+ return park_info.start_v + PARK_SECTION_SIZE * (cpu - 1);
+}
+
+static inline unsigned long cpu_park_section_p(unsigned int cpu)
+{
+ return park_info.start + PARK_SECTION_SIZE * (cpu - 1);
+}
+
+/*
+ * Write the secondary_entry to exit section of park state.
+ * Then the secondary cpu will jump straight into the kernel
+ * by the secondary_entry.
+ */
+static int write_park_exit(unsigned int cpu)
+{
+ struct cpu_park_section *park_section;
+ unsigned long *park_exit;
+ unsigned long *park_text;
+
+ if (mmap_cpu_park_mem() != 0)
+ return -EPERM;
+
+ park_section = (struct cpu_park_section *)cpu_park_section_v(cpu);
+ park_exit = &park_section->exit;
+ park_text = (unsigned long *)park_section->text;
+ pr_debug("park_text 0x%lx : 0x%lx, do_cpu_park text 0x%lx : 0x%lx",
+ (unsigned long)park_text, *park_text,
+ (unsigned long)do_cpu_park,
+ *(unsigned long *)do_cpu_park);
+
+ /*
+ * Test first 8 bytes to determine
+ * whether needs to write cpu park exit.
+ */
+ if (*park_text == *(unsigned long *)do_cpu_park) {
+ writeq_relaxed(__pa_symbol(secondary_entry), park_exit);
+ __flush_dcache_area((__force void *)park_exit,
+ sizeof(unsigned long));
+ flush_icache_range((unsigned long)park_exit,
+ (unsigned long)(park_exit + 1));
+ sev();
+ dsb(sy);
+ isb();
+
+ pr_debug("Write cpu %u secondary entry 0x%lx to 0x%lx.",
+ cpu, *park_exit, (unsigned long)park_exit);
+ pr_info("Boot cpu %u from PARK state.", cpu);
+ return 0;
+ }
+
+ return -EPERM;
+}
+
+/* Install cpu park sections for the specific cpu. */
+static int install_cpu_park(unsigned int cpu)
+{
+ struct cpu_park_section *park_section;
+ unsigned long *park_exit;
+ unsigned long *park_magic;
+ unsigned long park_text_len;
+
+ park_section = (struct cpu_park_section *)cpu_park_section_v(cpu);
+ pr_debug("Install cpu park on cpu %u park exit 0x%lx park text 0x%lx",
+ cpu, (unsigned long)park_section,
+ (unsigned long)(park_section->text));
+
+ park_exit = &park_section->exit;
+ park_magic = &park_section->magic;
+ park_text_len = PARK_SECTION_SIZE - sizeof(struct cpu_park_section);
+
+ *park_exit = 0UL;
+ *park_magic = 0UL;
+ memcpy((void *)park_section->text, do_cpu_park, park_text_len);
+ __flush_dcache_area((void *)park_section, PARK_SECTION_SIZE);
+
+ return 0;
+}
+
+static int uninstall_cpu_park(unsigned int cpu)
+{
+ unsigned long park_section;
+
+ if (mmap_cpu_park_mem() != 0)
+ return -EPERM;
+
+ park_section = cpu_park_section_v(cpu);
+ memset((void *)park_section, 0, PARK_SECTION_SIZE);
+ __flush_dcache_area((void *)park_section, PARK_SECTION_SIZE);
+
+ return 0;
+}
+
+static int cpu_wait_park(unsigned int cpu)
+{
+ long timeout;
+ struct cpu_park_section *park_section;
+
+ volatile unsigned long *park_magic;
+
+ park_section = (struct cpu_park_section *)cpu_park_section_v(cpu);
+ park_magic = &park_section->magic;
+
+ timeout = USEC_PER_SEC;
+ while (*park_magic != PARK_MAGIC && timeout--)
+ udelay(1);
+
+ if (timeout > 0)
+ pr_debug("cpu %u park done.", cpu);
+ else
+ pr_err("cpu %u park failed.", cpu);
+
+ return *park_magic == PARK_MAGIC;
+}
+
+static void cpu_park(unsigned int cpu)
+{
+ unsigned long park_section_p;
+ unsigned long park_exit_phy;
+ unsigned long do_park;
+ typeof(enter_cpu_park) *park;
+
+ park_section_p = cpu_park_section_p(cpu);
+ park_exit_phy = park_section_p;
+ pr_debug("Go to park cpu %u exit address 0x%lx", cpu, park_exit_phy);
+
+ do_park = park_section_p + sizeof(struct cpu_park_section);
+ park = (void *)__pa_symbol(enter_cpu_park);
+
+ cpu_install_idmap();
+ park(do_park, park_exit_phy);
+ unreachable();
+}
+#endif
/*
* Boot a secondary CPU, and assign it the specified idle task.
@@ -102,6 +265,10 @@ static int boot_secondary(unsigned int cpu, struct task_struct *idle)
{
const struct cpu_operations *ops = get_cpu_ops(cpu);
+#ifdef CONFIG_ARM64_CPU_PARK
+ if (write_park_exit(cpu) == 0)
+ return 0;
+#endif
if (ops->cpu_boot)
return ops->cpu_boot(cpu);
@@ -131,6 +298,9 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
return ret;
}
+#ifdef CONFIG_ARM64_CPU_PARK
+ uninstall_cpu_park(cpu);
+#endif
/*
* CPU was successfully started, wait for it to come online or
* time out.
@@ -844,10 +1014,30 @@ void arch_irq_work_raise(void)
static void local_cpu_stop(void)
{
+ int cpu;
+ const struct cpu_operations *ops = NULL;
+
set_cpu_online(smp_processor_id(), false);
local_daif_mask();
sdei_mask_local_cpu();
+
+#ifdef CONFIG_ARM64_CPU_PARK
+ /*
+ * Go to cpu park state.
+ * Otherwise go to cpu die.
+ */
+ cpu = smp_processor_id();
+ if (kexec_in_progress && park_info.start_v) {
+ machine_kexec_mask_interrupts();
+ cpu_park(cpu);
+
+ ops = get_cpu_ops(cpu);
+ if (ops && ops->cpu_die)
+ ops->cpu_die(cpu);
+ }
+#endif
+
cpu_park_loop();
}
@@ -1053,6 +1243,45 @@ void smp_send_stop(void)
sdei_mask_local_cpu();
}
+#ifdef CONFIG_ARM64_CPU_PARK
+int kexec_smp_send_park(void)
+{
+ unsigned long cpu;
+
+ if (WARN_ON(!kexec_in_progress)) {
+ pr_crit("%s called not in kexec progress.", __func__);
+ return -EPERM;
+ }
+
+ if (mmap_cpu_park_mem() != 0) {
+ pr_info("no cpuparkmem, goto normal way.");
+ return -EPERM;
+ }
+
+ local_irq_disable();
+
+ if (num_online_cpus() > 1) {
+ cpumask_t mask;
+
+ cpumask_copy(&mask, cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), &mask);
+
+ for_each_cpu(cpu, &mask)
+ install_cpu_park(cpu);
+ smp_cross_call(&mask, IPI_CPU_STOP);
+
+ /* Wait for other CPUs to park */
+ for_each_cpu(cpu, &mask)
+ cpu_wait_park(cpu);
+ pr_info("smp park other cpus done\n");
+ }
+
+ sdei_mask_local_cpu();
+
+ return 0;
+}
+#endif
+
#ifdef CONFIG_KEXEC_CORE
void crash_smp_send_stop(void)
{
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 794f992..d01259c 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -236,6 +236,57 @@ static void __init fdt_enforce_memory_region(void)
memblock_add(usable_rgns[1].base, usable_rgns[1].size);
}
+#ifdef CONFIG_ARM64_CPU_PARK
+struct cpu_park_info park_info = {
+ .start = 0,
+ .len = PARK_SECTION_SIZE * NR_CPUS,
+ .start_v = 0,
+};
+
+static int __init parse_park_mem(char *p)
+{
+ if (!p)
+ return 0;
+
+ park_info.start = PAGE_ALIGN(memparse(p, NULL));
+ if (park_info.start == 0)
+ pr_info("cpu park mem params[%s]", p);
+
+ return 0;
+}
+early_param("cpuparkmem", parse_park_mem);
+
+static int __init reserve_park_mem(void)
+{
+ if (park_info.start == 0 || park_info.len == 0)
+ return 0;
+
+ park_info.start = PAGE_ALIGN(park_info.start);
+ park_info.len = PAGE_ALIGN(park_info.len);
+
+ if (!memblock_is_region_memory(park_info.start, park_info.len)) {
+ pr_warn("cannot reserve park mem: region is not memory!");
+ goto out;
+ }
+
+ if (memblock_is_region_reserved(park_info.start, park_info.len)) {
+ pr_warn("cannot reserve park mem: region overlaps reserved memory!");
+ goto out;
+ }
+
+ memblock_remove(park_info.start, park_info.len);
+ pr_info("cpu park mem reserved: 0x%016lx - 0x%016lx (%ld MB)",
+ park_info.start, park_info.start + park_info.len,
+ park_info.len >> 20);
+
+ return 0;
+out:
+ park_info.start = 0;
+ park_info.len = 0;
+ return -EINVAL;
+}
+#endif
+
void __init arm64_memblock_init(void)
{
const s64 linear_region_size = BIT(vabits_actual - 1);
@@ -357,6 +408,10 @@ void __init arm64_memblock_init(void)
reserve_crashkernel();
+#ifdef CONFIG_ARM64_CPU_PARK
+ reserve_park_mem();
+#endif
+
reserve_elfcorehdr();
high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
--
2.9.5
2
3

22 Feb '21
From: Sang Yan <sangyan(a)huawei.com>
hulk inclusion
category: feature
bugzilla: 48159
CVE: N/A
In normal kexec, relocating kernel may cost 5 ~ 10 seconds, to
copy all segments from vmalloced memory to kernel boot memory,
because of disabled mmu.
We introduce quick kexec to save time of copying memory as above,
just like kdump(kexec on crash), by using reserved memory
"Quick Kexec".
Constructing quick kimage as the same as crash kernel,
then simply copy all segments of kimage to reserved memroy.
We also add this support in syscall kexec_load using flags
of KEXEC_QUICK.
Signed-off-by: Sang Yan <sangyan(a)huawei.com>
---
arch/Kconfig | 10 ++++++++++
include/linux/ioport.h | 1 +
include/linux/kexec.h | 11 ++++++++++-
include/uapi/linux/kexec.h | 1 +
kernel/kexec.c | 10 ++++++++++
kernel/kexec_core.c | 42 +++++++++++++++++++++++++++++++++---------
6 files changed, 65 insertions(+), 10 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig
index 2592b4b..7811eee 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -18,6 +18,16 @@ config KEXEC_CORE
select CRASH_CORE
bool
+config QUICK_KEXEC
+ bool "Support for quick kexec"
+ depends on KEXEC_CORE
+ help
+ It uses pre-reserved memory to accelerate kexec, just like
+ crash kexec, loads new kernel and initrd to reserved memory,
+ and boots new kernel on that memory. It will save the time
+ of relocating kernel.
+
+
config KEXEC_ELF
bool
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 5135d4b..84a716f 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -139,6 +139,7 @@ enum {
IORES_DESC_DEVICE_PRIVATE_MEMORY = 6,
IORES_DESC_RESERVED = 7,
IORES_DESC_SOFT_RESERVED = 8,
+ IORES_DESC_QUICK_KEXEC = 9,
};
/*
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index f301f2f..7fff410 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -269,9 +269,10 @@ struct kimage {
unsigned long control_page;
/* Flags to indicate special processing */
- unsigned int type : 1;
+ unsigned int type : 2;
#define KEXEC_TYPE_DEFAULT 0
#define KEXEC_TYPE_CRASH 1
+#define KEXEC_TYPE_QUICK 2
unsigned int preserve_context : 1;
/* If set, we are using file mode kexec syscall */
unsigned int file_mode:1;
@@ -331,6 +332,11 @@ extern int kexec_load_disabled;
#define KEXEC_FLAGS (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT)
#endif
+#ifdef CONFIG_QUICK_KEXEC
+#undef KEXEC_FLAGS
+#define KEXEC_FLAGS (KEXEC_ON_CRASH | KEXEC_QUICK)
+#endif
+
/* List of defined/legal kexec file flags */
#define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \
KEXEC_FILE_NO_INITRAMFS)
@@ -338,6 +344,9 @@ extern int kexec_load_disabled;
/* Location of a reserved region to hold the crash kernel.
*/
extern note_buf_t __percpu *crash_notes;
+#ifdef CONFIG_QUICK_KEXEC
+extern struct resource quick_kexec_res;
+#endif
/* flag to track if kexec reboot is in progress */
extern bool kexec_in_progress;
diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
index 05669c8..d891d80 100644
--- a/include/uapi/linux/kexec.h
+++ b/include/uapi/linux/kexec.h
@@ -12,6 +12,7 @@
/* kexec flags for different usage scenarios */
#define KEXEC_ON_CRASH 0x00000001
#define KEXEC_PRESERVE_CONTEXT 0x00000002
+#define KEXEC_QUICK 0x00000004
#define KEXEC_ARCH_MASK 0xffff0000
/*
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c82c6c0..4acc909 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -44,6 +44,9 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
int ret;
struct kimage *image;
bool kexec_on_panic = flags & KEXEC_ON_CRASH;
+#ifdef CONFIG_QUICK_KEXEC
+ bool kexec_on_quick = flags & KEXEC_QUICK;
+#endif
if (kexec_on_panic) {
/* Verify we have a valid entry point */
@@ -69,6 +72,13 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
image->type = KEXEC_TYPE_CRASH;
}
+#ifdef CONFIG_QUICK_KEXEC
+ if (kexec_on_quick) {
+ image->control_page = quick_kexec_res.start;
+ image->type = KEXEC_TYPE_QUICK;
+ }
+#endif
+
ret = sanity_check_segment_list(image);
if (ret)
goto out_free_image;
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 2ca8875..c7e2aa2 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -53,6 +53,17 @@ note_buf_t __percpu *crash_notes;
/* Flag to indicate we are going to kexec a new kernel */
bool kexec_in_progress = false;
+/* Resource for quick kexec */
+#ifdef CONFIG_QUICK_KEXEC
+struct resource quick_kexec_res = {
+ .name = "Quick kexec",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+ .desc = IORES_DESC_QUICK_KEXEC
+};
+#endif
+
int kexec_should_crash(struct task_struct *p)
{
/*
@@ -396,8 +407,9 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
return pages;
}
-static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
- unsigned int order)
+static struct page *kimage_alloc_special_control_pages(struct kimage *image,
+ unsigned int order,
+ unsigned long end)
{
/* Control pages are special, they are the intermediaries
* that are needed while we copy the rest of the pages
@@ -427,7 +439,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
size = (1 << order) << PAGE_SHIFT;
hole_start = (image->control_page + (size - 1)) & ~(size - 1);
hole_end = hole_start + size - 1;
- while (hole_end <= crashk_res.end) {
+ while (hole_end <= end) {
unsigned long i;
cond_resched();
@@ -462,7 +474,6 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
return pages;
}
-
struct page *kimage_alloc_control_pages(struct kimage *image,
unsigned int order)
{
@@ -473,8 +484,15 @@ struct page *kimage_alloc_control_pages(struct kimage *image,
pages = kimage_alloc_normal_control_pages(image, order);
break;
case KEXEC_TYPE_CRASH:
- pages = kimage_alloc_crash_control_pages(image, order);
+ pages = kimage_alloc_special_control_pages(image, order,
+ crashk_res.end);
+ break;
+#ifdef CONFIG_QUICK_KEXEC
+ case KEXEC_TYPE_QUICK:
+ pages = kimage_alloc_special_control_pages(image, order,
+ quick_kexec_res.end);
break;
+#endif
}
return pages;
@@ -830,11 +848,12 @@ static int kimage_load_normal_segment(struct kimage *image,
return result;
}
-static int kimage_load_crash_segment(struct kimage *image,
+static int kimage_load_special_segment(struct kimage *image,
struct kexec_segment *segment)
{
- /* For crash dumps kernels we simply copy the data from
- * user space to it's destination.
+ /*
+ * For crash dumps kernels and quick kexec kernels
+ * we simply copy the data from user space to it's destination.
* We do things a page at a time for the sake of kmap.
*/
unsigned long maddr;
@@ -908,8 +927,13 @@ int kimage_load_segment(struct kimage *image,
result = kimage_load_normal_segment(image, segment);
break;
case KEXEC_TYPE_CRASH:
- result = kimage_load_crash_segment(image, segment);
+ result = kimage_load_special_segment(image, segment);
break;
+#ifdef CONFIG_QUICK_KEXEC
+ case KEXEC_TYPE_QUICK:
+ result = kimage_load_special_segment(image, segment);
+ break;
+#endif
}
return result;
--
2.9.5
2
3