From: Xunlei Pang xlpang@linux.alibaba.com
Export "cpu|io|memory.pressure" to cgroup v1 "cpuacct" subsystem.
hulk inclusion category: feature bugzilla: 182979 https://gitee.com/openeuler/kernel/issues/I4HOX6
------------------------------------------
Reviewed-by: Joseph Qi joseph.qi@linux.alibaba.com Signed-off-by: Xunlei Pang xlpang@linux.alibaba.com Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- init/Kconfig | 10 ++++++++++ kernel/cgroup/cgroup.c | 28 ++++++++++++++++++++++++++++ kernel/sched/cpuacct.c | 10 ++++++++++ kernel/sched/psi.c | 4 ++++ kernel/sched/sched.h | 4 ++++ 5 files changed, 56 insertions(+)
diff --git a/init/Kconfig b/init/Kconfig index 04bc46ca0b9e..0afdb08131eb 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -635,6 +635,16 @@ config PSI_DEFAULT_DISABLED
Say N if unsure.
+config PSI_CGROUP_V1 + bool "Support PSI under cgroup v1" + default Y + depends on PSI + help + If set, pressure stall information tracking will be used + for cgroup v1 other than v2. + + Say N if unsure. + endmenu # "CPU/Task time and stats accounting"
config CPU_ISOLATION diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 701ef7ba4f95..86ab4a1305f6 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3659,6 +3659,34 @@ static void cgroup_pressure_release(struct kernfs_open_file *of) { psi_trigger_replace(&of->priv, NULL); } + +struct cftype cgroup_v1_psi_files[] = { + { + .name = "io.pressure", + .flags = CFTYPE_NO_PREFIX, + .seq_show = cgroup_io_pressure_show, + .write = cgroup_io_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, + }, + { + .name = "memory.pressure", + .flags = CFTYPE_NO_PREFIX, + .seq_show = cgroup_memory_pressure_show, + .write = cgroup_memory_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, + }, + { + .name = "cpu.pressure", + .flags = CFTYPE_NO_PREFIX, + .seq_show = cgroup_cpu_pressure_show, + .write = cgroup_cpu_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, + }, + { } /* terminate */ +}; #endif /* CONFIG_PSI */
static int cgroup_freeze_show(struct seq_file *seq, void *v) diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 941c28cf9738..4e5488659339 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -374,3 +374,13 @@ struct cgroup_subsys cpuacct_cgrp_subsys = { .legacy_cftypes = files, .early_init = true, }; + +#ifdef CONFIG_PSI +static int __init cgroup_v1_psi_init(void) +{ + cgroup_add_legacy_cftypes(&cpuacct_cgrp_subsys, cgroup_v1_psi_files); + return 0; +} + +late_initcall_sync(cgroup_v1_psi_init); +#endif diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index d50a31ecedee..0b48a74cbfac 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -752,7 +752,11 @@ static struct psi_group *iterate_groups(struct task_struct *task, void **iter) struct cgroup *cgroup = NULL;
if (!*iter) +#ifdef CONFIG_PSI_CGROUP_V1 + cgroup = task_cgroup(task, cpuacct_cgrp_id); +#else cgroup = task->cgroups->dfl_cgrp; +#endif else if (*iter == &psi_system) return NULL; else diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4ce573eeca4c..e29f051f824f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2585,6 +2585,10 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned } #endif
+#ifdef CONFIG_PSI +extern struct cftype cgroup_v1_psi_files[]; +#endif + #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus)))
From: Joseph Qi joseph.qi@linux.alibaba.com
hulk inclusion category: feature bugzilla: 182979 https://gitee.com/openeuler/kernel/issues/I4HOX6
------------------------------------------
Instead using static kconfig CONFIG_PSI_CGROUP_V1, we introduce a boot parameter psi_v1 to enable psi cgroup v1 support. Default it is disabled
Signed-off-by: Joseph Qi joseph.qi@linux.alibaba.com Acked-by: Xunlei Pang xlpang@linux.alibaba.com Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/admin-guide/kernel-parameters.txt | 4 ++++ include/linux/psi.h | 2 ++ init/Kconfig | 10 ---------- kernel/sched/cpuacct.c | 13 +++++++++++++ kernel/sched/psi.c | 14 +++++++------- 5 files changed, 26 insertions(+), 17 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index b0b9265fe5f3..8eca743da732 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4085,6 +4085,10 @@ tracking. Format: <bool>
+ psi_v1= [KNL] Enable or disable pressure stall information + tracking on cgroup v1. + Format: <bool> + psmouse.proto= [HW,MOUSE] Highest PS2 mouse protocol extension to probe for; one of (bare|imps|exps|lifebook|any). psmouse.rate= [HW,MOUSE] Set desired mouse report rate, in reports diff --git a/include/linux/psi.h b/include/linux/psi.h index 7361023f3fdd..8f59276b566b 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -14,6 +14,8 @@ struct css_set; extern struct static_key_false psi_disabled; extern struct psi_group psi_system;
+extern struct static_key_false psi_v1_disabled; + void psi_init(void);
void psi_task_change(struct task_struct *task, int clear, int set); diff --git a/init/Kconfig b/init/Kconfig index 0afdb08131eb..04bc46ca0b9e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -635,16 +635,6 @@ config PSI_DEFAULT_DISABLED
Say N if unsure.
-config PSI_CGROUP_V1 - bool "Support PSI under cgroup v1" - default Y - depends on PSI - help - If set, pressure stall information tracking will be used - for cgroup v1 other than v2. - - Say N if unsure. - endmenu # "CPU/Task time and stats accounting"
config CPU_ISOLATION diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 4e5488659339..60a95eb67d9b 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -376,8 +376,21 @@ struct cgroup_subsys cpuacct_cgrp_subsys = { };
#ifdef CONFIG_PSI + +static bool psi_v1_enable; +static int __init setup_psi_v1(char *str) +{ + return kstrtobool(str, &psi_v1_enable) == 0; +} +__setup("psi_v1=", setup_psi_v1); + static int __init cgroup_v1_psi_init(void) { + if (!psi_v1_enable) { + static_branch_enable(&psi_v1_disabled); + return 0; + } + cgroup_add_legacy_cftypes(&cpuacct_cgrp_subsys, cgroup_v1_psi_files); return 0; } diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 0b48a74cbfac..9f363830e40d 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -145,6 +145,7 @@ static int psi_bug __read_mostly;
DEFINE_STATIC_KEY_FALSE(psi_disabled); +DEFINE_STATIC_KEY_FALSE(psi_v1_disabled);
#ifdef CONFIG_PSI_DEFAULT_DISABLED static bool psi_enable; @@ -751,13 +752,12 @@ static struct psi_group *iterate_groups(struct task_struct *task, void **iter) #ifdef CONFIG_CGROUPS struct cgroup *cgroup = NULL;
- if (!*iter) -#ifdef CONFIG_PSI_CGROUP_V1 - cgroup = task_cgroup(task, cpuacct_cgrp_id); -#else - cgroup = task->cgroups->dfl_cgrp; -#endif - else if (*iter == &psi_system) + if (!*iter) { + if (static_branch_likely(&psi_v1_disabled)) + cgroup = task->cgroups->dfl_cgrp; + else + cgroup = task_cgroup(task, cpuacct_cgrp_id); + } else if (*iter == &psi_system) return NULL; else cgroup = cgroup_parent(*iter);
From: Steve Sistare steven.sistare@oracle.com
hulk inclusion category: feature bugzilla: 38261, https://gitee.com/openeuler/kernel/issues/I49XPZ CVE: NA
---------------------------
Provide struct sparsemask and functions to manipulate it. A sparsemask is a sparse bitmap. It reduces cache contention vs the usual bitmap when many threads concurrently set, clear, and visit elements, by reducing the number of significant bits per cacheline. For each cacheline chunk of the mask, only the first K bits of the first word are used, and the remaining bits are ignored, where K is a creation time parameter. Thus a sparsemask that can represent a set of N elements is approximately (N/K * CACHELINE) bytes in size.
This type is simpler and more efficient than the struct sbitmap used by block drivers.
Signed-off-by: Steve Sistare steven.sistare@oracle.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/sparsemask.h | 210 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 210 insertions(+) create mode 100644 kernel/sched/sparsemask.h
diff --git a/kernel/sched/sparsemask.h b/kernel/sched/sparsemask.h new file mode 100644 index 000000000000..11948620a1a2 --- /dev/null +++ b/kernel/sched/sparsemask.h @@ -0,0 +1,210 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * sparsemask.h - sparse bitmap operations + * + * Copyright (c) 2018 Oracle Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef __LINUX_SPARSEMASK_H +#define __LINUX_SPARSEMASK_H + +#include <linux/kernel.h> +#include <linux/bitmap.h> +#include <linux/bug.h> + +/* + * A sparsemask is a sparse bitmap. It reduces cache contention vs the usual + * bitmap when many threads concurrently set, clear, and visit elements. For + * each cacheline chunk of the mask, only the first K bits of the first word are + * used, and the remaining bits are ignored, where K is a creation time + * parameter. Thus a sparsemask that can represent a set of N elements is + * approximately (N/K * CACHELINE) bytes in size. + * + * Clients pass and receive element numbers in the public API, and the + * implementation translates them to bit numbers to perform the bitmap + * operations. + */ + +struct sparsemask_chunk { + unsigned long word; /* the significant bits */ +} ____cacheline_aligned_in_smp; + +struct sparsemask { + short nelems; /* current number of elements */ + short density; /* store 2^density elements per chunk */ + struct sparsemask_chunk chunks[0]; /* embedded array of chunks */ +}; + +#define _SMASK_INDEX(density, elem) ((elem) >> (density)) +#define _SMASK_BIT(density, elem) ((elem) & ((1U << (density)) - 1U)) +#define SMASK_INDEX(mask, elem) _SMASK_INDEX((mask)->density, elem) +#define SMASK_BIT(mask, elem) _SMASK_BIT((mask)->density, elem) +#define SMASK_WORD(mask, elem) \ + (&(mask)->chunks[SMASK_INDEX((mask), (elem))].word) + +/* + * sparsemask_next() - Return the next one bit in a bitmap, starting at a + * specified position and wrapping from the last bit to the first, up to but + * not including a specified origin. This is a helper, so do not call it + * directly. + * + * @mask: Bitmap to search. + * @origin: Origin. + * @prev: Previous bit. Start search after this bit number. + * If -1, start search at @origin. + * + * Return: the bit number, else mask->nelems if no bits are set in the range. + */ +static inline int +sparsemask_next(const struct sparsemask *mask, int origin, int prev) +{ + int density = mask->density; + int bits_per_word = 1U << density; + const struct sparsemask_chunk *chunk; + int nelems = mask->nelems; + int next, bit, nbits; + unsigned long word; + + /* Calculate number of bits to be searched. */ + if (prev == -1) { + nbits = nelems; + next = origin; + } else if (prev < origin) { + nbits = origin - prev; + next = prev + 1; + } else { + nbits = nelems - prev + origin - 1; + next = prev + 1; + } + + if (unlikely(next >= nelems)) + return nelems; + + /* + * Fetch and adjust first word. Clear word bits below @next, and round + * @next down to @bits_per_word boundary because later ffs will add + * those bits back. + */ + chunk = &mask->chunks[_SMASK_INDEX(density, next)]; + bit = _SMASK_BIT(density, next); + word = chunk->word & (~0UL << bit); + next -= bit; + nbits += bit; + + while (!word) { + next += bits_per_word; + nbits -= bits_per_word; + if (nbits <= 0) + return nelems; + + if (next >= nelems) { + chunk = mask->chunks; + nbits -= (next - nelems); + next = 0; + } else { + chunk++; + } + word = chunk->word; + } + + next += __ffs(word); + if (next >= origin && prev != -1) + return nelems; + return next; +} + +/****************** The public API ********************/ + +/* + * Max value for the density parameter, limited by 64 bits in the chunk word. + */ +#define SMASK_DENSITY_MAX 6 + +/* + * Return bytes to allocate for a sparsemask, for custom allocators. + */ +static inline size_t sparsemask_size(int nelems, int density) +{ + int index = _SMASK_INDEX(density, nelems) + 1; + + return offsetof(struct sparsemask, chunks[index]); +} + +/* + * Initialize an allocated sparsemask, for custom allocators. + */ +static inline void +sparsemask_init(struct sparsemask *mask, int nelems, int density) +{ + WARN_ON(density < 0 || density > SMASK_DENSITY_MAX || nelems < 0); + mask->nelems = nelems; + mask->density = density; +} + +/* + * sparsemask_alloc_node() - Allocate, initialize, and return a sparsemask. + * + * @nelems - maximum number of elements. + * @density - store 2^density elements per cacheline chunk. + * values from 0 to SMASK_DENSITY_MAX inclusive. + * @flags - kmalloc allocation flags + * @node - numa node + */ +static inline struct sparsemask * +sparsemask_alloc_node(int nelems, int density, gfp_t flags, int node) +{ + int nbytes = sparsemask_size(nelems, density); + struct sparsemask *mask = kmalloc_node(nbytes, flags, node); + + if (mask) + sparsemask_init(mask, nelems, density); + return mask; +} + +static inline void sparsemask_free(struct sparsemask *mask) +{ + kfree(mask); +} + +static inline void sparsemask_set_elem(struct sparsemask *dst, int elem) +{ + set_bit(SMASK_BIT(dst, elem), SMASK_WORD(dst, elem)); +} + +static inline void sparsemask_clear_elem(struct sparsemask *dst, int elem) +{ + clear_bit(SMASK_BIT(dst, elem), SMASK_WORD(dst, elem)); +} + +static inline int sparsemask_test_elem(const struct sparsemask *mask, int elem) +{ + return test_bit(SMASK_BIT(mask, elem), SMASK_WORD(mask, elem)); +} + +/* + * sparsemask_for_each() - iterate over each set bit in a bitmap, starting at a + * specified position, and wrapping from the last bit to the first. + * + * @mask: Bitmap to iterate over. + * @origin: Bit number at which to start searching. + * @elem: Iterator. Can be signed or unsigned integer. + * + * The implementation does not assume any bit in @mask is set, including + * @origin. After the loop, @elem = @mask->nelems. + */ +#define sparsemask_for_each(mask, origin, elem) \ + for ((elem) = -1; \ + (elem) = sparsemask_next((mask), (origin), (elem)), \ + (elem) < (mask)->nelems;) + +#endif /* __LINUX_SPARSEMASK_H */
From: Steve Sistare steven.sistare@oracle.com
hulk inclusion category: feature bugzilla: 38261, https://gitee.com/openeuler/kernel/issues/I49XPZ CVE: NA
---------------------------
Add functions sd_llc_alloc_all() and sd_llc_free_all() to allocate and free data pointed to by struct sched_domain_shared at the last-level-cache domain. sd_llc_alloc_all() is called after the SD hierarchy is known, to eliminate the unnecessary allocations that would occur if we instead allocated in __sdt_alloc() and then figured out which shared nodes are redundant.
Signed-off-by: Steve Sistare steven.sistare@oracle.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/topology.c | 75 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index dd7770226086..b6a4a767da3a 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -10,6 +10,12 @@ DEFINE_MUTEX(sched_domains_mutex); static cpumask_var_t sched_domains_tmpmask; static cpumask_var_t sched_domains_tmpmask2;
+struct s_data; +static int sd_llc_alloc(struct sched_domain *sd); +static void sd_llc_free(struct sched_domain *sd); +static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d); +static void sd_llc_free_all(const struct cpumask *cpu_map); + #ifdef CONFIG_SCHED_DEBUG
static int __init sched_debug_setup(char *str) @@ -596,8 +602,10 @@ static void destroy_sched_domain(struct sched_domain *sd) */ free_sched_groups(sd->groups, 1);
- if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) + if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) { + sd_llc_free(sd); kfree(sd->shared); + } kfree(sd); }
@@ -1238,6 +1246,7 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, free_percpu(d->sd); fallthrough; case sa_sd_storage: + sd_llc_free_all(cpu_map); __sdt_free(cpu_map); fallthrough; case sa_none: @@ -1849,6 +1858,62 @@ static void __sdt_free(const struct cpumask *cpu_map) } }
+static int sd_llc_alloc(struct sched_domain *sd) +{ + /* Allocate sd->shared data here. Empty for now. */ + + return 0; +} + +static void sd_llc_free(struct sched_domain *sd) +{ + struct sched_domain_shared *sds = sd->shared; + + if (!sds) + return; + + /* Free data here. Empty for now. */ +} + +static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d) +{ + struct sched_domain *sd, *hsd; + int i; + + for_each_cpu(i, cpu_map) { + /* Find highest domain that shares resources */ + hsd = NULL; + for (sd = *per_cpu_ptr(d->sd, i); sd; sd = sd->parent) { + if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) + break; + hsd = sd; + } + if (hsd && sd_llc_alloc(hsd)) + return 1; + } + + return 0; +} + +static void sd_llc_free_all(const struct cpumask *cpu_map) +{ + struct sched_domain_topology_level *tl; + struct sched_domain *sd; + struct sd_data *sdd; + int j; + + for_each_sd_topology(tl) { + sdd = &tl->data; + if (!sdd) + continue; + for_each_cpu(j, cpu_map) { + sd = *per_cpu_ptr(sdd->sd, j); + if (sd) + sd_llc_free(sd); + } + } +} + static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int dflags, int cpu) @@ -2049,6 +2114,14 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } }
+ /* + * Allocate shared sd data at last level cache. Must be done after + * domains are built above, but before the data is used in + * cpu_attach_domain and descendants below. + */ + if (sd_llc_alloc_all(cpu_map, &d)) + goto error; + /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) {
From: Steve Sistare steve.sistare@oracle.com
hulk inclusion category: feature bugzilla: 38261, https://gitee.com/openeuler/kernel/issues/I49XPZ CVE: NA
---------------------------
Define and initialize a sparse bitmap of overloaded CPUs, per last-level-cache scheduling domain, for use by the CFS scheduling class. Save a pointer to cfs_overload_cpus in the rq for efficient access.
Signed-off-by: Steve Sistare steven.sistare@oracle.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/sched/topology.h | 1 + kernel/sched/sched.h | 2 ++ kernel/sched/topology.c | 25 +++++++++++++++++++++++-- 3 files changed, 26 insertions(+), 2 deletions(-)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 9ef7bf686a9f..c98908eb7c24 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -74,6 +74,7 @@ struct sched_domain_shared { atomic_t ref; atomic_t nr_busy_cpus; int has_idle_cores; + struct sparsemask *cfs_overload_cpus; };
struct sched_domain { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e29f051f824f..97f54c881baf 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -86,6 +86,7 @@
struct rq; struct cpuidle_state; +struct sparsemask;
/* task_struct::on_rq states: */ #define TASK_ON_RQ_QUEUED 1 @@ -938,6 +939,7 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; struct dl_rq dl; + struct sparsemask *cfs_overload_cpus;
#ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this CPU: */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b6a4a767da3a..486965561754 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -3,6 +3,7 @@ * Scheduler topology setup/handling methods */ #include "sched.h" +#include "sparsemask.h"
DEFINE_MUTEX(sched_domains_mutex);
@@ -646,7 +647,9 @@ DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
static void update_top_cache_domain(int cpu) { + struct sparsemask *cfs_overload_cpus = NULL; struct sched_domain_shared *sds = NULL; + struct rq *rq = cpu_rq(cpu); struct sched_domain *sd; int id = cpu; int size = 1; @@ -656,8 +659,10 @@ static void update_top_cache_domain(int cpu) id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); sds = sd->shared; + cfs_overload_cpus = sds->cfs_overload_cpus; }
+ rcu_assign_pointer(rq->cfs_overload_cpus, cfs_overload_cpus); rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; @@ -1860,7 +1865,22 @@ static void __sdt_free(const struct cpumask *cpu_map)
static int sd_llc_alloc(struct sched_domain *sd) { - /* Allocate sd->shared data here. Empty for now. */ + struct sched_domain_shared *sds = sd->shared; + struct cpumask *span = sched_domain_span(sd); + int nid = cpu_to_node(cpumask_first(span)); + int flags = __GFP_ZERO | GFP_KERNEL; + struct sparsemask *mask; + + /* + * Allocate the bitmap if not already allocated. This is called for + * every CPU in the LLC but only allocates once per sd_llc_shared. + */ + if (!sds->cfs_overload_cpus) { + mask = sparsemask_alloc_node(nr_cpu_ids, 3, flags, nid); + if (!mask) + return 1; + sds->cfs_overload_cpus = mask; + }
return 0; } @@ -1872,7 +1892,8 @@ static void sd_llc_free(struct sched_domain *sd) if (!sds) return;
- /* Free data here. Empty for now. */ + sparsemask_free(sds->cfs_overload_cpus); + sds->cfs_overload_cpus = NULL; }
static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d)
From: Steve Sistare steven.sistare@oracle.com
hulk inclusion category: feature bugzilla: 38261, https://gitee.com/openeuler/kernel/issues/I49XPZ CVE: NA
---------------------------
An overloaded CPU has more than 1 runnable task. When a CFS task wakes on a CPU, if h_nr_running transitions from 1 to more, then set the CPU in the cfs_overload_cpus bitmap. When a CFS task sleeps, if h_nr_running transitions from 2 to less, then clear the CPU in cfs_overload_cpus.
Signed-off-by: Steve Sistare steven.sistare@oracle.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/fair.c | 54 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 27641d5c6db2..450d16e5651b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -21,6 +21,7 @@ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ #include "sched.h" +#include "sparsemask.h"
/* * Targeted preemption latency for CPU-bound tasks: @@ -4152,6 +4153,28 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); }
+static void overload_clear(struct rq *rq) +{ + struct sparsemask *overload_cpus; + + rcu_read_lock(); + overload_cpus = rcu_dereference(rq->cfs_overload_cpus); + if (overload_cpus) + sparsemask_clear_elem(overload_cpus, rq->cpu); + rcu_read_unlock(); +} + +static void overload_set(struct rq *rq) +{ + struct sparsemask *overload_cpus; + + rcu_read_lock(); + overload_cpus = rcu_dereference(rq->cfs_overload_cpus); + if (overload_cpus) + sparsemask_set_elem(overload_cpus, rq->cpu); + rcu_read_unlock(); +} + #else /* CONFIG_SMP */
#define UPDATE_TG 0x0 @@ -4175,6 +4198,9 @@ static inline int newidle_balance(struct rq *rq, struct rq_flags *rf) return 0; }
+static inline void overload_clear(struct rq *rq) {} +static inline void overload_set(struct rq *rq) {} + static inline void util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
@@ -4877,6 +4903,7 @@ static int tg_throttle_down(struct task_group *tg, void *data) static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); + unsigned int prev_nr = rq->cfs.h_nr_running; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta, dequeue = 1; @@ -4931,8 +4958,11 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) dequeue = 0; }
- if (!se) + if (!se) { sub_nr_running(rq, task_delta); + if (prev_nr >= 2 && prev_nr - task_delta < 2) + overload_clear(rq); + }
/* * Note: distribution will already see us throttled via the @@ -4946,6 +4976,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); + unsigned int prev_nr = rq->cfs.h_nr_running; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta; @@ -5007,6 +5038,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
/* At this point se is NULL and we are at root level*/ add_nr_running(rq, task_delta); + if (prev_nr < 2 && prev_nr + task_delta >= 2) + overload_set(rq);
unthrottle_throttle: /* @@ -5618,6 +5651,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; int idle_h_nr_running = task_has_idle_policy(p); int task_new = !(flags & ENQUEUE_WAKEUP); + unsigned int prev_nr = rq->cfs.h_nr_running;
/* * The code below (indirectly) updates schedutil which looks at @@ -5675,6 +5709,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* At this point se is NULL and we are at root level*/ add_nr_running(rq, 1); + if (prev_nr == 1) + overload_set(rq);
/* * Since new tasks are assigned an initial util_avg equal to @@ -5727,6 +5763,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; int idle_h_nr_running = task_has_idle_policy(p); + unsigned int prev_nr = rq->cfs.h_nr_running; bool was_sched_idle = sched_idle_rq(rq);
util_est_dequeue(&rq->cfs, p); @@ -5775,6 +5812,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* At this point se is NULL and we are at root level*/ sub_nr_running(rq, 1); + if (prev_nr == 2) + overload_clear(rq);
/* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) @@ -7133,6 +7172,7 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct sched_entity *se; + unsigned int prev_nr = cfs_rq->h_nr_running; long task_delta, idle_task_delta, dequeue = 1;
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; @@ -7159,8 +7199,12 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) dequeue = 0; }
- if (!se) + if (!se) { sub_nr_running(rq, task_delta); + if (prev_nr >= 2 && prev_nr - task_delta < 2) + overload_clear(rq); + + }
cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); @@ -7174,6 +7218,7 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; int enqueue = 1; + unsigned int prev_nr = cfs_rq->h_nr_running; long task_delta, idle_task_delta;
se = cfs_rq->tg->se[cpu_of(rq)]; @@ -7209,8 +7254,11 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
assert_list_leaf_cfs_rq(rq);
- if (!se) + if (!se) { add_nr_running(rq, task_delta); + if (prev_nr < 2 && prev_nr + task_delta >= 2) + overload_set(rq); + }
/* Determine whether we need to wake up potentially idle CPU: */ if (rq->curr == rq->idle && rq->cfs.nr_running)
From: Steve Sistare steven.sistare@oracle.com
hulk inclusion category: feature bugzilla: 38261, https://gitee.com/openeuler/kernel/issues/I49XPZ CVE: NA
---------------------------
Move the update of idle_stamp from idle_balance to the call site in pick_next_task_fair, to prepare for a future patch that adds work to pick_next_task_fair which must be included in the idle_stamp interval. No functional change.
Signed-off-by: Steve Sistare steven.sistare@oracle.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/fair.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 450d16e5651b..4b0ca4173bc3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4153,6 +4153,16 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); }
+static inline void rq_idle_stamp_update(struct rq *rq) +{ + rq->idle_stamp = rq_clock(rq); +} + +static inline void rq_idle_stamp_clear(struct rq *rq) +{ + rq->idle_stamp = 0; +} + static void overload_clear(struct rq *rq) { struct sparsemask *overload_cpus; @@ -4198,6 +4208,8 @@ static inline int newidle_balance(struct rq *rq, struct rq_flags *rf) return 0; }
+static inline void rq_idle_stamp_update(struct rq *rq) {} +static inline void rq_idle_stamp_clear(struct rq *rq) {} static inline void overload_clear(struct rq *rq) {} static inline void overload_set(struct rq *rq) {}
@@ -7437,8 +7449,18 @@ done: __maybe_unused; if (!rf) return NULL;
+ /* + * We must set idle_stamp _before_ calling idle_balance(), such that we + * measure the duration of idle_balance() as idle time. + */ + rq_idle_stamp_update(rq); + new_tasks = newidle_balance(rq, rf);
+ if (new_tasks) + rq_idle_stamp_clear(rq); + + /* * Because newidle_balance() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we @@ -10906,11 +10928,6 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) u64 curr_cost = 0;
update_misfit_status(NULL, this_rq); - /* - * We must set idle_stamp _before_ calling idle_balance(), such that we - * measure the duration of idle_balance() as idle time. - */ - this_rq->idle_stamp = rq_clock(this_rq);
/* * Do not pull tasks towards !active CPUs... @@ -11000,9 +11017,6 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) if (this_rq->nr_running != this_rq->cfs.h_nr_running) pulled_task = -1;
- if (pulled_task) - this_rq->idle_stamp = 0; - rq_repin_lock(this_rq, rf);
return pulled_task;
From: Steve Sistare steven.sistare@oracle.com
hulk inclusion category: feature bugzilla: 38261, https://gitee.com/openeuler/kernel/issues/I49XPZ CVE: NA
---------------------------
The detach_task function takes a struct lb_env argument, but only needs a few of its members. Pass the rq and cpu arguments explicitly so the function may be called from code that is not based on lb_env. No functional change.
Signed-off-by: Steve Sistare steven.sistare@oracle.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/fair.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4b0ca4173bc3..5653a3a5cd0e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7943,14 +7943,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) }
/* - * detach_task() -- detach the task for the migration specified in env + * detach_task() -- detach the task for the migration from @src_rq to @dst_cpu. */ -static void detach_task(struct task_struct *p, struct lb_env *env) +static void detach_task(struct task_struct *p, struct rq *src_rq, int dst_cpu) { - lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_held(&src_rq->lock);
- deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); - set_task_cpu(p, env->dst_cpu); + deactivate_task(src_rq, p, DEQUEUE_NOCLOCK); + set_task_cpu(p, dst_cpu); }
/* @@ -7970,7 +7970,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) if (!can_migrate_task(p, env)) continue;
- detach_task(p, env); + detach_task(p, env->src_rq, env->dst_cpu);
/* * Right now, this is only the second place where @@ -8078,7 +8078,7 @@ static int detach_tasks(struct lb_env *env) break; }
- detach_task(p, env); + detach_task(p, env->src_rq, env->dst_cpu); list_add(&p->se.group_node, &env->tasks);
detached++;
From: Steve Sistare steven.sistare@oracle.com
hulk inclusion category: feature bugzilla: 38261, https://gitee.com/openeuler/kernel/issues/I49XPZ CVE: NA
---------------------------
Define a simpler version of can_migrate_task called can_migrate_task_llc which does not require a struct lb_env argument, and judges whether a migration from one CPU to another within the same LLC should be allowed.
Signed-off-by: Steve Sistare steven.sistare@oracle.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/fair.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5653a3a5cd0e..7bc282428cc4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7942,6 +7942,34 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) return 0; }
+/* + * Return true if task @p can migrate from @rq to @dst_rq in the same LLC. + * No need to test for co-locality, and no need to test task_hot(), as sharing + * LLC provides cache warmth at that level. + */ +static bool +can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq) +{ + int dst_cpu = dst_rq->cpu; + + lockdep_assert_held(&rq->lock); + + if (throttled_lb_pair(task_group(p), cpu_of(rq), dst_cpu)) + return false; + + if (!cpumask_test_cpu(dst_cpu, p->cpus_ptr)) { + schedstat_inc(p->se.statistics.nr_failed_migrations_affine); + return false; + } + + if (task_running(rq, p)) { + schedstat_inc(p->se.statistics.nr_failed_migrations_running); + return false; + } + + return true; +} + /* * detach_task() -- detach the task for the migration from @src_rq to @dst_cpu. */
From: Steve Sistare steven.sistare@oracle.com
hulk inclusion category: feature bugzilla: 38261, https://gitee.com/openeuler/kernel/issues/I49XPZ CVE: NA
---------------------------
When a CPU has no more CFS tasks to run, and idle_balance() fails to find a task, then attempt to steal a task from an overloaded CPU in the same LLC, using the cfs_overload_cpus bitmap to efficiently identify candidates. To minimize search time, steal the first migratable task that is found when the bitmap is traversed. For fairness, search for migratable tasks on an overloaded CPU in order of next to run.
This simple stealing yields a higher CPU utilization than idle_balance() alone, because the search is cheap, so it may be called every time the CPU is about to go idle. idle_balance() does more work because it searches widely for the busiest queue, so to limit its CPU consumption, it declines to search if the system is too busy. Simple stealing does not offload the globally busiest queue, but it is much better than running nothing at all.
Stealing is controlled by the sched feature SCHED_STEAL, which is enabled by default.
Stealing imprroves utilization with only a modest CPU overhead in scheduler code. In the following experiment, hackbench is run with varying numbers of groups (40 tasks per group), and the delta in /proc/schedstat is shown for each run, averaged per CPU, augmented with these non-standard stats:
%find - percent of time spent in old and new functions that search for idle CPUs and tasks to steal and set the overloaded CPUs bitmap.
steal - number of times a task is stolen from another CPU.
X6-2: 1 socket * 10 cores * 2 hyperthreads = 20 CPUs Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz hackbench <grps> process 100000 sched_wakeup_granularity_ns=15000000
baseline grps time %busy slice sched idle wake %find steal 1 8.084 75.02 0.10 105476 46291 59183 0.31 0 2 13.892 85.33 0.10 190225 70958 119264 0.45 0 3 19.668 89.04 0.10 263896 87047 176850 0.49 0 4 25.279 91.28 0.10 322171 94691 227474 0.51 0 8 47.832 94.86 0.09 630636 144141 486322 0.56 0
new grps time %busy slice sched idle wake %find steal %speedup 1 5.938 96.80 0.24 31255 7190 24061 0.63 7433 36.1 2 11.491 99.23 0.16 74097 4578 69512 0.84 19463 20.9 3 16.987 99.66 0.15 115824 1985 113826 0.77 24707 15.8 4 22.504 99.80 0.14 167188 2385 164786 0.75 29353 12.3 8 44.441 99.86 0.11 389153 1616 387401 0.67 38190 7.6
Elapsed time improves by 8 to 36%, and CPU busy utilization is up by 5 to 22% hitting 99% for 2 or more groups (80 or more tasks). The cost is at most 0.4% more find time.
Additional performance results follow. A negative "speedup" is a regression. Note: for all hackbench runs, sched_wakeup_granularity_ns is set to 15 msec. Otherwise, preemptions increase at higher loads and distort the comparison between baseline and new.
------------------ 1 Socket Results ------------------
X6-2: 1 socket * 10 cores * 2 hyperthreads = 20 CPUs Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz Average of 10 runs of: hackbench <groups> process 100000
--- base -- --- new --- groups time %stdev time %stdev %speedup 1 8.008 0.1 5.905 0.2 35.6 2 13.814 0.2 11.438 0.1 20.7 3 19.488 0.2 16.919 0.1 15.1 4 25.059 0.1 22.409 0.1 11.8 8 47.478 0.1 44.221 0.1 7.3
X6-2: 1 socket * 22 cores * 2 hyperthreads = 44 CPUs Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz Average of 10 runs of: hackbench <groups> process 100000
--- base -- --- new --- groups time %stdev time %stdev %speedup 1 4.586 0.8 4.596 0.6 -0.3 2 7.693 0.2 5.775 1.3 33.2 3 10.442 0.3 8.288 0.3 25.9 4 13.087 0.2 11.057 0.1 18.3 8 24.145 0.2 22.076 0.3 9.3 16 43.779 0.1 41.741 0.2 4.8
KVM 4-cpu Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz tbench, average of 11 runs
clients %speedup 1 16.2 2 11.7 4 9.9 8 12.8 16 13.7
KVM 2-cpu Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz
Benchmark %speedup specjbb2015_critical_jops 5.7 mysql_sysb1.0.14_mutex_2 40.6 mysql_sysb1.0.14_oltp_2 3.9
------------------ 2 Socket Results ------------------
X6-2: 2 sockets * 10 cores * 2 hyperthreads = 40 CPUs Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz Average of 10 runs of: hackbench <groups> process 100000
--- base -- --- new --- groups time %stdev time %stdev %speedup 1 7.945 0.2 7.219 8.7 10.0 2 8.444 0.4 6.689 1.5 26.2 3 12.100 1.1 9.962 2.0 21.4 4 15.001 0.4 13.109 1.1 14.4 8 27.960 0.2 26.127 0.3 7.0
X6-2: 2 sockets * 22 cores * 2 hyperthreads = 88 CPUs Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz Average of 10 runs of: hackbench <groups> process 100000
--- base -- --- new --- groups time %stdev time %stdev %speedup 1 5.826 5.4 5.840 5.0 -0.3 2 5.041 5.3 6.171 23.4 -18.4 3 6.839 2.1 6.324 3.8 8.1 4 8.177 0.6 7.318 3.6 11.7 8 14.429 0.7 13.966 1.3 3.3 16 26.401 0.3 25.149 1.5 4.9
X6-2: 2 sockets * 22 cores * 2 hyperthreads = 88 CPUs Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz Oracle database OLTP, logging disabled, NVRAM storage
Customers Users %speedup 1200000 40 -1.2 2400000 80 2.7 3600000 120 8.9 4800000 160 4.4 6000000 200 3.0
X6-2: 2 sockets * 14 cores * 2 hyperthreads = 56 CPUs Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz Results from the Oracle "Performance PIT".
Benchmark %speedup
mysql_sysb1.0.14_fileio_56_rndrd 19.6 mysql_sysb1.0.14_fileio_56_seqrd 12.1 mysql_sysb1.0.14_fileio_56_rndwr 0.4 mysql_sysb1.0.14_fileio_56_seqrewr -0.3
pgsql_sysb1.0.14_fileio_56_rndrd 19.5 pgsql_sysb1.0.14_fileio_56_seqrd 8.6 pgsql_sysb1.0.14_fileio_56_rndwr 1.0 pgsql_sysb1.0.14_fileio_56_seqrewr 0.5
opatch_time_ASM_12.2.0.1.0_HP2M 7.5 select-1_users-warm_asmm_ASM_12.2.0.1.0_HP2M 5.1 select-1_users_asmm_ASM_12.2.0.1.0_HP2M 4.4 swingbenchv3_asmm_soebench_ASM_12.2.0.1.0_HP2M 5.8
lm3_memlat_L2 4.8 lm3_memlat_L1 0.0
ub_gcc_56CPUs-56copies_Pipe-based_Context_Switching 60.1 ub_gcc_56CPUs-56copies_Shell_Scripts_1_concurrent 5.2 ub_gcc_56CPUs-56copies_Shell_Scripts_8_concurrent -3.0 ub_gcc_56CPUs-56copies_File_Copy_1024_bufsize_2000_maxblocks 2.4
X5-2: 2 sockets * 18 cores * 2 hyperthreads = 72 CPUs Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz
NAS_OMP bench class ncpu %improved(Mops) dc B 72 1.3 is C 72 0.9 is D 72 0.7
sysbench mysql, average of 24 runs --- base --- --- new --- nthr events %stdev events %stdev %speedup 1 331.0 0.25 331.0 0.24 -0.1 2 661.3 0.22 661.8 0.22 0.0 4 1297.0 0.88 1300.5 0.82 0.2 8 2420.8 0.04 2420.5 0.04 -0.1 16 4826.3 0.07 4825.4 0.05 -0.1 32 8815.3 0.27 8830.2 0.18 0.1 64 12823.0 0.24 12823.6 0.26 0.0
-------------------------------------------------------------
Signed-off-by: Steve Sistare steven.sistare@oracle.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/fair.c | 164 ++++++++++++++++++++++++++++++++++++++-- kernel/sched/features.h | 6 ++ 2 files changed, 165 insertions(+), 5 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7bc282428cc4..9ad4399eb24f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4167,6 +4167,9 @@ static void overload_clear(struct rq *rq) { struct sparsemask *overload_cpus;
+ if (!sched_feat(STEAL)) + return; + rcu_read_lock(); overload_cpus = rcu_dereference(rq->cfs_overload_cpus); if (overload_cpus) @@ -4178,6 +4181,9 @@ static void overload_set(struct rq *rq) { struct sparsemask *overload_cpus;
+ if (!sched_feat(STEAL)) + return; + rcu_read_lock(); overload_cpus = rcu_dereference(rq->cfs_overload_cpus); if (overload_cpus) @@ -4185,6 +4191,8 @@ static void overload_set(struct rq *rq) rcu_read_unlock(); }
+static int try_steal(struct rq *this_rq, struct rq_flags *rf); + #else /* CONFIG_SMP */
#define UPDATE_TG 0x0 @@ -7450,21 +7458,23 @@ done: __maybe_unused; return NULL;
/* - * We must set idle_stamp _before_ calling idle_balance(), such that we - * measure the duration of idle_balance() as idle time. + * We must set idle_stamp _before_ calling try_steal() or + * idle_balance(), such that we measure the duration as idle time. */ rq_idle_stamp_update(rq);
new_tasks = newidle_balance(rq, rf); + if (new_tasks == 0) + new_tasks = try_steal(rq, rf);
if (new_tasks) rq_idle_stamp_clear(rq);
/* - * Because newidle_balance() releases (and re-acquires) rq->lock, it is - * possible for any higher priority task to appear. In that case we - * must re-start the pick_next_entity() loop. + * Because try_steal() and idle_balance() release (and re-acquire) + * rq->lock, it is possible for any higher priority task to appear. + * In that case we must re-start the pick_next_entity() loop. */ if (new_tasks < 0) return RETRY_TASK; @@ -11091,6 +11101,150 @@ void trigger_load_balance(struct rq *rq) nohz_balancer_kick(rq); }
+/* + * Search the runnable tasks in @cfs_rq in order of next to run, and find + * the first one that can be migrated to @dst_rq. @cfs_rq is locked on entry. + * On success, dequeue the task from @cfs_rq and return it, else return NULL. + */ +static struct task_struct * +detach_next_task(struct cfs_rq *cfs_rq, struct rq *dst_rq) +{ + int dst_cpu = dst_rq->cpu; + struct task_struct *p; + struct rq *rq = rq_of(cfs_rq); + + lockdep_assert_held(&rq_of(cfs_rq)->lock); + + list_for_each_entry_reverse(p, &rq->cfs_tasks, se.group_node) { + if (can_migrate_task_llc(p, rq, dst_rq)) { + detach_task(p, rq, dst_cpu); + return p; + } + } + return NULL; +} + +/* + * Attempt to migrate a CFS task from @src_cpu to @dst_rq. @locked indicates + * whether @dst_rq is already locked on entry. This function may lock or + * unlock @dst_rq, and updates @locked to indicate the locked state on return. + * The locking protocol is based on idle_balance(). + * Returns 1 on success and 0 on failure. + */ +static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked, + int src_cpu) +{ + struct task_struct *p; + struct rq_flags rf; + int stolen = 0; + int dst_cpu = dst_rq->cpu; + struct rq *src_rq = cpu_rq(src_cpu); + + if (dst_cpu == src_cpu || src_rq->cfs.h_nr_running < 2) + return 0; + + if (*locked) { + rq_unpin_lock(dst_rq, dst_rf); + raw_spin_unlock(&dst_rq->lock); + *locked = false; + } + rq_lock_irqsave(src_rq, &rf); + update_rq_clock(src_rq); + + if (src_rq->cfs.h_nr_running < 2 || !cpu_active(src_cpu)) + p = NULL; + else + p = detach_next_task(&src_rq->cfs, dst_rq); + + rq_unlock(src_rq, &rf); + + if (p) { + raw_spin_lock(&dst_rq->lock); + rq_repin_lock(dst_rq, dst_rf); + *locked = true; + update_rq_clock(dst_rq); + attach_task(dst_rq, p); + stolen = 1; + } + local_irq_restore(rf.flags); + + return stolen; +} + +/* + * Conservative upper bound on the max cost of a steal, in nsecs (the typical + * cost is 1-2 microsec). Do not steal if average idle time is less. + */ +#define SCHED_STEAL_COST 10000 + +/* + * Try to steal a runnable CFS task from a CPU in the same LLC as @dst_rq, + * and migrate it to @dst_rq. rq_lock is held on entry and return, but + * may be dropped in between. Return 1 on success, 0 on failure, and -1 + * if a task in a different scheduling class has become runnable on @dst_rq. + */ +static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf) +{ + int src_cpu; + int dst_cpu = dst_rq->cpu; + bool locked = true; + int stolen = 0; + struct sparsemask *overload_cpus; + + if (!sched_feat(STEAL)) + return 0; + + if (!cpu_active(dst_cpu)) + return 0; + + if (dst_rq->avg_idle < SCHED_STEAL_COST) + return 0; + + /* Get bitmap of overloaded CPUs in the same LLC as @dst_rq */ + + rcu_read_lock(); + overload_cpus = rcu_dereference(dst_rq->cfs_overload_cpus); + if (!overload_cpus) { + rcu_read_unlock(); + return 0; + } + +#ifdef CONFIG_SCHED_SMT + /* + * First try overloaded CPUs on the same core to preserve cache warmth. + */ + if (static_branch_likely(&sched_smt_present)) { + for_each_cpu(src_cpu, cpu_smt_mask(dst_cpu)) { + if (sparsemask_test_elem(overload_cpus, src_cpu) && + steal_from(dst_rq, dst_rf, &locked, src_cpu)) { + stolen = 1; + goto out; + } + } + } +#endif /* CONFIG_SCHED_SMT */ + + /* Accept any suitable task in the LLC */ + + sparsemask_for_each(overload_cpus, dst_cpu, src_cpu) { + if (steal_from(dst_rq, dst_rf, &locked, src_cpu)) { + stolen = 1; + goto out; + } + } + +out: + rcu_read_unlock(); + if (!locked) { + raw_spin_lock(&dst_rq->lock); + rq_repin_lock(dst_rq, dst_rf); + } + stolen |= (dst_rq->cfs.h_nr_running > 0); + if (dst_rq->nr_running != dst_rq->cfs.h_nr_running) + stolen = -1; + return stolen; +} + static void rq_online_fair(struct rq *rq) { update_sysctl(); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 129ca8f50cf5..98925e29b1b9 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -56,6 +56,12 @@ SCHED_FEAT(TTWU_QUEUE, true) */ SCHED_FEAT(SIS_PROP, true)
+/* + * Steal a CFS task from another CPU when going idle. + * Improves CPU utilization. + */ +SCHED_FEAT(STEAL, true) + /* * Issue a WARN when we do multiple update_rq_clock() calls * in a single rq->lock section. Default disabled because the
From: Steve Sistare steven.sistare@oracle.com
hulk inclusion category: feature bugzilla: 38261, https://gitee.com/openeuler/kernel/issues/I49XPZ CVE: NA
---------------------------
The STEAL feature causes regressions on hackbench on larger NUMA systems, so disable it on systems with more than sched_steal_node_limit nodes (default 2). Note that the feature remains enabled as seen in features.h and /sys/kernel/debug/sched_features, but stealing is only performed if nodes <= sched_steal_node_limit. This arrangement allows users to activate stealing on reboot by setting the kernel parameter sched_steal_node_limit on kernels built without CONFIG_SCHED_DEBUG. The parameter is temporary and will be deleted when the regression is fixed.
Details of the regression follow. With the STEAL feature set, hackbench is slower on many-node systems:
X5-8: 8 sockets * 18 cores * 2 hyperthreads = 288 CPUs Intel(R) Xeon(R) CPU E7-8895 v3 @ 2.60GHz Average of 10 runs of: hackbench <groups> processes 50000
--- base -- --- new --- groups time %stdev time %stdev %speedup 1 3.627 15.8 3.876 7.3 -6.5 2 4.545 24.7 5.583 16.7 -18.6 3 5.716 25.0 7.367 14.2 -22.5 4 6.901 32.9 7.718 14.5 -10.6 8 8.604 38.5 9.111 16.0 -5.6 16 7.734 6.8 11.007 8.2 -29.8
Total CPU time increases. Profiling shows that CPU time increases uniformly across all functions, suggesting a systemic increase in cache or memory latency. This may be due to NUMA migrations, as they cause loss of LLC cache footprint and remote memory latencies.
The domains for this system and their flags are:
domain0 (SMT) : 1 core SD_LOAD_BALANCE SD_BALANCE_NEWIDLE SD_BALANCE_EXEC SD_BALANCE_FORK SD_SHARE_PKG_RESOURCES SD_PREFER_SIBLING SD_SHARE_CPUCAPACITY SD_WAKE_AFFINE
domain1 (MC) : 1 socket SD_LOAD_BALANCE SD_BALANCE_NEWIDLE SD_BALANCE_EXEC SD_BALANCE_FORK SD_SHARE_PKG_RESOURCES SD_PREFER_SIBLING SD_WAKE_AFFINE
domain2 (NUMA) : 4 sockets SD_LOAD_BALANCE SD_BALANCE_NEWIDLE SD_BALANCE_EXEC SD_BALANCE_FORK SD_SERIALIZE SD_OVERLAP SD_NUMA SD_WAKE_AFFINE
domain3 (NUMA) : 8 sockets SD_LOAD_BALANCE SD_BALANCE_NEWIDLE SD_SERIALIZE SD_OVERLAP SD_NUMA
Schedstats point to the root cause of the regression. hackbench is run 10 times per group and the average schedstat accumulation per-run and per-cpu is shown below. Note that domain3 moves are zero because SD_WAKE_AFFINE is not set there.
NO_STEAL --- domain2 --- --- domain3 --- grp time %busy sched idle wake steal remote move pull remote move pull 1 20.3 10.3 28710 14346 14366 0 490 3378 0 4039 0 0 2 26.4 18.8 56721 28258 28469 0 792 7026 12 9229 0 7 3 29.9 28.3 90191 44933 45272 0 5380 7204 19 16481 0 3 4 30.2 35.8 121324 60409 60933 0 7012 9372 27 21438 0 5 8 27.7 64.2 229174 111917 117272 0 11991 1837 168 44006 0 32 16 32.6 74.0 334615 146784 188043 0 3404 1468 49 61405 0 8
STEAL --- domain2 --- --- domain3 --- grp time %busy sched idle wake steal remote move pull remote move pull 1 20.6 10.2 28490 14232 14261 18 3 3525 0 4254 0 0 2 27.9 18.8 56757 28203 28562 303 1675 7839 5 9690 0 2 3 35.3 27.7 87337 43274 44085 698 741 12785 14 15689 0 3 4 36.8 36.0 118630 58437 60216 1579 2973 14101 28 18732 0 7 8 48.1 73.8 289374 133681 155600 18646 35340 10179 171 65889 0 34 16 41.4 82.5 268925 91908 177172 47498 17206 6940 176 71776 0 20
Cross-numa-node migrations are caused by load balancing pulls and wake_affine moves. Pulls are small and similar for no_steal and steal. However, moves are significantly higher for steal, and rows above with the highest moves have the worst regressions for time; see for example grp=8.
Moves increase for steal due to the following logic in wake_affine_idle() for synchronous wakeup:
if (sync && cpu_rq(this_cpu)->nr_running == 1) return this_cpu; // move the task
The steal feature does a better job of smoothing the load between idle and busy CPUs, so nr_running is 1 more often, and moves are performed more often. For hackbench, cross-node affine moves early in the run are good because they colocate wakers and wakees from the same group on the same node, but continued moves later in the run are bad, because the wakee is moved away from peers on its previous node. Note that even no_steal is far from optimal; binding an instance of "hackbench 2" to each of the 8 NUMA nodes runs much faster than running "hackbench 16" with no binding.
Clearing SD_WAKE_AFFINE for domain2 eliminates the affine cross-node migrations and eliminates the difference between no_steal and steal performance. However, overall performance is lower than WA_IDLE because some migrations are helpful as explained above.
I have tried many heuristics in a attempt to optimize the number of cross-node moves in all conditions, with limited success. The fundamental problem is that the scheduler does not track which groups of tasks talk to each other. Parts of several groups become entrenched on the same node, filling it to capacity, leaving no room for either group to pull its peers over, and there is neither data nor mechanism for the scheduler to evict one group to make room for the other.
For now, disable STEAL on such systems until we can do better, or it is shown that hackbench is atypical and most workloads benefit from stealing.
Signed-off-by: Steve Sistare steven.sistare@oracle.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/fair.c | 16 +++++++++++++--- kernel/sched/sched.h | 3 ++- kernel/sched/topology.c | 25 +++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9ad4399eb24f..78d711aa8af6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4163,11 +4163,21 @@ static inline void rq_idle_stamp_clear(struct rq *rq) rq->idle_stamp = 0; }
+static inline bool steal_enabled(void) +{ +#ifdef CONFIG_NUMA + bool allow = static_branch_likely(&sched_steal_allow); +#else + bool allow = true; +#endif + return sched_feat(STEAL) && allow; +} + static void overload_clear(struct rq *rq) { struct sparsemask *overload_cpus;
- if (!sched_feat(STEAL)) + if (!steal_enabled()) return;
rcu_read_lock(); @@ -4181,7 +4191,7 @@ static void overload_set(struct rq *rq) { struct sparsemask *overload_cpus;
- if (!sched_feat(STEAL)) + if (!steal_enabled()) return;
rcu_read_lock(); @@ -11191,7 +11201,7 @@ static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf) int stolen = 0; struct sparsemask *overload_cpus;
- if (!sched_feat(STEAL)) + if (!steal_enabled()) return 0;
if (!cpu_active(dst_cpu)) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 97f54c881baf..b0f41c06a310 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1085,7 +1085,6 @@ static inline int cpu_of(struct rq *rq) #endif }
- #ifdef CONFIG_SCHED_SMT extern void __update_idle_core(struct rq *rq);
@@ -1348,6 +1347,8 @@ this_rq_lock_irq(struct rq_flags *rf) }
#ifdef CONFIG_NUMA +extern struct static_key_true sched_steal_allow; + enum numa_topology_type { NUMA_DIRECT, NUMA_GLUELESS_MESH, diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 486965561754..8a37e8328a82 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1563,6 +1563,30 @@ static void init_numa_topology_type(void) } }
+DEFINE_STATIC_KEY_TRUE(sched_steal_allow); +static int sched_steal_node_limit; +#define SCHED_STEAL_NODE_LIMIT_DEFAULT 2 + +static int __init steal_node_limit_setup(char *buf) +{ + get_option(&buf, &sched_steal_node_limit); + return 0; +} + +early_param("sched_steal_node_limit", steal_node_limit_setup); + +static void check_node_limit(void) +{ + int n = num_possible_nodes(); + + if (sched_steal_node_limit == 0) + sched_steal_node_limit = SCHED_STEAL_NODE_LIMIT_DEFAULT; + if (n > sched_steal_node_limit) { + static_branch_disable(&sched_steal_allow); + pr_debug("Suppressing sched STEAL. To enable, reboot with sched_steal_node_limit=%d", n); + } +} + void sched_init_numa(void) { int next_distance, curr_distance = node_distance(0, 0); @@ -1711,6 +1735,7 @@ void sched_init_numa(void) sched_max_numa_distance = sched_domains_numa_distance[level - 1];
init_numa_topology_type(); + check_node_limit(); }
void sched_domains_numa_masks_set(unsigned int cpu)
From: Steve Sistare steven.sistare@oracle.com
hulk inclusion category: feature bugzilla: 38261, https://gitee.com/openeuler/kernel/issues/I49XPZ CVE: NA
---------------------------
Add schedstats to measure the effectiveness of searching for idle CPUs and stealing tasks. This is a temporary patch intended for use during development only. SCHEDSTAT_VERSION is bumped to 16, and the following fields are added to the per-CPU statistics of /proc/schedstat:
field 10: # of times select_idle_sibling "easily" found an idle CPU -- prev or target is idle. field 11: # of times select_idle_sibling searched and found an idle cpu. field 12: # of times select_idle_sibling searched and found an idle core. field 13: # of times select_idle_sibling failed to find anything idle. field 14: time in nanoseconds spent in functions that search for idle CPUs and search for tasks to steal. field 15: # of times an idle CPU steals a task from another CPU. field 16: # of times try_steal finds overloaded CPUs but no task is migratable.
Signed-off-by: Steve Sistare steven.sistare@oracle.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/core.c | 31 ++++++++++++++++++++++++++-- kernel/sched/fair.c | 48 ++++++++++++++++++++++++++++++++++++++++---- kernel/sched/sched.h | 9 +++++++++ kernel/sched/stats.c | 11 +++++++++- kernel/sched/stats.h | 13 ++++++++++++ 5 files changed, 105 insertions(+), 7 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 25121223987b..acccd222814a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3146,17 +3146,44 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, DEFINE_STATIC_KEY_FALSE(sched_schedstats); static bool __initdata __sched_schedstats = false;
+unsigned long schedstat_skid; + +static void compute_skid(void) +{ + int i, n = 0; + s64 t; + int skid = 0; + + for (i = 0; i < 100; i++) { + t = local_clock(); + t = local_clock() - t; + if (t > 0 && t < 1000) { /* only use sane samples */ + skid += (int) t; + n++; + } + } + + if (n > 0) + schedstat_skid = skid / n; + else + schedstat_skid = 0; + pr_info("schedstat_skid = %lu\n", schedstat_skid); +} + static void set_schedstats(bool enabled) { - if (enabled) + if (enabled) { + compute_skid(); static_branch_enable(&sched_schedstats); - else + } else { static_branch_disable(&sched_schedstats); + } }
void force_schedstat_enabled(void) { if (!schedstat_enabled()) { + compute_skid(); pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); static_branch_enable(&sched_schedstats); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 78d711aa8af6..c74adb65732c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4176,29 +4176,35 @@ static inline bool steal_enabled(void) static void overload_clear(struct rq *rq) { struct sparsemask *overload_cpus; + unsigned long time;
if (!steal_enabled()) return;
+ time = schedstat_start_time(); rcu_read_lock(); overload_cpus = rcu_dereference(rq->cfs_overload_cpus); if (overload_cpus) sparsemask_clear_elem(overload_cpus, rq->cpu); rcu_read_unlock(); + schedstat_end_time(rq->find_time, time); }
static void overload_set(struct rq *rq) { struct sparsemask *overload_cpus; + unsigned long time;
if (!steal_enabled()) return;
+ time = schedstat_start_time(); rcu_read_lock(); overload_cpus = rcu_dereference(rq->cfs_overload_cpus); if (overload_cpus) sparsemask_set_elem(overload_cpus, rq->cpu); rcu_read_unlock(); + schedstat_end_time(rq->find_time, time); }
static int try_steal(struct rq *this_rq, struct rq_flags *rf); @@ -6416,6 +6422,16 @@ static inline bool asym_fits_capacity(int task_util, int cpu) return true; }
+#define SET_STAT(STAT) \ + do { \ + if (schedstat_enabled()) { \ + struct rq *rq = this_rq(); \ + \ + if (rq) \ + __schedstat_inc(rq->STAT); \ + } \ + } while (0) + /* * Try and locate an idle core/thread in the LLC cache domain. */ @@ -6435,16 +6451,20 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) }
if ((available_idle_cpu(target) || sched_idle_cpu(target)) && - asym_fits_capacity(task_util, target)) + asym_fits_capacity(task_util, target)) { + SET_STAT(found_idle_cpu_easy); return target; + }
/* * If the previous CPU is cache affine and idle, don't be stupid: */ if (prev != target && cpus_share_cache(prev, target) && (available_idle_cpu(prev) || sched_idle_cpu(prev)) && - asym_fits_capacity(task_util, prev)) + asym_fits_capacity(task_util, prev)) { + SET_STAT(found_idle_cpu_easy); return prev; + }
/* * Allow a per-cpu kthread to stack with the wakee if the @@ -6457,6 +6477,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (is_per_cpu_kthread(current) && prev == smp_processor_id() && this_rq()->nr_running <= 1) { + SET_STAT(found_idle_cpu_easy); return prev; }
@@ -6472,6 +6493,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * Replace recent_used_cpu with prev as it is a potential * candidate for the next wake: */ + SET_STAT(found_idle_cpu_easy); p->recent_used_cpu = prev; return recent_used_cpu; } @@ -6492,18 +6514,24 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ if (sd) { i = select_idle_capacity(p, sd, target); + SET_STAT(found_idle_cpu_capacity); return ((unsigned)i < nr_cpumask_bits) ? i : target; } }
sd = rcu_dereference(per_cpu(sd_llc, target)); - if (!sd) + if (!sd) { + SET_STAT(nofound_idle_cpu); return target; + }
i = select_idle_cpu(p, sd, target); - if ((unsigned)i < nr_cpumask_bits) + if ((unsigned)i < nr_cpumask_bits) { + SET_STAT(found_idle_cpu); return i; + }
+ SET_STAT(nofound_idle_cpu); return target; }
@@ -6897,6 +6925,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) static int select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) { + unsigned long time = schedstat_start_time(); struct sched_domain *tmp, *sd = NULL; int cpu = smp_processor_id(); int new_cpu = prev_cpu; @@ -6949,6 +6978,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f current->recent_used_cpu = cpu; } rcu_read_unlock(); + schedstat_end_time(cpu_rq(cpu)->find_time, time);
return new_cpu; } @@ -7342,6 +7372,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf struct sched_entity *se; struct task_struct *p; int new_tasks; + unsigned long time;
again: if (!sched_fair_runnable(rq)) @@ -7467,6 +7498,8 @@ done: __maybe_unused; if (!rf) return NULL;
+ time = schedstat_start_time(); + /* * We must set idle_stamp _before_ calling try_steal() or * idle_balance(), such that we measure the duration as idle time. @@ -7480,6 +7513,8 @@ done: __maybe_unused; if (new_tasks) rq_idle_stamp_clear(rq);
+ schedstat_end_time(rq->find_time, time); +
/* * Because try_steal() and idle_balance() release (and re-acquire) @@ -11175,6 +11210,7 @@ static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked, update_rq_clock(dst_rq); attach_task(dst_rq, p); stolen = 1; + schedstat_inc(dst_rq->steal); } local_irq_restore(rf.flags);
@@ -11199,6 +11235,7 @@ static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf) int dst_cpu = dst_rq->cpu; bool locked = true; int stolen = 0; + bool any_overload = false; struct sparsemask *overload_cpus;
if (!steal_enabled()) @@ -11241,6 +11278,7 @@ static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf) stolen = 1; goto out; } + any_overload = true; }
out: @@ -11252,6 +11290,8 @@ static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf) stolen |= (dst_rq->cfs.h_nr_running > 0); if (dst_rq->nr_running != dst_rq->cfs.h_nr_running) stolen = -1; + if (!stolen && any_overload) + schedstat_inc(dst_rq->steal_fail); return stolen; }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b0f41c06a310..67e6d89ca7bc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1052,6 +1052,15 @@ struct rq { /* try_to_wake_up() stats */ unsigned int ttwu_count; unsigned int ttwu_local; + + /* Idle search stats */ + unsigned int found_idle_cpu_capacity; + unsigned int found_idle_cpu; + unsigned int found_idle_cpu_easy; + unsigned int nofound_idle_cpu; + unsigned long find_time; + unsigned int steal; + unsigned int steal_fail; #endif
#ifdef CONFIG_CPU_IDLE diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 750fb3c67eed..e4d1d9805d46 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -10,7 +10,7 @@ * Bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ -#define SCHEDSTAT_VERSION 15 +#define SCHEDSTAT_VERSION 16
static int show_schedstat(struct seq_file *seq, void *v) { @@ -37,6 +37,15 @@ static int show_schedstat(struct seq_file *seq, void *v) rq->rq_cpu_time, rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
+ seq_printf(seq, " %u %u %u %u %lu %u %u", + rq->found_idle_cpu_easy, + rq->found_idle_cpu_capacity, + rq->found_idle_cpu, + rq->nofound_idle_cpu, + rq->find_time, + rq->steal, + rq->steal_fail); + seq_printf(seq, "\n");
#ifdef CONFIG_SMP diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 33d0daf83842..dc5ca31f5c2d 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -39,6 +39,17 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) #define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) #define schedstat_val(var) (var) #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) +#define schedstat_start_time() schedstat_val_or_zero(local_clock()) +#define schedstat_end_time(stat, time) \ + do { \ + unsigned long endtime; \ + \ + if (schedstat_enabled() && (time)) { \ + endtime = local_clock() - (time) - schedstat_skid; \ + schedstat_add((stat), endtime); \ + } \ + } while (0) +extern unsigned long schedstat_skid;
#else /* !CONFIG_SCHEDSTATS: */ static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { } @@ -53,6 +64,8 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt # define schedstat_set(var, val) do { } while (0) # define schedstat_val(var) 0 # define schedstat_val_or_zero(var) 0 +# define schedstat_start_time() 0 +# define schedstat_end_time(stat, t) do { } while (0) #endif /* CONFIG_SCHEDSTATS */
#ifdef CONFIG_PSI
From: Cheng Jian cj.chengjian@huawei.com
hulk inclusion category: feature bugzilla: 38261, https://gitee.com/openeuler/kernel/issues/I49XPZ CVE: NA
---------------------------
Steal tasks to improve CPU utilization can solve some performance problems such as mysql, but not all scenarios are optimized, such as hackbench.
So turn off by default.
Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/features.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 98925e29b1b9..d7c63040deb7 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -60,7 +60,7 @@ SCHED_FEAT(SIS_PROP, true) * Steal a CFS task from another CPU when going idle. * Improves CPU utilization. */ -SCHED_FEAT(STEAL, true) +SCHED_FEAT(STEAL, false)
/* * Issue a WARN when we do multiple update_rq_clock() calls
From: Cheng Jian cj.chengjian@huawei.com
hulk inclusion category: feature bugzilla: 38261, https://gitee.com/openeuler/kernel/issues/I49XPZ CVE: NA
---------------------------
Introduce CONFIG_SCHED_STEAL to limit the impact of steal task.
1). If turn off CONFIG_SCHED_STEAL, then all the changes will not exist, for we use some empty functions, so this depends on compiler optimization.
2). enable CONFIG_SCHED_STEAL, but disable STEAL and schedstats, it will introduce some impact whith schedstat check. but this has little effect on performance. This will be our default choice.
Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/sched/topology.h | 2 ++ init/Kconfig | 15 +++++++++++++++ kernel/sched/core.c | 4 ++++ kernel/sched/fair.c | 30 ++++++++++++++++++++++++------ kernel/sched/features.h | 2 ++ kernel/sched/sched.h | 9 +++++++++ kernel/sched/stats.c | 6 ++++++ kernel/sched/stats.h | 11 +++++++++-- kernel/sched/topology.c | 22 +++++++++++++++++++++- 9 files changed, 92 insertions(+), 9 deletions(-)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index c98908eb7c24..e50f2b0a2444 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -74,7 +74,9 @@ struct sched_domain_shared { atomic_t ref; atomic_t nr_busy_cpus; int has_idle_cores; +#ifdef CONFIG_SCHED_STEAL struct sparsemask *cfs_overload_cpus; +#endif };
struct sched_domain { diff --git a/init/Kconfig b/init/Kconfig index 04bc46ca0b9e..310082cd88fe 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1240,6 +1240,21 @@ config IMA_NS
endif # NAMESPACES
+config SCHED_STEAL + bool "Steal tasks to improve CPU utilization" + depends on SMP + default n + help + When a CPU has no more CFS tasks to run, and idle_balance() fails + to find a task, then attempt to steal a task from an overloaded + CPU in the same LLC. Maintain and use a bitmap of overloaded CPUs + to efficiently identify candidates. To minimize search time, steal + the first migratable task that is found when the bitmap is traversed. + For fairness, search for migratable tasks on an overloaded CPU in + order of next to run. + + If unsure, say N here. + config CHECKPOINT_RESTORE bool "Checkpoint/restore support" select PROC_CHILDREN diff --git a/kernel/sched/core.c b/kernel/sched/core.c index acccd222814a..685cb8c215d8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3146,6 +3146,7 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, DEFINE_STATIC_KEY_FALSE(sched_schedstats); static bool __initdata __sched_schedstats = false;
+#ifdef CONFIG_SCHED_STEAL unsigned long schedstat_skid;
static void compute_skid(void) @@ -3169,6 +3170,9 @@ static void compute_skid(void) schedstat_skid = 0; pr_info("schedstat_skid = %lu\n", schedstat_skid); } +#else +static inline void compute_skid(void) {} +#endif
static void set_schedstats(bool enabled) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c74adb65732c..5c7caca3aa96 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -21,7 +21,9 @@ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ #include "sched.h" +#ifdef CONFIG_SCHED_STEAL #include "sparsemask.h" +#endif
/* * Targeted preemption latency for CPU-bound tasks: @@ -4163,6 +4165,8 @@ static inline void rq_idle_stamp_clear(struct rq *rq) rq->idle_stamp = 0; }
+#ifdef CONFIG_SCHED_STEAL + static inline bool steal_enabled(void) { #ifdef CONFIG_NUMA @@ -4187,7 +4191,7 @@ static void overload_clear(struct rq *rq) if (overload_cpus) sparsemask_clear_elem(overload_cpus, rq->cpu); rcu_read_unlock(); - schedstat_end_time(rq->find_time, time); + schedstat_end_time(rq, time); }
static void overload_set(struct rq *rq) @@ -4204,10 +4208,15 @@ static void overload_set(struct rq *rq) if (overload_cpus) sparsemask_set_elem(overload_cpus, rq->cpu); rcu_read_unlock(); - schedstat_end_time(rq->find_time, time); + schedstat_end_time(rq, time); }
static int try_steal(struct rq *this_rq, struct rq_flags *rf); +#else +static inline int try_steal(struct rq *this_rq, struct rq_flags *rf) { return 0; } +static inline void overload_clear(struct rq *rq) {} +static inline void overload_set(struct rq *rq) {} +#endif
#else /* CONFIG_SMP */
@@ -6422,6 +6431,7 @@ static inline bool asym_fits_capacity(int task_util, int cpu) return true; }
+#ifdef CONFIG_SCHED_STEAL #define SET_STAT(STAT) \ do { \ if (schedstat_enabled()) { \ @@ -6431,6 +6441,9 @@ static inline bool asym_fits_capacity(int task_util, int cpu) __schedstat_inc(rq->STAT); \ } \ } while (0) +#else +#define SET_STAT(STAT) +#endif
/* * Try and locate an idle core/thread in the LLC cache domain. @@ -6925,13 +6938,15 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) static int select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) { - unsigned long time = schedstat_start_time(); + unsigned long time; struct sched_domain *tmp, *sd = NULL; int cpu = smp_processor_id(); int new_cpu = prev_cpu; int want_affine = 0; int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
+ time = schedstat_start_time(); + if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p);
@@ -6978,7 +6993,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f current->recent_used_cpu = cpu; } rcu_read_unlock(); - schedstat_end_time(cpu_rq(cpu)->find_time, time); + schedstat_end_time(cpu_rq(cpu), time);
return new_cpu; } @@ -7509,12 +7524,11 @@ done: __maybe_unused; new_tasks = newidle_balance(rq, rf); if (new_tasks == 0) new_tasks = try_steal(rq, rf); + schedstat_end_time(rq, time);
if (new_tasks) rq_idle_stamp_clear(rq);
- schedstat_end_time(rq->find_time, time); -
/* * Because try_steal() and idle_balance() release (and re-acquire) @@ -7997,6 +8011,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) return 0; }
+#ifdef CONFIG_SCHED_STEAL /* * Return true if task @p can migrate from @rq to @dst_rq in the same LLC. * No need to test for co-locality, and no need to test task_hot(), as sharing @@ -8024,6 +8039,7 @@ can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq)
return true; } +#endif
/* * detach_task() -- detach the task for the migration from @src_rq to @dst_cpu. @@ -11146,6 +11162,7 @@ void trigger_load_balance(struct rq *rq) nohz_balancer_kick(rq); }
+#ifdef CONFIG_SCHED_STEAL /* * Search the runnable tasks in @cfs_rq in order of next to run, and find * the first one that can be migrated to @dst_rq. @cfs_rq is locked on entry. @@ -11294,6 +11311,7 @@ static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf) schedstat_inc(dst_rq->steal_fail); return stolen; } +#endif
static void rq_online_fair(struct rq *rq) { diff --git a/kernel/sched/features.h b/kernel/sched/features.h index d7c63040deb7..97ed11bd25e7 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -56,11 +56,13 @@ SCHED_FEAT(TTWU_QUEUE, true) */ SCHED_FEAT(SIS_PROP, true)
+#ifdef CONFIG_SCHED_STEAL /* * Steal a CFS task from another CPU when going idle. * Improves CPU utilization. */ SCHED_FEAT(STEAL, false) +#endif
/* * Issue a WARN when we do multiple update_rq_clock() calls diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 67e6d89ca7bc..9ec230220ee3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -86,7 +86,9 @@
struct rq; struct cpuidle_state; +#ifdef CONFIG_SCHED_STEAL struct sparsemask; +#endif
/* task_struct::on_rq states: */ #define TASK_ON_RQ_QUEUED 1 @@ -939,7 +941,9 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; struct dl_rq dl; +#ifdef CONFIG_SCHED_STEAL struct sparsemask *cfs_overload_cpus; +#endif
#ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this CPU: */ @@ -1053,6 +1057,7 @@ struct rq { unsigned int ttwu_count; unsigned int ttwu_local;
+#ifdef CONFIG_SCHED_STEAL /* Idle search stats */ unsigned int found_idle_cpu_capacity; unsigned int found_idle_cpu; @@ -1061,6 +1066,7 @@ struct rq { unsigned long find_time; unsigned int steal; unsigned int steal_fail; +#endif /* CONFIG_SCHED_STEAL */ #endif
#ifdef CONFIG_CPU_IDLE @@ -1094,6 +1100,7 @@ static inline int cpu_of(struct rq *rq) #endif }
+ #ifdef CONFIG_SCHED_SMT extern void __update_idle_core(struct rq *rq);
@@ -1356,7 +1363,9 @@ this_rq_lock_irq(struct rq_flags *rf) }
#ifdef CONFIG_NUMA +#ifdef CONFIG_SCHED_STEAL extern struct static_key_true sched_steal_allow; +#endif
enum numa_topology_type { NUMA_DIRECT, diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index e4d1d9805d46..616c4b3c4307 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -10,7 +10,11 @@ * Bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ +#ifdef CONFIG_SCHED_STEAL #define SCHEDSTAT_VERSION 16 +#else +#define SCHEDSTAT_VERSION 15 +#endif
static int show_schedstat(struct seq_file *seq, void *v) { @@ -37,6 +41,7 @@ static int show_schedstat(struct seq_file *seq, void *v) rq->rq_cpu_time, rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
+#ifdef CONFIG_SCHED_STEAL seq_printf(seq, " %u %u %u %u %lu %u %u", rq->found_idle_cpu_easy, rq->found_idle_cpu_capacity, @@ -45,6 +50,7 @@ static int show_schedstat(struct seq_file *seq, void *v) rq->find_time, rq->steal, rq->steal_fail); +#endif /* CONFIG_SCHED_STEAL */
seq_printf(seq, "\n");
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index dc5ca31f5c2d..06cf8202c178 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -39,8 +39,9 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) #define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) #define schedstat_val(var) (var) #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) +#ifdef CONFIG_SCHED_STEAL #define schedstat_start_time() schedstat_val_or_zero(local_clock()) -#define schedstat_end_time(stat, time) \ +#define __schedstat_end_time(stat, time) \ do { \ unsigned long endtime; \ \ @@ -49,7 +50,13 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) schedstat_add((stat), endtime); \ } \ } while (0) +#define schedstat_end_time(rq, time) \ + __schedstat_end_time(((rq)->find_time), time) extern unsigned long schedstat_skid; +#else /* !CONFIG_SCHED_STEAL */ +# define schedstat_start_time() 0 +# define schedstat_end_time(rq, t) do { } while (0) +#endif /* CONFIG_SCHED_STEAL */
#else /* !CONFIG_SCHEDSTATS: */ static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { } @@ -65,7 +72,7 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt # define schedstat_val(var) 0 # define schedstat_val_or_zero(var) 0 # define schedstat_start_time() 0 -# define schedstat_end_time(stat, t) do { } while (0) +# define schedstat_end_time(rq, t) do { } while (0) #endif /* CONFIG_SCHEDSTATS */
#ifdef CONFIG_PSI diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8a37e8328a82..0564aeabbcb8 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -3,7 +3,9 @@ * Scheduler topology setup/handling methods */ #include "sched.h" +#ifdef CONFIG_SCHED_STEAL #include "sparsemask.h" +#endif
DEFINE_MUTEX(sched_domains_mutex);
@@ -11,11 +13,17 @@ DEFINE_MUTEX(sched_domains_mutex); static cpumask_var_t sched_domains_tmpmask; static cpumask_var_t sched_domains_tmpmask2;
+#ifdef CONFIG_SCHED_STEAL struct s_data; static int sd_llc_alloc(struct sched_domain *sd); static void sd_llc_free(struct sched_domain *sd); static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d); static void sd_llc_free_all(const struct cpumask *cpu_map); +#else +static inline void sd_llc_free(struct sched_domain *sd) {} +static inline int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d) { return 0; } +static inline void sd_llc_free_all(const struct cpumask *cpu_map) {} +#endif
#ifdef CONFIG_SCHED_DEBUG
@@ -647,9 +655,11 @@ DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
static void update_top_cache_domain(int cpu) { +#ifdef CONFIG_SCHED_STEAL + struct rq *rq = cpu_rq(cpu); struct sparsemask *cfs_overload_cpus = NULL; +#endif struct sched_domain_shared *sds = NULL; - struct rq *rq = cpu_rq(cpu); struct sched_domain *sd; int id = cpu; int size = 1; @@ -659,10 +669,14 @@ static void update_top_cache_domain(int cpu) id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); sds = sd->shared; +#ifdef CONFIG_SCHED_STEAL cfs_overload_cpus = sds->cfs_overload_cpus; +#endif }
+#ifdef CONFIG_SCHED_STEAL rcu_assign_pointer(rq->cfs_overload_cpus, cfs_overload_cpus); +#endif rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; @@ -1563,6 +1577,7 @@ static void init_numa_topology_type(void) } }
+#ifdef CONFIG_SCHED_STEAL DEFINE_STATIC_KEY_TRUE(sched_steal_allow); static int sched_steal_node_limit; #define SCHED_STEAL_NODE_LIMIT_DEFAULT 2 @@ -1586,6 +1601,9 @@ static void check_node_limit(void) pr_debug("Suppressing sched STEAL. To enable, reboot with sched_steal_node_limit=%d", n); } } +#else +static inline void check_node_limit(void) { } +#endif /* CONFIG_SCHED_STEAL */
void sched_init_numa(void) { @@ -1888,6 +1906,7 @@ static void __sdt_free(const struct cpumask *cpu_map) } }
+#ifdef CONFIG_SCHED_STEAL static int sd_llc_alloc(struct sched_domain *sd) { struct sched_domain_shared *sds = sd->shared; @@ -1959,6 +1978,7 @@ static void sd_llc_free_all(const struct cpumask *cpu_map) } } } +#endif
static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr,
From: Cheng Jian cj.chengjian@huawei.com
hulk inclusion category: config bugzilla: 38261, https://gitee.com/openeuler/kernel/issues/I49XPZ CVE: NA
-------------------------------------------------
Enable steal tasks by default to improve CPU utilization
Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + 2 files changed, 2 insertions(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index b657941e2836..cb9f3645d84c 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -160,6 +160,7 @@ CONFIG_USER_NS=y CONFIG_PID_NS=y CONFIG_NET_NS=y CONFIG_CHECKPOINT_RESTORE=y +CONFIG_SCHED_STEAL=y CONFIG_SCHED_AUTOGROUP=y # CONFIG_SYSFS_DEPRECATED is not set CONFIG_RELAY=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 57631bbc8839..11e0d90b5f06 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -180,6 +180,7 @@ CONFIG_USER_NS=y CONFIG_PID_NS=y CONFIG_NET_NS=y CONFIG_CHECKPOINT_RESTORE=y +CONFIG_SCHED_STEAL=y CONFIG_SCHED_AUTOGROUP=y # CONFIG_SYSFS_DEPRECATED is not set CONFIG_RELAY=y
From: Cheng Jian cj.chengjian@huawei.com
hulk inclusion category: bugfix bugzilla: 38261, https://gitee.com/openeuler/kernel/issues/I49XPZ CVE: NA
---------------------------
If we disable CONFIG_SMP, try_steal will lose its definition, resulting in a compile error as follows.
kernel/sched/fair.c: In function ‘pick_next_task_fair’: kernel/sched/fair.c:7001:15: error: implicit declaration of function ‘try_steal’ [-Werror=implicit-function-declaration] new_tasks = try_steal(rq, rf); ^~~~~~~~~
We can use allnoconfig to reproduce this problem.
Signed-off-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Bin Li huawei.libin@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/fair.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5c7caca3aa96..9c34ad6f9a67 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4243,6 +4243,7 @@ static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
static inline void rq_idle_stamp_update(struct rq *rq) {} static inline void rq_idle_stamp_clear(struct rq *rq) {} +static inline int try_steal(struct rq *this_rq, struct rq_flags *rf) { return 0; } static inline void overload_clear(struct rq *rq) {} static inline void overload_set(struct rq *rq) {}
From: SeongJae Park sjpark@amazon.de
mainline inclusion from mainline-5.15-rc1 commit 2224d8485492e499ca2e5d25407f8502cc06f149 category: feature bugzilla:https://gitee.com/openeuler/kernel/issues/I4GVMK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Patch series "Introduce Data Access MONitor (DAMON)", v34.
Introduction
============
DAMON is a data access monitoring framework for the Linux kernel. The core mechanisms of DAMON called 'region based sampling' and 'adaptive regions adjustment' (refer to 'mechanisms.rst' in the 11th patch of this patchset for the detail) make it
- accurate (The monitored information is useful for DRAM level memory management. It might not appropriate for Cache-level accuracy, though.),
- light-weight (The monitoring overhead is low enough to be applied online while making no impact on the performance of the target workloads.), and
- scalable (the upper-bound of the instrumentation overhead is controllable regardless of the size of target workloads.).
Using this framework, therefore, several memory management mechanisms such as reclamation and THP can be optimized to aware real data access patterns. Experimental access pattern aware memory management optimization works that incurring high instrumentation overhead will be able to have another try.
Though DAMON is for kernel subsystems, it can be easily exposed to the user space by writing a DAMON-wrapper kernel subsystem. Then, user space users who have some special workloads will be able to write personalized tools or applications for deeper understanding and specialized optimizations of their systems.
DAMON is also merged in two public Amazon Linux kernel trees that based on v5.4.y[1] and v5.10.y[2].
[1] https://github.com/amazonlinux/linux/tree/amazon-5.4.y/master/mm/damon [2] https://github.com/amazonlinux/linux/tree/amazon-5.10.y/master/mm/damon
The userspace tool[1] is available, released under GPLv2, and actively being maintained. I am also planning to implement another basic user interface in perf[2]. Also, the basic test suite for DAMON is available under GPLv2[3].
[1] https://github.com/awslabs/damo [2] https://lore.kernel.org/linux-mm/20210107120729.22328-1-sjpark@amazon.com/ [3] https://github.com/awslabs/damon-tests
Long-term Plan --------------
DAMON is a part of a project called Data Access-aware Operating System (DAOS). As the name implies, I want to improve the performance and efficiency of systems using fine-grained data access patterns. The optimizations are for both kernel and user spaces. I will therefore modify or create kernel subsystems, export some of those to user space and implement user space library / tools. Below shows the layers and components for the project.
--------------------------------------------------------------------------- Primitives: PTE Accessed bit, PG_idle, rmap, (Intel CMT), ... Framework: DAMON Features: DAMOS, virtual addr, physical addr, ... Applications: DAMON-debugfs, (DARC), ... ^^^^^^^^^^^^^^^^^^^^^^^ KERNEL SPACE ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Raw Interface: debugfs, (sysfs), (damonfs), tracepoints, (sys_damon), ...
vvvvvvvvvvvvvvvvvvvvvvv USER SPACE vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv Library: (libdamon), ... Tools: DAMO, (perf), ... ---------------------------------------------------------------------------
The components in parentheses or marked as '...' are not implemented yet but in the future plan. IOW, those are the TODO tasks of DAOS project. For more detail, please refer to the plans: https://lore.kernel.org/linux-mm/20201202082731.24828-1-sjpark@amazon.com/
Evaluations ===========
We evaluated DAMON's overhead, monitoring quality and usefulness using 24 realistic workloads on my QEMU/KVM based virtual machine running a kernel that v24 DAMON patchset is applied.
DAMON is lightweight. It increases system memory usage by 0.39% and slows target workloads down by 1.16%.
DAMON is accurate and useful for memory management optimizations. An experimental DAMON-based operation scheme for THP, namely 'ethp', removes 76.15% of THP memory overheads while preserving 51.25% of THP speedup. Another experimental DAMON-based 'proactive reclamation' implementation, 'prcl', reduces 93.38% of residential sets and 23.63% of system memory footprint while incurring only 1.22% runtime overhead in the best case (parsec3/freqmine).
NOTE that the experimental THP optimization and proactive reclamation are not for production but only for proof of concepts.
Please refer to the official document[1] or "Documentation/admin-guide/mm: Add a document for DAMON" patch in this patchset for detailed evaluation setup and results.
[1] https://damonitor.github.io/doc/html/latest-damon/admin-guide/mm/damon/eval....
Real-world User Story =====================
In summary, DAMON has used on production systems and proved its usefulness.
DAMON as a profiler -------------------
We analyzed characteristics of a large scale production systems of our customers using DAMON. The systems utilize 70GB DRAM and 36 CPUs. From this, we were able to find interesting things below.
There were obviously different access pattern under idle workload and active workload. Under the idle workload, it accessed large memory regions with low frequency, while the active workload accessed small memory regions with high freuqnecy.
DAMON found a 7GB memory region that showing obviously high access frequency under the active workload. We believe this is the performance-effective working set and need to be protected.
There was a 4KB memory region that showing highest access frequency under not only active but also idle workloads. We think this must be a hottest code section like thing that should never be paged out.
For this analysis, DAMON used only 0.3-1% of single CPU time. Because we used recording-based analysis, it consumed about 3-12 MB of disk space per 20 minutes. This is only small amount of disk space, but we can further reduce the disk usage by using non-recording-based DAMON features. I'd like to argue that only DAMON can do such detailed analysis (finding 4KB highest region in 70GB memory) with the light overhead.
DAMON as a system optimization tool -----------------------------------
We also found below potential performance problems on the systems and made DAMON-based solutions.
The system doesn't want to make the workload suffer from the page reclamation and thus it utilizes enough DRAM but no swap device. However, we found the system is actively reclaiming file-backed pages, because the system has intensive file IO. The file IO turned out to be not performance critical for the workload, but the customer wanted to ensure performance critical file-backed pages like code section to not mistakenly be evicted.
Using direct IO should or `mlock()` would be a straightforward solution, but modifying the user space code is not easy for the customer. Alternatively, we could use DAMON-based operation scheme[1]. By using it, we can ask DAMON to track access frequency of each region and make 'process_madvise(MADV_WILLNEED)[2]' call for regions having specific size and access frequency for a time interval.
We also found the system is having high number of TLB misses. We tried 'always' THP enabled policy and it greatly reduced TLB misses, but the page reclamation also been more frequent due to the THP internal fragmentation caused memory bloat. We could try another DAMON-based operation scheme that applies 'MADV_HUGEPAGE' to memory regions having
=2MB size and high access frequency, while applying 'MADV_NOHUGEPAGE' to
regions having <2MB size and low access frequency.
We do not own the systems so we only reported the analysis results and possible optimization solutions to the customers. The customers satisfied about the analysis results and promised to try the optimization guides.
[1] https://lore.kernel.org/linux-mm/20201006123931.5847-1-sjpark@amazon.com/ [2] https://lore.kernel.org/linux-api/20200622192900.22757-4-minchan@kernel.org/
Comparison with Idle Page Tracking ==================================
Idle Page Tracking allows users to set and read idleness of pages using a bitmap file which represents each page with each bit of the file. One recommended usage of it is working set size detection. Users can do that by
1. find PFN of each page for workloads in interest, 2. set all the pages as idle by doing writes to the bitmap file, 3. wait until the workload accesses its working set, and 4. read the idleness of the pages again and count pages became not idle.
NOTE: While Idle Page Tracking is for user space users, DAMON is primarily designed for kernel subsystems though it can easily exposed to the user space. Hence, this section only assumes such user space use of DAMON.
For what use cases Idle Page Tracking would be better? ------------------------------------------------------
1. Flexible usecases other than hotness monitoring.
Because Idle Page Tracking allows users to control the primitive (Page idleness) by themselves, Idle Page Tracking users can do anything they want. Meanwhile, DAMON is primarily designed to monitor the hotness of each memory region. For this, DAMON asks users to provide sampling interval and aggregation interval. For the reason, there could be some use case that using Idle Page Tracking is simpler.
2. Physical memory monitoring.
Idle Page Tracking receives PFN range as input, so natively supports physical memory monitoring.
DAMON is designed to be extensible for multiple address spaces and use cases by implementing and using primitives for the given use case. Therefore, by theory, DAMON has no limitation in the type of target address space as long as primitives for the given address space exists. However, the default primitives introduced by this patchset supports only virtual address spaces.
Therefore, for physical memory monitoring, you should implement your own primitives and use it, or simply use Idle Page Tracking.
Nonetheless, RFC patchsets[1] for the physical memory address space primitives is already available. It also supports user memory same to Idle Page Tracking.
[1] https://lore.kernel.org/linux-mm/20200831104730.28970-1-sjpark@amazon.com/
For what use cases DAMON is better? -----------------------------------
1. Hotness Monitoring.
Idle Page Tracking let users know only if a page frame is accessed or not. For hotness check, the user should write more code and use more memory. DAMON do that by itself.
2. Low Monitoring Overhead
DAMON receives user's monitoring request with one step and then provide the results. So, roughly speaking, DAMON require only O(1) user/kernel context switches.
In case of Idle Page Tracking, however, because the interface receives contiguous page frames, the number of user/kernel context switches increases as the monitoring target becomes complex and huge. As a result, the context switch overhead could be not negligible.
Moreover, DAMON is born to handle with the monitoring overhead. Because the core mechanism is pure logical, Idle Page Tracking users might be able to implement the mechanism on their own, but it would be time consuming and the user/kernel context switching will still more frequent than that of DAMON. Also, the kernel subsystems cannot use the logic in this case.
3. Page granularity working set size detection.
Until v22 of this patchset, this was categorized as the thing Idle Page Tracking could do better, because DAMON basically maintains additional metadata for each of the monitoring target regions. So, in the page granularity working set size detection use case, DAMON would incur (number of monitoring target pages * size of metadata) memory overhead. Size of the single metadata item is about 54 bytes, so assuming 4KB pages, about 1.3% of monitoring target pages will be additionally used.
All essential metadata for Idle Page Tracking are embedded in 'struct page' and page table entries. Therefore, in this use case, only one counter variable for working set size accounting is required if Idle Page Tracking is used.
There are more details to consider, but roughly speaking, this is true in most cases.
However, the situation changed from v23. Now DAMON supports arbitrary types of monitoring targets, which don't use the metadata. Using that, DAMON can do the working set size detection with no additional space overhead but less user-kernel context switch. A first draft for the implementation of monitoring primitives for this usage is available in a DAMON development tree[1]. An RFC patchset for it based on this patchset will also be available soon.
Since v24, the arbitrary type support is dropped from this patchset because this patchset doesn't introduce real use of the type. You can still get it from the DAMON development tree[2], though.
[1] https://github.com/sjp38/linux/tree/damon/pgidle_hack [2] https://github.com/sjp38/linux/tree/damon/master
4. More future usecases
While Idle Page Tracking has tight coupling with base primitives (PG_Idle and page table Accessed bits), DAMON is designed to be extensible for many use cases and address spaces. If you need some special address type or want to use special h/w access check primitives, you can write your own primitives for that and configure DAMON to use those. Therefore, if your use case could be changed a lot in future, using DAMON could be better.
Can I use both Idle Page Tracking and DAMON? --------------------------------------------
Yes, though using them concurrently for overlapping memory regions could result in interference to each other. Nevertheless, such use case would be rare or makes no sense at all. Even in the case, the noise would bot be really significant. So, you can choose whatever you want depending on the characteristics of your use cases.
More Information ================
We prepared a showcase web site[1] that you can get more information. There are
- the official documentations[2], - the heatmap format dynamic access pattern of various realistic workloads for heap area[3], mmap()-ed area[4], and stack[5] area, - the dynamic working set size distribution[6] and chronological working set size changes[7], and - the latest performance test results[8].
[1] https://damonitor.github.io/_index [2] https://damonitor.github.io/doc/html/latest-damon [3] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.0.png.html [4] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.1.png.html [5] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.2.png.html [6] https://damonitor.github.io/test/result/visual/latest/rec.wss_sz.png.html [7] https://damonitor.github.io/test/result/visual/latest/rec.wss_time.png.html [8] https://damonitor.github.io/test/result/perf/latest/html/index.html
Baseline and Complete Git Trees ===============================
The patches are based on the latest -mm tree, specifically v5.14-rc1-mmots-2021-07-15-18-47 of https://github.com/hnaz/linux-mm. You can also clone the complete git tree:
$ git clone git://github.com/sjp38/linux -b damon/patches/v34
The web is also available: https://github.com/sjp38/linux/releases/tag/damon/patches/v34
Development Trees -----------------
There are a couple of trees for entire DAMON patchset series and features for future release.
- For latest release: https://github.com/sjp38/linux/tree/damon/master - For next release: https://github.com/sjp38/linux/tree/damon/next
Long-term Support Trees -----------------------
For people who want to test DAMON but using LTS kernels, there are another couple of trees based on two latest LTS kernels respectively and containing the 'damon/master' backports.
- For v5.4.y: https://github.com/sjp38/linux/tree/damon/for-v5.4.y - For v5.10.y: https://github.com/sjp38/linux/tree/damon/for-v5.10.y
Amazon Linux Kernel Trees -------------------------
DAMON is also merged in two public Amazon Linux kernel trees that based on v5.4.y[1] and v5.10.y[2].
[1] https://github.com/amazonlinux/linux/tree/amazon-5.4.y/master/mm/damon [2] https://github.com/amazonlinux/linux/tree/amazon-5.10.y/master/mm/damon
Git Tree for Diff of Patches ============================
For easy review of diff between different versions of each patch, I prepared a git tree containing all versions of the DAMON patchset series: https://github.com/sjp38/damon-patches
You can clone it and use 'diff' for easy review of changes between different versions of the patchset. For example:
$ git clone https://github.com/sjp38/damon-patches && cd damon-patches $ diff -u damon/v33 damon/v34
Sequence Of Patches ===================
First three patches implement the core logics of DAMON. The 1st patch introduces basic sampling based hotness monitoring for arbitrary types of targets. Following two patches implement the core mechanisms for control of overhead and accuracy, namely regions based sampling (patch 2) and adaptive regions adjustment (patch 3).
Now the essential parts of DAMON is complete, but it cannot work unless someone provides monitoring primitives for a specific use case. The following two patches make it just work for virtual address spaces monitoring. The 4th patch makes 'PG_idle' can be used by DAMON and the 5th patch implements the virtual memory address space specific monitoring primitives using page table Accessed bits and the 'PG_idle' page flag.
Now DAMON just works for virtual address space monitoring via the kernel space api. To let the user space users can use DAMON, following four patches add interfaces for them. The 6th patch adds a tracepoint for monitoring results. The 7th patch implements a DAMON application kernel module, namely damon-dbgfs, that simply wraps DAMON and exposes DAMON interface to the user space via the debugfs interface. The 8th patch further exports pid of monitoring thread (kdamond) to user space for easier cpu usage accounting, and the 9th patch makes the debugfs interface to support multiple contexts.
Three patches for maintainability follows. The 10th patch adds documentations for both the user space and the kernel space. The 11th patch provides unit tests (based on the kunit) while the 12th patch adds user space tests (based on the kselftest).
Finally, the last patch (13th) updates the MAINTAINERS file.
This patch (of 13):
DAMON is a data access monitoring framework for the Linux kernel. The core mechanisms of DAMON make it
- accurate (the monitoring output is useful enough for DRAM level performance-centric memory management; It might be inappropriate for CPU cache levels, though), - light-weight (the monitoring overhead is normally low enough to be applied online), and - scalable (the upper-bound of the overhead is in constant range regardless of the size of target workloads).
Using this framework, hence, we can easily write efficient kernel space data access monitoring applications. For example, the kernel's memory management mechanisms can make advanced decisions using this. Experimental data access aware optimization works that incurring high access monitoring overhead could again be implemented on top of this.
Due to its simple and flexible interface, providing user space interface would be also easy. Then, user space users who have some special workloads can write personalized applications for better understanding and optimizations of their workloads and systems.
===
Nevertheless, this commit is defining and implementing only basic access check part without the overhead-accuracy handling core logic. The basic access check is as below.
The output of DAMON says what memory regions are how frequently accessed for a given duration. The resolution of the access frequency is controlled by setting ``sampling interval`` and ``aggregation interval``. In detail, DAMON checks access to each page per ``sampling interval`` and aggregates the results. In other words, counts the number of the accesses to each region. After each ``aggregation interval`` passes, DAMON calls callback functions that previously registered by users so that users can read the aggregated results and then clears the results. This can be described in below simple pseudo-code::
init() while monitoring_on: for page in monitoring_target: if accessed(page): nr_accesses[page] += 1 if time() % aggregation_interval == 0: for callback in user_registered_callbacks: callback(monitoring_target, nr_accesses) for page in monitoring_target: nr_accesses[page] = 0 if time() % update_interval == 0: update() sleep(sampling interval)
The target regions constructed at the beginning of the monitoring and updated after each ``regions_update_interval``, because the target regions could be dynamically changed (e.g., mmap() or memory hotplug). The monitoring overhead of this mechanism will arbitrarily increase as the size of the target workload grows.
The basic monitoring primitives for actual access check and dynamic target regions construction aren't in the core part of DAMON. Instead, it allows users to implement their own primitives that are optimized for their use case and configure DAMON to use those. In other words, users cannot use current version of DAMON without some additional works.
Following commits will implement the core mechanisms for the overhead-accuracy control and default primitives implementations.
Link: https://lkml.kernel.org/r/20210716081449.22187-1-sj38.park@gmail.com Link: https://lkml.kernel.org/r/20210716081449.22187-2-sj38.park@gmail.com Signed-off-by: SeongJae Park sjpark@amazon.de Reviewed-by: Leonard Foerster foersleo@amazon.de Reviewed-by: Fernand Sieber sieberf@amazon.com Acked-by: Shakeel Butt shakeelb@google.com Cc: Jonathan Cameron Jonathan.Cameron@huawei.com Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Amit Shah amit@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Jonathan Corbet corbet@lwn.net Cc: David Hildenbrand david@redhat.com Cc: David Woodhouse dwmw@amazon.com Cc: Marco Elver elver@google.com Cc: Fan Du fan.du@intel.com Cc: Greg Kroah-Hartman greg@kroah.com Cc: Greg Thelen gthelen@google.com Cc: Joe Perches joe@perches.com Cc: Mel Gorman mgorman@suse.de Cc: Maximilian Heyne mheyne@amazon.de Cc: Minchan Kim minchan@kernel.org Cc: Ingo Molnar mingo@redhat.com Cc: Namhyung Kim namhyung@kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: Rik van Riel riel@surriel.com Cc: David Rientjes rientjes@google.com Cc: Steven Rostedt (VMware) rostedt@goodmis.org Cc: Shuah Khan shuah@kernel.org Cc: Vlastimil Babka vbabka@suse.cz Cc: Vladimir Davydov vdavydov.dev@gmail.com Cc: Brendan Higgins brendanhiggins@google.com Cc: Markus Boehme markubo@amazon.de Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit 2224d8485492e499ca2e5d25407f8502cc06f149) Conflicts: mm/Kconfig Signed-off-by: Yue Zou zouyue3@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/damon.h | 167 ++++++++++++++++++++++ mm/Kconfig | 3 + mm/Makefile | 1 + mm/damon/Kconfig | 15 ++ mm/damon/Makefile | 3 + mm/damon/core.c | 320 ++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 509 insertions(+) create mode 100644 include/linux/damon.h create mode 100644 mm/damon/Kconfig create mode 100644 mm/damon/Makefile create mode 100644 mm/damon/core.c
diff --git a/include/linux/damon.h b/include/linux/damon.h new file mode 100644 index 000000000000..2f652602b1ea --- /dev/null +++ b/include/linux/damon.h @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * DAMON api + * + * Author: SeongJae Park sjpark@amazon.de + */ + +#ifndef _DAMON_H_ +#define _DAMON_H_ + +#include <linux/mutex.h> +#include <linux/time64.h> +#include <linux/types.h> + +struct damon_ctx; + +/** + * struct damon_primitive Monitoring primitives for given use cases. + * + * @init: Initialize primitive-internal data structures. + * @update: Update primitive-internal data structures. + * @prepare_access_checks: Prepare next access check of target regions. + * @check_accesses: Check the accesses to target regions. + * @reset_aggregated: Reset aggregated accesses monitoring results. + * @target_valid: Determine if the target is valid. + * @cleanup: Clean up the context. + * + * DAMON can be extended for various address spaces and usages. For this, + * users should register the low level primitives for their target address + * space and usecase via the &damon_ctx.primitive. Then, the monitoring thread + * (&damon_ctx.kdamond) calls @init and @prepare_access_checks before starting + * the monitoring, @update after each &damon_ctx.primitive_update_interval, and + * @check_accesses, @target_valid and @prepare_access_checks after each + * &damon_ctx.sample_interval. Finally, @reset_aggregated is called after each + * &damon_ctx.aggr_interval. + * + * @init should initialize primitive-internal data structures. For example, + * this could be used to construct proper monitoring target regions and link + * those to @damon_ctx.target. + * @update should update the primitive-internal data structures. For example, + * this could be used to update monitoring target regions for current status. + * @prepare_access_checks should manipulate the monitoring regions to be + * prepared for the next access check. + * @check_accesses should check the accesses to each region that made after the + * last preparation and update the number of observed accesses of each region. + * @reset_aggregated should reset the access monitoring results that aggregated + * by @check_accesses. + * @target_valid should check whether the target is still valid for the + * monitoring. + * @cleanup is called from @kdamond just before its termination. + */ +struct damon_primitive { + void (*init)(struct damon_ctx *context); + void (*update)(struct damon_ctx *context); + void (*prepare_access_checks)(struct damon_ctx *context); + void (*check_accesses)(struct damon_ctx *context); + void (*reset_aggregated)(struct damon_ctx *context); + bool (*target_valid)(void *target); + void (*cleanup)(struct damon_ctx *context); +}; + +/* + * struct damon_callback Monitoring events notification callbacks. + * + * @before_start: Called before starting the monitoring. + * @after_sampling: Called after each sampling. + * @after_aggregation: Called after each aggregation. + * @before_terminate: Called before terminating the monitoring. + * @private: User private data. + * + * The monitoring thread (&damon_ctx.kdamond) calls @before_start and + * @before_terminate just before starting and finishing the monitoring, + * respectively. Therefore, those are good places for installing and cleaning + * @private. + * + * The monitoring thread calls @after_sampling and @after_aggregation for each + * of the sampling intervals and aggregation intervals, respectively. + * Therefore, users can safely access the monitoring results without additional + * protection. For the reason, users are recommended to use these callback for + * the accesses to the results. + * + * If any callback returns non-zero, monitoring stops. + */ +struct damon_callback { + void *private; + + int (*before_start)(struct damon_ctx *context); + int (*after_sampling)(struct damon_ctx *context); + int (*after_aggregation)(struct damon_ctx *context); + int (*before_terminate)(struct damon_ctx *context); +}; + +/** + * struct damon_ctx - Represents a context for each monitoring. This is the + * main interface that allows users to set the attributes and get the results + * of the monitoring. + * + * @sample_interval: The time between access samplings. + * @aggr_interval: The time between monitor results aggregations. + * @primitive_update_interval: The time between monitoring primitive updates. + * + * For each @sample_interval, DAMON checks whether each region is accessed or + * not. It aggregates and keeps the access information (number of accesses to + * each region) for @aggr_interval time. DAMON also checks whether the target + * memory regions need update (e.g., by ``mmap()`` calls from the application, + * in case of virtual memory monitoring) and applies the changes for each + * @primitive_update_interval. All time intervals are in micro-seconds. + * Please refer to &struct damon_primitive and &struct damon_callback for more + * detail. + * + * @kdamond: Kernel thread who does the monitoring. + * @kdamond_stop: Notifies whether kdamond should stop. + * @kdamond_lock: Mutex for the synchronizations with @kdamond. + * + * For each monitoring context, one kernel thread for the monitoring is + * created. The pointer to the thread is stored in @kdamond. + * + * Once started, the monitoring thread runs until explicitly required to be + * terminated or every monitoring target is invalid. The validity of the + * targets is checked via the &damon_primitive.target_valid of @primitive. The + * termination can also be explicitly requested by writing non-zero to + * @kdamond_stop. The thread sets @kdamond to NULL when it terminates. + * Therefore, users can know whether the monitoring is ongoing or terminated by + * reading @kdamond. Reads and writes to @kdamond and @kdamond_stop from + * outside of the monitoring thread must be protected by @kdamond_lock. + * + * Note that the monitoring thread protects only @kdamond and @kdamond_stop via + * @kdamond_lock. Accesses to other fields must be protected by themselves. + * + * @primitive: Set of monitoring primitives for given use cases. + * @callback: Set of callbacks for monitoring events notifications. + * + * @target: Pointer to the user-defined monitoring target. + */ +struct damon_ctx { + unsigned long sample_interval; + unsigned long aggr_interval; + unsigned long primitive_update_interval; + +/* private: internal use only */ + struct timespec64 last_aggregation; + struct timespec64 last_primitive_update; + +/* public: */ + struct task_struct *kdamond; + bool kdamond_stop; + struct mutex kdamond_lock; + + struct damon_primitive primitive; + struct damon_callback callback; + + void *target; +}; + +#ifdef CONFIG_DAMON + +struct damon_ctx *damon_new_ctx(void); +void damon_destroy_ctx(struct damon_ctx *ctx); +int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, + unsigned long aggr_int, unsigned long primitive_upd_int); + +int damon_start(struct damon_ctx **ctxs, int nr_ctxs); +int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); + +#endif /* CONFIG_DAMON */ + +#endif /* _DAMON_H */ diff --git a/mm/Kconfig b/mm/Kconfig index 8d1f074b615b..c002358860c8 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -916,4 +916,7 @@ config PID_RESERVE We record the pid of dump task in the reserve memory, and reserve the pids before init task start. In restore process, free the reserved pids and realloc them for use. + +source "mm/damon/Kconfig" + endmenu diff --git a/mm/Makefile b/mm/Makefile index 0ae832fe9df6..4d07adb60619 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -114,6 +114,7 @@ obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o +obj-$(CONFIG_DAMON) += damon/ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o obj-$(CONFIG_ZONE_DEVICE) += memremap.o diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig new file mode 100644 index 000000000000..d00e99ac1a15 --- /dev/null +++ b/mm/damon/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menu "Data Access Monitoring" + +config DAMON + bool "DAMON: Data Access Monitoring Framework" + help + This builds a framework that allows kernel subsystems to monitor + access frequency of each memory region. The information can be useful + for performance-centric DRAM level memory management. + + See https://damonitor.github.io/doc/html/latest-damon/index.html for + more information. + +endmenu diff --git a/mm/damon/Makefile b/mm/damon/Makefile new file mode 100644 index 000000000000..4fd2edb4becf --- /dev/null +++ b/mm/damon/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_DAMON) := core.o diff --git a/mm/damon/core.c b/mm/damon/core.c new file mode 100644 index 000000000000..651590bf49b1 --- /dev/null +++ b/mm/damon/core.c @@ -0,0 +1,320 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Data Access Monitor + * + * Author: SeongJae Park sjpark@amazon.de + */ + +#define pr_fmt(fmt) "damon: " fmt + +#include <linux/damon.h> +#include <linux/delay.h> +#include <linux/kthread.h> +#include <linux/slab.h> + +static DEFINE_MUTEX(damon_lock); +static int nr_running_ctxs; + +struct damon_ctx *damon_new_ctx(void) +{ + struct damon_ctx *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return NULL; + + ctx->sample_interval = 5 * 1000; + ctx->aggr_interval = 100 * 1000; + ctx->primitive_update_interval = 60 * 1000 * 1000; + + ktime_get_coarse_ts64(&ctx->last_aggregation); + ctx->last_primitive_update = ctx->last_aggregation; + + mutex_init(&ctx->kdamond_lock); + + ctx->target = NULL; + + return ctx; +} + +void damon_destroy_ctx(struct damon_ctx *ctx) +{ + if (ctx->primitive.cleanup) + ctx->primitive.cleanup(ctx); + kfree(ctx); +} + +/** + * damon_set_attrs() - Set attributes for the monitoring. + * @ctx: monitoring context + * @sample_int: time interval between samplings + * @aggr_int: time interval between aggregations + * @primitive_upd_int: time interval between monitoring primitive updates + * + * This function should not be called while the kdamond is running. + * Every time interval is in micro-seconds. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, + unsigned long aggr_int, unsigned long primitive_upd_int) +{ + ctx->sample_interval = sample_int; + ctx->aggr_interval = aggr_int; + ctx->primitive_update_interval = primitive_upd_int; + + return 0; +} + +static bool damon_kdamond_running(struct damon_ctx *ctx) +{ + bool running; + + mutex_lock(&ctx->kdamond_lock); + running = ctx->kdamond != NULL; + mutex_unlock(&ctx->kdamond_lock); + + return running; +} + +static int kdamond_fn(void *data); + +/* + * __damon_start() - Starts monitoring with given context. + * @ctx: monitoring context + * + * This function should be called while damon_lock is hold. + * + * Return: 0 on success, negative error code otherwise. + */ +static int __damon_start(struct damon_ctx *ctx) +{ + int err = -EBUSY; + + mutex_lock(&ctx->kdamond_lock); + if (!ctx->kdamond) { + err = 0; + ctx->kdamond_stop = false; + ctx->kdamond = kthread_run(kdamond_fn, ctx, "kdamond.%d", + nr_running_ctxs); + if (IS_ERR(ctx->kdamond)) { + err = PTR_ERR(ctx->kdamond); + ctx->kdamond = 0; + } + } + mutex_unlock(&ctx->kdamond_lock); + + return err; +} + +/** + * damon_start() - Starts the monitorings for a given group of contexts. + * @ctxs: an array of the pointers for contexts to start monitoring + * @nr_ctxs: size of @ctxs + * + * This function starts a group of monitoring threads for a group of monitoring + * contexts. One thread per each context is created and run in parallel. The + * caller should handle synchronization between the threads by itself. If a + * group of threads that created by other 'damon_start()' call is currently + * running, this function does nothing but returns -EBUSY. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_start(struct damon_ctx **ctxs, int nr_ctxs) +{ + int i; + int err = 0; + + mutex_lock(&damon_lock); + if (nr_running_ctxs) { + mutex_unlock(&damon_lock); + return -EBUSY; + } + + for (i = 0; i < nr_ctxs; i++) { + err = __damon_start(ctxs[i]); + if (err) + break; + nr_running_ctxs++; + } + mutex_unlock(&damon_lock); + + return err; +} + +/* + * __damon_stop() - Stops monitoring of given context. + * @ctx: monitoring context + * + * Return: 0 on success, negative error code otherwise. + */ +static int __damon_stop(struct damon_ctx *ctx) +{ + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + ctx->kdamond_stop = true; + mutex_unlock(&ctx->kdamond_lock); + while (damon_kdamond_running(ctx)) + usleep_range(ctx->sample_interval, + ctx->sample_interval * 2); + return 0; + } + mutex_unlock(&ctx->kdamond_lock); + + return -EPERM; +} + +/** + * damon_stop() - Stops the monitorings for a given group of contexts. + * @ctxs: an array of the pointers for contexts to stop monitoring + * @nr_ctxs: size of @ctxs + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_stop(struct damon_ctx **ctxs, int nr_ctxs) +{ + int i, err = 0; + + for (i = 0; i < nr_ctxs; i++) { + /* nr_running_ctxs is decremented in kdamond_fn */ + err = __damon_stop(ctxs[i]); + if (err) + return err; + } + + return err; +} + +/* + * damon_check_reset_time_interval() - Check if a time interval is elapsed. + * @baseline: the time to check whether the interval has elapsed since + * @interval: the time interval (microseconds) + * + * See whether the given time interval has passed since the given baseline + * time. If so, it also updates the baseline to current time for next check. + * + * Return: true if the time interval has passed, or false otherwise. + */ +static bool damon_check_reset_time_interval(struct timespec64 *baseline, + unsigned long interval) +{ + struct timespec64 now; + + ktime_get_coarse_ts64(&now); + if ((timespec64_to_ns(&now) - timespec64_to_ns(baseline)) < + interval * 1000) + return false; + *baseline = now; + return true; +} + +/* + * Check whether it is time to flush the aggregated information + */ +static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx) +{ + return damon_check_reset_time_interval(&ctx->last_aggregation, + ctx->aggr_interval); +} + +/* + * Check whether it is time to check and apply the target monitoring regions + * + * Returns true if it is. + */ +static bool kdamond_need_update_primitive(struct damon_ctx *ctx) +{ + return damon_check_reset_time_interval(&ctx->last_primitive_update, + ctx->primitive_update_interval); +} + +/* + * Check whether current monitoring should be stopped + * + * The monitoring is stopped when either the user requested to stop, or all + * monitoring targets are invalid. + * + * Returns true if need to stop current monitoring. + */ +static bool kdamond_need_stop(struct damon_ctx *ctx) +{ + bool stop; + + mutex_lock(&ctx->kdamond_lock); + stop = ctx->kdamond_stop; + mutex_unlock(&ctx->kdamond_lock); + if (stop) + return true; + + if (!ctx->primitive.target_valid) + return false; + + return !ctx->primitive.target_valid(ctx->target); +} + +static void set_kdamond_stop(struct damon_ctx *ctx) +{ + mutex_lock(&ctx->kdamond_lock); + ctx->kdamond_stop = true; + mutex_unlock(&ctx->kdamond_lock); +} + +/* + * The monitoring daemon that runs as a kernel thread + */ +static int kdamond_fn(void *data) +{ + struct damon_ctx *ctx = (struct damon_ctx *)data; + + mutex_lock(&ctx->kdamond_lock); + pr_info("kdamond (%d) starts\n", ctx->kdamond->pid); + mutex_unlock(&ctx->kdamond_lock); + + if (ctx->primitive.init) + ctx->primitive.init(ctx); + if (ctx->callback.before_start && ctx->callback.before_start(ctx)) + set_kdamond_stop(ctx); + + while (!kdamond_need_stop(ctx)) { + if (ctx->primitive.prepare_access_checks) + ctx->primitive.prepare_access_checks(ctx); + if (ctx->callback.after_sampling && + ctx->callback.after_sampling(ctx)) + set_kdamond_stop(ctx); + + usleep_range(ctx->sample_interval, ctx->sample_interval + 1); + + if (ctx->primitive.check_accesses) + ctx->primitive.check_accesses(ctx); + + if (kdamond_aggregate_interval_passed(ctx)) { + if (ctx->callback.after_aggregation && + ctx->callback.after_aggregation(ctx)) + set_kdamond_stop(ctx); + if (ctx->primitive.reset_aggregated) + ctx->primitive.reset_aggregated(ctx); + } + + if (kdamond_need_update_primitive(ctx)) { + if (ctx->primitive.update) + ctx->primitive.update(ctx); + } + } + + if (ctx->callback.before_terminate && + ctx->callback.before_terminate(ctx)) + set_kdamond_stop(ctx); + if (ctx->primitive.cleanup) + ctx->primitive.cleanup(ctx); + + pr_debug("kdamond (%d) finishes\n", ctx->kdamond->pid); + mutex_lock(&ctx->kdamond_lock); + ctx->kdamond = NULL; + mutex_unlock(&ctx->kdamond_lock); + + mutex_lock(&damon_lock); + nr_running_ctxs--; + mutex_unlock(&damon_lock); + + do_exit(0); +}
From: SeongJae Park sjpark@amazon.de
mainline inclusion from mainline-5.15-rc1 commit f23b8eee1871a6db5c37f90831147de5426c40b7 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4GVMK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
------------------------------------------------- To avoid the unbounded increase of the overhead, DAMON groups adjacent pages that are assumed to have the same access frequencies into a region. As long as the assumption (pages in a region have the same access frequencies) is kept, only one page in the region is required to be checked. Thus, for each ``sampling interval``,
1. the 'prepare_access_checks' primitive picks one page in each region, 2. waits for one ``sampling interval``, 3. checks whether the page is accessed meanwhile, and 4. increases the access count of the region if so.
Therefore, the monitoring overhead is controllable by adjusting the number of regions. DAMON allows both the underlying primitives and user callbacks to adjust regions for the trade-off. In other words, this commit makes DAMON to use not only time-based sampling but also space-based sampling.
This scheme, however, cannot preserve the quality of the output if the assumption is not guaranteed. Next commit will address this problem.
Link: https://lkml.kernel.org/r/20210716081449.22187-3-sj38.park@gmail.com Signed-off-by: SeongJae Park sjpark@amazon.de Reviewed-by: Leonard Foerster foersleo@amazon.de Reviewed-by: Fernand Sieber sieberf@amazon.com Acked-by: Shakeel Butt shakeelb@google.com Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Amit Shah amit@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Brendan Higgins brendanhiggins@google.com Cc: David Hildenbrand david@redhat.com Cc: David Rientjes rientjes@google.com Cc: David Woodhouse dwmw@amazon.com Cc: Fan Du fan.du@intel.com Cc: Greg Kroah-Hartman greg@kroah.com Cc: Greg Thelen gthelen@google.com Cc: Ingo Molnar mingo@redhat.com Cc: Joe Perches joe@perches.com Cc: Jonathan Cameron Jonathan.Cameron@huawei.com Cc: Jonathan Corbet corbet@lwn.net Cc: Marco Elver elver@google.com Cc: Markus Boehme markubo@amazon.de Cc: Maximilian Heyne mheyne@amazon.de Cc: Mel Gorman mgorman@suse.de Cc: Minchan Kim minchan@kernel.org Cc: Namhyung Kim namhyung@kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: Rik van Riel riel@surriel.com Cc: Shuah Khan shuah@kernel.org Cc: Steven Rostedt (VMware) rostedt@goodmis.org Cc: Vladimir Davydov vdavydov.dev@gmail.com Cc: Vlastimil Babka vbabka@suse.cz Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit f23b8eee1871a6db5c37f90831147de5426c40b7) Signed-off-by: Yue Zou zouyue3@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/damon.h | 77 ++++++++++++++++++++++- mm/damon/core.c | 143 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 213 insertions(+), 7 deletions(-)
diff --git a/include/linux/damon.h b/include/linux/damon.h index 2f652602b1ea..67db309ad61b 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -12,6 +12,48 @@ #include <linux/time64.h> #include <linux/types.h>
+/** + * struct damon_addr_range - Represents an address region of [@start, @end). + * @start: Start address of the region (inclusive). + * @end: End address of the region (exclusive). + */ +struct damon_addr_range { + unsigned long start; + unsigned long end; +}; + +/** + * struct damon_region - Represents a monitoring target region. + * @ar: The address range of the region. + * @sampling_addr: Address of the sample for the next access check. + * @nr_accesses: Access frequency of this region. + * @list: List head for siblings. + */ +struct damon_region { + struct damon_addr_range ar; + unsigned long sampling_addr; + unsigned int nr_accesses; + struct list_head list; +}; + +/** + * struct damon_target - Represents a monitoring target. + * @id: Unique identifier for this target. + * @regions_list: Head of the monitoring target regions of this target. + * @list: List head for siblings. + * + * Each monitoring context could have multiple targets. For example, a context + * for virtual memory address spaces could have multiple target processes. The + * @id of each target should be unique among the targets of the context. For + * example, in the virtual address monitoring context, it could be a pidfd or + * an address of an mm_struct. + */ +struct damon_target { + unsigned long id; + struct list_head regions_list; + struct list_head list; +}; + struct damon_ctx;
/** @@ -36,7 +78,7 @@ struct damon_ctx; * * @init should initialize primitive-internal data structures. For example, * this could be used to construct proper monitoring target regions and link - * those to @damon_ctx.target. + * those to @damon_ctx.adaptive_targets. * @update should update the primitive-internal data structures. For example, * this could be used to update monitoring target regions for current status. * @prepare_access_checks should manipulate the monitoring regions to be @@ -130,7 +172,7 @@ struct damon_callback { * @primitive: Set of monitoring primitives for given use cases. * @callback: Set of callbacks for monitoring events notifications. * - * @target: Pointer to the user-defined monitoring target. + * @region_targets: Head of monitoring targets (&damon_target) list. */ struct damon_ctx { unsigned long sample_interval; @@ -149,11 +191,40 @@ struct damon_ctx { struct damon_primitive primitive; struct damon_callback callback;
- void *target; + struct list_head region_targets; };
+#define damon_next_region(r) \ + (container_of(r->list.next, struct damon_region, list)) + +#define damon_prev_region(r) \ + (container_of(r->list.prev, struct damon_region, list)) + +#define damon_for_each_region(r, t) \ + list_for_each_entry(r, &t->regions_list, list) + +#define damon_for_each_region_safe(r, next, t) \ + list_for_each_entry_safe(r, next, &t->regions_list, list) + +#define damon_for_each_target(t, ctx) \ + list_for_each_entry(t, &(ctx)->region_targets, list) + +#define damon_for_each_target_safe(t, next, ctx) \ + list_for_each_entry_safe(t, next, &(ctx)->region_targets, list) + #ifdef CONFIG_DAMON
+struct damon_region *damon_new_region(unsigned long start, unsigned long end); +inline void damon_insert_region(struct damon_region *r, + struct damon_region *prev, struct damon_region *next); +void damon_add_region(struct damon_region *r, struct damon_target *t); +void damon_destroy_region(struct damon_region *r); + +struct damon_target *damon_new_target(unsigned long id); +void damon_add_target(struct damon_ctx *ctx, struct damon_target *t); +void damon_free_target(struct damon_target *t); +void damon_destroy_target(struct damon_target *t); + struct damon_ctx *damon_new_ctx(void); void damon_destroy_ctx(struct damon_ctx *ctx); int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, diff --git a/mm/damon/core.c b/mm/damon/core.c index 651590bf49b1..947486a150ce 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -15,6 +15,101 @@ static DEFINE_MUTEX(damon_lock); static int nr_running_ctxs;
+/* + * Construct a damon_region struct + * + * Returns the pointer to the new struct if success, or NULL otherwise + */ +struct damon_region *damon_new_region(unsigned long start, unsigned long end) +{ + struct damon_region *region; + + region = kmalloc(sizeof(*region), GFP_KERNEL); + if (!region) + return NULL; + + region->ar.start = start; + region->ar.end = end; + region->nr_accesses = 0; + INIT_LIST_HEAD(®ion->list); + + return region; +} + +/* + * Add a region between two other regions + */ +inline void damon_insert_region(struct damon_region *r, + struct damon_region *prev, struct damon_region *next) +{ + __list_add(&r->list, &prev->list, &next->list); +} + +void damon_add_region(struct damon_region *r, struct damon_target *t) +{ + list_add_tail(&r->list, &t->regions_list); +} + +static void damon_del_region(struct damon_region *r) +{ + list_del(&r->list); +} + +static void damon_free_region(struct damon_region *r) +{ + kfree(r); +} + +void damon_destroy_region(struct damon_region *r) +{ + damon_del_region(r); + damon_free_region(r); +} + +/* + * Construct a damon_target struct + * + * Returns the pointer to the new struct if success, or NULL otherwise + */ +struct damon_target *damon_new_target(unsigned long id) +{ + struct damon_target *t; + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return NULL; + + t->id = id; + INIT_LIST_HEAD(&t->regions_list); + + return t; +} + +void damon_add_target(struct damon_ctx *ctx, struct damon_target *t) +{ + list_add_tail(&t->list, &ctx->region_targets); +} + +static void damon_del_target(struct damon_target *t) +{ + list_del(&t->list); +} + +void damon_free_target(struct damon_target *t) +{ + struct damon_region *r, *next; + + damon_for_each_region_safe(r, next, t) + damon_free_region(r); + kfree(t); +} + +void damon_destroy_target(struct damon_target *t) +{ + damon_del_target(t); + damon_free_target(t); +} + struct damon_ctx *damon_new_ctx(void) { struct damon_ctx *ctx; @@ -32,15 +127,27 @@ struct damon_ctx *damon_new_ctx(void)
mutex_init(&ctx->kdamond_lock);
- ctx->target = NULL; + INIT_LIST_HEAD(&ctx->region_targets);
return ctx; }
-void damon_destroy_ctx(struct damon_ctx *ctx) +static void damon_destroy_targets(struct damon_ctx *ctx) { - if (ctx->primitive.cleanup) + struct damon_target *t, *next_t; + + if (ctx->primitive.cleanup) { ctx->primitive.cleanup(ctx); + return; + } + + damon_for_each_target_safe(t, next_t, ctx) + damon_destroy_target(t); +} + +void damon_destroy_ctx(struct damon_ctx *ctx) +{ + damon_destroy_targets(ctx); kfree(ctx); }
@@ -217,6 +324,21 @@ static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx) ctx->aggr_interval); }
+/* + * Reset the aggregated monitoring results ('nr_accesses' of each region). + */ +static void kdamond_reset_aggregated(struct damon_ctx *c) +{ + struct damon_target *t; + + damon_for_each_target(t, c) { + struct damon_region *r; + + damon_for_each_region(r, t) + r->nr_accesses = 0; + } +} + /* * Check whether it is time to check and apply the target monitoring regions * @@ -238,6 +360,7 @@ static bool kdamond_need_update_primitive(struct damon_ctx *ctx) */ static bool kdamond_need_stop(struct damon_ctx *ctx) { + struct damon_target *t; bool stop;
mutex_lock(&ctx->kdamond_lock); @@ -249,7 +372,12 @@ static bool kdamond_need_stop(struct damon_ctx *ctx) if (!ctx->primitive.target_valid) return false;
- return !ctx->primitive.target_valid(ctx->target); + damon_for_each_target(t, ctx) { + if (ctx->primitive.target_valid(t)) + return false; + } + + return true; }
static void set_kdamond_stop(struct damon_ctx *ctx) @@ -265,6 +393,8 @@ static void set_kdamond_stop(struct damon_ctx *ctx) static int kdamond_fn(void *data) { struct damon_ctx *ctx = (struct damon_ctx *)data; + struct damon_target *t; + struct damon_region *r, *next;
mutex_lock(&ctx->kdamond_lock); pr_info("kdamond (%d) starts\n", ctx->kdamond->pid); @@ -291,6 +421,7 @@ static int kdamond_fn(void *data) if (ctx->callback.after_aggregation && ctx->callback.after_aggregation(ctx)) set_kdamond_stop(ctx); + kdamond_reset_aggregated(ctx); if (ctx->primitive.reset_aggregated) ctx->primitive.reset_aggregated(ctx); } @@ -300,6 +431,10 @@ static int kdamond_fn(void *data) ctx->primitive.update(ctx); } } + damon_for_each_target(t, ctx) { + damon_for_each_region_safe(r, next, t) + damon_destroy_region(r); + }
if (ctx->callback.before_terminate && ctx->callback.before_terminate(ctx))
From: SeongJae Park sjpark@amazon.de
mainline inclusion from mainline-5.15-rc1 commit b9a6ac4e4ede4172d165c133398b93e3233b0ba7 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4GVMK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
------------------------------------------------- Even somehow the initial monitoring target regions are well constructed to fulfill the assumption (pages in same region have similar access frequencies), the data access pattern can be dynamically changed. This will result in low monitoring quality. To keep the assumption as much as possible, DAMON adaptively merges and splits each region based on their access frequency.
For each ``aggregation interval``, it compares the access frequencies of adjacent regions and merges those if the frequency difference is small. Then, after it reports and clears the aggregated access frequency of each region, it splits each region into two or three regions if the total number of regions will not exceed the user-specified maximum number of regions after the split.
In this way, DAMON provides its best-effort quality and minimal overhead while keeping the upper-bound overhead that users set.
Link: https://lkml.kernel.org/r/20210716081449.22187-4-sj38.park@gmail.com Signed-off-by: SeongJae Park sjpark@amazon.de Reviewed-by: Leonard Foerster foersleo@amazon.de Reviewed-by: Fernand Sieber sieberf@amazon.com Acked-by: Shakeel Butt shakeelb@google.com Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Amit Shah amit@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Brendan Higgins brendanhiggins@google.com Cc: David Hildenbrand david@redhat.com Cc: David Rientjes rientjes@google.com Cc: David Woodhouse dwmw@amazon.com Cc: Fan Du fan.du@intel.com Cc: Greg Kroah-Hartman greg@kroah.com Cc: Greg Thelen gthelen@google.com Cc: Ingo Molnar mingo@redhat.com Cc: Joe Perches joe@perches.com Cc: Jonathan Cameron Jonathan.Cameron@huawei.com Cc: Jonathan Corbet corbet@lwn.net Cc: Marco Elver elver@google.com Cc: Markus Boehme markubo@amazon.de Cc: Maximilian Heyne mheyne@amazon.de Cc: Mel Gorman mgorman@suse.de Cc: Minchan Kim minchan@kernel.org Cc: Namhyung Kim namhyung@kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: Rik van Riel riel@surriel.com Cc: Shuah Khan shuah@kernel.org Cc: Steven Rostedt (VMware) rostedt@goodmis.org Cc: Vladimir Davydov vdavydov.dev@gmail.com Cc: Vlastimil Babka vbabka@suse.cz Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit b9a6ac4e4ede4172d165c133398b93e3233b0ba7) Signed-off-by: Yue Zou zouyue3@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/damon.h | 30 ++++-- mm/damon/core.c | 224 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 237 insertions(+), 17 deletions(-)
diff --git a/include/linux/damon.h b/include/linux/damon.h index 67db309ad61b..ce2a84b26cd7 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -12,6 +12,9 @@ #include <linux/time64.h> #include <linux/types.h>
+/* Minimal region size. Every damon_region is aligned by this. */ +#define DAMON_MIN_REGION PAGE_SIZE + /** * struct damon_addr_range - Represents an address region of [@start, @end). * @start: Start address of the region (inclusive). @@ -39,6 +42,7 @@ struct damon_region { /** * struct damon_target - Represents a monitoring target. * @id: Unique identifier for this target. + * @nr_regions: Number of monitoring target regions of this target. * @regions_list: Head of the monitoring target regions of this target. * @list: List head for siblings. * @@ -50,6 +54,7 @@ struct damon_region { */ struct damon_target { unsigned long id; + unsigned int nr_regions; struct list_head regions_list; struct list_head list; }; @@ -85,6 +90,8 @@ struct damon_ctx; * prepared for the next access check. * @check_accesses should check the accesses to each region that made after the * last preparation and update the number of observed accesses of each region. + * It should also return max number of observed accesses that made as a result + * of its update. The value will be used for regions adjustment threshold. * @reset_aggregated should reset the access monitoring results that aggregated * by @check_accesses. * @target_valid should check whether the target is still valid for the @@ -95,7 +102,7 @@ struct damon_primitive { void (*init)(struct damon_ctx *context); void (*update)(struct damon_ctx *context); void (*prepare_access_checks)(struct damon_ctx *context); - void (*check_accesses)(struct damon_ctx *context); + unsigned int (*check_accesses)(struct damon_ctx *context); void (*reset_aggregated)(struct damon_ctx *context); bool (*target_valid)(void *target); void (*cleanup)(struct damon_ctx *context); @@ -172,7 +179,9 @@ struct damon_callback { * @primitive: Set of monitoring primitives for given use cases. * @callback: Set of callbacks for monitoring events notifications. * - * @region_targets: Head of monitoring targets (&damon_target) list. + * @min_nr_regions: The minimum number of adaptive monitoring regions. + * @max_nr_regions: The maximum number of adaptive monitoring regions. + * @adaptive_targets: Head of monitoring targets (&damon_target) list. */ struct damon_ctx { unsigned long sample_interval; @@ -191,7 +200,9 @@ struct damon_ctx { struct damon_primitive primitive; struct damon_callback callback;
- struct list_head region_targets; + unsigned long min_nr_regions; + unsigned long max_nr_regions; + struct list_head adaptive_targets; };
#define damon_next_region(r) \ @@ -207,28 +218,31 @@ struct damon_ctx { list_for_each_entry_safe(r, next, &t->regions_list, list)
#define damon_for_each_target(t, ctx) \ - list_for_each_entry(t, &(ctx)->region_targets, list) + list_for_each_entry(t, &(ctx)->adaptive_targets, list)
#define damon_for_each_target_safe(t, next, ctx) \ - list_for_each_entry_safe(t, next, &(ctx)->region_targets, list) + list_for_each_entry_safe(t, next, &(ctx)->adaptive_targets, list)
#ifdef CONFIG_DAMON
struct damon_region *damon_new_region(unsigned long start, unsigned long end); inline void damon_insert_region(struct damon_region *r, - struct damon_region *prev, struct damon_region *next); + struct damon_region *prev, struct damon_region *next, + struct damon_target *t); void damon_add_region(struct damon_region *r, struct damon_target *t); -void damon_destroy_region(struct damon_region *r); +void damon_destroy_region(struct damon_region *r, struct damon_target *t);
struct damon_target *damon_new_target(unsigned long id); void damon_add_target(struct damon_ctx *ctx, struct damon_target *t); void damon_free_target(struct damon_target *t); void damon_destroy_target(struct damon_target *t); +unsigned int damon_nr_regions(struct damon_target *t);
struct damon_ctx *damon_new_ctx(void); void damon_destroy_ctx(struct damon_ctx *ctx); int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, - unsigned long aggr_int, unsigned long primitive_upd_int); + unsigned long aggr_int, unsigned long primitive_upd_int, + unsigned long min_nr_reg, unsigned long max_nr_reg);
int damon_start(struct damon_ctx **ctxs, int nr_ctxs); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); diff --git a/mm/damon/core.c b/mm/damon/core.c index 947486a150ce..28a2c78914fa 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -10,8 +10,12 @@ #include <linux/damon.h> #include <linux/delay.h> #include <linux/kthread.h> +#include <linux/random.h> #include <linux/slab.h>
+/* Get a random number in [l, r) */ +#define damon_rand(l, r) (l + prandom_u32_max(r - l)) + static DEFINE_MUTEX(damon_lock); static int nr_running_ctxs;
@@ -40,19 +44,23 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end) * Add a region between two other regions */ inline void damon_insert_region(struct damon_region *r, - struct damon_region *prev, struct damon_region *next) + struct damon_region *prev, struct damon_region *next, + struct damon_target *t) { __list_add(&r->list, &prev->list, &next->list); + t->nr_regions++; }
void damon_add_region(struct damon_region *r, struct damon_target *t) { list_add_tail(&r->list, &t->regions_list); + t->nr_regions++; }
-static void damon_del_region(struct damon_region *r) +static void damon_del_region(struct damon_region *r, struct damon_target *t) { list_del(&r->list); + t->nr_regions--; }
static void damon_free_region(struct damon_region *r) @@ -60,9 +68,9 @@ static void damon_free_region(struct damon_region *r) kfree(r); }
-void damon_destroy_region(struct damon_region *r) +void damon_destroy_region(struct damon_region *r, struct damon_target *t) { - damon_del_region(r); + damon_del_region(r, t); damon_free_region(r); }
@@ -80,6 +88,7 @@ struct damon_target *damon_new_target(unsigned long id) return NULL;
t->id = id; + t->nr_regions = 0; INIT_LIST_HEAD(&t->regions_list);
return t; @@ -87,7 +96,7 @@ struct damon_target *damon_new_target(unsigned long id)
void damon_add_target(struct damon_ctx *ctx, struct damon_target *t) { - list_add_tail(&t->list, &ctx->region_targets); + list_add_tail(&t->list, &ctx->adaptive_targets); }
static void damon_del_target(struct damon_target *t) @@ -110,6 +119,11 @@ void damon_destroy_target(struct damon_target *t) damon_free_target(t); }
+unsigned int damon_nr_regions(struct damon_target *t) +{ + return t->nr_regions; +} + struct damon_ctx *damon_new_ctx(void) { struct damon_ctx *ctx; @@ -127,7 +141,10 @@ struct damon_ctx *damon_new_ctx(void)
mutex_init(&ctx->kdamond_lock);
- INIT_LIST_HEAD(&ctx->region_targets); + ctx->min_nr_regions = 10; + ctx->max_nr_regions = 1000; + + INIT_LIST_HEAD(&ctx->adaptive_targets);
return ctx; } @@ -157,6 +174,8 @@ void damon_destroy_ctx(struct damon_ctx *ctx) * @sample_int: time interval between samplings * @aggr_int: time interval between aggregations * @primitive_upd_int: time interval between monitoring primitive updates + * @min_nr_reg: minimal number of regions + * @max_nr_reg: maximum number of regions * * This function should not be called while the kdamond is running. * Every time interval is in micro-seconds. @@ -164,15 +183,49 @@ void damon_destroy_ctx(struct damon_ctx *ctx) * Return: 0 on success, negative error code otherwise. */ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, - unsigned long aggr_int, unsigned long primitive_upd_int) + unsigned long aggr_int, unsigned long primitive_upd_int, + unsigned long min_nr_reg, unsigned long max_nr_reg) { + if (min_nr_reg < 3) { + pr_err("min_nr_regions (%lu) must be at least 3\n", + min_nr_reg); + return -EINVAL; + } + if (min_nr_reg > max_nr_reg) { + pr_err("invalid nr_regions. min (%lu) > max (%lu)\n", + min_nr_reg, max_nr_reg); + return -EINVAL; + } + ctx->sample_interval = sample_int; ctx->aggr_interval = aggr_int; ctx->primitive_update_interval = primitive_upd_int; + ctx->min_nr_regions = min_nr_reg; + ctx->max_nr_regions = max_nr_reg;
return 0; }
+/* Returns the size upper limit for each monitoring region */ +static unsigned long damon_region_sz_limit(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct damon_region *r; + unsigned long sz = 0; + + damon_for_each_target(t, ctx) { + damon_for_each_region(r, t) + sz += r->ar.end - r->ar.start; + } + + if (ctx->min_nr_regions) + sz /= ctx->min_nr_regions; + if (sz < DAMON_MIN_REGION) + sz = DAMON_MIN_REGION; + + return sz; +} + static bool damon_kdamond_running(struct damon_ctx *ctx) { bool running; @@ -339,6 +392,150 @@ static void kdamond_reset_aggregated(struct damon_ctx *c) } }
+#define sz_damon_region(r) (r->ar.end - r->ar.start) + +/* + * Merge two adjacent regions into one region + */ +static void damon_merge_two_regions(struct damon_target *t, + struct damon_region *l, struct damon_region *r) +{ + unsigned long sz_l = sz_damon_region(l), sz_r = sz_damon_region(r); + + l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) / + (sz_l + sz_r); + l->ar.end = r->ar.end; + damon_destroy_region(r, t); +} + +#define diff_of(a, b) (a > b ? a - b : b - a) + +/* + * Merge adjacent regions having similar access frequencies + * + * t target affected by this merge operation + * thres '->nr_accesses' diff threshold for the merge + * sz_limit size upper limit of each region + */ +static void damon_merge_regions_of(struct damon_target *t, unsigned int thres, + unsigned long sz_limit) +{ + struct damon_region *r, *prev = NULL, *next; + + damon_for_each_region_safe(r, next, t) { + if (prev && prev->ar.end == r->ar.start && + diff_of(prev->nr_accesses, r->nr_accesses) <= thres && + sz_damon_region(prev) + sz_damon_region(r) <= sz_limit) + damon_merge_two_regions(t, prev, r); + else + prev = r; + } +} + +/* + * Merge adjacent regions having similar access frequencies + * + * threshold '->nr_accesses' diff threshold for the merge + * sz_limit size upper limit of each region + * + * This function merges monitoring target regions which are adjacent and their + * access frequencies are similar. This is for minimizing the monitoring + * overhead under the dynamically changeable access pattern. If a merge was + * unnecessarily made, later 'kdamond_split_regions()' will revert it. + */ +static void kdamond_merge_regions(struct damon_ctx *c, unsigned int threshold, + unsigned long sz_limit) +{ + struct damon_target *t; + + damon_for_each_target(t, c) + damon_merge_regions_of(t, threshold, sz_limit); +} + +/* + * Split a region in two + * + * r the region to be split + * sz_r size of the first sub-region that will be made + */ +static void damon_split_region_at(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + unsigned long sz_r) +{ + struct damon_region *new; + + new = damon_new_region(r->ar.start + sz_r, r->ar.end); + if (!new) + return; + + r->ar.end = new->ar.start; + + damon_insert_region(new, r, damon_next_region(r), t); +} + +/* Split every region in the given target into 'nr_subs' regions */ +static void damon_split_regions_of(struct damon_ctx *ctx, + struct damon_target *t, int nr_subs) +{ + struct damon_region *r, *next; + unsigned long sz_region, sz_sub = 0; + int i; + + damon_for_each_region_safe(r, next, t) { + sz_region = r->ar.end - r->ar.start; + + for (i = 0; i < nr_subs - 1 && + sz_region > 2 * DAMON_MIN_REGION; i++) { + /* + * Randomly select size of left sub-region to be at + * least 10 percent and at most 90% of original region + */ + sz_sub = ALIGN_DOWN(damon_rand(1, 10) * + sz_region / 10, DAMON_MIN_REGION); + /* Do not allow blank region */ + if (sz_sub == 0 || sz_sub >= sz_region) + continue; + + damon_split_region_at(ctx, t, r, sz_sub); + sz_region = sz_sub; + } + } +} + +/* + * Split every target region into randomly-sized small regions + * + * This function splits every target region into random-sized small regions if + * current total number of the regions is equal or smaller than half of the + * user-specified maximum number of regions. This is for maximizing the + * monitoring accuracy under the dynamically changeable access patterns. If a + * split was unnecessarily made, later 'kdamond_merge_regions()' will revert + * it. + */ +static void kdamond_split_regions(struct damon_ctx *ctx) +{ + struct damon_target *t; + unsigned int nr_regions = 0; + static unsigned int last_nr_regions; + int nr_subregions = 2; + + damon_for_each_target(t, ctx) + nr_regions += damon_nr_regions(t); + + if (nr_regions > ctx->max_nr_regions / 2) + return; + + /* Maybe the middle of the region has different access frequency */ + if (last_nr_regions == nr_regions && + nr_regions < ctx->max_nr_regions / 3) + nr_subregions = 3; + + damon_for_each_target(t, ctx) + damon_split_regions_of(ctx, t, nr_subregions); + + last_nr_regions = nr_regions; +} + /* * Check whether it is time to check and apply the target monitoring regions * @@ -395,6 +592,8 @@ static int kdamond_fn(void *data) struct damon_ctx *ctx = (struct damon_ctx *)data; struct damon_target *t; struct damon_region *r, *next; + unsigned int max_nr_accesses = 0; + unsigned long sz_limit = 0;
mutex_lock(&ctx->kdamond_lock); pr_info("kdamond (%d) starts\n", ctx->kdamond->pid); @@ -405,6 +604,8 @@ static int kdamond_fn(void *data) if (ctx->callback.before_start && ctx->callback.before_start(ctx)) set_kdamond_stop(ctx);
+ sz_limit = damon_region_sz_limit(ctx); + while (!kdamond_need_stop(ctx)) { if (ctx->primitive.prepare_access_checks) ctx->primitive.prepare_access_checks(ctx); @@ -415,13 +616,17 @@ static int kdamond_fn(void *data) usleep_range(ctx->sample_interval, ctx->sample_interval + 1);
if (ctx->primitive.check_accesses) - ctx->primitive.check_accesses(ctx); + max_nr_accesses = ctx->primitive.check_accesses(ctx);
if (kdamond_aggregate_interval_passed(ctx)) { + kdamond_merge_regions(ctx, + max_nr_accesses / 10, + sz_limit); if (ctx->callback.after_aggregation && ctx->callback.after_aggregation(ctx)) set_kdamond_stop(ctx); kdamond_reset_aggregated(ctx); + kdamond_split_regions(ctx); if (ctx->primitive.reset_aggregated) ctx->primitive.reset_aggregated(ctx); } @@ -429,11 +634,12 @@ static int kdamond_fn(void *data) if (kdamond_need_update_primitive(ctx)) { if (ctx->primitive.update) ctx->primitive.update(ctx); + sz_limit = damon_region_sz_limit(ctx); } } damon_for_each_target(t, ctx) { damon_for_each_region_safe(r, next, t) - damon_destroy_region(r); + damon_destroy_region(r, t); }
if (ctx->callback.before_terminate &&
From: SeongJae Park sjpark@amazon.de
mainline inclusion from mainline-5.15-rc1 commit 1c676e0d9b1a59b98885b24a0e16a81fe4cc8301 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4GVMK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
------------------------------------------------- PG_idle and PG_young allow the two PTE Accessed bit users, Idle Page Tracking and the reclaim logic concurrently work while not interfering with each other. That is, when they need to clear the Accessed bit, they set PG_young to represent the previous state of the bit, respectively. And when they need to read the bit, if the bit is cleared, they further read the PG_young to know whether the other has cleared the bit meanwhile or not.
For yet another user of the PTE Accessed bit, we could add another page flag, or extend the mechanism to use the flags. For the DAMON usecase, however, we don't need to do that just yet. IDLE_PAGE_TRACKING and DAMON are mutually exclusive, so there's only ever going to be one user of the current set of flags.
In this commit, we split out the CONFIG options to allow for the use of PG_young and PG_idle outside of idle page tracking.
In the next commit, DAMON's reference implementation of the virtual memory address space monitoring primitives will use it.
[sjpark@amazon.de: set PAGE_EXTENSION for non-64BIT] Link: https://lkml.kernel.org/r/20210806095153.6444-1-sj38.park@gmail.com [akpm@linux-foundation.org: tweak Kconfig text] [sjpark@amazon.de: hide PAGE_IDLE_FLAG from users] Link: https://lkml.kernel.org/r/20210813081238.34705-1-sj38.park@gmail.com
Link: https://lkml.kernel.org/r/20210716081449.22187-5-sj38.park@gmail.com Signed-off-by: SeongJae Park sjpark@amazon.de Reviewed-by: Shakeel Butt shakeelb@google.com Reviewed-by: Fernand Sieber sieberf@amazon.com Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Amit Shah amit@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Brendan Higgins brendanhiggins@google.com Cc: David Hildenbrand david@redhat.com Cc: David Rientjes rientjes@google.com Cc: David Woodhouse dwmw@amazon.com Cc: Fan Du fan.du@intel.com Cc: Greg Kroah-Hartman greg@kroah.com Cc: Greg Thelen gthelen@google.com Cc: Ingo Molnar mingo@redhat.com Cc: Joe Perches joe@perches.com Cc: Jonathan Cameron Jonathan.Cameron@huawei.com Cc: Jonathan Corbet corbet@lwn.net Cc: Leonard Foerster foersleo@amazon.de Cc: Marco Elver elver@google.com Cc: Markus Boehme markubo@amazon.de Cc: Maximilian Heyne mheyne@amazon.de Cc: Mel Gorman mgorman@suse.de Cc: Minchan Kim minchan@kernel.org Cc: Namhyung Kim namhyung@kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: Rik van Riel riel@surriel.com Cc: Shuah Khan shuah@kernel.org Cc: Steven Rostedt (VMware) rostedt@goodmis.org Cc: Vladimir Davydov vdavydov.dev@gmail.com Cc: Vlastimil Babka vbabka@suse.cz Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit 1c676e0d9b1a59b98885b24a0e16a81fe4cc8301) Signed-off-by: Yue Zou zouyue3@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/page-flags.h | 4 ++-- include/linux/page_ext.h | 2 +- include/linux/page_idle.h | 6 +++--- include/trace/events/mmflags.h | 2 +- mm/Kconfig | 10 +++++++++- mm/page_ext.c | 12 +++++++++++- mm/page_idle.c | 10 ---------- 7 files changed, 27 insertions(+), 19 deletions(-)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5fed83d29ea8..56d9a03119a3 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -132,7 +132,7 @@ enum pageflags { #ifdef CONFIG_MEMORY_FAILURE PG_hwpoison, /* hardware poisoned page. Don't touch */ #endif -#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) +#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) PG_young, PG_idle, #endif @@ -438,7 +438,7 @@ PAGEFLAG_FALSE(HWPoison) #define __PG_HWPOISON 0 #endif
-#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) +#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) TESTPAGEFLAG(Young, young, PF_ANY) SETPAGEFLAG(Young, young, PF_ANY) TESTCLEARFLAG(Young, young, PF_ANY) diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index aff81ba31bd8..fabb2e1e087f 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -19,7 +19,7 @@ struct page_ext_operations { enum page_ext_flags { PAGE_EXT_OWNER, PAGE_EXT_OWNER_ALLOCATED, -#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) +#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) PAGE_EXT_YOUNG, PAGE_EXT_IDLE, #endif diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index 1e894d34bdce..d8a6aecf99cb 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -6,7 +6,7 @@ #include <linux/page-flags.h> #include <linux/page_ext.h>
-#ifdef CONFIG_IDLE_PAGE_TRACKING +#ifdef CONFIG_PAGE_IDLE_FLAG
#ifdef CONFIG_64BIT static inline bool page_is_young(struct page *page) @@ -106,7 +106,7 @@ static inline void clear_page_idle(struct page *page) } #endif /* CONFIG_64BIT */
-#else /* !CONFIG_IDLE_PAGE_TRACKING */ +#else /* !CONFIG_PAGE_IDLE_FLAG */
static inline bool page_is_young(struct page *page) { @@ -135,6 +135,6 @@ static inline void clear_page_idle(struct page *page) { }
-#endif /* CONFIG_IDLE_PAGE_TRACKING */ +#endif /* CONFIG_PAGE_IDLE_FLAG */
#endif /* _LINUX_MM_PAGE_IDLE_H */ diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 673c6f590800..792e0a2fa775 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -73,7 +73,7 @@ #define IF_HAVE_PG_HWPOISON(flag,string) #endif
-#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) +#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) #define IF_HAVE_PG_IDLE(flag,string) ,{1UL << flag, string} #else #define IF_HAVE_PG_IDLE(flag,string) diff --git a/mm/Kconfig b/mm/Kconfig index c002358860c8..59fdace319fd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -788,10 +788,18 @@ config DEFERRED_STRUCT_PAGE_INIT lifetime of the system until these kthreads finish the initialisation.
+config PAGE_IDLE_FLAG + bool + select PAGE_EXTENSION if !64BIT + help + This adds PG_idle and PG_young flags to 'struct page'. PTE Accessed + bit writers can set the state of the bit in the flags so that PTE + Accessed bit readers may avoid disturbance. + config IDLE_PAGE_TRACKING bool "Enable idle page tracking" depends on SYSFS && MMU - select PAGE_EXTENSION if !64BIT + select PAGE_IDLE_FLAG help This feature allows to estimate the amount of user pages that have not been touched during a given period of time. This information can diff --git a/mm/page_ext.c b/mm/page_ext.c index df6f74aac8e1..8e59da0f4367 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -58,11 +58,21 @@ * can utilize this callback to initialize the state of it correctly. */
+#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) +static bool need_page_idle(void) +{ + return true; +} +struct page_ext_operations page_idle_ops = { + .need = need_page_idle, +}; +#endif + static struct page_ext_operations *page_ext_ops[] = { #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif -#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) +#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) &page_idle_ops, #endif }; diff --git a/mm/page_idle.c b/mm/page_idle.c index 64e5344a992c..edead6a8a5f9 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -207,16 +207,6 @@ static const struct attribute_group page_idle_attr_group = { .name = "page_idle", };
-#ifndef CONFIG_64BIT -static bool need_page_idle(void) -{ - return true; -} -struct page_ext_operations page_idle_ops = { - .need = need_page_idle, -}; -#endif - static int __init page_idle_init(void) { int err;
From: SeongJae Park sjpark@amazon.de
mainline inclusion from mainline-5.15-rc1 commit 3f49584b262cf8f42b25f4c1ad9f5bfd3bdc1bca category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4GVMK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
------------------------------------------------- This commit introduces a reference implementation of the address space specific low level primitives for the virtual address space, so that users of DAMON can easily monitor the data accesses on virtual address spaces of specific processes by simply configuring the implementation to be used by DAMON.
The low level primitives for the fundamental access monitoring are defined in two parts:
1. Identification of the monitoring target address range for the address space. 2. Access check of specific address range in the target space.
The reference implementation for the virtual address space does the works as below.
PTE Accessed-bit Based Access Check -----------------------------------
The implementation uses PTE Accessed-bit for basic access checks. That is, it clears the bit for the next sampling target page and checks whether it is set again after one sampling period. This could disturb the reclaim logic. DAMON uses ``PG_idle`` and ``PG_young`` page flags to solve the conflict, as Idle page tracking does.
VMA-based Target Address Range Construction -------------------------------------------
Only small parts in the super-huge virtual address space of the processes are mapped to physical memory and accessed. Thus, tracking the unmapped address regions is just wasteful. However, because DAMON can deal with some level of noise using the adaptive regions adjustment mechanism, tracking every mapping is not strictly required but could even incur a high overhead in some cases. That said, too huge unmapped areas inside the monitoring target should be removed to not take the time for the adaptive mechanism.
For the reason, this implementation converts the complex mappings to three distinct regions that cover every mapped area of the address space. Also, the two gaps between the three regions are the two biggest unmapped areas in the given address space. The two biggest unmapped areas would be the gap between the heap and the uppermost mmap()-ed region, and the gap between the lowermost mmap()-ed region and the stack in most of the cases. Because these gaps are exceptionally huge in usual address spaces, excluding these will be sufficient to make a reasonable trade-off. Below shows this in detail::
<heap> <BIG UNMAPPED REGION 1> <uppermost mmap()-ed region> (small mmap()-ed regions and munmap()-ed regions) <lowermost mmap()-ed region> <BIG UNMAPPED REGION 2> <stack>
[akpm@linux-foundation.org: mm/damon/vaddr.c needs highmem.h for kunmap_atomic()] [sjpark@amazon.de: remove unnecessary PAGE_EXTENSION setup] Link: https://lkml.kernel.org/r/20210806095153.6444-2-sj38.park@gmail.com [sjpark@amazon.de: safely walk page table] Link: https://lkml.kernel.org/r/20210831161800.29419-1-sj38.park@gmail.com
Link: https://lkml.kernel.org/r/20210716081449.22187-6-sj38.park@gmail.com Signed-off-by: SeongJae Park sjpark@amazon.de Reviewed-by: Leonard Foerster foersleo@amazon.de Reviewed-by: Fernand Sieber sieberf@amazon.com Acked-by: Shakeel Butt shakeelb@google.com Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Amit Shah amit@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Brendan Higgins brendanhiggins@google.com Cc: David Hildenbrand david@redhat.com Cc: David Rientjes rientjes@google.com Cc: David Woodhouse dwmw@amazon.com Cc: Fan Du fan.du@intel.com Cc: Greg Kroah-Hartman greg@kroah.com Cc: Greg Thelen gthelen@google.com Cc: Ingo Molnar mingo@redhat.com Cc: Joe Perches joe@perches.com Cc: Jonathan Cameron Jonathan.Cameron@huawei.com Cc: Jonathan Corbet corbet@lwn.net Cc: Marco Elver elver@google.com Cc: Markus Boehme markubo@amazon.de Cc: Maximilian Heyne mheyne@amazon.de Cc: Mel Gorman mgorman@suse.de Cc: Minchan Kim minchan@kernel.org Cc: Namhyung Kim namhyung@kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: Rik van Riel riel@surriel.com Cc: Shuah Khan shuah@kernel.org Cc: Steven Rostedt (VMware) rostedt@goodmis.org Cc: Vladimir Davydov vdavydov.dev@gmail.com Cc: Vlastimil Babka vbabka@suse.cz Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit 3f49584b262cf8f42b25f4c1ad9f5bfd3bdc1bca) Signed-off-by: Yue Zou zouyue3@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/damon.h | 13 + mm/damon/Kconfig | 8 + mm/damon/Makefile | 1 + mm/damon/vaddr.c | 665 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 687 insertions(+) create mode 100644 mm/damon/vaddr.c
diff --git a/include/linux/damon.h b/include/linux/damon.h index ce2a84b26cd7..edb350e52b93 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -249,4 +249,17 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
#endif /* CONFIG_DAMON */
+#ifdef CONFIG_DAMON_VADDR + +/* Monitoring primitives for virtual memory address spaces */ +void damon_va_init(struct damon_ctx *ctx); +void damon_va_update(struct damon_ctx *ctx); +void damon_va_prepare_access_checks(struct damon_ctx *ctx); +unsigned int damon_va_check_accesses(struct damon_ctx *ctx); +bool damon_va_target_valid(void *t); +void damon_va_cleanup(struct damon_ctx *ctx); +void damon_va_set_primitives(struct damon_ctx *ctx); + +#endif /* CONFIG_DAMON_VADDR */ + #endif /* _DAMON_H */ diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index d00e99ac1a15..5cbb5db54158 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -12,4 +12,12 @@ config DAMON See https://damonitor.github.io/doc/html/latest-damon/index.html for more information.
+config DAMON_VADDR + bool "Data access monitoring primitives for virtual address spaces" + depends on DAMON && MMU + select PAGE_IDLE_FLAG + help + This builds the default data access monitoring primitives for DAMON + that works for virtual address spaces. + endmenu diff --git a/mm/damon/Makefile b/mm/damon/Makefile index 4fd2edb4becf..6ebbd08aed67 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_DAMON) := core.o +obj-$(CONFIG_DAMON_VADDR) += vaddr.o diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c new file mode 100644 index 000000000000..897aa8cf96c8 --- /dev/null +++ b/mm/damon/vaddr.c @@ -0,0 +1,665 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON Primitives for Virtual Address Spaces + * + * Author: SeongJae Park sjpark@amazon.de + */ + +#define pr_fmt(fmt) "damon-va: " fmt + +#include <linux/damon.h> +#include <linux/hugetlb.h> +#include <linux/mm.h> +#include <linux/mmu_notifier.h> +#include <linux/highmem.h> +#include <linux/page_idle.h> +#include <linux/pagewalk.h> +#include <linux/random.h> +#include <linux/sched/mm.h> +#include <linux/slab.h> + +/* Get a random number in [l, r) */ +#define damon_rand(l, r) (l + prandom_u32_max(r - l)) + +/* + * 't->id' should be the pointer to the relevant 'struct pid' having reference + * count. Caller must put the returned task, unless it is NULL. + */ +#define damon_get_task_struct(t) \ + (get_pid_task((struct pid *)t->id, PIDTYPE_PID)) + +/* + * Get the mm_struct of the given target + * + * Caller _must_ put the mm_struct after use, unless it is NULL. + * + * Returns the mm_struct of the target on success, NULL on failure + */ +static struct mm_struct *damon_get_mm(struct damon_target *t) +{ + struct task_struct *task; + struct mm_struct *mm; + + task = damon_get_task_struct(t); + if (!task) + return NULL; + + mm = get_task_mm(task); + put_task_struct(task); + return mm; +} + +/* + * Functions for the initial monitoring target regions construction + */ + +/* + * Size-evenly split a region into 'nr_pieces' small regions + * + * Returns 0 on success, or negative error code otherwise. + */ +static int damon_va_evenly_split_region(struct damon_target *t, + struct damon_region *r, unsigned int nr_pieces) +{ + unsigned long sz_orig, sz_piece, orig_end; + struct damon_region *n = NULL, *next; + unsigned long start; + + if (!r || !nr_pieces) + return -EINVAL; + + orig_end = r->ar.end; + sz_orig = r->ar.end - r->ar.start; + sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION); + + if (!sz_piece) + return -EINVAL; + + r->ar.end = r->ar.start + sz_piece; + next = damon_next_region(r); + for (start = r->ar.end; start + sz_piece <= orig_end; + start += sz_piece) { + n = damon_new_region(start, start + sz_piece); + if (!n) + return -ENOMEM; + damon_insert_region(n, r, next, t); + r = n; + } + /* complement last region for possible rounding error */ + if (n) + n->ar.end = orig_end; + + return 0; +} + +static unsigned long sz_range(struct damon_addr_range *r) +{ + return r->end - r->start; +} + +static void swap_ranges(struct damon_addr_range *r1, + struct damon_addr_range *r2) +{ + struct damon_addr_range tmp; + + tmp = *r1; + *r1 = *r2; + *r2 = tmp; +} + +/* + * Find three regions separated by two biggest unmapped regions + * + * vma the head vma of the target address space + * regions an array of three address ranges that results will be saved + * + * This function receives an address space and finds three regions in it which + * separated by the two biggest unmapped regions in the space. Please refer to + * below comments of '__damon_va_init_regions()' function to know why this is + * necessary. + * + * Returns 0 if success, or negative error code otherwise. + */ +static int __damon_va_three_regions(struct vm_area_struct *vma, + struct damon_addr_range regions[3]) +{ + struct damon_addr_range gap = {0}, first_gap = {0}, second_gap = {0}; + struct vm_area_struct *last_vma = NULL; + unsigned long start = 0; + struct rb_root rbroot; + + /* Find two biggest gaps so that first_gap > second_gap > others */ + for (; vma; vma = vma->vm_next) { + if (!last_vma) { + start = vma->vm_start; + goto next; + } + + if (vma->rb_subtree_gap <= sz_range(&second_gap)) { + rbroot.rb_node = &vma->vm_rb; + vma = rb_entry(rb_last(&rbroot), + struct vm_area_struct, vm_rb); + goto next; + } + + gap.start = last_vma->vm_end; + gap.end = vma->vm_start; + if (sz_range(&gap) > sz_range(&second_gap)) { + swap_ranges(&gap, &second_gap); + if (sz_range(&second_gap) > sz_range(&first_gap)) + swap_ranges(&second_gap, &first_gap); + } +next: + last_vma = vma; + } + + if (!sz_range(&second_gap) || !sz_range(&first_gap)) + return -EINVAL; + + /* Sort the two biggest gaps by address */ + if (first_gap.start > second_gap.start) + swap_ranges(&first_gap, &second_gap); + + /* Store the result */ + regions[0].start = ALIGN(start, DAMON_MIN_REGION); + regions[0].end = ALIGN(first_gap.start, DAMON_MIN_REGION); + regions[1].start = ALIGN(first_gap.end, DAMON_MIN_REGION); + regions[1].end = ALIGN(second_gap.start, DAMON_MIN_REGION); + regions[2].start = ALIGN(second_gap.end, DAMON_MIN_REGION); + regions[2].end = ALIGN(last_vma->vm_end, DAMON_MIN_REGION); + + return 0; +} + +/* + * Get the three regions in the given target (task) + * + * Returns 0 on success, negative error code otherwise. + */ +static int damon_va_three_regions(struct damon_target *t, + struct damon_addr_range regions[3]) +{ + struct mm_struct *mm; + int rc; + + mm = damon_get_mm(t); + if (!mm) + return -EINVAL; + + mmap_read_lock(mm); + rc = __damon_va_three_regions(mm->mmap, regions); + mmap_read_unlock(mm); + + mmput(mm); + return rc; +} + +/* + * Initialize the monitoring target regions for the given target (task) + * + * t the given target + * + * Because only a number of small portions of the entire address space + * is actually mapped to the memory and accessed, monitoring the unmapped + * regions is wasteful. That said, because we can deal with small noises, + * tracking every mapping is not strictly required but could even incur a high + * overhead if the mapping frequently changes or the number of mappings is + * high. The adaptive regions adjustment mechanism will further help to deal + * with the noise by simply identifying the unmapped areas as a region that + * has no access. Moreover, applying the real mappings that would have many + * unmapped areas inside will make the adaptive mechanism quite complex. That + * said, too huge unmapped areas inside the monitoring target should be removed + * to not take the time for the adaptive mechanism. + * + * For the reason, we convert the complex mappings to three distinct regions + * that cover every mapped area of the address space. Also the two gaps + * between the three regions are the two biggest unmapped areas in the given + * address space. In detail, this function first identifies the start and the + * end of the mappings and the two biggest unmapped areas of the address space. + * Then, it constructs the three regions as below: + * + * [mappings[0]->start, big_two_unmapped_areas[0]->start) + * [big_two_unmapped_areas[0]->end, big_two_unmapped_areas[1]->start) + * [big_two_unmapped_areas[1]->end, mappings[nr_mappings - 1]->end) + * + * As usual memory map of processes is as below, the gap between the heap and + * the uppermost mmap()-ed region, and the gap between the lowermost mmap()-ed + * region and the stack will be two biggest unmapped regions. Because these + * gaps are exceptionally huge areas in usual address space, excluding these + * two biggest unmapped regions will be sufficient to make a trade-off. + * + * <heap> + * <BIG UNMAPPED REGION 1> + * <uppermost mmap()-ed region> + * (other mmap()-ed regions and small unmapped regions) + * <lowermost mmap()-ed region> + * <BIG UNMAPPED REGION 2> + * <stack> + */ +static void __damon_va_init_regions(struct damon_ctx *ctx, + struct damon_target *t) +{ + struct damon_region *r; + struct damon_addr_range regions[3]; + unsigned long sz = 0, nr_pieces; + int i; + + if (damon_va_three_regions(t, regions)) { + pr_err("Failed to get three regions of target %lu\n", t->id); + return; + } + + for (i = 0; i < 3; i++) + sz += regions[i].end - regions[i].start; + if (ctx->min_nr_regions) + sz /= ctx->min_nr_regions; + if (sz < DAMON_MIN_REGION) + sz = DAMON_MIN_REGION; + + /* Set the initial three regions of the target */ + for (i = 0; i < 3; i++) { + r = damon_new_region(regions[i].start, regions[i].end); + if (!r) { + pr_err("%d'th init region creation failed\n", i); + return; + } + damon_add_region(r, t); + + nr_pieces = (regions[i].end - regions[i].start) / sz; + damon_va_evenly_split_region(t, r, nr_pieces); + } +} + +/* Initialize '->regions_list' of every target (task) */ +void damon_va_init(struct damon_ctx *ctx) +{ + struct damon_target *t; + + damon_for_each_target(t, ctx) { + /* the user may set the target regions as they want */ + if (!damon_nr_regions(t)) + __damon_va_init_regions(ctx, t); + } +} + +/* + * Functions for the dynamic monitoring target regions update + */ + +/* + * Check whether a region is intersecting an address range + * + * Returns true if it is. + */ +static bool damon_intersect(struct damon_region *r, struct damon_addr_range *re) +{ + return !(r->ar.end <= re->start || re->end <= r->ar.start); +} + +/* + * Update damon regions for the three big regions of the given target + * + * t the given target + * bregions the three big regions of the target + */ +static void damon_va_apply_three_regions(struct damon_target *t, + struct damon_addr_range bregions[3]) +{ + struct damon_region *r, *next; + unsigned int i = 0; + + /* Remove regions which are not in the three big regions now */ + damon_for_each_region_safe(r, next, t) { + for (i = 0; i < 3; i++) { + if (damon_intersect(r, &bregions[i])) + break; + } + if (i == 3) + damon_destroy_region(r, t); + } + + /* Adjust intersecting regions to fit with the three big regions */ + for (i = 0; i < 3; i++) { + struct damon_region *first = NULL, *last; + struct damon_region *newr; + struct damon_addr_range *br; + + br = &bregions[i]; + /* Get the first and last regions which intersects with br */ + damon_for_each_region(r, t) { + if (damon_intersect(r, br)) { + if (!first) + first = r; + last = r; + } + if (r->ar.start >= br->end) + break; + } + if (!first) { + /* no damon_region intersects with this big region */ + newr = damon_new_region( + ALIGN_DOWN(br->start, + DAMON_MIN_REGION), + ALIGN(br->end, DAMON_MIN_REGION)); + if (!newr) + continue; + damon_insert_region(newr, damon_prev_region(r), r, t); + } else { + first->ar.start = ALIGN_DOWN(br->start, + DAMON_MIN_REGION); + last->ar.end = ALIGN(br->end, DAMON_MIN_REGION); + } + } +} + +/* + * Update regions for current memory mappings + */ +void damon_va_update(struct damon_ctx *ctx) +{ + struct damon_addr_range three_regions[3]; + struct damon_target *t; + + damon_for_each_target(t, ctx) { + if (damon_va_three_regions(t, three_regions)) + continue; + damon_va_apply_three_regions(t, three_regions); + } +} + +/* + * Get an online page for a pfn if it's in the LRU list. Otherwise, returns + * NULL. + * + * The body of this function is stolen from the 'page_idle_get_page()'. We + * steal rather than reuse it because the code is quite simple. + */ +static struct page *damon_get_page(unsigned long pfn) +{ + struct page *page = pfn_to_online_page(pfn); + + if (!page || !PageLRU(page) || !get_page_unless_zero(page)) + return NULL; + + if (unlikely(!PageLRU(page))) { + put_page(page); + page = NULL; + } + return page; +} + +static void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, + unsigned long addr) +{ + bool referenced = false; + struct page *page = damon_get_page(pte_pfn(*pte)); + + if (!page) + return; + + if (pte_young(*pte)) { + referenced = true; + *pte = pte_mkold(*pte); + } + +#ifdef CONFIG_MMU_NOTIFIER + if (mmu_notifier_clear_young(mm, addr, addr + PAGE_SIZE)) + referenced = true; +#endif /* CONFIG_MMU_NOTIFIER */ + + if (referenced) + set_page_young(page); + + set_page_idle(page); + put_page(page); +} + +static void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, + unsigned long addr) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + bool referenced = false; + struct page *page = damon_get_page(pmd_pfn(*pmd)); + + if (!page) + return; + + if (pmd_young(*pmd)) { + referenced = true; + *pmd = pmd_mkold(*pmd); + } + +#ifdef CONFIG_MMU_NOTIFIER + if (mmu_notifier_clear_young(mm, addr, + addr + ((1UL) << HPAGE_PMD_SHIFT))) + referenced = true; +#endif /* CONFIG_MMU_NOTIFIER */ + + if (referenced) + set_page_young(page); + + set_page_idle(page); + put_page(page); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +} + +static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pte_t *pte; + spinlock_t *ptl; + + if (pmd_huge(*pmd)) { + ptl = pmd_lock(walk->mm, pmd); + if (pmd_huge(*pmd)) { + damon_pmdp_mkold(pmd, walk->mm, addr); + spin_unlock(ptl); + return 0; + } + spin_unlock(ptl); + } + + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + return 0; + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (!pte_present(*pte)) + goto out; + damon_ptep_mkold(pte, walk->mm, addr); +out: + pte_unmap_unlock(pte, ptl); + return 0; +} + +static struct mm_walk_ops damon_mkold_ops = { + .pmd_entry = damon_mkold_pmd_entry, +}; + +static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) +{ + mmap_read_lock(mm); + walk_page_range(mm, addr, addr + 1, &damon_mkold_ops, NULL); + mmap_read_unlock(mm); +} + +/* + * Functions for the access checking of the regions + */ + +static void damon_va_prepare_access_check(struct damon_ctx *ctx, + struct mm_struct *mm, struct damon_region *r) +{ + r->sampling_addr = damon_rand(r->ar.start, r->ar.end); + + damon_va_mkold(mm, r->sampling_addr); +} + +void damon_va_prepare_access_checks(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct mm_struct *mm; + struct damon_region *r; + + damon_for_each_target(t, ctx) { + mm = damon_get_mm(t); + if (!mm) + continue; + damon_for_each_region(r, t) + damon_va_prepare_access_check(ctx, mm, r); + mmput(mm); + } +} + +struct damon_young_walk_private { + unsigned long *page_sz; + bool young; +}; + +static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pte_t *pte; + spinlock_t *ptl; + struct page *page; + struct damon_young_walk_private *priv = walk->private; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_huge(*pmd)) { + ptl = pmd_lock(walk->mm, pmd); + if (!pmd_huge(*pmd)) { + spin_unlock(ptl); + goto regular_page; + } + page = damon_get_page(pmd_pfn(*pmd)); + if (!page) + goto huge_out; + if (pmd_young(*pmd) || !page_is_idle(page) || + mmu_notifier_test_young(walk->mm, + addr)) { + *priv->page_sz = ((1UL) << HPAGE_PMD_SHIFT); + priv->young = true; + } + put_page(page); +huge_out: + spin_unlock(ptl); + return 0; + } + +regular_page: +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + return -EINVAL; + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (!pte_present(*pte)) + goto out; + page = damon_get_page(pte_pfn(*pte)); + if (!page) + goto out; + if (pte_young(*pte) || !page_is_idle(page) || + mmu_notifier_test_young(walk->mm, addr)) { + *priv->page_sz = PAGE_SIZE; + priv->young = true; + } + put_page(page); +out: + pte_unmap_unlock(pte, ptl); + return 0; +} + +static struct mm_walk_ops damon_young_ops = { + .pmd_entry = damon_young_pmd_entry, +}; + +static bool damon_va_young(struct mm_struct *mm, unsigned long addr, + unsigned long *page_sz) +{ + struct damon_young_walk_private arg = { + .page_sz = page_sz, + .young = false, + }; + + mmap_read_lock(mm); + walk_page_range(mm, addr, addr + 1, &damon_young_ops, &arg); + mmap_read_unlock(mm); + return arg.young; +} + +/* + * Check whether the region was accessed after the last preparation + * + * mm 'mm_struct' for the given virtual address space + * r the region to be checked + */ +static void damon_va_check_access(struct damon_ctx *ctx, + struct mm_struct *mm, struct damon_region *r) +{ + static struct mm_struct *last_mm; + static unsigned long last_addr; + static unsigned long last_page_sz = PAGE_SIZE; + static bool last_accessed; + + /* If the region is in the last checked page, reuse the result */ + if (mm == last_mm && (ALIGN_DOWN(last_addr, last_page_sz) == + ALIGN_DOWN(r->sampling_addr, last_page_sz))) { + if (last_accessed) + r->nr_accesses++; + return; + } + + last_accessed = damon_va_young(mm, r->sampling_addr, &last_page_sz); + if (last_accessed) + r->nr_accesses++; + + last_mm = mm; + last_addr = r->sampling_addr; +} + +unsigned int damon_va_check_accesses(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct mm_struct *mm; + struct damon_region *r; + unsigned int max_nr_accesses = 0; + + damon_for_each_target(t, ctx) { + mm = damon_get_mm(t); + if (!mm) + continue; + damon_for_each_region(r, t) { + damon_va_check_access(ctx, mm, r); + max_nr_accesses = max(r->nr_accesses, max_nr_accesses); + } + mmput(mm); + } + + return max_nr_accesses; +} + +/* + * Functions for the target validity check and cleanup + */ + +bool damon_va_target_valid(void *target) +{ + struct damon_target *t = target; + struct task_struct *task; + + task = damon_get_task_struct(t); + if (task) { + put_task_struct(task); + return true; + } + + return false; +} + +void damon_va_set_primitives(struct damon_ctx *ctx) +{ + ctx->primitive.init = damon_va_init; + ctx->primitive.update = damon_va_update; + ctx->primitive.prepare_access_checks = damon_va_prepare_access_checks; + ctx->primitive.check_accesses = damon_va_check_accesses; + ctx->primitive.reset_aggregated = NULL; + ctx->primitive.target_valid = damon_va_target_valid; + ctx->primitive.cleanup = NULL; +}
From: SeongJae Park sjpark@amazon.de
mainline inclusion from mainline-5.15-rc1 commit 2fcb93629ad8911c846cdc44521c746e53cc4e6d category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4GVMK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
------------------------------------------------- This commit adds a tracepoint for DAMON. It traces the monitoring results of each region for each aggregation interval. Using this, DAMON can easily integrated with tracepoints supporting tools such as perf.
Link: https://lkml.kernel.org/r/20210716081449.22187-7-sj38.park@gmail.com Signed-off-by: SeongJae Park sjpark@amazon.de Reviewed-by: Leonard Foerster foersleo@amazon.de Reviewed-by: Steven Rostedt (VMware) rostedt@goodmis.org Reviewed-by: Fernand Sieber sieberf@amazon.com Acked-by: Shakeel Butt shakeelb@google.com Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Amit Shah amit@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Brendan Higgins brendanhiggins@google.com Cc: David Hildenbrand david@redhat.com Cc: David Rientjes rientjes@google.com Cc: David Woodhouse dwmw@amazon.com Cc: Fan Du fan.du@intel.com Cc: Greg Kroah-Hartman greg@kroah.com Cc: Greg Thelen gthelen@google.com Cc: Ingo Molnar mingo@redhat.com Cc: Joe Perches joe@perches.com Cc: Jonathan Cameron Jonathan.Cameron@huawei.com Cc: Jonathan Corbet corbet@lwn.net Cc: Marco Elver elver@google.com Cc: Markus Boehme markubo@amazon.de Cc: Maximilian Heyne mheyne@amazon.de Cc: Mel Gorman mgorman@suse.de Cc: Minchan Kim minchan@kernel.org Cc: Namhyung Kim namhyung@kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: Rik van Riel riel@surriel.com Cc: Shuah Khan shuah@kernel.org Cc: Vladimir Davydov vdavydov.dev@gmail.com Cc: Vlastimil Babka vbabka@suse.cz Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit 2fcb93629ad8911c846cdc44521c746e53cc4e6d) Signed-off-by: Yue Zou zouyue3@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/trace/events/damon.h | 43 ++++++++++++++++++++++++++++++++++++ mm/damon/core.c | 7 +++++- 2 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 include/trace/events/damon.h
diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h new file mode 100644 index 000000000000..2f422f4f1fb9 --- /dev/null +++ b/include/trace/events/damon.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM damon + +#if !defined(_TRACE_DAMON_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_DAMON_H + +#include <linux/damon.h> +#include <linux/types.h> +#include <linux/tracepoint.h> + +TRACE_EVENT(damon_aggregated, + + TP_PROTO(struct damon_target *t, struct damon_region *r, + unsigned int nr_regions), + + TP_ARGS(t, r, nr_regions), + + TP_STRUCT__entry( + __field(unsigned long, target_id) + __field(unsigned int, nr_regions) + __field(unsigned long, start) + __field(unsigned long, end) + __field(unsigned int, nr_accesses) + ), + + TP_fast_assign( + __entry->target_id = t->id; + __entry->nr_regions = nr_regions; + __entry->start = r->ar.start; + __entry->end = r->ar.end; + __entry->nr_accesses = r->nr_accesses; + ), + + TP_printk("target_id=%lu nr_regions=%u %lu-%lu: %u", + __entry->target_id, __entry->nr_regions, + __entry->start, __entry->end, __entry->nr_accesses) +); + +#endif /* _TRACE_DAMON_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/mm/damon/core.c b/mm/damon/core.c index 28a2c78914fa..ee24d64e8019 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -13,6 +13,9 @@ #include <linux/random.h> #include <linux/slab.h>
+#define CREATE_TRACE_POINTS +#include <trace/events/damon.h> + /* Get a random number in [l, r) */ #define damon_rand(l, r) (l + prandom_u32_max(r - l))
@@ -387,8 +390,10 @@ static void kdamond_reset_aggregated(struct damon_ctx *c) damon_for_each_target(t, c) { struct damon_region *r;
- damon_for_each_region(r, t) + damon_for_each_region(r, t) { + trace_damon_aggregated(t, r, damon_nr_regions(t)); r->nr_accesses = 0; + } } }
From: SeongJae Park sjpark@amazon.de
mainline inclusion from mainline-5.15-rc1 commit 4bc05954d0076655cfaf6f0135585bdc20cd6b11 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4GVMK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
------------------------------------------------- DAMON is designed to be used by kernel space code such as the memory management subsystems, and therefore it provides only kernel space API. That said, letting the user space control DAMON could provide some benefits to them. For example, it will allow user space to analyze their specific workloads and make their own special optimizations.
For such cases, this commit implements a simple DAMON application kernel module, namely 'damon-dbgfs', which merely wraps the DAMON api and exports those to the user space via the debugfs.
'damon-dbgfs' exports three files, ``attrs``, ``target_ids``, and ``monitor_on`` under its debugfs directory, ``<debugfs>/damon/``.
Attributes ----------
Users can read and write the ``sampling interval``, ``aggregation interval``, ``regions update interval``, and min/max number of monitoring target regions by reading from and writing to the ``attrs`` file. For example, below commands set those values to 5 ms, 100 ms, 1,000 ms, 10, 1000 and check it again::
# cd <debugfs>/damon # echo 5000 100000 1000000 10 1000 > attrs # cat attrs 5000 100000 1000000 10 1000
Target IDs ----------
Some types of address spaces supports multiple monitoring target. For example, the virtual memory address spaces monitoring can have multiple processes as the monitoring targets. Users can set the targets by writing relevant id values of the targets to, and get the ids of the current targets by reading from the ``target_ids`` file. In case of the virtual address spaces monitoring, the values should be pids of the monitoring target processes. For example, below commands set processes having pids 42 and 4242 as the monitoring targets and check it again::
# cd <debugfs>/damon # echo 42 4242 > target_ids # cat target_ids 42 4242
Note that setting the target ids doesn't start the monitoring.
Turning On/Off --------------
Setting the files as described above doesn't incur effect unless you explicitly start the monitoring. You can start, stop, and check the current status of the monitoring by writing to and reading from the ``monitor_on`` file. Writing ``on`` to the file starts the monitoring of the targets with the attributes. Writing ``off`` to the file stops those. DAMON also stops if every targets are invalidated (in case of the virtual memory monitoring, target processes are invalidated when terminated). Below example commands turn on, off, and check the status of DAMON::
# cd <debugfs>/damon # echo on > monitor_on # echo off > monitor_on # cat monitor_on off
Please note that you cannot write to the above-mentioned debugfs files while the monitoring is turned on. If you write to the files while DAMON is running, an error code such as ``-EBUSY`` will be returned.
[akpm@linux-foundation.org: remove unneeded "alloc failed" printks] [akpm@linux-foundation.org: replace macro with static inline]
Link: https://lkml.kernel.org/r/20210716081449.22187-8-sj38.park@gmail.com Signed-off-by: SeongJae Park sjpark@amazon.de Reviewed-by: Leonard Foerster foersleo@amazon.de Reviewed-by: Fernand Sieber sieberf@amazon.com Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Amit Shah amit@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Brendan Higgins brendanhiggins@google.com Cc: David Hildenbrand david@redhat.com Cc: David Rientjes rientjes@google.com Cc: David Woodhouse dwmw@amazon.com Cc: Fan Du fan.du@intel.com Cc: Greg Kroah-Hartman greg@kroah.com Cc: Greg Thelen gthelen@google.com Cc: Ingo Molnar mingo@redhat.com Cc: Joe Perches joe@perches.com Cc: Jonathan Cameron Jonathan.Cameron@huawei.com Cc: Jonathan Corbet corbet@lwn.net Cc: Marco Elver elver@google.com Cc: Markus Boehme markubo@amazon.de Cc: Maximilian Heyne mheyne@amazon.de Cc: Mel Gorman mgorman@suse.de Cc: Minchan Kim minchan@kernel.org Cc: Namhyung Kim namhyung@kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: Rik van Riel riel@surriel.com Cc: Shakeel Butt shakeelb@google.com Cc: Shuah Khan shuah@kernel.org Cc: Steven Rostedt (VMware) rostedt@goodmis.org Cc: Vladimir Davydov vdavydov.dev@gmail.com Cc: Vlastimil Babka vbabka@suse.cz Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit 4bc05954d0076655cfaf6f0135585bdc20cd6b11) Signed-off-by: Yue Zou zouyue3@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/damon.h | 3 + mm/damon/Kconfig | 9 + mm/damon/Makefile | 1 + mm/damon/core.c | 47 +++++ mm/damon/dbgfs.c | 397 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 457 insertions(+) create mode 100644 mm/damon/dbgfs.c
diff --git a/include/linux/damon.h b/include/linux/damon.h index edb350e52b93..d68b67b8d458 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -240,9 +240,12 @@ unsigned int damon_nr_regions(struct damon_target *t);
struct damon_ctx *damon_new_ctx(void); void damon_destroy_ctx(struct damon_ctx *ctx); +int damon_set_targets(struct damon_ctx *ctx, + unsigned long *ids, ssize_t nr_ids); int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, unsigned long aggr_int, unsigned long primitive_upd_int, unsigned long min_nr_reg, unsigned long max_nr_reg); +int damon_nr_running_ctxs(void);
int damon_start(struct damon_ctx **ctxs, int nr_ctxs); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 5cbb5db54158..c8e3dba6fb4c 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -20,4 +20,13 @@ config DAMON_VADDR This builds the default data access monitoring primitives for DAMON that works for virtual address spaces.
+config DAMON_DBGFS + bool "DAMON debugfs interface" + depends on DAMON_VADDR && DEBUG_FS + help + This builds the debugfs interface for DAMON. The user space admins + can use the interface for arbitrary data access monitoring. + + If unsure, say N. + endmenu diff --git a/mm/damon/Makefile b/mm/damon/Makefile index 6ebbd08aed67..fed4be3bace3 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -2,3 +2,4 @@
obj-$(CONFIG_DAMON) := core.o obj-$(CONFIG_DAMON_VADDR) += vaddr.o +obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o diff --git a/mm/damon/core.c b/mm/damon/core.c index ee24d64e8019..59033488402e 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -171,6 +171,39 @@ void damon_destroy_ctx(struct damon_ctx *ctx) kfree(ctx); }
+/** + * damon_set_targets() - Set monitoring targets. + * @ctx: monitoring context + * @ids: array of target ids + * @nr_ids: number of entries in @ids + * + * This function should not be called while the kdamond is running. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_set_targets(struct damon_ctx *ctx, + unsigned long *ids, ssize_t nr_ids) +{ + ssize_t i; + struct damon_target *t, *next; + + damon_destroy_targets(ctx); + + for (i = 0; i < nr_ids; i++) { + t = damon_new_target(ids[i]); + if (!t) { + pr_err("Failed to alloc damon_target\n"); + /* The caller should do cleanup of the ids itself */ + damon_for_each_target_safe(t, next, ctx) + damon_destroy_target(t); + return -ENOMEM; + } + damon_add_target(ctx, t); + } + + return 0; +} + /** * damon_set_attrs() - Set attributes for the monitoring. * @ctx: monitoring context @@ -209,6 +242,20 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, return 0; }
+/** + * damon_nr_running_ctxs() - Return number of currently running contexts. + */ +int damon_nr_running_ctxs(void) +{ + int nr_ctxs; + + mutex_lock(&damon_lock); + nr_ctxs = nr_running_ctxs; + mutex_unlock(&damon_lock); + + return nr_ctxs; +} + /* Returns the size upper limit for each monitoring region */ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx) { diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c new file mode 100644 index 000000000000..d2e0a547eb3f --- /dev/null +++ b/mm/damon/dbgfs.c @@ -0,0 +1,397 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON Debugfs Interface + * + * Author: SeongJae Park sjpark@amazon.de + */ + +#define pr_fmt(fmt) "damon-dbgfs: " fmt + +#include <linux/damon.h> +#include <linux/debugfs.h> +#include <linux/file.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/page_idle.h> +#include <linux/slab.h> + +static struct damon_ctx **dbgfs_ctxs; +static int dbgfs_nr_ctxs; +static struct dentry **dbgfs_dirs; + +/* + * Returns non-empty string on success, negative error code otherwise. + */ +static char *user_input_str(const char __user *buf, size_t count, loff_t *ppos) +{ + char *kbuf; + ssize_t ret; + + /* We do not accept continuous write */ + if (*ppos) + return ERR_PTR(-EINVAL); + + kbuf = kmalloc(count + 1, GFP_KERNEL); + if (!kbuf) + return ERR_PTR(-ENOMEM); + + ret = simple_write_to_buffer(kbuf, count + 1, ppos, buf, count); + if (ret != count) { + kfree(kbuf); + return ERR_PTR(-EIO); + } + kbuf[ret] = '\0'; + + return kbuf; +} + +static ssize_t dbgfs_attrs_read(struct file *file, + char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char kbuf[128]; + int ret; + + mutex_lock(&ctx->kdamond_lock); + ret = scnprintf(kbuf, ARRAY_SIZE(kbuf), "%lu %lu %lu %lu %lu\n", + ctx->sample_interval, ctx->aggr_interval, + ctx->primitive_update_interval, ctx->min_nr_regions, + ctx->max_nr_regions); + mutex_unlock(&ctx->kdamond_lock); + + return simple_read_from_buffer(buf, count, ppos, kbuf, ret); +} + +static ssize_t dbgfs_attrs_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + unsigned long s, a, r, minr, maxr; + char *kbuf; + ssize_t ret = count; + int err; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + if (sscanf(kbuf, "%lu %lu %lu %lu %lu", + &s, &a, &r, &minr, &maxr) != 5) { + ret = -EINVAL; + goto out; + } + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + ret = -EBUSY; + goto unlock_out; + } + + err = damon_set_attrs(ctx, s, a, r, minr, maxr); + if (err) + ret = err; +unlock_out: + mutex_unlock(&ctx->kdamond_lock); +out: + kfree(kbuf); + return ret; +} + +static inline bool targetid_is_pid(const struct damon_ctx *ctx) +{ + return ctx->primitive.target_valid == damon_va_target_valid; +} + +static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len) +{ + struct damon_target *t; + unsigned long id; + int written = 0; + int rc; + + damon_for_each_target(t, ctx) { + id = t->id; + if (targetid_is_pid(ctx)) + /* Show pid numbers to debugfs users */ + id = (unsigned long)pid_vnr((struct pid *)id); + + rc = scnprintf(&buf[written], len - written, "%lu ", id); + if (!rc) + return -ENOMEM; + written += rc; + } + if (written) + written -= 1; + written += scnprintf(&buf[written], len - written, "\n"); + return written; +} + +static ssize_t dbgfs_target_ids_read(struct file *file, + char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + ssize_t len; + char ids_buf[320]; + + mutex_lock(&ctx->kdamond_lock); + len = sprint_target_ids(ctx, ids_buf, 320); + mutex_unlock(&ctx->kdamond_lock); + if (len < 0) + return len; + + return simple_read_from_buffer(buf, count, ppos, ids_buf, len); +} + +/* + * Converts a string into an array of unsigned long integers + * + * Returns an array of unsigned long integers if the conversion success, or + * NULL otherwise. + */ +static unsigned long *str_to_target_ids(const char *str, ssize_t len, + ssize_t *nr_ids) +{ + unsigned long *ids; + const int max_nr_ids = 32; + unsigned long id; + int pos = 0, parsed, ret; + + *nr_ids = 0; + ids = kmalloc_array(max_nr_ids, sizeof(id), GFP_KERNEL); + if (!ids) + return NULL; + while (*nr_ids < max_nr_ids && pos < len) { + ret = sscanf(&str[pos], "%lu%n", &id, &parsed); + pos += parsed; + if (ret != 1) + break; + ids[*nr_ids] = id; + *nr_ids += 1; + } + + return ids; +} + +static void dbgfs_put_pids(unsigned long *ids, int nr_ids) +{ + int i; + + for (i = 0; i < nr_ids; i++) + put_pid((struct pid *)ids[i]); +} + +static ssize_t dbgfs_target_ids_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf, *nrs; + unsigned long *targets; + ssize_t nr_targets; + ssize_t ret = count; + int i; + int err; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + nrs = kbuf; + + targets = str_to_target_ids(nrs, ret, &nr_targets); + if (!targets) { + ret = -ENOMEM; + goto out; + } + + if (targetid_is_pid(ctx)) { + for (i = 0; i < nr_targets; i++) { + targets[i] = (unsigned long)find_get_pid( + (int)targets[i]); + if (!targets[i]) { + dbgfs_put_pids(targets, i); + ret = -EINVAL; + goto free_targets_out; + } + } + } + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + if (targetid_is_pid(ctx)) + dbgfs_put_pids(targets, nr_targets); + ret = -EBUSY; + goto unlock_out; + } + + err = damon_set_targets(ctx, targets, nr_targets); + if (err) { + if (targetid_is_pid(ctx)) + dbgfs_put_pids(targets, nr_targets); + ret = err; + } + +unlock_out: + mutex_unlock(&ctx->kdamond_lock); +free_targets_out: + kfree(targets); +out: + kfree(kbuf); + return ret; +} + +static int damon_dbgfs_open(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + + return nonseekable_open(inode, file); +} + +static const struct file_operations attrs_fops = { + .open = damon_dbgfs_open, + .read = dbgfs_attrs_read, + .write = dbgfs_attrs_write, +}; + +static const struct file_operations target_ids_fops = { + .open = damon_dbgfs_open, + .read = dbgfs_target_ids_read, + .write = dbgfs_target_ids_write, +}; + +static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx) +{ + const char * const file_names[] = {"attrs", "target_ids"}; + const struct file_operations *fops[] = {&attrs_fops, &target_ids_fops}; + int i; + + for (i = 0; i < ARRAY_SIZE(file_names); i++) + debugfs_create_file(file_names[i], 0600, dir, ctx, fops[i]); +} + +static int dbgfs_before_terminate(struct damon_ctx *ctx) +{ + struct damon_target *t, *next; + + if (!targetid_is_pid(ctx)) + return 0; + + damon_for_each_target_safe(t, next, ctx) { + put_pid((struct pid *)t->id); + damon_destroy_target(t); + } + return 0; +} + +static struct damon_ctx *dbgfs_new_ctx(void) +{ + struct damon_ctx *ctx; + + ctx = damon_new_ctx(); + if (!ctx) + return NULL; + + damon_va_set_primitives(ctx); + ctx->callback.before_terminate = dbgfs_before_terminate; + return ctx; +} + +static ssize_t dbgfs_monitor_on_read(struct file *file, + char __user *buf, size_t count, loff_t *ppos) +{ + char monitor_on_buf[5]; + bool monitor_on = damon_nr_running_ctxs() != 0; + int len; + + len = scnprintf(monitor_on_buf, 5, monitor_on ? "on\n" : "off\n"); + + return simple_read_from_buffer(buf, count, ppos, monitor_on_buf, len); +} + +static ssize_t dbgfs_monitor_on_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + ssize_t ret = count; + char *kbuf; + int err; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + /* Remove white space */ + if (sscanf(kbuf, "%s", kbuf) != 1) { + kfree(kbuf); + return -EINVAL; + } + + if (!strncmp(kbuf, "on", count)) + err = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs); + else if (!strncmp(kbuf, "off", count)) + err = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs); + else + err = -EINVAL; + + if (err) + ret = err; + kfree(kbuf); + return ret; +} + +static const struct file_operations monitor_on_fops = { + .read = dbgfs_monitor_on_read, + .write = dbgfs_monitor_on_write, +}; + +static int __init __damon_dbgfs_init(void) +{ + struct dentry *dbgfs_root; + const char * const file_names[] = {"monitor_on"}; + const struct file_operations *fops[] = {&monitor_on_fops}; + int i; + + dbgfs_root = debugfs_create_dir("damon", NULL); + + for (i = 0; i < ARRAY_SIZE(file_names); i++) + debugfs_create_file(file_names[i], 0600, dbgfs_root, NULL, + fops[i]); + dbgfs_fill_ctx_dir(dbgfs_root, dbgfs_ctxs[0]); + + dbgfs_dirs = kmalloc_array(1, sizeof(dbgfs_root), GFP_KERNEL); + if (!dbgfs_dirs) { + debugfs_remove(dbgfs_root); + return -ENOMEM; + } + dbgfs_dirs[0] = dbgfs_root; + + return 0; +} + +/* + * Functions for the initialization + */ + +static int __init damon_dbgfs_init(void) +{ + int rc; + + dbgfs_ctxs = kmalloc(sizeof(*dbgfs_ctxs), GFP_KERNEL); + if (!dbgfs_ctxs) + return -ENOMEM; + dbgfs_ctxs[0] = dbgfs_new_ctx(); + if (!dbgfs_ctxs[0]) { + kfree(dbgfs_ctxs); + return -ENOMEM; + } + dbgfs_nr_ctxs = 1; + + rc = __damon_dbgfs_init(); + if (rc) { + kfree(dbgfs_ctxs[0]); + kfree(dbgfs_ctxs); + pr_err("%s: dbgfs init failed\n", __func__); + } + + return rc; +} + +module_init(damon_dbgfs_init);
From: SeongJae Park sjpark@amazon.de
mainline inclusion from mainline-5.15-rc1 commit 429538e85410c3ae12719ec42b89ab873ed6d47b category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4GVMK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
------------------------------------------------- For CPU usage accounting, knowing pid of the monitoring thread could be helpful. For example, users could use cpuaccount cgroups with the pid.
This commit therefore exports the pid of currently running monitoring thread to the user space via 'kdamond_pid' file in the debugfs directory.
Link: https://lkml.kernel.org/r/20210716081449.22187-9-sj38.park@gmail.com Signed-off-by: SeongJae Park sjpark@amazon.de Reviewed-by: Fernand Sieber sieberf@amazon.com Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Amit Shah amit@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Brendan Higgins brendanhiggins@google.com Cc: David Hildenbrand david@redhat.com Cc: David Rientjes rientjes@google.com Cc: David Woodhouse dwmw@amazon.com Cc: Fan Du fan.du@intel.com Cc: Greg Kroah-Hartman greg@kroah.com Cc: Greg Thelen gthelen@google.com Cc: Ingo Molnar mingo@redhat.com Cc: Joe Perches joe@perches.com Cc: Jonathan Cameron Jonathan.Cameron@huawei.com Cc: Jonathan Corbet corbet@lwn.net Cc: Leonard Foerster foersleo@amazon.de Cc: Marco Elver elver@google.com Cc: Markus Boehme markubo@amazon.de Cc: Maximilian Heyne mheyne@amazon.de Cc: Mel Gorman mgorman@suse.de Cc: Minchan Kim minchan@kernel.org Cc: Namhyung Kim namhyung@kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: Rik van Riel riel@surriel.com Cc: Shakeel Butt shakeelb@google.com Cc: Shuah Khan shuah@kernel.org Cc: Steven Rostedt (VMware) rostedt@goodmis.org Cc: Vladimir Davydov vdavydov.dev@gmail.com Cc: Vlastimil Babka vbabka@suse.cz Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit 429538e85410c3ae12719ec42b89ab873ed6d47b) Signed-off-by: Yue Zou zouyue3@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/damon/dbgfs.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-)
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index d2e0a547eb3f..e850be4077f5 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -239,6 +239,32 @@ static ssize_t dbgfs_target_ids_write(struct file *file, return ret; }
+static ssize_t dbgfs_kdamond_pid_read(struct file *file, + char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf; + ssize_t len; + + kbuf = kmalloc(count, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) + len = scnprintf(kbuf, count, "%d\n", ctx->kdamond->pid); + else + len = scnprintf(kbuf, count, "none\n"); + mutex_unlock(&ctx->kdamond_lock); + if (!len) + goto out; + len = simple_read_from_buffer(buf, count, ppos, kbuf, len); + +out: + kfree(kbuf); + return len; +} + static int damon_dbgfs_open(struct inode *inode, struct file *file) { file->private_data = inode->i_private; @@ -258,10 +284,17 @@ static const struct file_operations target_ids_fops = { .write = dbgfs_target_ids_write, };
+static const struct file_operations kdamond_pid_fops = { + .open = damon_dbgfs_open, + .read = dbgfs_kdamond_pid_read, +}; + static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx) { - const char * const file_names[] = {"attrs", "target_ids"}; - const struct file_operations *fops[] = {&attrs_fops, &target_ids_fops}; + const char * const file_names[] = {"attrs", "target_ids", + "kdamond_pid"}; + const struct file_operations *fops[] = {&attrs_fops, &target_ids_fops, + &kdamond_pid_fops}; int i;
for (i = 0; i < ARRAY_SIZE(file_names); i++)
From: SeongJae Park sjpark@amazon.de
mainline inclusion from mainline-5.15-rc1 commit 75c1c2b53c78bf3b3188ebb7b3508dadbf98bba1 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4GVMK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
------------------------------------------------- In some use cases, users would want to run multiple monitoring context. For example, if a user wants a high precision monitoring and dedicating multiple CPUs for the job is ok, because DAMON creates one monitoring thread per one context, the user can split the monitoring target regions into multiple small regions and create one context for each region. Or, someone might want to simultaneously monitor different address spaces, e.g., both virtual address space and physical address space.
The DAMON's API allows such usage, but 'damon-dbgfs' does not. Therefore, only kernel space DAMON users can do multiple contexts monitoring.
This commit allows the user space DAMON users to use multiple contexts monitoring by introducing two new 'damon-dbgfs' debugfs files, 'mk_context' and 'rm_context'. Users can create a new monitoring context by writing the desired name of the new context to 'mk_context'. Then, a new directory with the name and having the files for setting of the context ('attrs', 'target_ids' and 'record') will be created under the debugfs directory. Writing the name of the context to remove to 'rm_context' will remove the related context and directory.
Link: https://lkml.kernel.org/r/20210716081449.22187-10-sj38.park@gmail.com Signed-off-by: SeongJae Park sjpark@amazon.de Reviewed-by: Fernand Sieber sieberf@amazon.com Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Amit Shah amit@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Brendan Higgins brendanhiggins@google.com Cc: David Hildenbrand david@redhat.com Cc: David Rientjes rientjes@google.com Cc: David Woodhouse dwmw@amazon.com Cc: Fan Du fan.du@intel.com Cc: Greg Kroah-Hartman greg@kroah.com Cc: Greg Thelen gthelen@google.com Cc: Ingo Molnar mingo@redhat.com Cc: Joe Perches joe@perches.com Cc: Jonathan Cameron Jonathan.Cameron@huawei.com Cc: Jonathan Corbet corbet@lwn.net Cc: Leonard Foerster foersleo@amazon.de Cc: Marco Elver elver@google.com Cc: Markus Boehme markubo@amazon.de Cc: Maximilian Heyne mheyne@amazon.de Cc: Mel Gorman mgorman@suse.de Cc: Minchan Kim minchan@kernel.org Cc: Namhyung Kim namhyung@kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: Rik van Riel riel@surriel.com Cc: Shakeel Butt shakeelb@google.com Cc: Shuah Khan shuah@kernel.org Cc: Steven Rostedt (VMware) rostedt@goodmis.org Cc: Vladimir Davydov vdavydov.dev@gmail.com Cc: Vlastimil Babka vbabka@suse.cz Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit 75c1c2b53c78bf3b3188ebb7b3508dadbf98bba1) Signed-off-by: Yue Zou zouyue3@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/damon/dbgfs.c | 195 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 193 insertions(+), 2 deletions(-)
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index e850be4077f5..31ad550ecba2 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -18,6 +18,7 @@ static struct damon_ctx **dbgfs_ctxs; static int dbgfs_nr_ctxs; static struct dentry **dbgfs_dirs; +static DEFINE_MUTEX(damon_dbgfs_lock);
/* * Returns non-empty string on success, negative error code otherwise. @@ -328,6 +329,186 @@ static struct damon_ctx *dbgfs_new_ctx(void) return ctx; }
+static void dbgfs_destroy_ctx(struct damon_ctx *ctx) +{ + damon_destroy_ctx(ctx); +} + +/* + * Make a context of @name and create a debugfs directory for it. + * + * This function should be called while holding damon_dbgfs_lock. + * + * Returns 0 on success, negative error code otherwise. + */ +static int dbgfs_mk_context(char *name) +{ + struct dentry *root, **new_dirs, *new_dir; + struct damon_ctx **new_ctxs, *new_ctx; + + if (damon_nr_running_ctxs()) + return -EBUSY; + + new_ctxs = krealloc(dbgfs_ctxs, sizeof(*dbgfs_ctxs) * + (dbgfs_nr_ctxs + 1), GFP_KERNEL); + if (!new_ctxs) + return -ENOMEM; + dbgfs_ctxs = new_ctxs; + + new_dirs = krealloc(dbgfs_dirs, sizeof(*dbgfs_dirs) * + (dbgfs_nr_ctxs + 1), GFP_KERNEL); + if (!new_dirs) + return -ENOMEM; + dbgfs_dirs = new_dirs; + + root = dbgfs_dirs[0]; + if (!root) + return -ENOENT; + + new_dir = debugfs_create_dir(name, root); + dbgfs_dirs[dbgfs_nr_ctxs] = new_dir; + + new_ctx = dbgfs_new_ctx(); + if (!new_ctx) { + debugfs_remove(new_dir); + dbgfs_dirs[dbgfs_nr_ctxs] = NULL; + return -ENOMEM; + } + + dbgfs_ctxs[dbgfs_nr_ctxs] = new_ctx; + dbgfs_fill_ctx_dir(dbgfs_dirs[dbgfs_nr_ctxs], + dbgfs_ctxs[dbgfs_nr_ctxs]); + dbgfs_nr_ctxs++; + + return 0; +} + +static ssize_t dbgfs_mk_context_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + char *kbuf; + char *ctx_name; + ssize_t ret = count; + int err; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + ctx_name = kmalloc(count + 1, GFP_KERNEL); + if (!ctx_name) { + kfree(kbuf); + return -ENOMEM; + } + + /* Trim white space */ + if (sscanf(kbuf, "%s", ctx_name) != 1) { + ret = -EINVAL; + goto out; + } + + mutex_lock(&damon_dbgfs_lock); + err = dbgfs_mk_context(ctx_name); + if (err) + ret = err; + mutex_unlock(&damon_dbgfs_lock); + +out: + kfree(kbuf); + kfree(ctx_name); + return ret; +} + +/* + * Remove a context of @name and its debugfs directory. + * + * This function should be called while holding damon_dbgfs_lock. + * + * Return 0 on success, negative error code otherwise. + */ +static int dbgfs_rm_context(char *name) +{ + struct dentry *root, *dir, **new_dirs; + struct damon_ctx **new_ctxs; + int i, j; + + if (damon_nr_running_ctxs()) + return -EBUSY; + + root = dbgfs_dirs[0]; + if (!root) + return -ENOENT; + + dir = debugfs_lookup(name, root); + if (!dir) + return -ENOENT; + + new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs), + GFP_KERNEL); + if (!new_dirs) + return -ENOMEM; + + new_ctxs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_ctxs), + GFP_KERNEL); + if (!new_ctxs) { + kfree(new_dirs); + return -ENOMEM; + } + + for (i = 0, j = 0; i < dbgfs_nr_ctxs; i++) { + if (dbgfs_dirs[i] == dir) { + debugfs_remove(dbgfs_dirs[i]); + dbgfs_destroy_ctx(dbgfs_ctxs[i]); + continue; + } + new_dirs[j] = dbgfs_dirs[i]; + new_ctxs[j++] = dbgfs_ctxs[i]; + } + + kfree(dbgfs_dirs); + kfree(dbgfs_ctxs); + + dbgfs_dirs = new_dirs; + dbgfs_ctxs = new_ctxs; + dbgfs_nr_ctxs--; + + return 0; +} + +static ssize_t dbgfs_rm_context_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + char *kbuf; + ssize_t ret = count; + int err; + char *ctx_name; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + ctx_name = kmalloc(count + 1, GFP_KERNEL); + if (!ctx_name) { + kfree(kbuf); + return -ENOMEM; + } + + /* Trim white space */ + if (sscanf(kbuf, "%s", ctx_name) != 1) { + ret = -EINVAL; + goto out; + } + + mutex_lock(&damon_dbgfs_lock); + err = dbgfs_rm_context(ctx_name); + if (err) + ret = err; + mutex_unlock(&damon_dbgfs_lock); + +out: + kfree(kbuf); + kfree(ctx_name); + return ret; +} + static ssize_t dbgfs_monitor_on_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -370,6 +551,14 @@ static ssize_t dbgfs_monitor_on_write(struct file *file, return ret; }
+static const struct file_operations mk_contexts_fops = { + .write = dbgfs_mk_context_write, +}; + +static const struct file_operations rm_contexts_fops = { + .write = dbgfs_rm_context_write, +}; + static const struct file_operations monitor_on_fops = { .read = dbgfs_monitor_on_read, .write = dbgfs_monitor_on_write, @@ -378,8 +567,10 @@ static const struct file_operations monitor_on_fops = { static int __init __damon_dbgfs_init(void) { struct dentry *dbgfs_root; - const char * const file_names[] = {"monitor_on"}; - const struct file_operations *fops[] = {&monitor_on_fops}; + const char * const file_names[] = {"mk_contexts", "rm_contexts", + "monitor_on"}; + const struct file_operations *fops[] = {&mk_contexts_fops, + &rm_contexts_fops, &monitor_on_fops}; int i;
dbgfs_root = debugfs_create_dir("damon", NULL);
From: SeongJae Park sjpark@amazon.de
mainline inclusion from mainline-5.15-rc1 commit c4ba6014aec39e74ad3c10229dcfd187c42ee4f3 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4GVMK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
------------------------------------------------- This commit adds documents for DAMON under `Documentation/admin-guide/mm/damon/` and `Documentation/vm/damon/`.
Link: https://lkml.kernel.org/r/20210716081449.22187-11-sj38.park@gmail.com Signed-off-by: SeongJae Park sjpark@amazon.de Reviewed-by: Fernand Sieber sieberf@amazon.com Reviewed-by: Markus Boehme markubo@amazon.de Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Amit Shah amit@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Brendan Higgins brendanhiggins@google.com Cc: David Hildenbrand david@redhat.com Cc: David Rientjes rientjes@google.com Cc: David Woodhouse dwmw@amazon.com Cc: Fan Du fan.du@intel.com Cc: Greg Kroah-Hartman greg@kroah.com Cc: Greg Thelen gthelen@google.com Cc: Ingo Molnar mingo@redhat.com Cc: Joe Perches joe@perches.com Cc: Jonathan Cameron Jonathan.Cameron@huawei.com Cc: Jonathan Corbet corbet@lwn.net Cc: Leonard Foerster foersleo@amazon.de Cc: Marco Elver elver@google.com Cc: Maximilian Heyne mheyne@amazon.de Cc: Mel Gorman mgorman@suse.de Cc: Minchan Kim minchan@kernel.org Cc: Namhyung Kim namhyung@kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: Rik van Riel riel@surriel.com Cc: Shakeel Butt shakeelb@google.com Cc: Shuah Khan shuah@kernel.org Cc: Steven Rostedt (VMware) rostedt@goodmis.org Cc: Vladimir Davydov vdavydov.dev@gmail.com Cc: Vlastimil Babka vbabka@suse.cz Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit c4ba6014aec39e74ad3c10229dcfd187c42ee4f3) Signed-off-by: Yue Zou zouyue3@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/admin-guide/mm/damon/index.rst | 15 ++ Documentation/admin-guide/mm/damon/start.rst | 114 +++++++++++++ Documentation/admin-guide/mm/damon/usage.rst | 112 +++++++++++++ Documentation/admin-guide/mm/index.rst | 1 + Documentation/vm/damon/api.rst | 20 +++ Documentation/vm/damon/design.rst | 166 +++++++++++++++++++ Documentation/vm/damon/faq.rst | 51 ++++++ Documentation/vm/damon/index.rst | 30 ++++ Documentation/vm/index.rst | 1 + 9 files changed, 510 insertions(+) create mode 100644 Documentation/admin-guide/mm/damon/index.rst create mode 100644 Documentation/admin-guide/mm/damon/start.rst create mode 100644 Documentation/admin-guide/mm/damon/usage.rst create mode 100644 Documentation/vm/damon/api.rst create mode 100644 Documentation/vm/damon/design.rst create mode 100644 Documentation/vm/damon/faq.rst create mode 100644 Documentation/vm/damon/index.rst
diff --git a/Documentation/admin-guide/mm/damon/index.rst b/Documentation/admin-guide/mm/damon/index.rst new file mode 100644 index 000000000000..8c5dde3a5754 --- /dev/null +++ b/Documentation/admin-guide/mm/damon/index.rst @@ -0,0 +1,15 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================== +Monitoring Data Accesses +======================== + +:doc:`DAMON </vm/damon/index>` allows light-weight data access monitoring. +Using DAMON, users can analyze the memory access patterns of their systems and +optimize those. + +.. toctree:: + :maxdepth: 2 + + start + usage diff --git a/Documentation/admin-guide/mm/damon/start.rst b/Documentation/admin-guide/mm/damon/start.rst new file mode 100644 index 000000000000..d5eb89a8fc38 --- /dev/null +++ b/Documentation/admin-guide/mm/damon/start.rst @@ -0,0 +1,114 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +Getting Started +=============== + +This document briefly describes how you can use DAMON by demonstrating its +default user space tool. Please note that this document describes only a part +of its features for brevity. Please refer to :doc:`usage` for more details. + + +TL; DR +====== + +Follow the commands below to monitor and visualize the memory access pattern of +your workload. :: + + # # build the kernel with CONFIG_DAMON_*=y, install it, and reboot + # mount -t debugfs none /sys/kernel/debug/ + # git clone https://github.com/awslabs/damo + # ./damo/damo record $(pidof <your workload>) + # ./damo/damo report heat --plot_ascii + +The final command draws the access heatmap of ``<your workload>``. The heatmap +shows which memory region (x-axis) is accessed when (y-axis) and how frequently +(number; the higher the more accesses have been observed). :: + + 111111111111111111111111111111111111111111111111111111110000 + 111121111111111111111111111111211111111111111111111111110000 + 000000000000000000000000000000000000000000000000001555552000 + 000000000000000000000000000000000000000000000222223555552000 + 000000000000000000000000000000000000000011111677775000000000 + 000000000000000000000000000000000000000488888000000000000000 + 000000000000000000000000000000000177888400000000000000000000 + 000000000000000000000000000046666522222100000000000000000000 + 000000000000000000000014444344444300000000000000000000000000 + 000000000000000002222245555510000000000000000000000000000000 + # access_frequency: 0 1 2 3 4 5 6 7 8 9 + # x-axis: space (140286319947776-140286426374096: 101.496 MiB) + # y-axis: time (605442256436361-605479951866441: 37.695430s) + # resolution: 60x10 (1.692 MiB and 3.770s for each character) + + +Prerequisites +============= + +Kernel +------ + +You should first ensure your system is running on a kernel built with +``CONFIG_DAMON_*=y``. + + +User Space Tool +--------------- + +For the demonstration, we will use the default user space tool for DAMON, +called DAMON Operator (DAMO). It is available at +https://github.com/awslabs/damo. The examples below assume that ``damo`` is on +your ``$PATH``. It's not mandatory, though. + +Because DAMO is using the debugfs interface (refer to :doc:`usage` for the +detail) of DAMON, you should ensure debugfs is mounted. Mount it manually as +below:: + + # mount -t debugfs none /sys/kernel/debug/ + +or append the following line to your ``/etc/fstab`` file so that your system +can automatically mount debugfs upon booting:: + + debugfs /sys/kernel/debug debugfs defaults 0 0 + + +Recording Data Access Patterns +============================== + +The commands below record the memory access patterns of a program and save the +monitoring results to a file. :: + + $ git clone https://github.com/sjp38/masim + $ cd masim; make; ./masim ./configs/zigzag.cfg & + $ sudo damo record -o damon.data $(pidof masim) + +The first two lines of the commands download an artificial memory access +generator program and run it in the background. The generator will repeatedly +access two 100 MiB sized memory regions one by one. You can substitute this +with your real workload. The last line asks ``damo`` to record the access +pattern in the ``damon.data`` file. + + +Visualizing Recorded Patterns +============================= + +The following three commands visualize the recorded access patterns and save +the results as separate image files. :: + + $ damo report heats --heatmap access_pattern_heatmap.png + $ damo report wss --range 0 101 1 --plot wss_dist.png + $ damo report wss --range 0 101 1 --sortby time --plot wss_chron_change.png + +- ``access_pattern_heatmap.png`` will visualize the data access pattern in a + heatmap, showing which memory region (y-axis) got accessed when (x-axis) + and how frequently (color). +- ``wss_dist.png`` will show the distribution of the working set size. +- ``wss_chron_change.png`` will show how the working set size has + chronologically changed. + +You can view the visualizations of this example workload at [1]_. +Visualizations of other realistic workloads are available at [2]_ [3]_ [4]_. + +.. [1] https://damonitor.github.io/doc/html/v17/admin-guide/mm/damon/start.html#vis... +.. [2] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.1.png.html +.. [3] https://damonitor.github.io/test/result/visual/latest/rec.wss_sz.png.html +.. [4] https://damonitor.github.io/test/result/visual/latest/rec.wss_time.png.html diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst new file mode 100644 index 000000000000..a72cda374aba --- /dev/null +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -0,0 +1,112 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +Detailed Usages +=============== + +DAMON provides below three interfaces for different users. + +- *DAMON user space tool.* + This is for privileged people such as system administrators who want a + just-working human-friendly interface. Using this, users can use the DAMON’s + major features in a human-friendly way. It may not be highly tuned for + special cases, though. It supports only virtual address spaces monitoring. +- *debugfs interface.* + This is for privileged user space programmers who want more optimized use of + DAMON. Using this, users can use DAMON’s major features by reading + from and writing to special debugfs files. Therefore, you can write and use + your personalized DAMON debugfs wrapper programs that reads/writes the + debugfs files instead of you. The DAMON user space tool is also a reference + implementation of such programs. It supports only virtual address spaces + monitoring. +- *Kernel Space Programming Interface.* + This is for kernel space programmers. Using this, users can utilize every + feature of DAMON most flexibly and efficiently by writing kernel space + DAMON application programs for you. You can even extend DAMON for various + address spaces. + +Nevertheless, you could write your own user space tool using the debugfs +interface. A reference implementation is available at +https://github.com/awslabs/damo. If you are a kernel programmer, you could +refer to :doc:`/vm/damon/api` for the kernel space programming interface. For +the reason, this document describes only the debugfs interface + +debugfs Interface +================= + +DAMON exports three files, ``attrs``, ``target_ids``, and ``monitor_on`` under +its debugfs directory, ``<debugfs>/damon/``. + + +Attributes +---------- + +Users can get and set the ``sampling interval``, ``aggregation interval``, +``regions update interval``, and min/max number of monitoring target regions by +reading from and writing to the ``attrs`` file. To know about the monitoring +attributes in detail, please refer to the :doc:`/vm/damon/design`. For +example, below commands set those values to 5 ms, 100 ms, 1,000 ms, 10 and +1000, and then check it again:: + + # cd <debugfs>/damon + # echo 5000 100000 1000000 10 1000 > attrs + # cat attrs + 5000 100000 1000000 10 1000 + + +Target IDs +---------- + +Some types of address spaces supports multiple monitoring target. For example, +the virtual memory address spaces monitoring can have multiple processes as the +monitoring targets. Users can set the targets by writing relevant id values of +the targets to, and get the ids of the current targets by reading from the +``target_ids`` file. In case of the virtual address spaces monitoring, the +values should be pids of the monitoring target processes. For example, below +commands set processes having pids 42 and 4242 as the monitoring targets and +check it again:: + + # cd <debugfs>/damon + # echo 42 4242 > target_ids + # cat target_ids + 42 4242 + +Note that setting the target ids doesn't start the monitoring. + + +Turning On/Off +-------------- + +Setting the files as described above doesn't incur effect unless you explicitly +start the monitoring. You can start, stop, and check the current status of the +monitoring by writing to and reading from the ``monitor_on`` file. Writing +``on`` to the file starts the monitoring of the targets with the attributes. +Writing ``off`` to the file stops those. DAMON also stops if every target +process is terminated. Below example commands turn on, off, and check the +status of DAMON:: + + # cd <debugfs>/damon + # echo on > monitor_on + # echo off > monitor_on + # cat monitor_on + off + +Please note that you cannot write to the above-mentioned debugfs files while +the monitoring is turned on. If you write to the files while DAMON is running, +an error code such as ``-EBUSY`` will be returned. + + +Tracepoint for Monitoring Results +================================= + +DAMON provides the monitoring results via a tracepoint, +``damon:damon_aggregated``. While the monitoring is turned on, you could +record the tracepoint events and show results using tracepoint supporting tools +like ``perf``. For example:: + + # echo on > monitor_on + # perf record -e damon:damon_aggregated & + # sleep 5 + # kill 9 $(pidof perf) + # echo off > monitor_on + # perf script diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst index cd727cfc1b04..32c27fbf1913 100644 --- a/Documentation/admin-guide/mm/index.rst +++ b/Documentation/admin-guide/mm/index.rst @@ -27,6 +27,7 @@ the Linux memory management.
concepts cma_debugfs + damon/index hugetlbpage idle_page_tracking ksm diff --git a/Documentation/vm/damon/api.rst b/Documentation/vm/damon/api.rst new file mode 100644 index 000000000000..08f34df45523 --- /dev/null +++ b/Documentation/vm/damon/api.rst @@ -0,0 +1,20 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============= +API Reference +============= + +Kernel space programs can use every feature of DAMON using below APIs. All you +need to do is including ``damon.h``, which is located in ``include/linux/`` of +the source tree. + +Structures +========== + +.. kernel-doc:: include/linux/damon.h + + +Functions +========= + +.. kernel-doc:: mm/damon/core.c diff --git a/Documentation/vm/damon/design.rst b/Documentation/vm/damon/design.rst new file mode 100644 index 000000000000..b05159c295f4 --- /dev/null +++ b/Documentation/vm/damon/design.rst @@ -0,0 +1,166 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====== +Design +====== + +Configurable Layers +=================== + +DAMON provides data access monitoring functionality while making the accuracy +and the overhead controllable. The fundamental access monitorings require +primitives that dependent on and optimized for the target address space. On +the other hand, the accuracy and overhead tradeoff mechanism, which is the core +of DAMON, is in the pure logic space. DAMON separates the two parts in +different layers and defines its interface to allow various low level +primitives implementations configurable with the core logic. + +Due to this separated design and the configurable interface, users can extend +DAMON for any address space by configuring the core logics with appropriate low +level primitive implementations. If appropriate one is not provided, users can +implement the primitives on their own. + +For example, physical memory, virtual memory, swap space, those for specific +processes, NUMA nodes, files, and backing memory devices would be supportable. +Also, if some architectures or devices support special optimized access check +primitives, those will be easily configurable. + + +Reference Implementations of Address Space Specific Primitives +============================================================== + +The low level primitives for the fundamental access monitoring are defined in +two parts: + +1. Identification of the monitoring target address range for the address space. +2. Access check of specific address range in the target space. + +DAMON currently provides the implementation of the primitives for only the +virtual address spaces. Below two subsections describe how it works. + + +VMA-based Target Address Range Construction +------------------------------------------- + +Only small parts in the super-huge virtual address space of the processes are +mapped to the physical memory and accessed. Thus, tracking the unmapped +address regions is just wasteful. However, because DAMON can deal with some +level of noise using the adaptive regions adjustment mechanism, tracking every +mapping is not strictly required but could even incur a high overhead in some +cases. That said, too huge unmapped areas inside the monitoring target should +be removed to not take the time for the adaptive mechanism. + +For the reason, this implementation converts the complex mappings to three +distinct regions that cover every mapped area of the address space. The two +gaps between the three regions are the two biggest unmapped areas in the given +address space. The two biggest unmapped areas would be the gap between the +heap and the uppermost mmap()-ed region, and the gap between the lowermost +mmap()-ed region and the stack in most of the cases. Because these gaps are +exceptionally huge in usual address spaces, excluding these will be sufficient +to make a reasonable trade-off. Below shows this in detail:: + + <heap> + <BIG UNMAPPED REGION 1> + <uppermost mmap()-ed region> + (small mmap()-ed regions and munmap()-ed regions) + <lowermost mmap()-ed region> + <BIG UNMAPPED REGION 2> + <stack> + + +PTE Accessed-bit Based Access Check +----------------------------------- + +The implementation for the virtual address space uses PTE Accessed-bit for +basic access checks. It finds the relevant PTE Accessed bit from the address +by walking the page table for the target task of the address. In this way, the +implementation finds and clears the bit for next sampling target address and +checks whether the bit set again after one sampling period. This could disturb +other kernel subsystems using the Accessed bits, namely Idle page tracking and +the reclaim logic. To avoid such disturbances, DAMON makes it mutually +exclusive with Idle page tracking and uses ``PG_idle`` and ``PG_young`` page +flags to solve the conflict with the reclaim logic, as Idle page tracking does. + + +Address Space Independent Core Mechanisms +========================================= + +Below four sections describe each of the DAMON core mechanisms and the five +monitoring attributes, ``sampling interval``, ``aggregation interval``, +``regions update interval``, ``minimum number of regions``, and ``maximum +number of regions``. + + +Access Frequency Monitoring +--------------------------- + +The output of DAMON says what pages are how frequently accessed for a given +duration. The resolution of the access frequency is controlled by setting +``sampling interval`` and ``aggregation interval``. In detail, DAMON checks +access to each page per ``sampling interval`` and aggregates the results. In +other words, counts the number of the accesses to each page. After each +``aggregation interval`` passes, DAMON calls callback functions that previously +registered by users so that users can read the aggregated results and then +clears the results. This can be described in below simple pseudo-code:: + + while monitoring_on: + for page in monitoring_target: + if accessed(page): + nr_accesses[page] += 1 + if time() % aggregation_interval == 0: + for callback in user_registered_callbacks: + callback(monitoring_target, nr_accesses) + for page in monitoring_target: + nr_accesses[page] = 0 + sleep(sampling interval) + +The monitoring overhead of this mechanism will arbitrarily increase as the +size of the target workload grows. + + +Region Based Sampling +--------------------- + +To avoid the unbounded increase of the overhead, DAMON groups adjacent pages +that assumed to have the same access frequencies into a region. As long as the +assumption (pages in a region have the same access frequencies) is kept, only +one page in the region is required to be checked. Thus, for each ``sampling +interval``, DAMON randomly picks one page in each region, waits for one +``sampling interval``, checks whether the page is accessed meanwhile, and +increases the access frequency of the region if so. Therefore, the monitoring +overhead is controllable by setting the number of regions. DAMON allows users +to set the minimum and the maximum number of regions for the trade-off. + +This scheme, however, cannot preserve the quality of the output if the +assumption is not guaranteed. + + +Adaptive Regions Adjustment +--------------------------- + +Even somehow the initial monitoring target regions are well constructed to +fulfill the assumption (pages in same region have similar access frequencies), +the data access pattern can be dynamically changed. This will result in low +monitoring quality. To keep the assumption as much as possible, DAMON +adaptively merges and splits each region based on their access frequency. + +For each ``aggregation interval``, it compares the access frequencies of +adjacent regions and merges those if the frequency difference is small. Then, +after it reports and clears the aggregated access frequency of each region, it +splits each region into two or three regions if the total number of regions +will not exceed the user-specified maximum number of regions after the split. + +In this way, DAMON provides its best-effort quality and minimal overhead while +keeping the bounds users set for their trade-off. + + +Dynamic Target Space Updates Handling +------------------------------------- + +The monitoring target address range could dynamically changed. For example, +virtual memory could be dynamically mapped and unmapped. Physical memory could +be hot-plugged. + +As the changes could be quite frequent in some cases, DAMON checks the dynamic +memory mapping changes and applies it to the abstracted target area only for +each of a user-specified time interval (``regions update interval``). diff --git a/Documentation/vm/damon/faq.rst b/Documentation/vm/damon/faq.rst new file mode 100644 index 000000000000..cb3d8b585a8b --- /dev/null +++ b/Documentation/vm/damon/faq.rst @@ -0,0 +1,51 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================== +Frequently Asked Questions +========================== + +Why a new subsystem, instead of extending perf or other user space tools? +========================================================================= + +First, because it needs to be lightweight as much as possible so that it can be +used online, any unnecessary overhead such as kernel - user space context +switching cost should be avoided. Second, DAMON aims to be used by other +programs including the kernel. Therefore, having a dependency on specific +tools like perf is not desirable. These are the two biggest reasons why DAMON +is implemented in the kernel space. + + +Can 'idle pages tracking' or 'perf mem' substitute DAMON? +========================================================= + +Idle page tracking is a low level primitive for access check of the physical +address space. 'perf mem' is similar, though it can use sampling to minimize +the overhead. On the other hand, DAMON is a higher-level framework for the +monitoring of various address spaces. It is focused on memory management +optimization and provides sophisticated accuracy/overhead handling mechanisms. +Therefore, 'idle pages tracking' and 'perf mem' could provide a subset of +DAMON's output, but cannot substitute DAMON. + + +Does DAMON support virtual memory only? +======================================= + +No. The core of the DAMON is address space independent. The address space +specific low level primitive parts including monitoring target regions +constructions and actual access checks can be implemented and configured on the +DAMON core by the users. In this way, DAMON users can monitor any address +space with any access check technique. + +Nonetheless, DAMON provides vma tracking and PTE Accessed bit check based +implementations of the address space dependent functions for the virtual memory +by default, for a reference and convenient use. In near future, we will +provide those for physical memory address space. + + +Can I simply monitor page granularity? +====================================== + +Yes. You can do so by setting the ``min_nr_regions`` attribute higher than the +working set size divided by the page size. Because the monitoring target +regions size is forced to be ``>=page size``, the region split will make no +effect. diff --git a/Documentation/vm/damon/index.rst b/Documentation/vm/damon/index.rst new file mode 100644 index 000000000000..a2858baf3bf1 --- /dev/null +++ b/Documentation/vm/damon/index.rst @@ -0,0 +1,30 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================== +DAMON: Data Access MONitor +========================== + +DAMON is a data access monitoring framework subsystem for the Linux kernel. +The core mechanisms of DAMON (refer to :doc:`design` for the detail) make it + + - *accurate* (the monitoring output is useful enough for DRAM level memory + management; It might not appropriate for CPU Cache levels, though), + - *light-weight* (the monitoring overhead is low enough to be applied online), + and + - *scalable* (the upper-bound of the overhead is in constant range regardless + of the size of target workloads). + +Using this framework, therefore, the kernel's memory management mechanisms can +make advanced decisions. Experimental memory management optimization works +that incurring high data accesses monitoring overhead could implemented again. +In user space, meanwhile, users who have some special workloads can write +personalized applications for better understanding and optimizations of their +workloads and systems. + +.. toctree:: + :maxdepth: 2 + + faq + design + api + plans diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst index eff5fbd492d0..b51f0d8992f8 100644 --- a/Documentation/vm/index.rst +++ b/Documentation/vm/index.rst @@ -32,6 +32,7 @@ descriptions of data structures and algorithms. arch_pgtable_helpers balance cleancache + damon/index free_page_reporting frontswap highmem
From: SeongJae Park sjpark@amazon.de
mainline inclusion from mainline-5.15-rc1 commit 17ccae8bb5c928946f6f3af14626ec458f74e6ad category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4GVMK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
------------------------------------------------- This commit adds kunit based unit tests for the core and the virtual address spaces monitoring primitives of DAMON.
Link: https://lkml.kernel.org/r/20210716081449.22187-12-sj38.park@gmail.com Signed-off-by: SeongJae Park sjpark@amazon.de Reviewed-by: Brendan Higgins brendanhiggins@google.com Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Amit Shah amit@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: David Hildenbrand david@redhat.com Cc: David Rientjes rientjes@google.com Cc: David Woodhouse dwmw@amazon.com Cc: Fan Du fan.du@intel.com Cc: Fernand Sieber sieberf@amazon.com Cc: Greg Kroah-Hartman greg@kroah.com Cc: Greg Thelen gthelen@google.com Cc: Ingo Molnar mingo@redhat.com Cc: Joe Perches joe@perches.com Cc: Jonathan Cameron Jonathan.Cameron@huawei.com Cc: Jonathan Corbet corbet@lwn.net Cc: Leonard Foerster foersleo@amazon.de Cc: Marco Elver elver@google.com Cc: Markus Boehme markubo@amazon.de Cc: Maximilian Heyne mheyne@amazon.de Cc: Mel Gorman mgorman@suse.de Cc: Minchan Kim minchan@kernel.org Cc: Namhyung Kim namhyung@kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: Rik van Riel riel@surriel.com Cc: Shakeel Butt shakeelb@google.com Cc: Shuah Khan shuah@kernel.org Cc: Steven Rostedt (VMware) rostedt@goodmis.org Cc: Vladimir Davydov vdavydov.dev@gmail.com Cc: Vlastimil Babka vbabka@suse.cz Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit 17ccae8bb5c928946f6f3af14626ec458f74e6ad) Signed-off-by: Yue Zou zouyue3@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/damon/Kconfig | 36 +++++ mm/damon/core-test.h | 253 ++++++++++++++++++++++++++++++++ mm/damon/core.c | 7 + mm/damon/dbgfs-test.h | 126 ++++++++++++++++ mm/damon/dbgfs.c | 2 + mm/damon/vaddr-test.h | 329 ++++++++++++++++++++++++++++++++++++++++++ mm/damon/vaddr.c | 7 + 7 files changed, 760 insertions(+) create mode 100644 mm/damon/core-test.h create mode 100644 mm/damon/dbgfs-test.h create mode 100644 mm/damon/vaddr-test.h
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index c8e3dba6fb4c..37024798a97c 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -12,6 +12,18 @@ config DAMON See https://damonitor.github.io/doc/html/latest-damon/index.html for more information.
+config DAMON_KUNIT_TEST + bool "Test for damon" if !KUNIT_ALL_TESTS + depends on DAMON && KUNIT=y + default KUNIT_ALL_TESTS + help + This builds the DAMON Kunit test suite. + + For more information on KUnit and unit tests in general, please refer + to the KUnit documentation. + + If unsure, say N. + config DAMON_VADDR bool "Data access monitoring primitives for virtual address spaces" depends on DAMON && MMU @@ -20,6 +32,18 @@ config DAMON_VADDR This builds the default data access monitoring primitives for DAMON that works for virtual address spaces.
+config DAMON_VADDR_KUNIT_TEST + bool "Test for DAMON primitives" if !KUNIT_ALL_TESTS + depends on DAMON_VADDR && KUNIT=y + default KUNIT_ALL_TESTS + help + This builds the DAMON virtual addresses primitives Kunit test suite. + + For more information on KUnit and unit tests in general, please refer + to the KUnit documentation. + + If unsure, say N. + config DAMON_DBGFS bool "DAMON debugfs interface" depends on DAMON_VADDR && DEBUG_FS @@ -29,4 +53,16 @@ config DAMON_DBGFS
If unsure, say N.
+config DAMON_DBGFS_KUNIT_TEST + bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS + depends on DAMON_DBGFS && KUNIT=y + default KUNIT_ALL_TESTS + help + This builds the DAMON debugfs interface Kunit test suite. + + For more information on KUnit and unit tests in general, please refer + to the KUnit documentation. + + If unsure, say N. + endmenu diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h new file mode 100644 index 000000000000..c938a9c34e6c --- /dev/null +++ b/mm/damon/core-test.h @@ -0,0 +1,253 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Data Access Monitor Unit Tests + * + * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * + * Author: SeongJae Park sjpark@amazon.de + */ + +#ifdef CONFIG_DAMON_KUNIT_TEST + +#ifndef _DAMON_CORE_TEST_H +#define _DAMON_CORE_TEST_H + +#include <kunit/test.h> + +static void damon_test_regions(struct kunit *test) +{ + struct damon_region *r; + struct damon_target *t; + + r = damon_new_region(1, 2); + KUNIT_EXPECT_EQ(test, 1ul, r->ar.start); + KUNIT_EXPECT_EQ(test, 2ul, r->ar.end); + KUNIT_EXPECT_EQ(test, 0u, r->nr_accesses); + + t = damon_new_target(42); + KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t)); + + damon_add_region(r, t); + KUNIT_EXPECT_EQ(test, 1u, damon_nr_regions(t)); + + damon_del_region(r, t); + KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t)); + + damon_free_target(t); +} + +static unsigned int nr_damon_targets(struct damon_ctx *ctx) +{ + struct damon_target *t; + unsigned int nr_targets = 0; + + damon_for_each_target(t, ctx) + nr_targets++; + + return nr_targets; +} + +static void damon_test_target(struct kunit *test) +{ + struct damon_ctx *c = damon_new_ctx(); + struct damon_target *t; + + t = damon_new_target(42); + KUNIT_EXPECT_EQ(test, 42ul, t->id); + KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c)); + + damon_add_target(c, t); + KUNIT_EXPECT_EQ(test, 1u, nr_damon_targets(c)); + + damon_destroy_target(t); + KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c)); + + damon_destroy_ctx(c); +} + +/* + * Test kdamond_reset_aggregated() + * + * DAMON checks access to each region and aggregates this information as the + * access frequency of each region. In detail, it increases '->nr_accesses' of + * regions that an access has confirmed. 'kdamond_reset_aggregated()' flushes + * the aggregated information ('->nr_accesses' of each regions) to the result + * buffer. As a result of the flushing, the '->nr_accesses' of regions are + * initialized to zero. + */ +static void damon_test_aggregate(struct kunit *test) +{ + struct damon_ctx *ctx = damon_new_ctx(); + unsigned long target_ids[] = {1, 2, 3}; + unsigned long saddr[][3] = {{10, 20, 30}, {5, 42, 49}, {13, 33, 55} }; + unsigned long eaddr[][3] = {{15, 27, 40}, {31, 45, 55}, {23, 44, 66} }; + unsigned long accesses[][3] = {{42, 95, 84}, {10, 20, 30}, {0, 1, 2} }; + struct damon_target *t; + struct damon_region *r; + int it, ir; + + damon_set_targets(ctx, target_ids, 3); + + it = 0; + damon_for_each_target(t, ctx) { + for (ir = 0; ir < 3; ir++) { + r = damon_new_region(saddr[it][ir], eaddr[it][ir]); + r->nr_accesses = accesses[it][ir]; + damon_add_region(r, t); + } + it++; + } + kdamond_reset_aggregated(ctx); + it = 0; + damon_for_each_target(t, ctx) { + ir = 0; + /* '->nr_accesses' should be zeroed */ + damon_for_each_region(r, t) { + KUNIT_EXPECT_EQ(test, 0u, r->nr_accesses); + ir++; + } + /* regions should be preserved */ + KUNIT_EXPECT_EQ(test, 3, ir); + it++; + } + /* targets also should be preserved */ + KUNIT_EXPECT_EQ(test, 3, it); + + damon_destroy_ctx(ctx); +} + +static void damon_test_split_at(struct kunit *test) +{ + struct damon_ctx *c = damon_new_ctx(); + struct damon_target *t; + struct damon_region *r; + + t = damon_new_target(42); + r = damon_new_region(0, 100); + damon_add_region(r, t); + damon_split_region_at(c, t, r, 25); + KUNIT_EXPECT_EQ(test, r->ar.start, 0ul); + KUNIT_EXPECT_EQ(test, r->ar.end, 25ul); + + r = damon_next_region(r); + KUNIT_EXPECT_EQ(test, r->ar.start, 25ul); + KUNIT_EXPECT_EQ(test, r->ar.end, 100ul); + + damon_free_target(t); + damon_destroy_ctx(c); +} + +static void damon_test_merge_two(struct kunit *test) +{ + struct damon_target *t; + struct damon_region *r, *r2, *r3; + int i; + + t = damon_new_target(42); + r = damon_new_region(0, 100); + r->nr_accesses = 10; + damon_add_region(r, t); + r2 = damon_new_region(100, 300); + r2->nr_accesses = 20; + damon_add_region(r2, t); + + damon_merge_two_regions(t, r, r2); + KUNIT_EXPECT_EQ(test, r->ar.start, 0ul); + KUNIT_EXPECT_EQ(test, r->ar.end, 300ul); + KUNIT_EXPECT_EQ(test, r->nr_accesses, 16u); + + i = 0; + damon_for_each_region(r3, t) { + KUNIT_EXPECT_PTR_EQ(test, r, r3); + i++; + } + KUNIT_EXPECT_EQ(test, i, 1); + + damon_free_target(t); +} + +static struct damon_region *__nth_region_of(struct damon_target *t, int idx) +{ + struct damon_region *r; + unsigned int i = 0; + + damon_for_each_region(r, t) { + if (i++ == idx) + return r; + } + + return NULL; +} + +static void damon_test_merge_regions_of(struct kunit *test) +{ + struct damon_target *t; + struct damon_region *r; + unsigned long sa[] = {0, 100, 114, 122, 130, 156, 170, 184}; + unsigned long ea[] = {100, 112, 122, 130, 156, 170, 184, 230}; + unsigned int nrs[] = {0, 0, 10, 10, 20, 30, 1, 2}; + + unsigned long saddrs[] = {0, 114, 130, 156, 170}; + unsigned long eaddrs[] = {112, 130, 156, 170, 230}; + int i; + + t = damon_new_target(42); + for (i = 0; i < ARRAY_SIZE(sa); i++) { + r = damon_new_region(sa[i], ea[i]); + r->nr_accesses = nrs[i]; + damon_add_region(r, t); + } + + damon_merge_regions_of(t, 9, 9999); + /* 0-112, 114-130, 130-156, 156-170 */ + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 5u); + for (i = 0; i < 5; i++) { + r = __nth_region_of(t, i); + KUNIT_EXPECT_EQ(test, r->ar.start, saddrs[i]); + KUNIT_EXPECT_EQ(test, r->ar.end, eaddrs[i]); + } + damon_free_target(t); +} + +static void damon_test_split_regions_of(struct kunit *test) +{ + struct damon_ctx *c = damon_new_ctx(); + struct damon_target *t; + struct damon_region *r; + + t = damon_new_target(42); + r = damon_new_region(0, 22); + damon_add_region(r, t); + damon_split_regions_of(c, t, 2); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 2u); + damon_free_target(t); + + t = damon_new_target(42); + r = damon_new_region(0, 220); + damon_add_region(r, t); + damon_split_regions_of(c, t, 4); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 4u); + damon_free_target(t); + damon_destroy_ctx(c); +} + +static struct kunit_case damon_test_cases[] = { + KUNIT_CASE(damon_test_target), + KUNIT_CASE(damon_test_regions), + KUNIT_CASE(damon_test_aggregate), + KUNIT_CASE(damon_test_split_at), + KUNIT_CASE(damon_test_merge_two), + KUNIT_CASE(damon_test_merge_regions_of), + KUNIT_CASE(damon_test_split_regions_of), + {}, +}; + +static struct kunit_suite damon_test_suite = { + .name = "damon", + .test_cases = damon_test_cases, +}; +kunit_test_suite(damon_test_suite); + +#endif /* _DAMON_CORE_TEST_H */ + +#endif /* CONFIG_DAMON_KUNIT_TEST */ diff --git a/mm/damon/core.c b/mm/damon/core.c index 59033488402e..30e9211f494a 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -16,6 +16,11 @@ #define CREATE_TRACE_POINTS #include <trace/events/damon.h>
+#ifdef CONFIG_DAMON_KUNIT_TEST +#undef DAMON_MIN_REGION +#define DAMON_MIN_REGION 1 +#endif + /* Get a random number in [l, r) */ #define damon_rand(l, r) (l + prandom_u32_max(r - l))
@@ -711,3 +716,5 @@ static int kdamond_fn(void *data)
do_exit(0); } + +#include "core-test.h" diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h new file mode 100644 index 000000000000..930e83bceef0 --- /dev/null +++ b/mm/damon/dbgfs-test.h @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * DAMON Debugfs Interface Unit Tests + * + * Author: SeongJae Park sjpark@amazon.de + */ + +#ifdef CONFIG_DAMON_DBGFS_KUNIT_TEST + +#ifndef _DAMON_DBGFS_TEST_H +#define _DAMON_DBGFS_TEST_H + +#include <kunit/test.h> + +static void damon_dbgfs_test_str_to_target_ids(struct kunit *test) +{ + char *question; + unsigned long *answers; + unsigned long expected[] = {12, 35, 46}; + ssize_t nr_integers = 0, i; + + question = "123"; + answers = str_to_target_ids(question, strnlen(question, 128), + &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers); + KUNIT_EXPECT_EQ(test, 123ul, answers[0]); + kfree(answers); + + question = "123abc"; + answers = str_to_target_ids(question, strnlen(question, 128), + &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers); + KUNIT_EXPECT_EQ(test, 123ul, answers[0]); + kfree(answers); + + question = "a123"; + answers = str_to_target_ids(question, strnlen(question, 128), + &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); + kfree(answers); + + question = "12 35"; + answers = str_to_target_ids(question, strnlen(question, 128), + &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers); + for (i = 0; i < nr_integers; i++) + KUNIT_EXPECT_EQ(test, expected[i], answers[i]); + kfree(answers); + + question = "12 35 46"; + answers = str_to_target_ids(question, strnlen(question, 128), + &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)3, nr_integers); + for (i = 0; i < nr_integers; i++) + KUNIT_EXPECT_EQ(test, expected[i], answers[i]); + kfree(answers); + + question = "12 35 abc 46"; + answers = str_to_target_ids(question, strnlen(question, 128), + &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers); + for (i = 0; i < 2; i++) + KUNIT_EXPECT_EQ(test, expected[i], answers[i]); + kfree(answers); + + question = ""; + answers = str_to_target_ids(question, strnlen(question, 128), + &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); + kfree(answers); + + question = "\n"; + answers = str_to_target_ids(question, strnlen(question, 128), + &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); + kfree(answers); +} + +static void damon_dbgfs_test_set_targets(struct kunit *test) +{ + struct damon_ctx *ctx = dbgfs_new_ctx(); + unsigned long ids[] = {1, 2, 3}; + char buf[64]; + + /* Make DAMON consider target id as plain number */ + ctx->primitive.target_valid = NULL; + ctx->primitive.cleanup = NULL; + + damon_set_targets(ctx, ids, 3); + sprint_target_ids(ctx, buf, 64); + KUNIT_EXPECT_STREQ(test, (char *)buf, "1 2 3\n"); + + damon_set_targets(ctx, NULL, 0); + sprint_target_ids(ctx, buf, 64); + KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); + + damon_set_targets(ctx, (unsigned long []){1, 2}, 2); + sprint_target_ids(ctx, buf, 64); + KUNIT_EXPECT_STREQ(test, (char *)buf, "1 2\n"); + + damon_set_targets(ctx, (unsigned long []){2}, 1); + sprint_target_ids(ctx, buf, 64); + KUNIT_EXPECT_STREQ(test, (char *)buf, "2\n"); + + damon_set_targets(ctx, NULL, 0); + sprint_target_ids(ctx, buf, 64); + KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); + + dbgfs_destroy_ctx(ctx); +} + +static struct kunit_case damon_test_cases[] = { + KUNIT_CASE(damon_dbgfs_test_str_to_target_ids), + KUNIT_CASE(damon_dbgfs_test_set_targets), + {}, +}; + +static struct kunit_suite damon_test_suite = { + .name = "damon-dbgfs", + .test_cases = damon_test_cases, +}; +kunit_test_suite(damon_test_suite); + +#endif /* _DAMON_TEST_H */ + +#endif /* CONFIG_DAMON_KUNIT_TEST */ diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 31ad550ecba2..faee070977d8 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -619,3 +619,5 @@ static int __init damon_dbgfs_init(void) }
module_init(damon_dbgfs_init); + +#include "dbgfs-test.h" diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h new file mode 100644 index 000000000000..1f5c13257dba --- /dev/null +++ b/mm/damon/vaddr-test.h @@ -0,0 +1,329 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Data Access Monitor Unit Tests + * + * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * + * Author: SeongJae Park sjpark@amazon.de + */ + +#ifdef CONFIG_DAMON_VADDR_KUNIT_TEST + +#ifndef _DAMON_VADDR_TEST_H +#define _DAMON_VADDR_TEST_H + +#include <kunit/test.h> + +static void __link_vmas(struct vm_area_struct *vmas, ssize_t nr_vmas) +{ + int i, j; + unsigned long largest_gap, gap; + + if (!nr_vmas) + return; + + for (i = 0; i < nr_vmas - 1; i++) { + vmas[i].vm_next = &vmas[i + 1]; + + vmas[i].vm_rb.rb_left = NULL; + vmas[i].vm_rb.rb_right = &vmas[i + 1].vm_rb; + + largest_gap = 0; + for (j = i; j < nr_vmas; j++) { + if (j == 0) + continue; + gap = vmas[j].vm_start - vmas[j - 1].vm_end; + if (gap > largest_gap) + largest_gap = gap; + } + vmas[i].rb_subtree_gap = largest_gap; + } + vmas[i].vm_next = NULL; + vmas[i].vm_rb.rb_right = NULL; + vmas[i].rb_subtree_gap = 0; +} + +/* + * Test __damon_va_three_regions() function + * + * In case of virtual memory address spaces monitoring, DAMON converts the + * complex and dynamic memory mappings of each target task to three + * discontiguous regions which cover every mapped areas. However, the three + * regions should not include the two biggest unmapped areas in the original + * mapping, because the two biggest areas are normally the areas between 1) + * heap and the mmap()-ed regions, and 2) the mmap()-ed regions and stack. + * Because these two unmapped areas are very huge but obviously never accessed, + * covering the region is just a waste. + * + * '__damon_va_three_regions() receives an address space of a process. It + * first identifies the start of mappings, end of mappings, and the two biggest + * unmapped areas. After that, based on the information, it constructs the + * three regions and returns. For more detail, refer to the comment of + * 'damon_init_regions_of()' function definition in 'mm/damon.c' file. + * + * For example, suppose virtual address ranges of 10-20, 20-25, 200-210, + * 210-220, 300-305, and 307-330 (Other comments represent this mappings in + * more short form: 10-20-25, 200-210-220, 300-305, 307-330) of a process are + * mapped. To cover every mappings, the three regions should start with 10, + * and end with 305. The process also has three unmapped areas, 25-200, + * 220-300, and 305-307. Among those, 25-200 and 220-300 are the biggest two + * unmapped areas, and thus it should be converted to three regions of 10-25, + * 200-220, and 300-330. + */ +static void damon_test_three_regions_in_vmas(struct kunit *test) +{ + struct damon_addr_range regions[3] = {0,}; + /* 10-20-25, 200-210-220, 300-305, 307-330 */ + struct vm_area_struct vmas[] = { + (struct vm_area_struct) {.vm_start = 10, .vm_end = 20}, + (struct vm_area_struct) {.vm_start = 20, .vm_end = 25}, + (struct vm_area_struct) {.vm_start = 200, .vm_end = 210}, + (struct vm_area_struct) {.vm_start = 210, .vm_end = 220}, + (struct vm_area_struct) {.vm_start = 300, .vm_end = 305}, + (struct vm_area_struct) {.vm_start = 307, .vm_end = 330}, + }; + + __link_vmas(vmas, 6); + + __damon_va_three_regions(&vmas[0], regions); + + KUNIT_EXPECT_EQ(test, 10ul, regions[0].start); + KUNIT_EXPECT_EQ(test, 25ul, regions[0].end); + KUNIT_EXPECT_EQ(test, 200ul, regions[1].start); + KUNIT_EXPECT_EQ(test, 220ul, regions[1].end); + KUNIT_EXPECT_EQ(test, 300ul, regions[2].start); + KUNIT_EXPECT_EQ(test, 330ul, regions[2].end); +} + +static struct damon_region *__nth_region_of(struct damon_target *t, int idx) +{ + struct damon_region *r; + unsigned int i = 0; + + damon_for_each_region(r, t) { + if (i++ == idx) + return r; + } + + return NULL; +} + +/* + * Test 'damon_va_apply_three_regions()' + * + * test kunit object + * regions an array containing start/end addresses of current + * monitoring target regions + * nr_regions the number of the addresses in 'regions' + * three_regions The three regions that need to be applied now + * expected start/end addresses of monitoring target regions that + * 'three_regions' are applied + * nr_expected the number of addresses in 'expected' + * + * The memory mapping of the target processes changes dynamically. To follow + * the change, DAMON periodically reads the mappings, simplifies it to the + * three regions, and updates the monitoring target regions to fit in the three + * regions. The update of current target regions is the role of + * 'damon_va_apply_three_regions()'. + * + * This test passes the given target regions and the new three regions that + * need to be applied to the function and check whether it updates the regions + * as expected. + */ +static void damon_do_test_apply_three_regions(struct kunit *test, + unsigned long *regions, int nr_regions, + struct damon_addr_range *three_regions, + unsigned long *expected, int nr_expected) +{ + struct damon_ctx *ctx = damon_new_ctx(); + struct damon_target *t; + struct damon_region *r; + int i; + + t = damon_new_target(42); + for (i = 0; i < nr_regions / 2; i++) { + r = damon_new_region(regions[i * 2], regions[i * 2 + 1]); + damon_add_region(r, t); + } + damon_add_target(ctx, t); + + damon_va_apply_three_regions(t, three_regions); + + for (i = 0; i < nr_expected / 2; i++) { + r = __nth_region_of(t, i); + KUNIT_EXPECT_EQ(test, r->ar.start, expected[i * 2]); + KUNIT_EXPECT_EQ(test, r->ar.end, expected[i * 2 + 1]); + } + + damon_destroy_ctx(ctx); +} + +/* + * This function test most common case where the three big regions are only + * slightly changed. Target regions should adjust their boundary (10-20-30, + * 50-55, 70-80, 90-100) to fit with the new big regions or remove target + * regions (57-79) that now out of the three regions. + */ +static void damon_test_apply_three_regions1(struct kunit *test) +{ + /* 10-20-30, 50-55-57-59, 70-80-90-100 */ + unsigned long regions[] = {10, 20, 20, 30, 50, 55, 55, 57, 57, 59, + 70, 80, 80, 90, 90, 100}; + /* 5-27, 45-55, 73-104 */ + struct damon_addr_range new_three_regions[3] = { + (struct damon_addr_range){.start = 5, .end = 27}, + (struct damon_addr_range){.start = 45, .end = 55}, + (struct damon_addr_range){.start = 73, .end = 104} }; + /* 5-20-27, 45-55, 73-80-90-104 */ + unsigned long expected[] = {5, 20, 20, 27, 45, 55, + 73, 80, 80, 90, 90, 104}; + + damon_do_test_apply_three_regions(test, regions, ARRAY_SIZE(regions), + new_three_regions, expected, ARRAY_SIZE(expected)); +} + +/* + * Test slightly bigger change. Similar to above, but the second big region + * now require two target regions (50-55, 57-59) to be removed. + */ +static void damon_test_apply_three_regions2(struct kunit *test) +{ + /* 10-20-30, 50-55-57-59, 70-80-90-100 */ + unsigned long regions[] = {10, 20, 20, 30, 50, 55, 55, 57, 57, 59, + 70, 80, 80, 90, 90, 100}; + /* 5-27, 56-57, 65-104 */ + struct damon_addr_range new_three_regions[3] = { + (struct damon_addr_range){.start = 5, .end = 27}, + (struct damon_addr_range){.start = 56, .end = 57}, + (struct damon_addr_range){.start = 65, .end = 104} }; + /* 5-20-27, 56-57, 65-80-90-104 */ + unsigned long expected[] = {5, 20, 20, 27, 56, 57, + 65, 80, 80, 90, 90, 104}; + + damon_do_test_apply_three_regions(test, regions, ARRAY_SIZE(regions), + new_three_regions, expected, ARRAY_SIZE(expected)); +} + +/* + * Test a big change. The second big region has totally freed and mapped to + * different area (50-59 -> 61-63). The target regions which were in the old + * second big region (50-55-57-59) should be removed and new target region + * covering the second big region (61-63) should be created. + */ +static void damon_test_apply_three_regions3(struct kunit *test) +{ + /* 10-20-30, 50-55-57-59, 70-80-90-100 */ + unsigned long regions[] = {10, 20, 20, 30, 50, 55, 55, 57, 57, 59, + 70, 80, 80, 90, 90, 100}; + /* 5-27, 61-63, 65-104 */ + struct damon_addr_range new_three_regions[3] = { + (struct damon_addr_range){.start = 5, .end = 27}, + (struct damon_addr_range){.start = 61, .end = 63}, + (struct damon_addr_range){.start = 65, .end = 104} }; + /* 5-20-27, 61-63, 65-80-90-104 */ + unsigned long expected[] = {5, 20, 20, 27, 61, 63, + 65, 80, 80, 90, 90, 104}; + + damon_do_test_apply_three_regions(test, regions, ARRAY_SIZE(regions), + new_three_regions, expected, ARRAY_SIZE(expected)); +} + +/* + * Test another big change. Both of the second and third big regions (50-59 + * and 70-100) has totally freed and mapped to different area (30-32 and + * 65-68). The target regions which were in the old second and third big + * regions should now be removed and new target regions covering the new second + * and third big regions should be crated. + */ +static void damon_test_apply_three_regions4(struct kunit *test) +{ + /* 10-20-30, 50-55-57-59, 70-80-90-100 */ + unsigned long regions[] = {10, 20, 20, 30, 50, 55, 55, 57, 57, 59, + 70, 80, 80, 90, 90, 100}; + /* 5-7, 30-32, 65-68 */ + struct damon_addr_range new_three_regions[3] = { + (struct damon_addr_range){.start = 5, .end = 7}, + (struct damon_addr_range){.start = 30, .end = 32}, + (struct damon_addr_range){.start = 65, .end = 68} }; + /* expect 5-7, 30-32, 65-68 */ + unsigned long expected[] = {5, 7, 30, 32, 65, 68}; + + damon_do_test_apply_three_regions(test, regions, ARRAY_SIZE(regions), + new_three_regions, expected, ARRAY_SIZE(expected)); +} + +static void damon_test_split_evenly(struct kunit *test) +{ + struct damon_ctx *c = damon_new_ctx(); + struct damon_target *t; + struct damon_region *r; + unsigned long i; + + KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(NULL, NULL, 5), + -EINVAL); + + t = damon_new_target(42); + r = damon_new_region(0, 100); + KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 0), -EINVAL); + + damon_add_region(r, t); + KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 10), 0); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 10u); + + i = 0; + damon_for_each_region(r, t) { + KUNIT_EXPECT_EQ(test, r->ar.start, i++ * 10); + KUNIT_EXPECT_EQ(test, r->ar.end, i * 10); + } + damon_free_target(t); + + t = damon_new_target(42); + r = damon_new_region(5, 59); + damon_add_region(r, t); + KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 5), 0); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 5u); + + i = 0; + damon_for_each_region(r, t) { + if (i == 4) + break; + KUNIT_EXPECT_EQ(test, r->ar.start, 5 + 10 * i++); + KUNIT_EXPECT_EQ(test, r->ar.end, 5 + 10 * i); + } + KUNIT_EXPECT_EQ(test, r->ar.start, 5 + 10 * i); + KUNIT_EXPECT_EQ(test, r->ar.end, 59ul); + damon_free_target(t); + + t = damon_new_target(42); + r = damon_new_region(5, 6); + damon_add_region(r, t); + KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 2), -EINVAL); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1u); + + damon_for_each_region(r, t) { + KUNIT_EXPECT_EQ(test, r->ar.start, 5ul); + KUNIT_EXPECT_EQ(test, r->ar.end, 6ul); + } + damon_free_target(t); + damon_destroy_ctx(c); +} + +static struct kunit_case damon_test_cases[] = { + KUNIT_CASE(damon_test_three_regions_in_vmas), + KUNIT_CASE(damon_test_apply_three_regions1), + KUNIT_CASE(damon_test_apply_three_regions2), + KUNIT_CASE(damon_test_apply_three_regions3), + KUNIT_CASE(damon_test_apply_three_regions4), + KUNIT_CASE(damon_test_split_evenly), + {}, +}; + +static struct kunit_suite damon_test_suite = { + .name = "damon-primitives", + .test_cases = damon_test_cases, +}; +kunit_test_suite(damon_test_suite); + +#endif /* _DAMON_VADDR_TEST_H */ + +#endif /* CONFIG_DAMON_VADDR_KUNIT_TEST */ diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 897aa8cf96c8..58c1fb2aafa9 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -18,6 +18,11 @@ #include <linux/sched/mm.h> #include <linux/slab.h>
+#ifdef CONFIG_DAMON_VADDR_KUNIT_TEST +#undef DAMON_MIN_REGION +#define DAMON_MIN_REGION 1 +#endif + /* Get a random number in [l, r) */ #define damon_rand(l, r) (l + prandom_u32_max(r - l))
@@ -663,3 +668,5 @@ void damon_va_set_primitives(struct damon_ctx *ctx) ctx->primitive.target_valid = damon_va_target_valid; ctx->primitive.cleanup = NULL; } + +#include "vaddr-test.h"
From: SeongJae Park sjpark@amazon.de
mainline inclusion from mainline-5.15-rc1 commit b348eb7abd0987b849420113ced27ad7a1bc6cf3 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4GVMK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
------------------------------------------------- This commit adds a simple user space tests for DAMON. The tests are using kselftest framework.
Link: https://lkml.kernel.org/r/20210716081449.22187-13-sj38.park@gmail.com Signed-off-by: SeongJae Park sjpark@amazon.de Reviewed-by: Markus Boehme markubo@amazon.de Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Amit Shah amit@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Brendan Higgins brendanhiggins@google.com Cc: David Hildenbrand david@redhat.com Cc: David Rientjes rientjes@google.com Cc: David Woodhouse dwmw@amazon.com Cc: Fan Du fan.du@intel.com Cc: Fernand Sieber sieberf@amazon.com Cc: Greg Kroah-Hartman greg@kroah.com Cc: Greg Thelen gthelen@google.com Cc: Ingo Molnar mingo@redhat.com Cc: Joe Perches joe@perches.com Cc: Jonathan Cameron Jonathan.Cameron@huawei.com Cc: Jonathan Corbet corbet@lwn.net Cc: Leonard Foerster foersleo@amazon.de Cc: Marco Elver elver@google.com Cc: Maximilian Heyne mheyne@amazon.de Cc: Mel Gorman mgorman@suse.de Cc: Minchan Kim minchan@kernel.org Cc: Namhyung Kim namhyung@kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: Rik van Riel riel@surriel.com Cc: Shakeel Butt shakeelb@google.com Cc: Shuah Khan shuah@kernel.org Cc: Steven Rostedt (VMware) rostedt@goodmis.org Cc: Vladimir Davydov vdavydov.dev@gmail.com Cc: Vlastimil Babka vbabka@suse.cz Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit b348eb7abd0987b849420113ced27ad7a1bc6cf3) Signed-off-by: Yue Zou zouyue3@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- tools/testing/selftests/damon/Makefile | 7 ++ .../selftests/damon/_chk_dependency.sh | 28 +++++++ .../testing/selftests/damon/debugfs_attrs.sh | 75 +++++++++++++++++++ 3 files changed, 110 insertions(+) create mode 100644 tools/testing/selftests/damon/Makefile create mode 100644 tools/testing/selftests/damon/_chk_dependency.sh create mode 100644 tools/testing/selftests/damon/debugfs_attrs.sh
diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile new file mode 100644 index 000000000000..8a3f2cd9fec0 --- /dev/null +++ b/tools/testing/selftests/damon/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 +# Makefile for damon selftests + +TEST_FILES = _chk_dependency.sh +TEST_PROGS = debugfs_attrs.sh + +include ../lib.mk diff --git a/tools/testing/selftests/damon/_chk_dependency.sh b/tools/testing/selftests/damon/_chk_dependency.sh new file mode 100644 index 000000000000..0189db81550b --- /dev/null +++ b/tools/testing/selftests/damon/_chk_dependency.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +DBGFS=/sys/kernel/debug/damon + +if [ $EUID -ne 0 ]; +then + echo "Run as root" + exit $ksft_skip +fi + +if [ ! -d "$DBGFS" ] +then + echo "$DBGFS not found" + exit $ksft_skip +fi + +for f in attrs target_ids monitor_on +do + if [ ! -f "$DBGFS/$f" ] + then + echo "$f not found" + exit 1 + fi +done diff --git a/tools/testing/selftests/damon/debugfs_attrs.sh b/tools/testing/selftests/damon/debugfs_attrs.sh new file mode 100644 index 000000000000..bfabb19dc0d3 --- /dev/null +++ b/tools/testing/selftests/damon/debugfs_attrs.sh @@ -0,0 +1,75 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +test_write_result() { + file=$1 + content=$2 + orig_content=$3 + expect_reason=$4 + expected=$5 + + echo "$content" > "$file" + if [ $? -ne "$expected" ] + then + echo "writing $content to $file doesn't return $expected" + echo "expected because: $expect_reason" + echo "$orig_content" > "$file" + exit 1 + fi +} + +test_write_succ() { + test_write_result "$1" "$2" "$3" "$4" 0 +} + +test_write_fail() { + test_write_result "$1" "$2" "$3" "$4" 1 +} + +test_content() { + file=$1 + orig_content=$2 + expected=$3 + expect_reason=$4 + + content=$(cat "$file") + if [ "$content" != "$expected" ] + then + echo "reading $file expected $expected but $content" + echo "expected because: $expect_reason" + echo "$orig_content" > "$file" + exit 1 + fi +} + +source ./_chk_dependency.sh + +# Test attrs file +# =============== + +file="$DBGFS/attrs" +orig_content=$(cat "$file") + +test_write_succ "$file" "1 2 3 4 5" "$orig_content" "valid input" +test_write_fail "$file" "1 2 3 4" "$orig_content" "no enough fields" +test_write_fail "$file" "1 2 3 5 4" "$orig_content" \ + "min_nr_regions > max_nr_regions" +test_content "$file" "$orig_content" "1 2 3 4 5" "successfully written" +echo "$orig_content" > "$file" + +# Test target_ids file +# ==================== + +file="$DBGFS/target_ids" +orig_content=$(cat "$file") + +test_write_succ "$file" "1 2 3 4" "$orig_content" "valid input" +test_write_succ "$file" "1 2 abc 4" "$orig_content" "still valid input" +test_content "$file" "$orig_content" "1 2" "non-integer was there" +test_write_succ "$file" "abc 2 3" "$orig_content" "the file allows wrong input" +test_content "$file" "$orig_content" "" "wrong input written" +test_write_succ "$file" "" "$orig_content" "empty input" +test_content "$file" "$orig_content" "" "empty input written" +echo "$orig_content" > "$file" + +echo "PASS"
From: SeongJae Park sjpark@amazon.de
mainline inclusion from mainline-5.15-rc1 commit 75e39b1a3668c008c32c7ffbebe93386e67c7352 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4GVMK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
------------------------------------------------- This commit updates MAINTAINERS file for DAMON related files.
Link: https://lkml.kernel.org/r/20210716081449.22187-14-sj38.park@gmail.com Signed-off-by: SeongJae Park sjpark@amazon.de Reviewed-by: Markus Boehme markubo@amazon.de Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Amit Shah amit@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Brendan Higgins brendanhiggins@google.com Cc: David Hildenbrand david@redhat.com Cc: David Rientjes rientjes@google.com Cc: David Woodhouse dwmw@amazon.com Cc: Fan Du fan.du@intel.com Cc: Fernand Sieber sieberf@amazon.com Cc: Greg Kroah-Hartman greg@kroah.com Cc: Greg Thelen gthelen@google.com Cc: Ingo Molnar mingo@redhat.com Cc: Joe Perches joe@perches.com Cc: Jonathan Cameron Jonathan.Cameron@huawei.com Cc: Jonathan Corbet corbet@lwn.net Cc: Leonard Foerster foersleo@amazon.de Cc: Marco Elver elver@google.com Cc: Maximilian Heyne mheyne@amazon.de Cc: Mel Gorman mgorman@suse.de Cc: Minchan Kim minchan@kernel.org Cc: Namhyung Kim namhyung@kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: Rik van Riel riel@surriel.com Cc: Shakeel Butt shakeelb@google.com Cc: Shuah Khan shuah@kernel.org Cc: Steven Rostedt (VMware) rostedt@goodmis.org Cc: Vladimir Davydov vdavydov.dev@gmail.com Cc: Vlastimil Babka vbabka@suse.cz Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit 75e39b1a3668c008c32c7ffbebe93386e67c7352) Signed-off-by: Yue Zou zouyue3@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- MAINTAINERS | 11 +++++++++++ 1 file changed, 11 insertions(+)
diff --git a/MAINTAINERS b/MAINTAINERS index 20ea04bb5542..16a81020ead0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4872,6 +4872,17 @@ F: net/ax25/ax25_out.c F: net/ax25/ax25_timer.c F: net/ax25/sysctl_net_ax25.c
+DATA ACCESS MONITOR +M: SeongJae Park sjpark@amazon.de +L: linux-mm@kvack.org +S: Maintained +F: Documentation/admin-guide/mm/damon/ +F: Documentation/vm/damon/ +F: include/linux/damon.h +F: include/trace/events/damon.h +F: mm/damon/ +F: tools/testing/selftests/damon/ + DAVICOM FAST ETHERNET (DMFE) NETWORK DRIVER L: netdev@vger.kernel.org S: Orphan
From: Lorenzo Bianconi lorenzo@kernel.org
mainline inclusion from mainline-v5.11-rc1 commit 7886244736a4 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4CVS3 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Introduce the capability to batch page_pool ptr_ring refill since it is usually run inside the driver NAPI tx completion loop.
Suggested-by: Jesper Dangaard Brouer brouer@redhat.com Co-developed-by: Jesper Dangaard Brouer brouer@redhat.com Signed-off-by: Jesper Dangaard Brouer brouer@redhat.com Signed-off-by: Lorenzo Bianconi lorenzo@kernel.org Signed-off-by: Daniel Borkmann daniel@iogearbox.net Acked-by: John Fastabend john.fastabend@gmail.com Acked-by: Ilias Apalodimas ilias.apalodimas@linaro.org Link: https://lore.kernel.org/bpf/08dd249c9522c001313f520796faa777c4089e1c.1605267... Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com (fix conflicts: delete modifications to net/core/xdp.c) Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/net/page_pool.h | 26 +++++++++++++++ net/core/page_pool.c | 70 +++++++++++++++++++++++++++++++++++------ 2 files changed, 86 insertions(+), 10 deletions(-)
diff --git a/include/net/page_pool.h b/include/net/page_pool.h index bd296cebab6a..b4b6de909c93 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -152,6 +152,8 @@ struct page_pool *page_pool_create(const struct page_pool_params *params); void page_pool_destroy(struct page_pool *pool); void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *)); void page_pool_release_page(struct page_pool *pool, struct page *page); +void page_pool_put_page_bulk(struct page_pool *pool, void **data, + int count); #else static inline void page_pool_destroy(struct page_pool *pool) { @@ -165,6 +167,11 @@ static inline void page_pool_release_page(struct page_pool *pool, struct page *page) { } + +static inline void page_pool_put_page_bulk(struct page_pool *pool, void **data, + int count) +{ +} #endif
void page_pool_put_page(struct page_pool *pool, struct page *page, @@ -225,4 +232,23 @@ static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) if (unlikely(pool->p.nid != new_nid)) page_pool_update_nid(pool, new_nid); } + +static inline void page_pool_ring_lock(struct page_pool *pool) + __acquires(&pool->ring.producer_lock) +{ + if (in_serving_softirq()) + spin_lock(&pool->ring.producer_lock); + else + spin_lock_bh(&pool->ring.producer_lock); +} + +static inline void page_pool_ring_unlock(struct page_pool *pool) + __releases(&pool->ring.producer_lock) +{ + if (in_serving_softirq()) + spin_unlock(&pool->ring.producer_lock); + else + spin_unlock_bh(&pool->ring.producer_lock); +} + #endif /* _NET_PAGE_POOL_H */ diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 91911a459580..3abf00511799 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -11,6 +11,8 @@ #include <linux/device.h>
#include <net/page_pool.h> +#include <net/xdp.h> + #include <linux/dma-direction.h> #include <linux/dma-mapping.h> #include <linux/page-flags.h> @@ -401,8 +403,9 @@ static bool pool_page_reusable(struct page_pool *pool, struct page *page) * If the page refcnt != 1, then the page will be returned to memory * subsystem. */ -void page_pool_put_page(struct page_pool *pool, struct page *page, - unsigned int dma_sync_size, bool allow_direct) +static __always_inline struct page * +__page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) { /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the @@ -418,15 +421,12 @@ void page_pool_put_page(struct page_pool *pool, struct page *page, page_pool_dma_sync_for_device(pool, page, dma_sync_size);
- if (allow_direct && in_serving_softirq()) - if (page_pool_recycle_in_cache(page, pool)) - return; + if (allow_direct && in_serving_softirq() && + page_pool_recycle_in_cache(page, pool)) + return NULL;
- if (!page_pool_recycle_in_ring(pool, page)) { - /* Cache full, fallback to free pages */ - page_pool_return_page(pool, page); - } - return; + /* Page found as candidate for recycling */ + return page; } /* Fallback/non-XDP mode: API user have elevated refcnt. * @@ -444,9 +444,59 @@ void page_pool_put_page(struct page_pool *pool, struct page *page, /* Do not replace this with page_pool_return_page() */ page_pool_release_page(pool, page); put_page(page); + + return NULL; +} + +void page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) +{ + page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); + if (page && !page_pool_recycle_in_ring(pool, page)) { + /* Cache full, fallback to free pages */ + page_pool_return_page(pool, page); + } } EXPORT_SYMBOL(page_pool_put_page);
+/* Caller must not use data area after call, as this function overwrites it */ +void page_pool_put_page_bulk(struct page_pool *pool, void **data, + int count) +{ + int i, bulk_len = 0; + + for (i = 0; i < count; i++) { + struct page *page = virt_to_head_page(data[i]); + + page = __page_pool_put_page(pool, page, -1, false); + /* Approved for bulk recycling in ptr_ring cache */ + if (page) + data[bulk_len++] = page; + } + + if (unlikely(!bulk_len)) + return; + + /* Bulk producer into ptr_ring page_pool cache */ + page_pool_ring_lock(pool); + for (i = 0; i < bulk_len; i++) { + if (__ptr_ring_produce(&pool->ring, data[i])) + break; /* ring full */ + } + page_pool_ring_unlock(pool); + + /* Hopefully all pages was return into ptr_ring */ + if (likely(i == bulk_len)) + return; + + /* ptr_ring cache full, free remaining pages outside producer lock + * since put_page() with refcnt == 1 can be an expensive operation + */ + for (; i < bulk_len; i++) + page_pool_return_page(pool, data[i]); +} +EXPORT_SYMBOL(page_pool_put_page_bulk); + static void page_pool_empty_ring(struct page_pool *pool) { struct page *page;
From: Jonathan Lemon jonathan.lemon@gmail.com
mainline inclusion from mainline-v5.12-rc1-dontuse commit 70c4316749f6 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4CVS3 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
RX zerocopy fragment pages which are not allocated from the system page pool require special handling. Give the callback in skb_zcopy_clear() a chance to process them first.
Signed-off-by: Jonathan Lemon jonathan.lemon@gmail.com Signed-off-by: Jakub Kicinski kuba@kernel.org Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/core/skbuff.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 825e6b988003..91661352dca8 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -614,13 +614,14 @@ static void skb_release_data(struct sk_buff *skb) &shinfo->dataref)) return;
+ skb_zcopy_clear(skb, true); + for (i = 0; i < shinfo->nr_frags; i++) __skb_frag_unref(&shinfo->frags[i]);
if (shinfo->frag_list) kfree_skb_list(shinfo->frag_list);
- skb_zcopy_clear(skb, true); skb_free_head(skb); }
From: Alexander Lobakin alobakin@pm.me
mainline inclusion from mainline-v5.12-rc1-dontuse commit 05656132a874 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4CVS3 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
pool_page_reusable() is a leftover from pre-NUMA-aware times. For now, this function is just a redundant wrapper over page_is_pfmemalloc(), so inline it into its sole call site.
Signed-off-by: Alexander Lobakin alobakin@pm.me Acked-by: Jesper Dangaard Brouer brouer@redhat.com Reviewed-by: Ilias Apalodimas ilias.apalodimas@linaro.org Reviewed-by: Jesse Brandeburg jesse.brandeburg@intel.com Acked-by: David Rientjes rientjes@google.com Signed-off-by: Jakub Kicinski kuba@kernel.org Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/core/page_pool.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-)
diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 3abf00511799..3c4c4c7a0402 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -389,14 +389,6 @@ static bool page_pool_recycle_in_cache(struct page *page, return true; }
-/* page is NOT reusable when: - * 1) allocated when system is under some pressure. (page_is_pfmemalloc) - */ -static bool pool_page_reusable(struct page_pool *pool, struct page *page) -{ - return !page_is_pfmemalloc(page); -} - /* If the page refcnt == 1, this will try to recycle the page. * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for * the configured size min(dma_sync_size, pool->max_len). @@ -412,9 +404,11 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, * regular page allocator APIs. * * refcnt == 1 means page_pool owns page, and can recycle it. + * + * page is NOT reusable when allocated when system is under + * some pressure. (page_is_pfmemalloc) */ - if (likely(page_ref_count(page) == 1 && - pool_page_reusable(pool, page))) { + if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) { /* Read barrier done in page_ref_count / READ_ONCE */
if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
From: Matteo Croce mcroce@microsoft.com
mainline inclusion from mainline-v5.14-rc1 commit c07aea3ef4d4 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4CVS3 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
This is needed by the page_pool to avoid recycling a page not allocated via page_pool.
The page->signature field is aliased to page->lru.next and page->compound_head, but it can't be set by mistake because the signature value is a bad pointer, and can't trigger a false positive in PageTail() because the last bit is 0.
Co-developed-by: Matthew Wilcox (Oracle) willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) willy@infradead.org Signed-off-by: Matteo Croce mcroce@microsoft.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/mm.h | 11 ++++++----- include/linux/mm_types.h | 7 +++++++ include/linux/poison.h | 3 +++ net/core/page_pool.c | 6 ++++++ 4 files changed, 22 insertions(+), 5 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h index 414664e651b5..656f524ba7d3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1582,10 +1582,11 @@ struct address_space *page_mapping_file(struct page *page); static inline bool page_is_pfmemalloc(const struct page *page) { /* - * Page index cannot be this large so this must be - * a pfmemalloc page. + * lru.next has bit 1 set if the page is allocated from the + * pfmemalloc reserves. Callers may simply overwrite it if + * they do not need to preserve that information. */ - return page->index == -1UL; + return (uintptr_t)page->lru.next & BIT(1); }
/* @@ -1594,12 +1595,12 @@ static inline bool page_is_pfmemalloc(const struct page *page) */ static inline void set_page_pfmemalloc(struct page *page) { - page->index = -1UL; + page->lru.next = (void *)BIT(1); }
static inline void clear_page_pfmemalloc(struct page *page) { - page->index = 0; + page->lru.next = NULL; }
/* diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 72ec8fbc50d5..fe38904ec52e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -96,6 +96,13 @@ struct page { unsigned long private; }; struct { /* page_pool used by netstack */ + /** + * @pp_magic: magic value to avoid recycling non + * page_pool allocated pages. + */ + unsigned long pp_magic; + struct page_pool *pp; + unsigned long _pp_mapping_pad; /** * @dma_addr: might require a 64-bit value on * 32-bit architectures. diff --git a/include/linux/poison.h b/include/linux/poison.h index dc8ae5d8db03..028133fb1405 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -82,4 +82,7 @@ /********** security/ **********/ #define KEY_DESTROY 0xbd
+/********** net/core/page_pool.c **********/ +#define PP_SIGNATURE (0x40 + POISON_POINTER_DELTA) + #endif diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 3c4c4c7a0402..e1321bc9d316 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -17,6 +17,7 @@ #include <linux/dma-mapping.h> #include <linux/page-flags.h> #include <linux/mm.h> /* for __put_page() */ +#include <linux/poison.h>
#include <trace/events/page_pool.h>
@@ -221,6 +222,8 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, return NULL; }
+ page->pp_magic |= PP_SIGNATURE; + /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); @@ -263,6 +266,7 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, put_page(page); continue; } + page->pp_magic |= PP_SIGNATURE; pool->alloc.cache[pool->alloc.count++] = page; /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; @@ -341,6 +345,8 @@ void page_pool_release_page(struct page_pool *pool, struct page *page) DMA_ATTR_SKIP_CPU_SYNC); page_pool_set_dma_addr(page, 0); skip_dma_unmap: + page->pp_magic = 0; + /* This may be the last page returned, releasing the pool, so * it is not safe to reference pool afterwards. */
From: Matteo Croce mcroce@microsoft.com
mainline inclusion from mainline-v5.14-rc1 commit c420c98982fa category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4CVS3 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
This is a prerequisite patch, the next one is enabling recycling of skbs and fragments. Add an extra argument on __skb_frag_unref() to handle recycling, and update the current users of the function with that.
Signed-off-by: Matteo Croce mcroce@microsoft.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/marvell/sky2.c | 2 +- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 2 +- include/linux/skbuff.h | 8 +++++--- net/core/skbuff.c | 4 ++-- net/tls/tls_device.c | 2 +- 5 files changed, 10 insertions(+), 8 deletions(-)
diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c index e049fd653669..a035773a82e6 100644 --- a/drivers/net/ethernet/marvell/sky2.c +++ b/drivers/net/ethernet/marvell/sky2.c @@ -2501,7 +2501,7 @@ static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space,
if (length == 0) { /* don't need this page */ - __skb_frag_unref(frag); + __skb_frag_unref(frag, false); --skb_shinfo(skb)->nr_frags; } else { size = min(length, (unsigned) PAGE_SIZE); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 502d1b97855c..82f2ac16168b 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -526,7 +526,7 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv, fail: while (nr > 0) { nr--; - __skb_frag_unref(skb_shinfo(skb)->frags + nr); + __skb_frag_unref(skb_shinfo(skb)->frags + nr, false); } return 0; } diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 054cde38ebc8..47ffc683015d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3027,10 +3027,12 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f) /** * __skb_frag_unref - release a reference on a paged fragment. * @frag: the paged fragment + * @recycle: recycle the page if allocated via page_pool * - * Releases a reference on the paged fragment @frag. + * Releases a reference on the paged fragment @frag + * or recycles the page via the page_pool API. */ -static inline void __skb_frag_unref(skb_frag_t *frag) +static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) { put_page(skb_frag_page(frag)); } @@ -3044,7 +3046,7 @@ static inline void __skb_frag_unref(skb_frag_t *frag) */ static inline void skb_frag_unref(struct sk_buff *skb, int f) { - __skb_frag_unref(&skb_shinfo(skb)->frags[f]); + __skb_frag_unref(&skb_shinfo(skb)->frags[f], false); }
/** diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 91661352dca8..9580a6603d6f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -617,7 +617,7 @@ static void skb_release_data(struct sk_buff *skb) skb_zcopy_clear(skb, true);
for (i = 0; i < shinfo->nr_frags; i++) - __skb_frag_unref(&shinfo->frags[i]); + __skb_frag_unref(&shinfo->frags[i], false);
if (shinfo->frag_list) kfree_skb_list(shinfo->frag_list); @@ -3419,7 +3419,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) fragto = &skb_shinfo(tgt)->frags[merge];
skb_frag_size_add(fragto, skb_frag_size(fragfrom)); - __skb_frag_unref(fragfrom); + __skb_frag_unref(fragfrom, false); }
/* Reposition in the original skb */ diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index f718c7346088..5a8f1c65ce80 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -128,7 +128,7 @@ static void destroy_record(struct tls_record_info *record) int i;
for (i = 0; i < record->num_frags; i++) - __skb_frag_unref(&record->frags[i]); + __skb_frag_unref(&record->frags[i], false); kfree(record); }
From: Ilias Apalodimas ilias.apalodimas@linaro.org
mainline inclusion from mainline-v5.14-rc1 commit 6a5bcd84e886 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4CVS3 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Up to now several high speed NICs have custom mechanisms of recycling the allocated memory they use for their payloads. Our page_pool API already has recycling capabilities that are always used when we are running in 'XDP mode'. So let's tweak the API and the kernel network stack slightly and allow the recycling to happen even during the standard operation. The API doesn't take into account 'split page' policies used by those drivers currently, but can be extended once we have users for that.
The idea is to be able to intercept the packet on skb_release_data(). If it's a buffer coming from our page_pool API recycle it back to the pool for further usage or just release the packet entirely.
To achieve that we introduce a bit in struct sk_buff (pp_recycle:1) and a field in struct page (page->pp) to store the page_pool pointer. Storing the information in page->pp allows us to recycle both SKBs and their fragments. We could have skipped the skb bit entirely, since identical information can bederived from struct page. However, in an effort to affect the free path as less as possible, reading a single bit in the skb which is already in cache, is better that trying to derive identical information for the page stored data.
The driver or page_pool has to take care of the sync operations on it's own during the buffer recycling since the buffer is, after opting-in to the recycling, never unmapped.
Since the gain on the drivers depends on the architecture, we are not enabling recycling by default if the page_pool API is used on a driver. In order to enable recycling the driver must call skb_mark_for_recycle() to store the information we need for recycling in page->pp and enabling the recycling bit, or page_pool_store_mem_info() for a fragment.
Co-developed-by: Jesper Dangaard Brouer brouer@redhat.com Signed-off-by: Jesper Dangaard Brouer brouer@redhat.com Co-developed-by: Matteo Croce mcroce@microsoft.com Signed-off-by: Matteo Croce mcroce@microsoft.com Signed-off-by: Ilias Apalodimas ilias.apalodimas@linaro.org Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com (fix conflicts: replace skb_get_kcov_handle by skb_csum_is_sctp) Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/skbuff.h | 33 ++++++++++++++++++++++++++++++--- include/net/page_pool.h | 9 +++++++++ net/core/page_pool.c | 22 ++++++++++++++++++++++ net/core/skbuff.c | 20 ++++++++++++++++---- 4 files changed, 77 insertions(+), 7 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 47ffc683015d..a15e9e2146ba 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -37,6 +37,7 @@ #include <linux/in6.h> #include <linux/if_packet.h> #include <net/flow.h> +#include <net/page_pool.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <linux/netfilter/nf_conntrack_common.h> #endif @@ -664,6 +665,8 @@ typedef unsigned char *sk_buff_data_t; * @head_frag: skb was allocated from page fragments, * not allocated by kmalloc() or vmalloc(). * @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves + * @pp_recycle: mark the packet for recycling instead of freeing (implies + * page_pool support on driver) * @active_extensions: active extensions (skb_ext_id types) * @ndisc_nodetype: router type (from link layer) * @ooo_okay: allow the mapping of a socket to a queue to be changed @@ -784,10 +787,12 @@ struct sk_buff { fclone:2, peeked:1, head_frag:1, - pfmemalloc:1; + pfmemalloc:1, + pp_recycle:1; /* page_pool recycle indicator */ #ifdef CONFIG_SKB_EXTENSIONS __u8 active_extensions; #endif + /* fields enclosed in headers_start/headers_end are copied * using a single memcpy() in __copy_skb_header() */ @@ -3034,7 +3039,13 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f) */ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) { - put_page(skb_frag_page(frag)); + struct page *page = skb_frag_page(frag); + +#ifdef CONFIG_PAGE_POOL + if (recycle && page_pool_return_skb_page(page)) + return; +#endif + put_page(page); }
/** @@ -3046,7 +3057,7 @@ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) */ static inline void skb_frag_unref(struct sk_buff *skb, int f) { - __skb_frag_unref(&skb_shinfo(skb)->frags[f], false); + __skb_frag_unref(&skb_shinfo(skb)->frags[f], skb->pp_recycle); }
/** @@ -4661,5 +4672,21 @@ static inline bool skb_csum_is_sctp(struct sk_buff *skb) return skb->csum_not_inet; }
+#ifdef CONFIG_PAGE_POOL +static inline void skb_mark_for_recycle(struct sk_buff *skb, struct page *page, + struct page_pool *pp) +{ + skb->pp_recycle = 1; + page_pool_store_mem_info(page, pp); +} +#endif + +static inline bool skb_pp_recycle(struct sk_buff *skb, void *data) +{ + if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) + return false; + return page_pool_return_skb_page(virt_to_page(data)); +} + #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ diff --git a/include/net/page_pool.h b/include/net/page_pool.h index b4b6de909c93..3dd62dd73027 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -146,6 +146,8 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool) return pool->p.dma_dir; }
+bool page_pool_return_skb_page(struct page *page); + struct page_pool *page_pool_create(const struct page_pool_params *params);
#ifdef CONFIG_PAGE_POOL @@ -251,4 +253,11 @@ static inline void page_pool_ring_unlock(struct page_pool *pool) spin_unlock_bh(&pool->ring.producer_lock); }
+/* Store mem_info on struct page and use it while recycling skb frags */ +static inline +void page_pool_store_mem_info(struct page *page, struct page_pool *pp) +{ + page->pp = pp; +} + #endif /* _NET_PAGE_POOL_H */ diff --git a/net/core/page_pool.c b/net/core/page_pool.c index e1321bc9d316..5e4eb45b139c 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -628,3 +628,25 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid) } } EXPORT_SYMBOL(page_pool_update_nid); + +bool page_pool_return_skb_page(struct page *page) +{ + struct page_pool *pp; + + page = compound_head(page); + if (unlikely(page->pp_magic != PP_SIGNATURE)) + return false; + + pp = page->pp; + + /* Driver set this to memory recycling info. Reset it on recycle. + * This will *not* work for NIC using a split-page memory model. + * The page will be returned to the pool here regardless of the + * 'flipped' fragment being in use or not. + */ + page->pp = NULL; + page_pool_put_full_page(pp, page, false); + + return true; +} +EXPORT_SYMBOL(page_pool_return_skb_page); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9580a6603d6f..e7bf21d7d531 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -69,6 +69,7 @@ #include <net/xfrm.h> #include <net/mpls.h> #include <net/mptcp.h> +#include <net/page_pool.h>
#include <linux/uaccess.h> #include <trace/events/skb.h> @@ -598,10 +599,13 @@ static void skb_free_head(struct sk_buff *skb) { unsigned char *head = skb->head;
- if (skb->head_frag) + if (skb->head_frag) { + if (skb_pp_recycle(skb, head)) + return; skb_free_frag(head); - else + } else { kfree(head); + } }
static void skb_release_data(struct sk_buff *skb) @@ -617,7 +621,7 @@ static void skb_release_data(struct sk_buff *skb) skb_zcopy_clear(skb, true);
for (i = 0; i < shinfo->nr_frags; i++) - __skb_frag_unref(&shinfo->frags[i], false); + __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
if (shinfo->frag_list) kfree_skb_list(shinfo->frag_list); @@ -1004,6 +1008,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) n->nohdr = 0; n->peeked = 0; C(pfmemalloc); + C(pp_recycle); n->destructor = NULL; C(tail); C(end); @@ -3419,7 +3424,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) fragto = &skb_shinfo(tgt)->frags[merge];
skb_frag_size_add(fragto, skb_frag_size(fragfrom)); - __skb_frag_unref(fragfrom, false); + __skb_frag_unref(fragfrom, skb->pp_recycle); }
/* Reposition in the original skb */ @@ -5192,6 +5197,13 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, if (skb_cloned(to)) return false;
+ /* The page pool signature of struct page will eventually figure out + * which pages can be recycled or not but for now let's prohibit slab + * allocated and page_pool allocated SKBs from being coalesced. + */ + if (to->pp_recycle != from->pp_recycle) + return false; + if (len <= skb_tailroom(to)) { if (len) BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
From: Matteo Croce mcroce@microsoft.com
mainline inclusion from mainline-v5.14-rc1 commit 133637fcfab2 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4CVS3 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Use the new recycling API for page_pool. In a drop rate test, the packet rate is almost doubled, from 1110 Kpps to 2128 Kpps.
perf top on a stock system shows:
Overhead Shared Object Symbol 34.88% [kernel] [k] page_pool_release_page 8.06% [kernel] [k] free_unref_page 6.42% [mvpp2] [k] mvpp2_rx 6.07% [kernel] [k] eth_type_trans 5.18% [kernel] [k] __netif_receive_skb_core 4.95% [kernel] [k] build_skb 4.88% [kernel] [k] kmem_cache_free 3.97% [kernel] [k] kmem_cache_alloc 3.45% [kernel] [k] dev_gro_receive 2.73% [kernel] [k] page_frag_free 2.07% [kernel] [k] __alloc_pages_bulk 1.99% [kernel] [k] arch_local_irq_save 1.84% [kernel] [k] skb_release_data 1.20% [kernel] [k] netif_receive_skb_list_internal
With packet rate stable at 1100 Kpps:
tx: 0 bps 0 pps rx: 532.7 Mbps 1110 Kpps tx: 0 bps 0 pps rx: 532.6 Mbps 1110 Kpps tx: 0 bps 0 pps rx: 532.4 Mbps 1109 Kpps tx: 0 bps 0 pps rx: 532.1 Mbps 1109 Kpps tx: 0 bps 0 pps rx: 531.9 Mbps 1108 Kpps tx: 0 bps 0 pps rx: 531.9 Mbps 1108 Kpps
And this is the same output with recycling enabled:
Overhead Shared Object Symbol 12.91% [kernel] [k] eth_type_trans 12.54% [mvpp2] [k] mvpp2_rx 9.67% [kernel] [k] build_skb 9.63% [kernel] [k] __netif_receive_skb_core 8.44% [kernel] [k] page_pool_put_page 8.07% [kernel] [k] kmem_cache_free 7.79% [kernel] [k] kmem_cache_alloc 6.86% [kernel] [k] dev_gro_receive 3.19% [kernel] [k] skb_release_data 2.41% [kernel] [k] netif_receive_skb_list_internal 2.18% [kernel] [k] page_pool_refill_alloc_cache 1.76% [kernel] [k] napi_gro_receive 1.61% [kernel] [k] kfree_skb 1.20% [kernel] [k] dma_sync_single_for_device 1.16% [mvpp2] [k] mvpp2_poll 1.12% [mvpp2] [k] mvpp2_read
With packet rate above 2100 Kpps:
tx: 0 bps 0 pps rx: 1021 Mbps 2128 Kpps tx: 0 bps 0 pps rx: 1021 Mbps 2127 Kpps tx: 0 bps 0 pps rx: 1021 Mbps 2128 Kpps tx: 0 bps 0 pps rx: 1021 Mbps 2128 Kpps tx: 0 bps 0 pps rx: 1022 Mbps 2128 Kpps tx: 0 bps 0 pps rx: 1022 Mbps 2129 Kpps
The major performance increase is explained by the fact that the most CPU consuming functions (page_pool_release_page, page_frag_free and free_unref_page) are no longer called on a per packet basis.
The test was done by sending to the macchiatobin 64 byte ethernet frames with an invalid ethertype, so the packets are dropped early in the RX path.
Signed-off-by: Matteo Croce mcroce@microsoft.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index bafdf287c478..e4360b1803bf 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -3639,7 +3639,7 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi, }
if (pp) - page_pool_release_page(pp, virt_to_page(data)); + skb_mark_for_recycle(skb, virt_to_page(data), pp); else dma_unmap_single_attrs(dev->dev.parent, dma_addr, bm_pool->buf_size, DMA_FROM_DEVICE,
From: Matteo Croce mcroce@microsoft.com
mainline inclusion from mainline-v5.14-rc1 commit e4017570daee category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4CVS3 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Use the new recycling API for page_pool. In a drop rate test, the packet rate increased by 10%, from 296 Kpps to 326 Kpps.
perf top on a stock system shows:
Overhead Shared Object Symbol 23.66% [kernel] [k] __pi___inval_dcache_area 22.85% [mvneta] [k] mvneta_rx_swbm 7.54% [kernel] [k] kmem_cache_alloc 6.49% [kernel] [k] eth_type_trans 3.94% [kernel] [k] dev_gro_receive 3.91% [kernel] [k] __netif_receive_skb_core 3.91% [kernel] [k] kmem_cache_free 3.76% [kernel] [k] page_pool_release_page 3.56% [kernel] [k] free_unref_page 2.40% [kernel] [k] build_skb 1.49% [kernel] [k] skb_release_data 1.45% [kernel] [k] __alloc_pages_bulk 1.30% [kernel] [k] page_frag_free
And this is the same output with recycling enabled:
Overhead Shared Object Symbol 26.41% [kernel] [k] __pi___inval_dcache_area 25.00% [mvneta] [k] mvneta_rx_swbm 8.14% [kernel] [k] kmem_cache_alloc 6.84% [kernel] [k] eth_type_trans 4.44% [kernel] [k] __netif_receive_skb_core 4.38% [kernel] [k] kmem_cache_free 4.16% [kernel] [k] dev_gro_receive 3.21% [kernel] [k] page_pool_put_page 2.41% [kernel] [k] build_skb 1.82% [kernel] [k] skb_release_data 1.61% [kernel] [k] napi_gro_receive 1.25% [kernel] [k] page_pool_refill_alloc_cache 1.16% [kernel] [k] __netif_receive_skb_list_core
We can see that page_pool_release_page(), free_unref_page() and __alloc_pages_bulk() are no longer on top of the list when receiving traffic.
The test was done with mausezahn on the TX side with 64 byte raw ethernet frames.
Signed-off-by: Matteo Croce mcroce@microsoft.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/marvell/mvneta.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index f7d4f2d6adea..694944e2a4eb 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -2303,7 +2303,7 @@ mvneta_swbm_add_rx_fragment(struct mvneta_port *pp, }
static struct sk_buff * -mvneta_swbm_build_skb(struct mvneta_port *pp, struct mvneta_rx_queue *rxq, +mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool, struct xdp_buff *xdp, u32 desc_status) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); @@ -2314,7 +2314,7 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct mvneta_rx_queue *rxq, if (!skb) return ERR_PTR(-ENOMEM);
- page_pool_release_page(rxq->page_pool, virt_to_page(xdp->data)); + skb_mark_for_recycle(skb, virt_to_page(xdp->data), pool);
skb_reserve(skb, xdp->data - xdp->data_hard_start); skb_put(skb, xdp->data_end - xdp->data); @@ -2326,7 +2326,10 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct mvneta_rx_queue *rxq, skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, skb_frag_page(frag), skb_frag_off(frag), skb_frag_size(frag), PAGE_SIZE); - page_pool_release_page(rxq->page_pool, skb_frag_page(frag)); + /* We don't need to reset pp_recycle here. It's already set, so + * just mark fragments for recycling. + */ + page_pool_store_mem_info(skb_frag_page(frag), pool); }
return skb; @@ -2405,7 +2408,7 @@ static int mvneta_rx_swbm(struct napi_struct *napi, mvneta_run_xdp(pp, rxq, xdp_prog, &xdp_buf, frame_sz, &ps)) goto next;
- skb = mvneta_swbm_build_skb(pp, rxq, &xdp_buf, desc_status); + skb = mvneta_swbm_build_skb(pp, rxq->page_pool, &xdp_buf, desc_status); if (IS_ERR(skb)) { struct mvneta_pcpu_stats *stats = this_cpu_ptr(pp->stats);
From: Matteo Croce mcroce@microsoft.com
mainline inclusion from mainline-v5.14-rc1 commit d8ea89fe8a49 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4CVS3 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
In the RX buffer, the received data starts after a headroom used to align the IP header and to allow prepending headers efficiently. The prefetch() should take this into account, and prefetch from the very start of the received data.
We can see that ether_addr_equal_64bits(), which is the first function to access the data, drops from the top of the perf top output.
prefetch(data):
Overhead Shared Object Symbol 11.64% [kernel] [k] eth_type_trans
prefetch(data + MVPP2_MH_SIZE + MVPP2_SKB_HEADROOM):
Overhead Shared Object Symbol 13.42% [kernel] [k] build_skb 10.35% [mvpp2] [k] mvpp2_rx 9.35% [kernel] [k] __netif_receive_skb_core 8.24% [kernel] [k] kmem_cache_free 7.97% [kernel] [k] dev_gro_receive 7.68% [kernel] [k] page_pool_put_page 7.32% [kernel] [k] kmem_cache_alloc 7.09% [mvpp2] [k] mvpp2_bm_pool_put 3.36% [kernel] [k] eth_type_trans
Also, move the eth_type_trans() call a bit down, to give the RAM more time to prefetch the data.
Signed-off-by: Matteo Croce mcroce@microsoft.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index e4360b1803bf..55c6f90f818b 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -3580,7 +3580,7 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi, goto err_drop_frame;
/* Prefetch header */ - prefetch(data); + prefetch(data + MVPP2_MH_SIZE + MVPP2_SKB_HEADROOM);
if (bm_pool->frag_size > PAGE_SIZE) frag_size = 0; @@ -3650,8 +3650,8 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi,
skb_reserve(skb, MVPP2_MH_SIZE + MVPP2_SKB_HEADROOM); skb_put(skb, rx_bytes); - skb->protocol = eth_type_trans(skb, dev); mvpp2_rx_csum(port, rx_status, skb); + skb->protocol = eth_type_trans(skb, dev);
napi_gro_receive(napi, skb); continue;
From: Matteo Croce mcroce@microsoft.com
mainline inclusion from mainline-v5.14-rc1 commit 2f128eb3308a category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4CVS3 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Most of the time during the RX is caused by the compound_head() call done at the end of the RX loop:
│ build_skb(): [...] │ static inline struct page *compound_head(struct page *page) │ { │ unsigned long head = READ_ONCE(page->compound_head); 65.23 │ ldr x2, [x1, #8]
Prefetch the page struct as soon as possible, to speedup the RX path noticeabily by a ~3-4% packet rate in a drop test.
│ build_skb(): [...] │ static inline struct page *compound_head(struct page *page) │ { │ unsigned long head = READ_ONCE(page->compound_head); 17.92 │ ldr x2, [x1, #8]
Signed-off-by: Matteo Croce mcroce@microsoft.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 55c6f90f818b..42ad6c15db16 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -3542,15 +3542,19 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi, phys_addr_t phys_addr; u32 rx_status, timestamp; int pool, rx_bytes, err, ret; + struct page *page; void *data;
+ phys_addr = mvpp2_rxdesc_cookie_get(port, rx_desc); + data = (void *)phys_to_virt(phys_addr); + page = virt_to_page(data); + prefetch(page); + rx_done++; rx_status = mvpp2_rxdesc_status_get(port, rx_desc); rx_bytes = mvpp2_rxdesc_size_get(port, rx_desc); rx_bytes -= MVPP2_MH_SIZE; dma_addr = mvpp2_rxdesc_dma_addr_get(port, rx_desc); - phys_addr = mvpp2_rxdesc_cookie_get(port, rx_desc); - data = (void *)phys_to_virt(phys_addr);
pool = (rx_status & MVPP2_RXD_BM_POOL_ID_MASK) >> MVPP2_RXD_BM_POOL_ID_OFFS; @@ -3639,7 +3643,7 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi, }
if (pp) - skb_mark_for_recycle(skb, virt_to_page(data), pp); + skb_mark_for_recycle(skb, page, pp); else dma_unmap_single_attrs(dev->dev.parent, dma_addr, bm_pool->buf_size, DMA_FROM_DEVICE,
From: Lorenzo Bianconi lorenzo@kernel.org
mainline inclusion from mainline-v5.14-rc1 commit a078d981f863 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4CVS3 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
As already done for mvneta and mvpp2, enable skb recycling for ti ethernet drivers
ti driver on net-next: ---------------------- [perf top] 47.15% [kernel] [k] _raw_spin_unlock_irqrestore 11.77% [kernel] [k] __cpdma_chan_free 3.16% [kernel] [k] ___bpf_prog_run 2.52% [kernel] [k] cpsw_rx_vlan_encap 2.34% [kernel] [k] __netif_receive_skb_core 2.27% [kernel] [k] free_unref_page 2.26% [kernel] [k] kmem_cache_free 2.24% [kernel] [k] kmem_cache_alloc 1.69% [kernel] [k] __softirqentry_text_start 1.61% [kernel] [k] cpsw_rx_handler 1.19% [kernel] [k] page_pool_release_page 1.19% [kernel] [k] clear_bits_ll 1.15% [kernel] [k] page_frag_free 1.06% [kernel] [k] __dma_page_dev_to_cpu 0.99% [kernel] [k] memset 0.94% [kernel] [k] __alloc_pages_bulk 0.92% [kernel] [k] kfree_skb 0.85% [kernel] [k] packet_rcv 0.78% [kernel] [k] page_address 0.75% [kernel] [k] v7_dma_inv_range 0.71% [kernel] [k] __lock_text_start
[iperf3 tcp] [ 5] 0.00-10.00 sec 873 MBytes 732 Mbits/sec 0 sender [ 5] 0.00-10.01 sec 866 MBytes 726 Mbits/sec receiver
ti + skb recycling: ------------------- [perf top] 40.58% [kernel] [k] _raw_spin_unlock_irqrestore 16.18% [kernel] [k] __softirqentry_text_start 10.33% [kernel] [k] __cpdma_chan_free 2.62% [kernel] [k] ___bpf_prog_run 2.05% [kernel] [k] cpsw_rx_vlan_encap 2.00% [kernel] [k] kmem_cache_alloc 1.86% [kernel] [k] __netif_receive_skb_core 1.80% [kernel] [k] kmem_cache_free 1.63% [kernel] [k] cpsw_rx_handler 1.12% [kernel] [k] cpsw_rx_mq_poll 1.11% [kernel] [k] page_pool_put_page 1.04% [kernel] [k] _raw_spin_unlock 0.97% [kernel] [k] clear_bits_ll 0.90% [kernel] [k] packet_rcv 0.88% [kernel] [k] __dma_page_dev_to_cpu 0.85% [kernel] [k] kfree_skb 0.80% [kernel] [k] memset 0.71% [kernel] [k] __lock_text_start 0.66% [kernel] [k] v7_dma_inv_range 0.64% [kernel] [k] gen_pool_free_owner
[iperf3 tcp] [ 5] 0.00-10.00 sec 884 MBytes 742 Mbits/sec 0 sender [ 5] 0.00-10.01 sec 878 MBytes 735 Mbits/sec receiver
Tested-by: Grygorii Strashko grygorii.strashko@ti.com Reviewed-by: Grygorii Strashko grygorii.strashko@ti.com Signed-off-by: Lorenzo Bianconi lorenzo@kernel.org Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/ti/cpsw.c | 4 ++-- drivers/net/ethernet/ti/cpsw_new.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index ca1168bdcf72..8494c4459788 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -438,8 +438,8 @@ static void cpsw_rx_handler(void *token, int len, int status) cpts_rx_timestamp(cpsw->cpts, skb); skb->protocol = eth_type_trans(skb, ndev);
- /* unmap page as no netstack skb page recycling */ - page_pool_release_page(pool, page); + /* mark skb for recycling */ + skb_mark_for_recycle(skb, page, pool); netif_receive_skb(skb);
ndev->stats.rx_bytes += len; diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c index 1634947ef488..8a72541d8484 100644 --- a/drivers/net/ethernet/ti/cpsw_new.c +++ b/drivers/net/ethernet/ti/cpsw_new.c @@ -381,8 +381,8 @@ static void cpsw_rx_handler(void *token, int len, int status) cpts_rx_timestamp(cpsw->cpts, skb); skb->protocol = eth_type_trans(skb, ndev);
- /* unmap page as no netstack skb page recycling */ - page_pool_release_page(pool, page); + /* mark skb for recycling */ + skb_mark_for_recycle(skb, page, pool); netif_receive_skb(skb);
ndev->stats.rx_bytes += len;
From: Ilias Apalodimas ilias.apalodimas@linaro.org
mainline inclusion from mainline-v5.14-rc3 commit 2cc3aeb5eccc category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4CVS3 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
As Alexander points out, when we are trying to recycle a cloned/expanded SKB we might trigger a race. The recycling code relies on the pp_recycle bit to trigger, which we carry over to cloned SKBs. If that cloned SKB gets expanded or if we get references to the frags, call skb_release_data() and overwrite skb->head, we are creating separate instances accessing the same page frags. Since the skb_release_data() will first try to recycle the frags, there's a potential race between the original and cloned SKB, since both will have the pp_recycle bit set.
Fix this by explicitly those SKBs not recyclable. The atomic_sub_return effectively limits us to a single release case, and when we are calling skb_release_data we are also releasing the option to perform the recycling, or releasing the pages from the page pool.
Fixes: 6a5bcd84e886 ("page_pool: Allow drivers to hint on SKB recycling") Reported-by: Alexander Duyck alexanderduyck@fb.com Suggested-by: Alexander Duyck alexanderduyck@fb.com Reviewed-by: Alexander Duyck alexanderduyck@fb.com Acked-by: Jesper Dangaard Brouer brouer@redhat.com Signed-off-by: Ilias Apalodimas ilias.apalodimas@linaro.org Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/core/skbuff.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e7bf21d7d531..f73ea11c0874 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -616,7 +616,7 @@ static void skb_release_data(struct sk_buff *skb) if (skb->cloned && atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, &shinfo->dataref)) - return; + goto exit;
skb_zcopy_clear(skb, true);
@@ -627,6 +627,17 @@ static void skb_release_data(struct sk_buff *skb) kfree_skb_list(shinfo->frag_list);
skb_free_head(skb); +exit: + /* When we clone an SKB we copy the reycling bit. The pp_recycle + * bit is only set on the head though, so in order to avoid races + * while trying to recycle fragments on __skb_frag_unref() we need + * to make one SKB responsible for triggering the recycle path. + * So disable the recycling bit if an SKB is cloned and we have + * additional references to to the fragmented part of the SKB. + * Eventually the last SKB will have the recycling bit set and it's + * dataref set to 0, which will trigger the recycling + */ + skb->pp_recycle = 0; }
/*
From: Yunsheng Lin linyunsheng@huawei.com
mainline inclusion from mainline-v5.14-rc6 commit 0fa32ca438b4 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4CVS3 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
As mentioned in commit c07aea3ef4d4 ("mm: add a signature in struct page"): "The page->signature field is aliased to page->lru.next and page->compound_head."
And as the comment in page_is_pfmemalloc(): "lru.next has bit 1 set if the page is allocated from the pfmemalloc reserves. Callers may simply overwrite it if they do not need to preserve that information."
The page->signature is OR’ed with PP_SIGNATURE when a page is allocated in page pool, see __page_pool_alloc_pages_slow(), and page->signature is checked directly with PP_SIGNATURE in page_pool_return_skb_page(), which might cause resoure leaking problem for a page from page pool if bit 1 of lru.next is set for a pfmemalloc page. What happens here is that the original pp->signature is OR'ed with PP_SIGNATURE after the allocation in order to preserve any existing bits(such as the bit 1, used to indicate a pfmemalloc page), so when those bits are present, those page is not considered to be from page pool and the DMA mapping of those pages will be left stale.
As bit 0 is for page->compound_head, So mask both bit 0/1 before the checking in page_pool_return_skb_page(). And we will return those pfmemalloc pages back to the page allocator after cleaning up the DMA mapping.
Fixes: 6a5bcd84e886 ("page_pool: Allow drivers to hint on SKB recycling") Reviewed-by: Ilias Apalodimas ilias.apalodimas@linaro.org Signed-off-by: Yunsheng Lin linyunsheng@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/core/page_pool.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 5e4eb45b139c..8ab7b402244c 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -634,7 +634,15 @@ bool page_pool_return_skb_page(struct page *page) struct page_pool *pp;
page = compound_head(page); - if (unlikely(page->pp_magic != PP_SIGNATURE)) + + /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation + * in order to preserve any existing bits, such as bit 0 for the + * head page of compound page and bit 1 for pfmemalloc page, so + * mask those bits for freeing side when doing below checking, + * and page_is_pfmemalloc() is checked in __page_pool_put_page() + * to avoid recycling the pfmemalloc page. + */ + if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) return false;
pp = page->pp;
From: Yunsheng Lin linyunsheng@huawei.com
mainline inclusion from mainline-v5.15-rc1 commit 57f05bc2ab24 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4CVS3 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Currently, page->pp is cleared and set everytime the page is recycled, which is unnecessary.
So only set the page->pp when the page is added to the page pool and only clear it when the page is released from the page pool.
This is also a preparation to support allocating frag page in page pool.
Reviewed-by: Ilias Apalodimas ilias.apalodimas@linaro.org Signed-off-by: Yunsheng Lin linyunsheng@huawei.com Signed-off-by: Jakub Kicinski kuba@kernel.org Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/marvell/mvneta.c | 6 +----- .../net/ethernet/marvell/mvpp2/mvpp2_main.c | 2 +- drivers/net/ethernet/ti/cpsw.c | 2 +- drivers/net/ethernet/ti/cpsw_new.c | 2 +- include/linux/skbuff.h | 4 +--- include/net/page_pool.h | 7 ------- net/core/page_pool.c | 21 +++++++++++++++---- 7 files changed, 22 insertions(+), 22 deletions(-)
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 694944e2a4eb..7fc44c43f4ee 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -2314,7 +2314,7 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool, if (!skb) return ERR_PTR(-ENOMEM);
- skb_mark_for_recycle(skb, virt_to_page(xdp->data), pool); + skb_mark_for_recycle(skb);
skb_reserve(skb, xdp->data - xdp->data_hard_start); skb_put(skb, xdp->data_end - xdp->data); @@ -2326,10 +2326,6 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool, skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, skb_frag_page(frag), skb_frag_off(frag), skb_frag_size(frag), PAGE_SIZE); - /* We don't need to reset pp_recycle here. It's already set, so - * just mark fragments for recycling. - */ - page_pool_store_mem_info(skb_frag_page(frag), pool); }
return skb; diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 42ad6c15db16..ac75a215772d 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -3643,7 +3643,7 @@ static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi, }
if (pp) - skb_mark_for_recycle(skb, page, pp); + skb_mark_for_recycle(skb); else dma_unmap_single_attrs(dev->dev.parent, dma_addr, bm_pool->buf_size, DMA_FROM_DEVICE, diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 8494c4459788..37fab882d649 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -439,7 +439,7 @@ static void cpsw_rx_handler(void *token, int len, int status) skb->protocol = eth_type_trans(skb, ndev);
/* mark skb for recycling */ - skb_mark_for_recycle(skb, page, pool); + skb_mark_for_recycle(skb); netif_receive_skb(skb);
ndev->stats.rx_bytes += len; diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c index 8a72541d8484..3cc5884b7c40 100644 --- a/drivers/net/ethernet/ti/cpsw_new.c +++ b/drivers/net/ethernet/ti/cpsw_new.c @@ -382,7 +382,7 @@ static void cpsw_rx_handler(void *token, int len, int status) skb->protocol = eth_type_trans(skb, ndev);
/* mark skb for recycling */ - skb_mark_for_recycle(skb, page, pool); + skb_mark_for_recycle(skb); netif_receive_skb(skb);
ndev->stats.rx_bytes += len; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a15e9e2146ba..2529415e3483 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4673,11 +4673,9 @@ static inline bool skb_csum_is_sctp(struct sk_buff *skb) }
#ifdef CONFIG_PAGE_POOL -static inline void skb_mark_for_recycle(struct sk_buff *skb, struct page *page, - struct page_pool *pp) +static inline void skb_mark_for_recycle(struct sk_buff *skb) { skb->pp_recycle = 1; - page_pool_store_mem_info(page, pp); } #endif
diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 3dd62dd73027..8d7744d1c7c1 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -253,11 +253,4 @@ static inline void page_pool_ring_unlock(struct page_pool *pool) spin_unlock_bh(&pool->ring.producer_lock); }
-/* Store mem_info on struct page and use it while recycling skb frags */ -static inline -void page_pool_store_mem_info(struct page *page, struct page_pool *pp) -{ - page->pp = pp; -} - #endif /* _NET_PAGE_POOL_H */ diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 8ab7b402244c..c22f30ae5c42 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -206,6 +206,19 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page) return true; }
+static void page_pool_set_pp_info(struct page_pool *pool, + struct page *page) +{ + page->pp = pool; + page->pp_magic |= PP_SIGNATURE; +} + +static void page_pool_clear_pp_info(struct page *page) +{ + page->pp_magic = 0; + page->pp = NULL; +} + static struct page *__page_pool_alloc_page_order(struct page_pool *pool, gfp_t gfp) { @@ -222,7 +235,7 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, return NULL; }
- page->pp_magic |= PP_SIGNATURE; + page_pool_set_pp_info(pool, page);
/* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; @@ -266,7 +279,8 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, put_page(page); continue; } - page->pp_magic |= PP_SIGNATURE; + + page_pool_set_pp_info(pool, page); pool->alloc.cache[pool->alloc.count++] = page; /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; @@ -345,7 +359,7 @@ void page_pool_release_page(struct page_pool *pool, struct page *page) DMA_ATTR_SKIP_CPU_SYNC); page_pool_set_dma_addr(page, 0); skip_dma_unmap: - page->pp_magic = 0; + page_pool_clear_pp_info(page);
/* This may be the last page returned, releasing the pool, so * it is not safe to reference pool afterwards. @@ -652,7 +666,6 @@ bool page_pool_return_skb_page(struct page *page) * The page will be returned to the pool here regardless of the * 'flipped' fragment being in use or not. */ - page->pp = NULL; page_pool_put_full_page(pp, page, false);
return true;
From: Yunsheng Lin linyunsheng@huawei.com
mainline inclusion from mainline-v5.15-rc1 commit 0e9d2a0a3a83 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4CVS3 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
For 32 bit systems with 64 bit dma, dma_addr[1] is used to store the upper 32 bit dma addr, those system should be rare those days.
For normal system, the dma_addr[1] in 'struct page' is not used, so we can reuse dma_addr[1] for storing frag count, which means how many frags this page might be splited to.
In order to simplify the page frag support in the page pool, the PAGE_POOL_DMA_USE_PP_FRAG_COUNT macro is added to indicate the 32 bit systems with 64 bit dma, and the page frag support in page pool is disabled for such system.
The newly added page_pool_set_frag_count() is called to reserve the maximum frag count before any page frag is passed to the user. The page_pool_atomic_sub_frag_count_return() is called when user is done with the page frag.
Signed-off-by: Yunsheng Lin linyunsheng@huawei.com Signed-off-by: Jakub Kicinski kuba@kernel.org Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/mm_types.h | 18 +++++++++++----- include/net/page_pool.h | 46 ++++++++++++++++++++++++++++++++++------ net/core/page_pool.c | 4 ++++ 3 files changed, 56 insertions(+), 12 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index fe38904ec52e..6121e844d77b 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -103,11 +103,19 @@ struct page { unsigned long pp_magic; struct page_pool *pp; unsigned long _pp_mapping_pad; - /** - * @dma_addr: might require a 64-bit value on - * 32-bit architectures. - */ - unsigned long dma_addr[2]; + unsigned long dma_addr; + union { + /** + * dma_addr_upper: might require a 64-bit + * value on 32-bit architectures. + */ + unsigned long dma_addr_upper; + /** + * For frag page support, not supported in + * 32-bit architectures with 64-bit DMA. + */ + atomic_long_t pp_frag_count; + }; }; struct { /* slab, slob and slub */ union { diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 8d7744d1c7c1..42e6997e637d 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -45,7 +45,10 @@ * Please note DMA-sync-for-CPU is still * device driver responsibility */ -#define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV) +#define PP_FLAG_PAGE_FRAG BIT(2) /* for page frag feature */ +#define PP_FLAG_ALL (PP_FLAG_DMA_MAP |\ + PP_FLAG_DMA_SYNC_DEV |\ + PP_FLAG_PAGE_FRAG)
/* * Fast allocation side cache array/stack @@ -198,19 +201,48 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, page_pool_put_full_page(pool, page, true); }
+#define PAGE_POOL_DMA_USE_PP_FRAG_COUNT \ + (sizeof(dma_addr_t) > sizeof(unsigned long)) + static inline dma_addr_t page_pool_get_dma_addr(struct page *page) { - dma_addr_t ret = page->dma_addr[0]; - if (sizeof(dma_addr_t) > sizeof(unsigned long)) - ret |= (dma_addr_t)page->dma_addr[1] << 16 << 16; + dma_addr_t ret = page->dma_addr; + + if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) + ret |= (dma_addr_t)page->dma_addr_upper << 16 << 16; + return ret; }
static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr) { - page->dma_addr[0] = addr; - if (sizeof(dma_addr_t) > sizeof(unsigned long)) - page->dma_addr[1] = upper_32_bits(addr); + page->dma_addr = addr; + if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) + page->dma_addr_upper = upper_32_bits(addr); +} + +static inline void page_pool_set_frag_count(struct page *page, long nr) +{ + atomic_long_set(&page->pp_frag_count, nr); +} + +static inline long page_pool_atomic_sub_frag_count_return(struct page *page, + long nr) +{ + long ret; + + /* As suggested by Alexander, atomic_long_read() may cover up the + * reference count errors, so avoid calling atomic_long_read() in + * the cases of freeing or draining the page_frags, where we would + * not expect it to match or that are slowpath anyway. + */ + if (__builtin_constant_p(nr) && + atomic_long_read(&page->pp_frag_count) == nr) + return 0; + + ret = atomic_long_sub_return(nr, &page->pp_frag_count); + WARN_ON(ret < 0); + return ret; }
static inline bool is_page_pool_compiled_in(void) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index c22f30ae5c42..e05d0b9323fc 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -67,6 +67,10 @@ static int page_pool_init(struct page_pool *pool, */ }
+ if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT && + pool->p.flags & PP_FLAG_PAGE_FRAG) + return -EINVAL; + if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) return -ENOMEM;
From: Yunsheng Lin linyunsheng@huawei.com
mainline inclusion from mainline-v5.15-rc1 commit 53e0961da1c7 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4CVS3 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
Currently page pool only support page recycling when there is only one user of the page, and the split page reusing implemented in the most driver can not use the page pool as bing-pong way of reusing requires the multi user support in page pool.
Those reusing or recycling has below limitations: 1. page from page pool can only be used be one user in order for the page recycling to happen. 2. Bing-pong way of reusing in most driver does not support multi desc using different part of the same page in order to save memory.
So add multi-users support and frag page recycling in page pool to overcome the above limitation.
Signed-off-by: Yunsheng Lin linyunsheng@huawei.com Signed-off-by: Jakub Kicinski kuba@kernel.org Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/net/page_pool.h | 15 +++++++ net/core/page_pool.c | 87 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+)
diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 42e6997e637d..a4082406a003 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -91,6 +91,9 @@ struct page_pool { unsigned long defer_warn;
u32 pages_state_hold_cnt; + unsigned int frag_offset; + struct page *frag_page; + long frag_users;
/* * Data structure for allocation side @@ -140,6 +143,18 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) return page_pool_alloc_pages(pool, gfp); }
+struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, + unsigned int size, gfp_t gfp); + +static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, + unsigned int *offset, + unsigned int size) +{ + gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); + + return page_pool_alloc_frag(pool, offset, size, gfp); +} + /* get the stored dma direction. A driver might decide to treat this locally and * avoid the extra cache line from page_pool to determine the direction */ diff --git a/net/core/page_pool.c b/net/core/page_pool.c index e05d0b9323fc..e1409056965a 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -24,6 +24,8 @@ #define DEFER_TIME (msecs_to_jiffies(1000)) #define DEFER_WARN_INTERVAL (60 * HZ)
+#define BIAS_MAX LONG_MAX + static int page_pool_init(struct page_pool *pool, const struct page_pool_params *params) { @@ -423,6 +425,11 @@ static __always_inline struct page * __page_pool_put_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { + /* It is not the last user for the page frag case */ + if (pool->p.flags & PP_FLAG_PAGE_FRAG && + page_pool_atomic_sub_frag_count_return(page, 1)) + return NULL; + /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the * regular page allocator APIs. @@ -515,6 +522,84 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, } EXPORT_SYMBOL(page_pool_put_page_bulk);
+static struct page *page_pool_drain_frag(struct page_pool *pool, + struct page *page) +{ + long drain_count = BIAS_MAX - pool->frag_users; + + /* Some user is still using the page frag */ + if (likely(page_pool_atomic_sub_frag_count_return(page, + drain_count))) + return NULL; + + if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) { + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) + page_pool_dma_sync_for_device(pool, page, -1); + + return page; + } + + page_pool_return_page(pool, page); + return NULL; +} + +static void page_pool_free_frag(struct page_pool *pool) +{ + long drain_count = BIAS_MAX - pool->frag_users; + struct page *page = pool->frag_page; + + pool->frag_page = NULL; + + if (!page || + page_pool_atomic_sub_frag_count_return(page, drain_count)) + return; + + page_pool_return_page(pool, page); +} + +struct page *page_pool_alloc_frag(struct page_pool *pool, + unsigned int *offset, + unsigned int size, gfp_t gfp) +{ + unsigned int max_size = PAGE_SIZE << pool->p.order; + struct page *page = pool->frag_page; + + if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) || + size > max_size)) + return NULL; + + size = ALIGN(size, dma_get_cache_alignment()); + *offset = pool->frag_offset; + + if (page && *offset + size > max_size) { + page = page_pool_drain_frag(pool, page); + if (page) + goto frag_reset; + } + + if (!page) { + page = page_pool_alloc_pages(pool, gfp); + if (unlikely(!page)) { + pool->frag_page = NULL; + return NULL; + } + + pool->frag_page = page; + +frag_reset: + pool->frag_users = 1; + *offset = 0; + pool->frag_offset = size; + page_pool_set_frag_count(page, BIAS_MAX); + return page; + } + + pool->frag_users++; + pool->frag_offset = *offset + size; + return page; +} +EXPORT_SYMBOL(page_pool_alloc_frag); + static void page_pool_empty_ring(struct page_pool *pool) { struct page *page; @@ -620,6 +705,8 @@ void page_pool_destroy(struct page_pool *pool) if (!page_pool_put(pool)) return;
+ page_pool_free_frag(pool); + if (!page_pool_release(pool)) return;
From: Yunsheng Lin linyunsheng@huawei.com
mainline inclusion from mainline-v5.15-rc1 commit 93188e9642c3 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4CVS3 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
This patch adds skb's frag page recycling support based on the frag page support in page pool.
The performance improves above 10~20% for single thread iperf TCP flow with IOMMU disabled when iperf server and irq/NAPI have a different CPU.
The performance improves about 135%(14Gbit to 33Gbit) for single thread iperf TCP flow when IOMMU is in strict mode and iperf server shares the same cpu with irq/NAPI.
Signed-off-by: Yunsheng Lin linyunsheng@huawei.com Signed-off-by: Jakub Kicinski kuba@kernel.org Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/Kconfig | 1 + .../net/ethernet/hisilicon/hns3/hns3_enet.c | 79 +++++++++++++++++-- .../net/ethernet/hisilicon/hns3/hns3_enet.h | 3 + 3 files changed, 78 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/Kconfig b/drivers/net/ethernet/hisilicon/Kconfig index 094e4a37a295..2ba0e7bd3466 100644 --- a/drivers/net/ethernet/hisilicon/Kconfig +++ b/drivers/net/ethernet/hisilicon/Kconfig @@ -91,6 +91,7 @@ config HNS3 tristate "Hisilicon Network Subsystem Support HNS3 (Framework)" depends on PCI select NET_DEVLINK + select PAGE_POOL help This selects the framework support for Hisilicon Network Subsystem 3. This layer facilitates clients like ENET, RoCE and user-space ethernet diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 83e640814256..43f91fe11e3c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -3178,6 +3178,21 @@ static int hns3_alloc_buffer(struct hns3_enet_ring *ring, unsigned int order = hns3_page_order(ring); struct page *p;
+ if (ring->page_pool) { + p = page_pool_dev_alloc_frag(ring->page_pool, + &cb->page_offset, + hns3_buf_size(ring)); + if (unlikely(!p)) + return -ENOMEM; + + cb->priv = p; + cb->buf = page_address(p); + cb->dma = page_pool_get_dma_addr(p); + cb->type = DESC_TYPE_PP_FRAG; + cb->reuse_flag = 0; + return 0; + } + p = dev_alloc_pages(order); if (!p) return -ENOMEM; @@ -3200,8 +3215,13 @@ static void hns3_free_buffer(struct hns3_enet_ring *ring, if (cb->type & (DESC_TYPE_SKB | DESC_TYPE_BOUNCE_HEAD | DESC_TYPE_BOUNCE_ALL | DESC_TYPE_SGL_SKB)) napi_consume_skb(cb->priv, budget); - else if (!HNAE3_IS_TX_RING(ring) && cb->pagecnt_bias) - __page_frag_cache_drain(cb->priv, cb->pagecnt_bias); + else if (!HNAE3_IS_TX_RING(ring)) { + if (cb->type & DESC_TYPE_PAGE && cb->pagecnt_bias) + __page_frag_cache_drain(cb->priv, cb->pagecnt_bias); + else if (cb->type & DESC_TYPE_PP_FRAG) + page_pool_put_full_page(ring->page_pool, cb->priv, + false); + } memset(cb, 0, sizeof(*cb)); }
@@ -3288,7 +3308,7 @@ static int hns3_alloc_and_map_buffer(struct hns3_enet_ring *ring, int ret;
ret = hns3_alloc_buffer(ring, cb); - if (ret) + if (ret || ring->page_pool) goto out;
ret = hns3_map_buffer(ring, cb); @@ -3310,7 +3330,8 @@ static int hns3_alloc_and_attach_buffer(struct hns3_enet_ring *ring, int i) if (ret) return ret;
- ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma); + ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma + + ring->desc_cb[i].page_offset);
return 0; } @@ -3340,7 +3361,8 @@ static void hns3_replace_buffer(struct hns3_enet_ring *ring, int i, { hns3_unmap_buffer(ring, &ring->desc_cb[i]); ring->desc_cb[i] = *res_cb; - ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma); + ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma + + ring->desc_cb[i].page_offset); ring->desc[i].rx.bd_base_info = 0; }
@@ -3512,6 +3534,12 @@ static void hns3_nic_reuse_page(struct sk_buff *skb, int i, u32 frag_size = size - pull_len; bool reused;
+ if (ring->page_pool) { + skb_add_rx_frag(skb, i, desc_cb->priv, frag_offset, + frag_size, truesize); + return; + } + /* Avoid re-using remote or pfmem page */ if (unlikely(!dev_page_is_reusable(desc_cb->priv))) goto out; @@ -3829,6 +3857,9 @@ static int hns3_alloc_skb(struct hns3_enet_ring *ring, unsigned int length, /* We can reuse buffer as-is, just make sure it is reusable */ if (dev_page_is_reusable(desc_cb->priv)) desc_cb->reuse_flag = 1; + else if (desc_cb->type & DESC_TYPE_PP_FRAG) + page_pool_put_full_page(ring->page_pool, desc_cb->priv, + false); else /* This page cannot be reused so discard it */ __page_frag_cache_drain(desc_cb->priv, desc_cb->pagecnt_bias); @@ -3836,6 +3867,10 @@ static int hns3_alloc_skb(struct hns3_enet_ring *ring, unsigned int length, hns3_rx_ring_move_fw(ring); return 0; } + + if (ring->page_pool) + skb_mark_for_recycle(skb); + u64_stats_update_begin(&ring->syncp); ring->stats.seg_pkt_cnt++; u64_stats_update_end(&ring->syncp); @@ -3874,6 +3909,10 @@ static int hns3_add_frag(struct hns3_enet_ring *ring) "alloc rx fraglist skb fail\n"); return -ENXIO; } + + if (ring->page_pool) + skb_mark_for_recycle(new_skb); + ring->frag_num = 0;
if (ring->tail_skb) { @@ -4676,6 +4715,29 @@ static void hns3_put_ring_config(struct hns3_nic_priv *priv) priv->ring = NULL; }
+static void hns3_alloc_page_pool(struct hns3_enet_ring *ring) +{ + struct page_pool_params pp_params = { + .flags = PP_FLAG_DMA_MAP | PP_FLAG_PAGE_FRAG | + PP_FLAG_DMA_SYNC_DEV, + .order = hns3_page_order(ring), + .pool_size = ring->desc_num * hns3_buf_size(ring) / + (PAGE_SIZE << hns3_page_order(ring)), + .nid = dev_to_node(ring_to_dev(ring)), + .dev = ring_to_dev(ring), + .dma_dir = DMA_FROM_DEVICE, + .offset = 0, + .max_len = PAGE_SIZE << hns3_page_order(ring), + }; + + ring->page_pool = page_pool_create(&pp_params); + if (IS_ERR(ring->page_pool)) { + dev_warn(ring_to_dev(ring), "page pool creation failed: %ld\n", + PTR_ERR(ring->page_pool)); + ring->page_pool = NULL; + } +} + static int hns3_alloc_ring_memory(struct hns3_enet_ring *ring) { int ret; @@ -4695,6 +4757,8 @@ static int hns3_alloc_ring_memory(struct hns3_enet_ring *ring) goto out_with_desc_cb;
if (!HNAE3_IS_TX_RING(ring)) { + hns3_alloc_page_pool(ring); + ret = hns3_alloc_ring_buffers(ring); if (ret) goto out_with_desc; @@ -4735,6 +4799,11 @@ void hns3_fini_ring(struct hns3_enet_ring *ring) devm_kfree(ring_to_dev(ring), tx_spare); ring->tx_spare = NULL; } + + if (!HNAE3_IS_TX_RING(ring) && ring->page_pool) { + page_pool_destroy(ring->page_pool); + ring->page_pool = NULL; + } }
static int hns3_buf_size2type(u32 buf_size) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index 0810d4d78353..6162d9f88e37 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -6,6 +6,7 @@
#include <linux/dim.h> #include <linux/if_vlan.h> +#include <net/page_pool.h>
#include "hnae3.h"
@@ -313,6 +314,7 @@ enum hns3_desc_type { DESC_TYPE_BOUNCE_ALL = 1 << 3, DESC_TYPE_BOUNCE_HEAD = 1 << 4, DESC_TYPE_SGL_SKB = 1 << 5, + DESC_TYPE_PP_FRAG = 1 << 6, };
struct hns3_desc_cb { @@ -457,6 +459,7 @@ struct hns3_enet_ring { struct hnae3_queue *tqp; int queue_index; struct device *dev; /* will be used for DMA mapping of descriptors */ + struct page_pool *page_pool;
/* statistic */ struct ring_stats stats;
From: Yunsheng Lin linyunsheng@huawei.com
mainline inclusion from mainline-v5.15-rc2 commit f7ec554b73c5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4CVS3 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
When page pool is added to the hns3 driver, it is always enabled unconditionally, which means spilt page handling in the hns3 driver is dead code.
As there is a requirement to test the performance between spilt page handling in driver and page pool, so add a module param to support disabling the page pool.
When the page pool is proved to perform better in most case, the spilt page handling in driver can be removed.
Fixes: 93188e9642c3 ("net: hns3: support skb's frag page recycling based on page pool") Signed-off-by: Yunsheng Lin linyunsheng@huawei.com Signed-off-by: Guangbin Huang huangguangbin2@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 43f91fe11e3c..24925be7b131 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -61,6 +61,9 @@ static unsigned int tx_sgl = 1; module_param(tx_sgl, uint, 0600); MODULE_PARM_DESC(tx_sgl, "Minimum number of frags when using dma_map_sg() to optimize the IOMMU mapping");
+static bool page_pool_enabled = true; +module_param(page_pool_enabled, bool, 0400); + #define HNS3_SGL_SIZE(nfrag) (sizeof(struct scatterlist) * (nfrag) + \ sizeof(struct sg_table)) #define HNS3_MAX_SGL_SIZE ALIGN(HNS3_SGL_SIZE(HNS3_MAX_TSO_BD_NUM), \ @@ -4757,7 +4760,8 @@ static int hns3_alloc_ring_memory(struct hns3_enet_ring *ring) goto out_with_desc_cb;
if (!HNAE3_IS_TX_RING(ring)) { - hns3_alloc_page_pool(ring); + if (page_pool_enabled) + hns3_alloc_page_pool(ring);
ret = hns3_alloc_ring_buffers(ring); if (ret)
From: Yunsheng Lin linyunsheng@huawei.com
mainline inclusion from mainline-v5.15-rc1 commit 7fb9b66dc9ce category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4CVS3 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
There is no need to synchronize the account updating, so use the relaxed atomic to avoid some memory barrier in the data path.
Acked-by: Jesper Dangaard Brouer brouer@redhat.com Signed-off-by: Yunsheng Lin linyunsheng@huawei.com Acked-by: Ilias Apalodimas ilias.apalodimas@linaro.org Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/core/page_pool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/core/page_pool.c b/net/core/page_pool.c index e1409056965a..1a6978427d6c 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -370,7 +370,7 @@ void page_pool_release_page(struct page_pool *pool, struct page *page) /* This may be the last page returned, releasing the pool, so * it is not safe to reference pool afterwards. */ - count = atomic_inc_return(&pool->pages_state_release_cnt); + count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); trace_page_pool_state_release(pool, page, count); } EXPORT_SYMBOL(page_pool_release_page);
From: Yunsheng Lin linyunsheng@huawei.com
mainline inclusion from mainline-master commit d00e60ee54b1 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4CVS3 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
As the 32-bit arch with 64-bit DMA seems to rare those days, and page pool might carry a lot of code and complexity for systems that possibly.
So disable dma mapping support for such systems, if drivers really want to work on such systems, they have to implement their own DMA-mapping fallback tracking outside page_pool.
Reviewed-by: Ilias Apalodimas ilias.apalodimas@linaro.org Signed-off-by: Yunsheng Lin linyunsheng@huawei.com Acked-by: Jesper Dangaard Brouer brouer@redhat.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/mm_types.h | 13 +------------ include/net/page_pool.h | 12 +----------- net/core/page_pool.c | 10 ++++++---- 3 files changed, 8 insertions(+), 27 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6121e844d77b..2729eb58aca4 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -104,18 +104,7 @@ struct page { struct page_pool *pp; unsigned long _pp_mapping_pad; unsigned long dma_addr; - union { - /** - * dma_addr_upper: might require a 64-bit - * value on 32-bit architectures. - */ - unsigned long dma_addr_upper; - /** - * For frag page support, not supported in - * 32-bit architectures with 64-bit DMA. - */ - atomic_long_t pp_frag_count; - }; + atomic_long_t pp_frag_count; }; struct { /* slab, slob and slub */ union { diff --git a/include/net/page_pool.h b/include/net/page_pool.h index a4082406a003..3855f069627f 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -216,24 +216,14 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, page_pool_put_full_page(pool, page, true); }
-#define PAGE_POOL_DMA_USE_PP_FRAG_COUNT \ - (sizeof(dma_addr_t) > sizeof(unsigned long)) - static inline dma_addr_t page_pool_get_dma_addr(struct page *page) { - dma_addr_t ret = page->dma_addr; - - if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) - ret |= (dma_addr_t)page->dma_addr_upper << 16 << 16; - - return ret; + return page->dma_addr; }
static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr) { page->dma_addr = addr; - if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) - page->dma_addr_upper = upper_32_bits(addr); }
static inline void page_pool_set_frag_count(struct page *page, long nr) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 1a6978427d6c..9b60e4301a44 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -49,6 +49,12 @@ static int page_pool_init(struct page_pool *pool, * which is the XDP_TX use-case. */ if (pool->p.flags & PP_FLAG_DMA_MAP) { + /* DMA-mapping is not supported on 32-bit systems with + * 64-bit DMA mapping. + */ + if (sizeof(dma_addr_t) > sizeof(unsigned long)) + return -EOPNOTSUPP; + if ((pool->p.dma_dir != DMA_FROM_DEVICE) && (pool->p.dma_dir != DMA_BIDIRECTIONAL)) return -EINVAL; @@ -69,10 +75,6 @@ static int page_pool_init(struct page_pool *pool, */ }
- if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT && - pool->p.flags & PP_FLAG_PAGE_FRAG) - return -EINVAL; - if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) return -ENOMEM;
From: Qi Deng dengqi7@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4ETXO CVE: NA
-----------------------------------------
The BMA software is a system management software offered by Huawei. It supports the status monitoring, performance monitoring, and event monitoring of various components, including server CPUs, memory, hard disks, NICs, IB cards, PCIe cards, RAID controller cards, and optical modules.
This host_edma_drv driver is a PCIe driver used by Huawei BMA software. The main function of it is to control the PCIe bus between BMA software and Huawei 1711 chip. The chip will then process the data and display to users. This host_edma_drv driver offers API to send/receive data for other BMA drivers which want to use the PCIe channel in different ways(eg.host_cdev_drv, host_veth_drv).
Link: https://lkml.org/lkml/2020/6/22/752
Signed-off-by: Qi Deng dengqi7@huawei.com Reviewed-by: Qidong Wang wangqindong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/huawei/Kconfig | 1 + drivers/net/ethernet/huawei/Makefile | 1 + drivers/net/ethernet/huawei/bma/Kconfig | 10 + drivers/net/ethernet/huawei/bma/Makefile | 5 + .../net/ethernet/huawei/bma/edma_drv/Makefile | 2 + .../huawei/bma/edma_drv/bma_devintf.c | 597 +++++++ .../huawei/bma/edma_drv/bma_devintf.h | 40 + .../huawei/bma/edma_drv/bma_include.h | 116 ++ .../ethernet/huawei/bma/edma_drv/bma_pci.c | 533 ++++++ .../ethernet/huawei/bma/edma_drv/bma_pci.h | 98 ++ .../ethernet/huawei/bma/edma_drv/edma_host.c | 1462 +++++++++++++++++ .../ethernet/huawei/bma/edma_drv/edma_host.h | 351 ++++ .../huawei/bma/include/bma_ker_intf.h | 94 ++ 13 files changed, 3310 insertions(+) create mode 100644 drivers/net/ethernet/huawei/bma/Kconfig create mode 100644 drivers/net/ethernet/huawei/bma/Makefile create mode 100644 drivers/net/ethernet/huawei/bma/edma_drv/Makefile create mode 100644 drivers/net/ethernet/huawei/bma/edma_drv/bma_devintf.c create mode 100644 drivers/net/ethernet/huawei/bma/edma_drv/bma_devintf.h create mode 100644 drivers/net/ethernet/huawei/bma/edma_drv/bma_include.h create mode 100644 drivers/net/ethernet/huawei/bma/edma_drv/bma_pci.c create mode 100644 drivers/net/ethernet/huawei/bma/edma_drv/bma_pci.h create mode 100644 drivers/net/ethernet/huawei/bma/edma_drv/edma_host.c create mode 100644 drivers/net/ethernet/huawei/bma/edma_drv/edma_host.h create mode 100644 drivers/net/ethernet/huawei/bma/include/bma_ker_intf.h
diff --git a/drivers/net/ethernet/huawei/Kconfig b/drivers/net/ethernet/huawei/Kconfig index c05fce15eb51..0afeb5021f17 100644 --- a/drivers/net/ethernet/huawei/Kconfig +++ b/drivers/net/ethernet/huawei/Kconfig @@ -16,5 +16,6 @@ config NET_VENDOR_HUAWEI if NET_VENDOR_HUAWEI
source "drivers/net/ethernet/huawei/hinic/Kconfig" +source "drivers/net/ethernet/huawei/bma/Kconfig"
endif # NET_VENDOR_HUAWEI diff --git a/drivers/net/ethernet/huawei/Makefile b/drivers/net/ethernet/huawei/Makefile index 2549ad5afe6d..f5bf4ae195a3 100644 --- a/drivers/net/ethernet/huawei/Makefile +++ b/drivers/net/ethernet/huawei/Makefile @@ -4,3 +4,4 @@ #
obj-$(CONFIG_HINIC) += hinic/ +obj-$(CONFIG_BMA) += bma/ diff --git a/drivers/net/ethernet/huawei/bma/Kconfig b/drivers/net/ethernet/huawei/bma/Kconfig new file mode 100644 index 000000000000..3c4c7b5d9757 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/Kconfig @@ -0,0 +1,10 @@ +# +# Huawei BMA software driver configuration +# + +config BMA + tristate "Huawei BMA Driver" + + help + This driver supports Huawei BMA Software. It is used + to communication between Huawei BMA and BMC software. \ No newline at end of file diff --git a/drivers/net/ethernet/huawei/bma/Makefile b/drivers/net/ethernet/huawei/bma/Makefile new file mode 100644 index 000000000000..15b3e29e65f9 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for BMA software driver +# + +obj-$(CONFIG_BMA) += edma_drv/ diff --git a/drivers/net/ethernet/huawei/bma/edma_drv/Makefile b/drivers/net/ethernet/huawei/bma/edma_drv/Makefile new file mode 100644 index 000000000000..46cc51275a71 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/edma_drv/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_BMA) += host_edma_drv.o +host_edma_drv-y := bma_pci.o bma_devintf.o edma_host.o diff --git a/drivers/net/ethernet/huawei/bma/edma_drv/bma_devintf.c b/drivers/net/ethernet/huawei/bma/edma_drv/bma_devintf.c new file mode 100644 index 000000000000..7817f58f8635 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/edma_drv/bma_devintf.c @@ -0,0 +1,597 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/miscdevice.h> +#include <asm/ioctls.h> +#include <linux/slab.h> +#include <linux/poll.h> +#include <linux/proc_fs.h> +#include <linux/notifier.h> +#include "../include/bma_ker_intf.h" +#include "bma_include.h" +#include "bma_devintf.h" +#include "bma_pci.h" +#include "edma_host.h" + +static struct bma_dev_s *g_bma_dev; + +static ATOMIC_NOTIFIER_HEAD(bma_int_notify_list); + +static int bma_priv_insert_priv_list(struct bma_priv_data_s *priv, u32 type, + u32 sub_type) +{ + unsigned long flags = 0; + int ret = 0; + struct edma_user_inft_s *user_inft = NULL; + + if (type >= TYPE_MAX || !priv) + return -EFAULT; + + user_inft = edma_host_get_user_inft(type); + + if (user_inft && user_inft->user_register) { + ret = user_inft->user_register(priv); + if (ret) { + BMA_LOG(DLOG_ERROR, "register failed\n"); + return -EFAULT; + } + } else { + if (!g_bma_dev) + return -ENXIO; + + if (atomic_dec_and_test(&g_bma_dev->au_count[type]) == 0) { + BMA_LOG(DLOG_ERROR, + "busy, init_dev_type.type = %d, au_count = %d\n", + type, + atomic_read(&g_bma_dev->au_count[type])); + atomic_inc(&g_bma_dev->au_count[type]); + return -EBUSY; /* already register */ + } + + priv->user.type = type; + priv->user.sub_type = sub_type; + priv->user.user_id = 0; + + spin_lock_irqsave(&g_bma_dev->priv_list_lock, flags); + + list_add_rcu(&priv->user.link, &g_bma_dev->priv_list); + + spin_unlock_irqrestore(&g_bma_dev->priv_list_lock, flags); + } + + return 0; +} + +static int bma_priv_delete_priv_list(struct bma_priv_data_s *priv) +{ + unsigned long flags = 0; + struct edma_user_inft_s *user_inft = NULL; + + if (!priv || priv->user.type >= TYPE_MAX) + return -EFAULT; + user_inft = edma_host_get_user_inft(priv->user.type); + if (user_inft && user_inft->user_register) { + user_inft->user_unregister(priv); + } else { + if (!g_bma_dev) + return -ENXIO; + spin_lock_irqsave(&g_bma_dev->priv_list_lock, flags); + list_del_rcu(&priv->user.link); + spin_unlock_irqrestore(&g_bma_dev->priv_list_lock, flags); + /* release the type */ + atomic_inc(&g_bma_dev->au_count[priv->user.type]); + } + return 0; +} + +static int bma_priv_init(struct bma_priv_data_s **bma_priv) +{ + struct bma_priv_data_s *priv = NULL; + + if (!bma_priv) + return -EFAULT; + + priv = kmalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) { + BMA_LOG(DLOG_ERROR, "malloc priv failed\n"); + return -ENOMEM; + } + + memset(priv, 0, sizeof(struct bma_priv_data_s)); + + spin_lock_init(&priv->recv_msg_lock); + INIT_LIST_HEAD(&priv->recv_msgs); + init_waitqueue_head(&priv->wait); + + priv->user.type = TYPE_UNKNOWN; + priv->user.sub_type = 0; + priv->user.dma_transfer = 0; + priv->user.seq = 0; + priv->user.cur_recvmsg_nums = 0; + priv->user.max_recvmsg_nums = DEFAULT_MAX_RECV_MSG_NUMS; + + *bma_priv = priv; + + return 0; +} + +static void bma_priv_clean_up(struct bma_priv_data_s *bma_priv) +{ + int ret = 0; + int i = 0; + struct bma_priv_data_s *priv = bma_priv; + struct edma_recv_msg_s *msg = NULL; + + if (!priv) + return; + + if (priv->user.type == TYPE_UNKNOWN) { + BMA_LOG(DLOG_ERROR, "already unknown type\n"); + return; + } + + for (i = 0; i < priv->user.max_recvmsg_nums; i++) { + ret = edma_host_recv_msg(&g_bma_dev->edma_host, priv, &msg); + if (ret) + break; + + kfree(msg); + } + + priv->user.type = TYPE_UNKNOWN; + priv->user.sub_type = 0; + priv->user.dma_transfer = 0; + priv->user.seq = 0; + priv->user.cur_recvmsg_nums = 0; + priv->user.max_recvmsg_nums = DEFAULT_MAX_RECV_MSG_NUMS; + kfree(priv); +} + +static irqreturn_t bma_irq_handle(int irq, void *data) +{ + struct bma_dev_s *bma_dev = (struct bma_dev_s *)data; + + if (!bma_dev) + return IRQ_HANDLED; + + bma_dev->edma_host.statistics.b2h_int++; + + if (!is_edma_b2h_int(&bma_dev->edma_host)) + return edma_host_irq_handle(&bma_dev->edma_host); + + return (irqreturn_t)atomic_notifier_call_chain(&bma_int_notify_list, 0, + data); +} + +int bma_devinft_init(struct bma_pci_dev_s *bma_pci_dev) +{ + int ret = 0; + int i = 0; + struct bma_dev_s *bma_dev = NULL; + + if (!bma_pci_dev) + return -EFAULT; + + bma_dev = kmalloc(sizeof(*bma_dev), (int)GFP_KERNEL); + if (!bma_dev) + return -ENOMEM; + + memset(bma_dev, 0, sizeof(struct bma_dev_s)); + + bma_dev->bma_pci_dev = bma_pci_dev; + bma_pci_dev->bma_dev = bma_dev; + + INIT_LIST_HEAD(&bma_dev->priv_list); + spin_lock_init(&bma_dev->priv_list_lock); + + for (i = 0; i < TYPE_MAX; i++) + atomic_set(&bma_dev->au_count[i], 1); + + ret = edma_host_init(&bma_dev->edma_host); + if (ret) { + BMA_LOG(DLOG_ERROR, "init edma host failed!err = %d\n", ret); + goto err_free_bma_dev; + } + + BMA_LOG(DLOG_DEBUG, "irq = %d\n", bma_pci_dev->pdev->irq); + + ret = request_irq(bma_pci_dev->pdev->irq, bma_irq_handle, IRQF_SHARED, + "EDMA_IRQ", (void *)bma_dev); + if (ret) { + BMA_LOG(DLOG_ERROR, "request_irq failed!err = %d\n", ret); + goto err_edma_host_exit; + } + + g_bma_dev = bma_dev; + BMA_LOG(DLOG_DEBUG, "ok\n"); + + return 0; + +err_edma_host_exit: + edma_host_cleanup(&bma_dev->edma_host); + +err_free_bma_dev: + kfree(bma_dev); + bma_pci_dev->bma_dev = NULL; + + return ret; +} + +void bma_devinft_cleanup(struct bma_pci_dev_s *bma_pci_dev) +{ + if (g_bma_dev) { + if ((bma_pci_dev) && bma_pci_dev->pdev && + bma_pci_dev->pdev->irq) { + BMA_LOG(DLOG_DEBUG, "irq = %d\n", + bma_pci_dev->pdev->irq); + free_irq(bma_pci_dev->pdev->irq, + (void *)bma_pci_dev->bma_dev); + } + + edma_host_cleanup(&g_bma_dev->edma_host); + + if ((bma_pci_dev) && bma_pci_dev->bma_dev) { + kfree(bma_pci_dev->bma_dev); + bma_pci_dev->bma_dev = NULL; + } + + g_bma_dev = NULL; + } +} + +int bma_intf_register_int_notifier(struct notifier_block *nb) +{ + if (!nb) + return -1; + + return atomic_notifier_chain_register(&bma_int_notify_list, nb); +} +EXPORT_SYMBOL_GPL(bma_intf_register_int_notifier); + +void bma_intf_unregister_int_notifier(struct notifier_block *nb) +{ + if (!nb) + return; + + atomic_notifier_chain_unregister(&bma_int_notify_list, nb); +} +EXPORT_SYMBOL_GPL(bma_intf_unregister_int_notifier); + +int bma_intf_register_type(u32 type, u32 sub_type, enum intr_mod support_int, + void **handle) +{ + int ret = 0; + struct bma_priv_data_s *priv = NULL; + + if (!handle) + return -EFAULT; + + ret = bma_priv_init(&priv); + if (ret) { + BMA_LOG(DLOG_ERROR, "bma_priv_init failed! ret = %d\n", ret); + return ret; + } + + ret = bma_priv_insert_priv_list(priv, type, sub_type); + if (ret) { + bma_priv_clean_up(priv); + BMA_LOG(DLOG_ERROR, + "bma_priv_insert_priv_list failed! ret = %d\n", ret); + return ret; + } + + if (support_int) + priv->user.support_int = INTR_ENABLE; + + if (type == TYPE_VETH) { + priv->specific.veth.pdev = g_bma_dev->bma_pci_dev->pdev; + + priv->specific.veth.veth_swap_phy_addr = + g_bma_dev->bma_pci_dev->veth_swap_phy_addr; + priv->specific.veth.veth_swap_addr = + g_bma_dev->bma_pci_dev->veth_swap_addr; + priv->specific.veth.veth_swap_len = + g_bma_dev->bma_pci_dev->veth_swap_len; + } + + *handle = priv; + + return 0; +} +EXPORT_SYMBOL(bma_intf_register_type); + +int bma_intf_unregister_type(void **handle) +{ + struct bma_priv_data_s *priv = NULL; + + if (!handle) { + BMA_LOG(DLOG_ERROR, "edna_priv is NULL\n"); + return -EFAULT; + } + + priv = (struct bma_priv_data_s *)*handle; + *handle = NULL; + + priv->user.cur_recvmsg_nums++; + wake_up_interruptible(&priv->wait); + + msleep(500); + + bma_priv_delete_priv_list(priv); + + bma_priv_clean_up(priv); + + return 0; +} +EXPORT_SYMBOL(bma_intf_unregister_type); + +int bma_intf_check_edma_supported(void) +{ + return !(!g_bma_dev); +} +EXPORT_SYMBOL(bma_intf_check_edma_supported); + +int bma_intf_check_dma_status(enum dma_direction_e dir) +{ + return edma_host_check_dma_status(dir); +} +EXPORT_SYMBOL(bma_intf_check_dma_status); + +void bma_intf_reset_dma(enum dma_direction_e dir) +{ + edma_host_reset_dma(&g_bma_dev->edma_host, dir); +} +EXPORT_SYMBOL(bma_intf_reset_dma); + +void bma_intf_clear_dma_int(enum dma_direction_e dir) +{ + if (dir == BMC_TO_HOST) + clear_int_dmab2h(&g_bma_dev->edma_host); + else if (dir == HOST_TO_BMC) + clear_int_dmah2b(&g_bma_dev->edma_host); + else + return; +} +EXPORT_SYMBOL(bma_intf_clear_dma_int); + +int bma_intf_start_dma(void *handle, struct bma_dma_transfer_s *dma_transfer) +{ + int ret = 0; + struct bma_priv_data_s *priv = (struct bma_priv_data_s *)handle; + + if (!handle || !dma_transfer) + return -EFAULT; + + ret = edma_host_dma_start(&g_bma_dev->edma_host, priv); + if (ret) { + BMA_LOG(DLOG_ERROR, + "edma_host_dma_start failed! result = %d\n", ret); + return ret; + } + + ret = edma_host_dma_transfer(&g_bma_dev->edma_host, priv, dma_transfer); + if (ret) + BMA_LOG(DLOG_ERROR, + "edma_host_dma_transfer failed! ret = %d\n", ret); + + ret = edma_host_dma_stop(&g_bma_dev->edma_host, priv); + if (ret) { + BMA_LOG(DLOG_ERROR, + "edma_host_dma_stop failed! result = %d\n", ret); + return ret; + } + + return ret; +} +EXPORT_SYMBOL(bma_intf_start_dma); + +int bma_intf_int_to_bmc(void *handle) +{ + struct bma_priv_data_s *priv = (struct bma_priv_data_s *)handle; + + if (!handle) + return -EFAULT; + + if (priv->user.support_int == 0) { + BMA_LOG(DLOG_ERROR, "not support int to bmc.\n"); + return -EFAULT; + } + + edma_int_to_bmc(&g_bma_dev->edma_host); + + return 0; +} +EXPORT_SYMBOL(bma_intf_int_to_bmc); + +int bma_intf_is_link_ok(void) +{ + return (g_bma_dev->edma_host.statistics.remote_status == + REGISTERED) ? 1 : 0; +} +EXPORT_SYMBOL(bma_intf_is_link_ok); + +int bma_cdev_recv_msg(void *handle, char __user *data, size_t count) +{ + struct bma_priv_data_s *priv = NULL; + struct edma_recv_msg_s *msg = NULL; + int result = 0; + int len = 0; + + if (!handle || !data || count == 0) { + BMA_LOG(DLOG_DEBUG, "input NULL point!\n"); + return -EFAULT; + } + + priv = (struct bma_priv_data_s *)handle; + + result = edma_host_recv_msg(&g_bma_dev->edma_host, priv, &msg); + if (result != 0) + return -ENODATA; + + if (msg->msg_len > count) { + kfree(msg); + return -EFAULT; + } + + if (copy_to_user(data, (void *)msg->msg_data, msg->msg_len)) { + kfree(msg); + return -EFAULT; + } + + len = msg->msg_len; + + kfree(msg); + + return len; +} +EXPORT_SYMBOL_GPL(bma_cdev_recv_msg); + +int bma_cdev_add_msg(void *handle, const char __user *msg, size_t msg_len) +{ + struct bma_priv_data_s *priv = NULL; + struct edma_msg_hdr_s *hdr = NULL; + unsigned long flags = 0; + int total_len = 0; + int ret = 0; + struct edma_host_s *phost = &g_bma_dev->edma_host; + + if (!handle || !msg || msg_len == 0) { + BMA_LOG(DLOG_DEBUG, "input NULL point!\n"); + return -EFAULT; + } + + if (msg_len > CDEV_MAX_WRITE_LEN) { + BMA_LOG(DLOG_DEBUG, "input data is overlen!\n"); + return -EINVAL; + } + + priv = (struct bma_priv_data_s *)handle; + + if (priv->user.type >= TYPE_MAX) { + BMA_LOG(DLOG_DEBUG, "error type = %d\n", priv->user.type); + return -EFAULT; + } + total_len = SIZE_OF_MSG_HDR + msg_len; + + spin_lock_irqsave(&phost->send_msg_lock, flags); + + if (phost->msg_send_write + total_len <= + HOST_MAX_SEND_MBX_LEN - SIZE_OF_MBX_HDR) { + hdr = (struct edma_msg_hdr_s *)(phost->msg_send_buf + + phost->msg_send_write); + hdr->type = priv->user.type; + hdr->sub_type = priv->user.sub_type; + hdr->user_id = priv->user.user_id; + hdr->datalen = msg_len; + BMA_LOG(DLOG_DEBUG, "msg_len is %ld\n", msg_len); + + if (copy_from_user(hdr->data, msg, msg_len)) { + BMA_LOG(DLOG_ERROR, "copy_from_user error\n"); + ret = -EFAULT; + goto end; + } + + phost->msg_send_write += total_len; + phost->statistics.send_bytes += total_len; + phost->statistics.send_pkgs++; +#ifdef EDMA_TIMER + (void)mod_timer(&phost->timer, jiffies_64); +#endif + BMA_LOG(DLOG_DEBUG, "msg_send_write = %d\n", + phost->msg_send_write); + + ret = msg_len; + goto end; + } else { + BMA_LOG(DLOG_DEBUG, + "msg lost,msg_send_write: %d,msg_len:%d,max_len: %d\n", + phost->msg_send_write, total_len, + HOST_MAX_SEND_MBX_LEN); + ret = -ENOSPC; + goto end; + } + +end: + spin_unlock_irqrestore(&g_bma_dev->edma_host.send_msg_lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(bma_cdev_add_msg); + +unsigned int bma_cdev_check_recv(void *handle) +{ + struct bma_priv_data_s *priv = (struct bma_priv_data_s *)handle; + unsigned long flags = 0; + unsigned int result = 0; + + if (priv) { + spin_lock_irqsave(&priv->recv_msg_lock, flags); + + if (!list_empty(&priv->recv_msgs)) + result = 1; + + spin_unlock_irqrestore(&priv->recv_msg_lock, flags); + } + + return result; +} +EXPORT_SYMBOL_GPL(bma_cdev_check_recv); + +void *bma_cdev_get_wait_queue(void *handle) +{ + struct bma_priv_data_s *priv = (struct bma_priv_data_s *)handle; + + return priv ? ((void *)&priv->wait) : NULL; +} +EXPORT_SYMBOL_GPL(bma_cdev_get_wait_queue); + +void bma_intf_set_open_status(void *handle, int s) +{ + struct bma_priv_data_s *priv = (struct bma_priv_data_s *)handle; + int i = 0; + int ret = 0; + unsigned long flags = 0; + char drv_msg[3] = { 0 }; + struct edma_recv_msg_s *tmp_msg = NULL; + + if (!priv || priv->user.type >= TYPE_MAX) + return; + + drv_msg[0] = 1; + drv_msg[1] = priv->user.type; + drv_msg[2] = s; + + (void)edma_host_send_driver_msg((void *)drv_msg, sizeof(drv_msg), + DEV_OPEN_STATUS_ANS); + + spin_lock_irqsave(&priv->recv_msg_lock, flags); + g_bma_dev->edma_host.local_open_status[priv->user.type] = s; + + if (s == DEV_CLOSE && priv->user.cur_recvmsg_nums > 0) { + for (i = 0; i < priv->user.max_recvmsg_nums; i++) { + ret = edma_host_recv_msg(&g_bma_dev->edma_host, + priv, &tmp_msg); + if (ret < 0) + break; + + kfree(tmp_msg); + tmp_msg = NULL; + } + } + + spin_unlock_irqrestore(&priv->recv_msg_lock, flags); +} +EXPORT_SYMBOL_GPL(bma_intf_set_open_status); diff --git a/drivers/net/ethernet/huawei/bma/edma_drv/bma_devintf.h b/drivers/net/ethernet/huawei/bma/edma_drv/bma_devintf.h new file mode 100644 index 000000000000..138d1e278479 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/edma_drv/bma_devintf.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _BMA_DEVINTF_H_ +#define _BMA_DEVINTF_H_ + +#include <linux/mutex.h> +#include "bma_pci.h" +#include "edma_host.h" + +struct bma_dev_s { + /* proc */ + struct proc_dir_entry *proc_bma_root; + + atomic_t au_count[TYPE_MAX]; + + struct list_head priv_list; + /* spinlock for priv list */ + spinlock_t priv_list_lock; + + struct bma_pci_dev_s *bma_pci_dev; + struct edma_host_s edma_host; +}; + +int bma_devinft_init(struct bma_pci_dev_s *bma_pci_dev); +void bma_devinft_cleanup(struct bma_pci_dev_s *bma_pci_dev); + +#endif diff --git a/drivers/net/ethernet/huawei/bma/edma_drv/bma_include.h b/drivers/net/ethernet/huawei/bma/edma_drv/bma_include.h new file mode 100644 index 000000000000..2c122ae91463 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/edma_drv/bma_include.h @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _BMA_INCLUDE_H_ +#define _BMA_INCLUDE_H_ + +#include <linux/slab.h> +#include <asm/ioctls.h> +#include <linux/capability.h> +#include <linux/uaccess.h> /* copy_*_user */ +#include <linux/delay.h> /* udelay */ +#include <linux/mm.h> +#include <linux/types.h> +#include <linux/pci.h> +#include <linux/mutex.h> +#include <linux/interrupt.h> /*tasklet */ +#include <linux/version.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/version.h> +#include <linux/semaphore.h> +#include <linux/sched.h> + +#define UNUSED(x) (x = x) +#define KBOX_FALSE (-1) +#define KBOX_TRUE 0 + +#define KBOX_IOC_MAGIC (0xB2) + +#define DEFAULT_MAX_RECV_MSG_NUMS 32 +#define MAX_RECV_MSG_NUMS 1024 + +#define STRFICATION(R) #R +#define MICRO_TO_STR(R) STRFICATION(R) + +enum { + DLOG_ERROR = 0, + DLOG_DEBUG = 1, +}; + +enum { + DEV_CLOSE = 0, + DEV_OPEN = 1, + DEV_OPEN_STATUS_REQ = 0xf0, + DEV_OPEN_STATUS_ANS +}; + +struct bma_user_s { + struct list_head link; + + u32 type; + u32 sub_type; + u8 user_id; + + u8 dma_transfer:1, support_int:1; + + u8 reserve1[2]; + u32 seq; + u16 cur_recvmsg_nums; + u16 max_recvmsg_nums; +}; + +struct bma_priv_data_veth_s { + struct pci_dev *pdev; + + unsigned long veth_swap_phy_addr; + void __iomem *veth_swap_addr; + unsigned long veth_swap_len; +}; + +struct bma_priv_data_s { + struct bma_user_s user; + /* spinlock for recv msg list */ + spinlock_t recv_msg_lock; + struct list_head recv_msgs; + struct file *file; + wait_queue_head_t wait; + + union { + struct bma_priv_data_veth_s veth; + } specific; +}; + +#if defined(timer_setup) && defined(from_timer) +#define HAVE_TIMER_SETUP +#endif + +void __iomem *kbox_get_base_addr(void); +unsigned long kbox_get_io_len(void); +unsigned long kbox_get_base_phy_addr(void); +int edma_param_set_debug(const char *buf, const struct kernel_param *kp); + +#define GET_SYS_SECONDS(t) do \ + {\ + struct timespec64 uptime;\ + ktime_get_coarse_real_ts64(&uptime);\ + t = uptime.tv_sec;\ + } while (0) + +#define SECONDS_PER_DAY (24 * 3600) +#define SECONDS_PER_HOUR (3600) +#define SECONDS_PER_MINUTE (60) + +#endif diff --git a/drivers/net/ethernet/huawei/bma/edma_drv/bma_pci.c b/drivers/net/ethernet/huawei/bma/edma_drv/bma_pci.c new file mode 100644 index 000000000000..577acaedb0e2 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/edma_drv/bma_pci.c @@ -0,0 +1,533 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/pci.h> +#include <linux/version.h> +#include <linux/module.h> + +#include "bma_include.h" +#include "bma_devintf.h" +#include "bma_pci.h" + +#define PCI_KBOX_MODULE_NAME "edma_drv" +#define PCI_VENDOR_ID_HUAWEI_FPGA 0x19aa +#define PCI_DEVICE_ID_KBOX_0 0xe004 + +#define PCI_VENDOR_ID_HUAWEI_PME 0x19e5 +#define PCI_DEVICE_ID_KBOX_0_PME 0x1710 +#define PCI_PME_USEABLE_SPACE (4 * 1024 * 1024) +#define PME_DEV_CHECK(device, vendor) ((device) == PCI_DEVICE_ID_KBOX_0_PME && \ + (vendor) == PCI_VENDOR_ID_HUAWEI_PME) + +#define PCI_BAR0_PME_1710 0x85800000 +#define PCI_BAR0 0 +#define PCI_BAR1 1 +#define PCI_USING_DAC_DEFAULT 0 + +#define GET_HIGH_ADDR(address) ((sizeof(unsigned long) == 8) ? \ + ((u64)(address) >> 32) : 0) + +/* The value of the expression is true + * only when dma_set_mask and dma_set_coherent_mask failed. + */ +#define SET_DMA_MASK(p_dev) \ + (dma_set_mask((p_dev), DMA_BIT_MASK(64)) && \ + dma_set_coherent_mask((p_dev), DMA_BIT_MASK(64))) + +int pci_using_dac = PCI_USING_DAC_DEFAULT; +int debug = DLOG_ERROR; +MODULE_PARM_DESC(debug, "Debug switch (0=close debug, 1=open debug)"); + +static struct bma_pci_dev_s *g_bma_pci_dev; + +static int bma_pci_suspend(struct pci_dev *pdev, pm_message_t state); +static int bma_pci_resume(struct pci_dev *pdev); +static int bma_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent); +static void bma_pci_remove(struct pci_dev *pdev); + +static const struct pci_device_id bma_pci_tbl[] = { + {PCI_DEVICE(PCI_VENDOR_ID_HUAWEI_FPGA, PCI_DEVICE_ID_KBOX_0)}, + {PCI_DEVICE(PCI_VENDOR_ID_HUAWEI_PME, PCI_DEVICE_ID_KBOX_0_PME)}, + {} +}; +MODULE_DEVICE_TABLE(pci, bma_pci_tbl); + +int edma_param_get_statics(char *buf, const struct kernel_param *kp) +{ + if (!buf) + return 0; + + return edmainfo_show(buf); +} + +module_param_call(statistics, NULL, edma_param_get_statics, &debug, 0444); +MODULE_PARM_DESC(statistics, "Statistics info of edma driver,readonly"); + +int edma_param_set_debug(const char *buf, const struct kernel_param *kp) +{ + unsigned long val = 0; + int ret = 0; + + if (!buf) + return -EINVAL; + + ret = kstrtoul(buf, 0, &val); + + if (ret) + return ret; + + if (val > 1) + return -EINVAL; + + return param_set_int(buf, kp); +} +EXPORT_SYMBOL_GPL(edma_param_set_debug); + +module_param_call(debug, &edma_param_set_debug, ¶m_get_int, &debug, 0644); + +void __iomem *kbox_get_base_addr(void) +{ + if (!g_bma_pci_dev || (!(g_bma_pci_dev->kbox_base_addr))) { + BMA_LOG(DLOG_ERROR, "kbox_base_addr NULL point\n"); + return NULL; + } + + return g_bma_pci_dev->kbox_base_addr; +} +EXPORT_SYMBOL_GPL(kbox_get_base_addr); + +unsigned long kbox_get_io_len(void) +{ + if (!g_bma_pci_dev) { + BMA_LOG(DLOG_ERROR, "kbox_io_len is error,can not get it\n"); + return 0; + } + + return g_bma_pci_dev->kbox_base_len; +} +EXPORT_SYMBOL_GPL(kbox_get_io_len); + +unsigned long kbox_get_base_phy_addr(void) +{ + if (!g_bma_pci_dev || !g_bma_pci_dev->kbox_base_phy_addr) { + BMA_LOG(DLOG_ERROR, "kbox_base_phy_addr NULL point\n"); + return 0; + } + + return g_bma_pci_dev->kbox_base_phy_addr; +} +EXPORT_SYMBOL_GPL(kbox_get_base_phy_addr); + +static struct pci_driver bma_driver = { + .name = PCI_KBOX_MODULE_NAME, + .id_table = bma_pci_tbl, + .probe = bma_pci_probe, + .remove = bma_pci_remove, + .suspend = bma_pci_suspend, + .resume = bma_pci_resume, +}; + +s32 __atu_config_H(struct pci_dev *pdev, unsigned int region, + unsigned int hostaddr_h, unsigned int hostaddr_l, + unsigned int bmcaddr_h, unsigned int bmcaddr_l, + unsigned int len) +{ + /* atu index reg,inbound and region*/ + (void)pci_write_config_dword(pdev, ATU_VIEWPORT, + REGION_DIR_INPUT + (region & REGION_INDEX_MASK)); + (void)pci_write_config_dword(pdev, ATU_BASE_LOW, hostaddr_l); + (void)pci_write_config_dword(pdev, ATU_BASE_HIGH, hostaddr_h); + (void)pci_write_config_dword(pdev, ATU_LIMIT, hostaddr_l + len - 1); + (void)pci_write_config_dword(pdev, ATU_TARGET_LOW, bmcaddr_l); + (void)pci_write_config_dword(pdev, ATU_TARGET_HIGH, bmcaddr_h); + /* atu ctrl1 reg */ + (void)pci_write_config_dword(pdev, ATU_REGION_CTRL1, ATU_CTRL1_DEFAULT); + /* atu ctrl2 reg */ + (void)pci_write_config_dword(pdev, ATU_REGION_CTRL2, REGION_ENABLE); + + return 0; +} + +static void iounmap_bar_mem(struct bma_pci_dev_s *bma_pci_dev) +{ + if (bma_pci_dev->kbox_base_addr) { + iounmap(bma_pci_dev->kbox_base_addr); + bma_pci_dev->kbox_base_addr = NULL; + } + + if (bma_pci_dev->bma_base_addr) { + iounmap(bma_pci_dev->bma_base_addr); + bma_pci_dev->bma_base_addr = NULL; + bma_pci_dev->edma_swap_addr = NULL; + bma_pci_dev->hostrtc_viraddr = NULL; + } +} + +static int ioremap_pme_bar1_mem(struct pci_dev *pdev, + struct bma_pci_dev_s *bma_pci_dev) +{ + unsigned long bar1_resource_flag = 0; + u32 data = 0; + + bma_pci_dev->kbox_base_len = PCI_PME_USEABLE_SPACE; + BMA_LOG(DLOG_DEBUG, "1710\n"); + + bma_pci_dev->bma_base_phy_addr = + pci_resource_start(pdev, PCI_BAR1); + bar1_resource_flag = pci_resource_flags(pdev, PCI_BAR1); + + if (!(bar1_resource_flag & IORESOURCE_MEM)) { + BMA_LOG(DLOG_ERROR, + "Cannot find proper PCI device base address, aborting\n"); + return -ENODEV; + } + + bma_pci_dev->bma_base_len = pci_resource_len(pdev, PCI_BAR1); + bma_pci_dev->edma_swap_len = EDMA_SWAP_DATA_SIZE; + bma_pci_dev->veth_swap_len = VETH_SWAP_DATA_SIZE; + + BMA_LOG(DLOG_DEBUG, + "bar1: bma_base_len = 0x%lx, edma_swap_len = %ld, veth_swap_len = %ld(0x%lx)\n", + bma_pci_dev->bma_base_len, bma_pci_dev->edma_swap_len, + bma_pci_dev->veth_swap_len, bma_pci_dev->veth_swap_len); + + bma_pci_dev->hostrtc_phyaddr = bma_pci_dev->bma_base_phy_addr; + /* edma */ + bma_pci_dev->edma_swap_phy_addr = + bma_pci_dev->bma_base_phy_addr + EDMA_SWAP_BASE_OFFSET; + /* veth */ + bma_pci_dev->veth_swap_phy_addr = + bma_pci_dev->edma_swap_phy_addr + EDMA_SWAP_DATA_SIZE; + + BMA_LOG(DLOG_DEBUG, + "bar1: hostrtc_phyaddr = 0x%lx, edma_swap_phy_addr = 0x%lx, veth_swap_phy_addr = 0x%lx\n", + bma_pci_dev->hostrtc_phyaddr, + bma_pci_dev->edma_swap_phy_addr, + bma_pci_dev->veth_swap_phy_addr); + + __atu_config_H(pdev, 0, + GET_HIGH_ADDR(bma_pci_dev->kbox_base_phy_addr), + (bma_pci_dev->kbox_base_phy_addr & 0xffffffff), + 0, PCI_BAR0_PME_1710, PCI_PME_USEABLE_SPACE); + + __atu_config_H(pdev, 1, + GET_HIGH_ADDR(bma_pci_dev->hostrtc_phyaddr), + (bma_pci_dev->hostrtc_phyaddr & 0xffffffff), + 0, HOSTRTC_REG_BASE, HOSTRTC_REG_SIZE); + + __atu_config_H(pdev, 2, + GET_HIGH_ADDR(bma_pci_dev->edma_swap_phy_addr), + (bma_pci_dev->edma_swap_phy_addr & 0xffffffff), + 0, EDMA_SWAP_DATA_BASE, EDMA_SWAP_DATA_SIZE); + + __atu_config_H(pdev, 3, + GET_HIGH_ADDR(bma_pci_dev->veth_swap_phy_addr), + (bma_pci_dev->veth_swap_phy_addr & 0xffffffff), + 0, VETH_SWAP_DATA_BASE, VETH_SWAP_DATA_SIZE); + + if (bar1_resource_flag & IORESOURCE_CACHEABLE) { + bma_pci_dev->bma_base_addr = + ioremap(bma_pci_dev->bma_base_phy_addr, + bma_pci_dev->bma_base_len); + } else { + bma_pci_dev->bma_base_addr = + IOREMAP(bma_pci_dev->bma_base_phy_addr, + bma_pci_dev->bma_base_len); + } + + if (!bma_pci_dev->bma_base_addr) { + BMA_LOG(DLOG_ERROR, + "Cannot map device registers, aborting\n"); + + return -ENODEV; + } + + bma_pci_dev->hostrtc_viraddr = bma_pci_dev->bma_base_addr; + bma_pci_dev->edma_swap_addr = + (unsigned char *)bma_pci_dev->bma_base_addr + + EDMA_SWAP_BASE_OFFSET; + bma_pci_dev->veth_swap_addr = + (unsigned char *)bma_pci_dev->edma_swap_addr + + EDMA_SWAP_DATA_SIZE; + + (void)pci_read_config_dword(pdev, 0x78, &data); + data = data & 0xfffffff0; + (void)pci_write_config_dword(pdev, 0x78, data); + (void)pci_read_config_dword(pdev, 0x78, &data); + + return 0; +} + +static int ioremap_bar_mem(struct pci_dev *pdev, + struct bma_pci_dev_s *bma_pci_dev) +{ + int err = 0; + unsigned long bar0_resource_flag = 0; + + bar0_resource_flag = pci_resource_flags(pdev, PCI_BAR0); + + if (!(bar0_resource_flag & IORESOURCE_MEM)) { + BMA_LOG(DLOG_ERROR, + "Cannot find proper PCI device base address, aborting\n"); + err = -ENODEV; + return err; + } + + bma_pci_dev->kbox_base_phy_addr = pci_resource_start(pdev, PCI_BAR0); + + bma_pci_dev->kbox_base_len = pci_resource_len(pdev, PCI_BAR0); + + BMA_LOG(DLOG_DEBUG, + "bar0: kbox_base_phy_addr = 0x%lx, base_len = %ld(0x%lx)\n", + bma_pci_dev->kbox_base_phy_addr, bma_pci_dev->kbox_base_len, + bma_pci_dev->kbox_base_len); + + if (PME_DEV_CHECK(pdev->device, pdev->vendor)) { + err = ioremap_pme_bar1_mem(pdev, bma_pci_dev); + if (err != 0) + return err; + } + + BMA_LOG(DLOG_DEBUG, "remap BAR0 KBOX\n"); + + if (bar0_resource_flag & IORESOURCE_CACHEABLE) { + bma_pci_dev->kbox_base_addr = + ioremap(bma_pci_dev->kbox_base_phy_addr, + bma_pci_dev->kbox_base_len); + } else { + bma_pci_dev->kbox_base_addr = + IOREMAP(bma_pci_dev->kbox_base_phy_addr, + bma_pci_dev->kbox_base_len); + } + + if (!bma_pci_dev->kbox_base_addr) { + BMA_LOG(DLOG_ERROR, "Cannot map device registers, aborting\n"); + + iounmap(bma_pci_dev->bma_base_addr); + bma_pci_dev->bma_base_addr = NULL; + bma_pci_dev->edma_swap_addr = NULL; + bma_pci_dev->hostrtc_viraddr = NULL; + return -ENOMEM; + } + + return 0; +} + +int pme_pci_enable_msi(struct pci_dev *pdev) +{ + int err = 0; + + pci_set_master(pdev); + +#ifdef CONFIG_PCI_MSI + if (pci_find_capability(pdev, PCI_CAP_ID_MSI) == 0) { + BMA_LOG(DLOG_ERROR, "not support msi\n"); + pci_disable_device(pdev); + return err; + } + + BMA_LOG(DLOG_DEBUG, "support msi\n"); + + err = pci_enable_msi(pdev); + if (err) { + BMA_LOG(DLOG_ERROR, "pci_enable_msi failed\n"); + pci_disable_device(pdev); + return err; + } +#endif + + return err; +} + +int pci_device_init(struct pci_dev *pdev, struct bma_pci_dev_s *bma_pci_dev) +{ + int err = 0; + + if (PME_DEV_CHECK(pdev->device, pdev->vendor)) { + err = bma_devinft_init(bma_pci_dev); + if (err) { + BMA_LOG(DLOG_ERROR, "bma_devinft_init failed\n"); + bma_devinft_cleanup(bma_pci_dev); + iounmap_bar_mem(bma_pci_dev); + g_bma_pci_dev = NULL; + pci_release_regions(pdev); + kfree(bma_pci_dev); + #ifdef CONFIG_PCI_MSI + pci_disable_msi(pdev); + #endif + pci_disable_device(pdev); + + return err; + } + } else { + BMA_LOG(DLOG_DEBUG, "edma is not supported on this pcie\n"); + } + + pci_set_drvdata(pdev, bma_pci_dev); + + return 0; +} + +int pci_device_config(struct pci_dev *pdev) +{ + int err = 0; + struct bma_pci_dev_s *bma_pci_dev = NULL; + + bma_pci_dev = kmalloc(sizeof(*bma_pci_dev), GFP_KERNEL); + if (!bma_pci_dev) { + err = -ENOMEM; + goto err_out_disable_msi; + } + memset(bma_pci_dev, 0, sizeof(*bma_pci_dev)); + + bma_pci_dev->pdev = pdev; + + err = pci_request_regions(pdev, PCI_KBOX_MODULE_NAME); + if (err) { + BMA_LOG(DLOG_ERROR, "Cannot obtain PCI resources, aborting\n"); + goto err_out_free_dev; + } + + err = ioremap_bar_mem(pdev, bma_pci_dev); + if (err) { + BMA_LOG(DLOG_ERROR, "ioremap_edma_io_mem failed\n"); + goto err_out_release_regions; + } + + g_bma_pci_dev = bma_pci_dev; + + if (SET_DMA_MASK(&pdev->dev)) { + BMA_LOG(DLOG_ERROR, + "No usable DMA ,configuration, aborting,goto failed2!!!\n"); + goto err_out_unmap_bar; + } + + g_bma_pci_dev = bma_pci_dev; + + return pci_device_init(pdev, bma_pci_dev); + +err_out_unmap_bar: + iounmap_bar_mem(bma_pci_dev); + g_bma_pci_dev = NULL; +err_out_release_regions: + pci_release_regions(pdev); +err_out_free_dev: + kfree(bma_pci_dev); + bma_pci_dev = NULL; +err_out_disable_msi: +#ifdef CONFIG_PCI_MSI + pci_disable_msi(pdev); +#endif + + pci_disable_device(pdev); + + return err; +} + +static int bma_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + int err = 0; + + UNUSED(ent); + + if (g_bma_pci_dev) + return -EPERM; + + err = pci_enable_device(pdev); + if (err) { + BMA_LOG(DLOG_ERROR, "Cannot enable PCI device,aborting\n"); + return err; + } + + if (PME_DEV_CHECK(pdev->device, pdev->vendor)) { + err = pme_pci_enable_msi(pdev); + if (err) + return err; + } + + BMA_LOG(DLOG_DEBUG, "pdev->device = 0x%x\n", pdev->device); + BMA_LOG(DLOG_DEBUG, "pdev->vendor = 0x%x\n", pdev->vendor); + + return pci_device_config(pdev); +} + +static void bma_pci_remove(struct pci_dev *pdev) +{ + struct bma_pci_dev_s *bma_pci_dev = + (struct bma_pci_dev_s *)pci_get_drvdata(pdev); + + g_bma_pci_dev = NULL; + (void)pci_set_drvdata(pdev, NULL); + + if (bma_pci_dev) { + bma_devinft_cleanup(bma_pci_dev); + + iounmap_bar_mem(bma_pci_dev); + + kfree(bma_pci_dev); + } + + pci_release_regions(pdev); + +#ifdef CONFIG_PCI_MSI + pci_disable_msi(pdev); +#endif + pci_disable_device(pdev); +} + +static int bma_pci_suspend(struct pci_dev *pdev, pm_message_t state) +{ + UNUSED(pdev); + UNUSED(state); + + return 0; +} + +static int bma_pci_resume(struct pci_dev *pdev) +{ + UNUSED(pdev); + + return 0; +} + +int __init bma_pci_init(void) +{ + int ret = 0; + + BMA_LOG(DLOG_DEBUG, "\n"); + + ret = pci_register_driver(&bma_driver); + if (ret) + BMA_LOG(DLOG_ERROR, "pci_register_driver failed\n"); + + return ret; +} + +void __exit bma_pci_cleanup(void) +{ + BMA_LOG(DLOG_DEBUG, "\n"); + + pci_unregister_driver(&bma_driver); +} + +MODULE_AUTHOR("HUAWEI TECHNOLOGIES CO., LTD."); +MODULE_DESCRIPTION("HUAWEI EDMA DRIVER"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(BMA_VERSION); +#ifndef _lint + +module_init(bma_pci_init); +module_exit(bma_pci_cleanup); +#endif diff --git a/drivers/net/ethernet/huawei/bma/edma_drv/bma_pci.h b/drivers/net/ethernet/huawei/bma/edma_drv/bma_pci.h new file mode 100644 index 000000000000..92893b6bfd08 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/edma_drv/bma_pci.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _BMA_PCI_H_ +#define _BMA_PCI_H_ + +#include "bma_devintf.h" +#include "bma_include.h" +#include <linux/netdevice.h> + +#define EDMA_SWAP_BASE_OFFSET 0x10000 + +#define HOSTRTC_REG_BASE 0x2f000000 +#define HOSTRTC_REG_SIZE EDMA_SWAP_BASE_OFFSET + +#define EDMA_SWAP_DATA_BASE 0x84810000 +#define EDMA_SWAP_DATA_SIZE 65536 + +#define VETH_SWAP_DATA_BASE 0x84820000 +#define VETH_SWAP_DATA_SIZE 0xdf000 + +#define ATU_VIEWPORT 0x900 +#define ATU_REGION_CTRL1 0x904 +#define ATU_REGION_CTRL2 0x908 +#define ATU_BASE_LOW 0x90C +#define ATU_BASE_HIGH 0x910 +#define ATU_LIMIT 0x914 +#define ATU_TARGET_LOW 0x918 +#define ATU_TARGET_HIGH 0x91C +#define REGION_DIR_OUTPUT (0x0 << 31) +#define REGION_DIR_INPUT (0x1 << 31) +#define REGION_INDEX_MASK 0x7 +#define REGION_ENABLE (0x1 << 31) +#define ATU_CTRL1_DEFAULT 0x0 +struct bma_pci_dev_s { + unsigned long kbox_base_phy_addr; + void __iomem *kbox_base_addr; + unsigned long kbox_base_len; + + unsigned long bma_base_phy_addr; + void __iomem *bma_base_addr; + unsigned long bma_base_len; + + unsigned long hostrtc_phyaddr; + void __iomem *hostrtc_viraddr; + + unsigned long edma_swap_phy_addr; + void __iomem *edma_swap_addr; + unsigned long edma_swap_len; + + unsigned long veth_swap_phy_addr; + void __iomem *veth_swap_addr; + unsigned long veth_swap_len; + + struct pci_dev *pdev; + struct bma_dev_s *bma_dev; +}; + +#ifdef DRV_VERSION +#define BMA_VERSION MICRO_TO_STR(DRV_VERSION) +#else +#define BMA_VERSION "0.3.4" +#endif + +#ifdef CONFIG_ARM64 +#define IOREMAP ioremap_wc +#else +#ifdef ioremap_nocache +#define IOREMAP ioremap_nocache +#else +#define IOREMAP ioremap_wc +#endif +#endif + +extern int debug; + +#define BMA_LOG(level, fmt, args...) \ + do { \ + if (debug >= (level))\ + netdev_alert(0, "edma: %s, %d, " fmt, \ + __func__, __LINE__, ## args); \ + } while (0) + +int edmainfo_show(char *buff); + +#endif diff --git a/drivers/net/ethernet/huawei/bma/edma_drv/edma_host.c b/drivers/net/ethernet/huawei/bma/edma_drv/edma_host.c new file mode 100644 index 000000000000..2d5f4ffd79d9 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/edma_drv/edma_host.c @@ -0,0 +1,1462 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/errno.h> +#include <linux/kthread.h> +#include <linux/mm.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +#include "bma_pci.h" +#include "edma_host.h" + +static struct edma_user_inft_s *g_user_func[TYPE_MAX] = { 0 }; + +static struct bma_dev_s *g_bma_dev; +static int edma_host_dma_interrupt(struct edma_host_s *edma_host); + +int edmainfo_show(char *buf) +{ + struct bma_user_s *user_ptr = NULL; + struct edma_host_s *host_ptr = NULL; + int len = 0; + __kernel_time_t running_time = 0; + static const char * const host_status[] = { + "deregistered", "registered", "lost"}; + + if (!buf) + return 0; + + if (!g_bma_dev) { + len += sprintf(buf, "EDMA IS NOT SUPPORTED"); + return len; + } + + host_ptr = &g_bma_dev->edma_host; + + GET_SYS_SECONDS(running_time); + running_time -= host_ptr->statistics.init_time; + len += sprintf(buf + len, + "============================EDMA_DRIVER_INFO============================\n"); + len += sprintf(buf + len, "version :" BMA_VERSION "\n"); + + len += sprintf(buf + len, "running_time :%luD %02lu:%02lu:%02lu\n", + running_time / SECONDS_PER_DAY, + running_time % SECONDS_PER_DAY / SECONDS_PER_HOUR, + running_time % SECONDS_PER_HOUR / SECONDS_PER_MINUTE, + running_time % SECONDS_PER_MINUTE); + + len += sprintf(buf + len, "remote_status:%s\n", + host_status[host_ptr->statistics.remote_status]); + len += sprintf(buf + len, "lost_count :%d\n", + host_ptr->statistics.lost_count); + len += sprintf(buf + len, "b2h_int :%d\n", + host_ptr->statistics.b2h_int); + len += sprintf(buf + len, "h2b_int :%d\n", + host_ptr->statistics.h2b_int); + len += sprintf(buf + len, "dma_count :%d\n", + host_ptr->statistics.dma_count); + len += sprintf(buf + len, "recv_bytes :%d\n", + host_ptr->statistics.recv_bytes); + len += sprintf(buf + len, "send_bytes :%d\n", + host_ptr->statistics.send_bytes); + len += sprintf(buf + len, "recv_pkgs :%d\n", + host_ptr->statistics.recv_pkgs); + len += sprintf(buf + len, "send_pkgs :%d\n", + host_ptr->statistics.send_pkgs); + len += sprintf(buf + len, "drop_pkgs :%d\n", + host_ptr->statistics.drop_pkgs); + len += sprintf(buf + len, "fail_count :%d\n", + host_ptr->statistics.failed_count); + len += sprintf(buf + len, "debug :%d\n", debug); + len += sprintf(buf + len, + "================================USER_INFO===============================\n"); + + list_for_each_entry_rcu(user_ptr, &g_bma_dev->priv_list, link) { + len += sprintf(buf + len, + "type: %d\nsub type: %d\nopen:%d\nmax recvmsg nums: %d\ncur recvmsg nums: %d\n", + user_ptr->type, user_ptr->sub_type, + host_ptr->local_open_status[user_ptr->type], + user_ptr->max_recvmsg_nums, + user_ptr->cur_recvmsg_nums); + len += sprintf(buf + len, + "========================================================================\n"); + } + + return len; +} + +int is_edma_b2h_int(struct edma_host_s *edma_host) +{ + struct notify_msg *pnm = NULL; + + if (!edma_host) + return -1; + + pnm = (struct notify_msg *)edma_host->edma_flag; + if (!pnm) { + BMA_LOG(DLOG_ERROR, "pnm is 0\n"); + return -1; + } + + if (IS_EDMA_B2H_INT(pnm->int_flag)) { + CLEAR_EDMA_B2H_INT(pnm->int_flag); + return 0; + } + + return -1; +} + +void edma_int_to_bmc(struct edma_host_s *edma_host) +{ + unsigned int data = 0; + + if (!edma_host) + return; + + edma_host->statistics.h2b_int++; + + data = *(unsigned int *)((char *)edma_host->hostrtc_viraddr + + HOSTRTC_INT_OFFSET); + + data |= 0x00000001; + + *(unsigned int *)((char *)edma_host->hostrtc_viraddr + + HOSTRTC_INT_OFFSET) = data; +} + +static void edma_host_int_to_bmc(struct edma_host_s *edma_host) +{ + struct notify_msg *pnm = NULL; + + if (!edma_host) + return; + + pnm = (struct notify_msg *)edma_host->edma_flag; + if (pnm) { + SET_EDMA_H2B_INT(pnm->int_flag); + edma_int_to_bmc(edma_host); + } +} + +static int check_status_dmah2b(struct edma_host_s *edma_host) +{ + unsigned int data = 0; + struct pci_dev *pdev = NULL; + + if (!edma_host) + return 0; + + pdev = edma_host->pdev; + if (!pdev) + return 0; + + (void)pci_read_config_dword(pdev, REG_PCIE1_DMAREAD_STATUS, + (u32 *)&data); + + if (data & (1 << SHIFT_PCIE1_DMAREAD_STATUS)) + return 1; /* ok */ + else + return 0; /* busy */ +} + +static int check_status_dmab2h(struct edma_host_s *edma_host) +{ + unsigned int data = 0; + struct pci_dev *pdev = NULL; + + if (!edma_host) + return 0; + + pdev = edma_host->pdev; + if (!pdev) + return 0; + + (void)pci_read_config_dword(pdev, REG_PCIE1_DMAWRITE_STATUS, + (u32 *)&data); + + if (data & (1 << SHIFT_PCIE1_DMAWRITE_STATUS)) + return 1; /* ok */ + else + return 0; /* busy */ +} + +void clear_int_dmah2b(struct edma_host_s *edma_host) +{ + unsigned int data = 0; + struct pci_dev *pdev = NULL; + + if (!edma_host) + return; + + pdev = edma_host->pdev; + if (!pdev) + return; + + (void)pci_read_config_dword(pdev, REG_PCIE1_DMAREADINT_CLEAR, + (u32 *)&data); + data = data & (~((1 << SHIFT_PCIE1_DMAREADINT_CLEAR))); + data = data | (1 << SHIFT_PCIE1_DMAREADINT_CLEAR); + (void)pci_write_config_dword(pdev, REG_PCIE1_DMAREADINT_CLEAR, data); +} + +void clear_int_dmab2h(struct edma_host_s *edma_host) +{ + unsigned int data = 0; + struct pci_dev *pdev = NULL; + + if (!edma_host) + return; + + pdev = edma_host->pdev; + if (!pdev) + return; + + (void)pci_read_config_dword(pdev, REG_PCIE1_DMAWRITEINT_CLEAR, + (u32 *)&data); + data = data & (~((1 << SHIFT_PCIE1_DMAWRITEINT_CLEAR))); + data = data | (1 << SHIFT_PCIE1_DMAWRITEINT_CLEAR); + (void)pci_write_config_dword(pdev, REG_PCIE1_DMAWRITEINT_CLEAR, data); +} + +int edma_host_check_dma_status(enum dma_direction_e dir) +{ + int ret = 0; + + switch (dir) { + case BMC_TO_HOST: + ret = check_status_dmab2h(&g_bma_dev->edma_host); + if (ret == 1) + clear_int_dmab2h(&g_bma_dev->edma_host); + + break; + + case HOST_TO_BMC: + ret = check_status_dmah2b(&g_bma_dev->edma_host); + if (ret == 1) + clear_int_dmah2b(&g_bma_dev->edma_host); + + break; + + default: + BMA_LOG(DLOG_ERROR, "direction failed, dir = %d\n", dir); + ret = -EFAULT; + break; + } + + return ret; +} + +#ifdef USE_DMA + +static int start_transfer_h2b(struct edma_host_s *edma_host, unsigned int len, + unsigned int src_h, unsigned int src_l, + unsigned int dst_h, unsigned int dst_l) +{ + unsigned long flags = 0; + struct pci_dev *pdev = edma_host->pdev; + + spin_lock_irqsave(&edma_host->reg_lock, flags); + /* read engine enable */ + (void)pci_write_config_dword(pdev, 0x99c, 0x00000001); + /* read ch,ch index 0 */ + (void)pci_write_config_dword(pdev, 0xa6c, 0x80000000); + /* ch ctrl,local int enable */ + (void)pci_write_config_dword(pdev, 0xa70, 0x00000008); + /* size */ + (void)pci_write_config_dword(pdev, 0xa78, len); + /* src lower 32b */ + (void)pci_write_config_dword(pdev, 0xa7c, src_l); + /* src upper 32b */ + (void)pci_write_config_dword(pdev, 0xa80, src_h); + /* dst lower 32b */ + (void)pci_write_config_dword(pdev, 0xa84, dst_l); + /* dst upper 32b */ + (void)pci_write_config_dword(pdev, 0xa88, dst_h); + /* start read dma,ch 0 */ + (void)pci_write_config_dword(pdev, 0x9a0, 0x00000000); + spin_unlock_irqrestore(&edma_host->reg_lock, flags); + return 0; +} + +static int start_transfer_b2h(struct edma_host_s *edma_host, unsigned int len, + unsigned int src_h, unsigned int src_l, + unsigned int dst_h, unsigned int dst_l) +{ + unsigned long flags = 0; + struct pci_dev *pdev = edma_host->pdev; + + BMA_LOG(DLOG_DEBUG, + "len = 0x%8x,src_h = 0x%8x,src_l = 0x%8x,dst_h = 0x%8x,dst_l = 0x%8x\n", + len, src_h, src_l, dst_h, dst_l); + + spin_lock_irqsave(&edma_host->reg_lock, flags); + /* write engine enable */ + (void)pci_write_config_dword(pdev, 0x97c, 0x00000001); + /* write ch,ch index 0 */ + (void)pci_write_config_dword(pdev, 0xa6c, 0x00000000); + /* ch ctrl,local int enable */ + (void)pci_write_config_dword(pdev, 0xa70, 0x00000008); + /* size */ + (void)pci_write_config_dword(pdev, 0xa78, len); + /* src lower 32b */ + (void)pci_write_config_dword(pdev, 0xa7c, src_l); + /* src upper 32b */ + (void)pci_write_config_dword(pdev, 0xa80, src_h); + /* dst lower 32b */ + (void)pci_write_config_dword(pdev, 0xa84, dst_l); + /* dst upper 32b */ + (void)pci_write_config_dword(pdev, 0xa88, dst_h); + /* start write dma,ch 0 */ + (void)pci_write_config_dword(pdev, 0x980, 0x00000000); + spin_unlock_irqrestore(&edma_host->reg_lock, flags); + + return 0; +} +#endif + +static void start_listtransfer_h2b(struct edma_host_s *edma_host, + unsigned int list_h, unsigned int list_l) +{ + unsigned long flags = 0; + struct pci_dev *pdev = NULL; + + if (!edma_host) + return; + + pdev = edma_host->pdev; + if (!pdev) + return; + + spin_lock_irqsave(&edma_host->reg_lock, flags); + + /* write engine enable */ + (void)pci_write_config_dword(pdev, 0x700 + 0x29c, 0x00000001); + /* write list err enable */ + (void)pci_write_config_dword(pdev, 0x700 + 0x334, 0x00010000); + /* write ch,ch index 0 */ + (void)pci_write_config_dword(pdev, 0x700 + 0x36c, 0x80000000); + /* ch ctrl,local int enable */ + (void)pci_write_config_dword(pdev, 0x700 + 0x370, 0x00000300); + /* list lower 32b */ + (void)pci_write_config_dword(pdev, 0x700 + 0x38c, list_l); + /* list upper 32b */ + (void)pci_write_config_dword(pdev, 0x700 + 0x390, list_h); + /* start write dma,ch 0 */ + (void)pci_write_config_dword(pdev, 0x700 + 0x2a0, 0x00000000); + + spin_unlock_irqrestore(&edma_host->reg_lock, flags); +} + +static void start_listtransfer_b2h(struct edma_host_s *edma_host, + unsigned int list_h, unsigned int list_l) +{ + unsigned long flags = 0; + struct pci_dev *pdev = NULL; + + if (!edma_host) + return; + + pdev = edma_host->pdev; + if (!pdev) + return; + + spin_lock_irqsave(&edma_host->reg_lock, flags); + + /* write engine enable */ + (void)pci_write_config_dword(pdev, 0x700 + 0x27c, 0x00000001); + /* write list err enable */ + (void)pci_write_config_dword(pdev, 0x700 + 0x300, 0x00000001); + /* write ch,ch index 0 */ + (void)pci_write_config_dword(pdev, 0x700 + 0x36c, 0x00000000); + /* ch ctrl,local int enable */ + (void)pci_write_config_dword(pdev, 0x700 + 0x370, 0x00000300); + /* list lower 32b */ + (void)pci_write_config_dword(pdev, 0x700 + 0x38c, list_l); + /* list upper 32b */ + (void)pci_write_config_dword(pdev, 0x700 + 0x390, list_h); + /* start write dma,ch 0 */ + (void)pci_write_config_dword(pdev, 0x700 + 0x280, 0x00000000); + + spin_unlock_irqrestore(&edma_host->reg_lock, flags); +} + +int edma_host_dma_start(struct edma_host_s *edma_host, + struct bma_priv_data_s *priv) +{ + struct bma_user_s *puser = NULL; + struct bma_dev_s *bma_dev = NULL; + unsigned long flags = 0; + + if (!edma_host || !priv) + return -EFAULT; + + bma_dev = list_entry(edma_host, struct bma_dev_s, edma_host); + + spin_lock_irqsave(&bma_dev->priv_list_lock, flags); + + list_for_each_entry_rcu(puser, &bma_dev->priv_list, link) { + if (puser->dma_transfer) { + spin_unlock_irqrestore(&bma_dev->priv_list_lock, flags); + BMA_LOG(DLOG_ERROR, "type = %d dma is started\n", + puser->type); + + return -EBUSY; + } + } + + priv->user.dma_transfer = 1; + + spin_unlock_irqrestore(&bma_dev->priv_list_lock, flags); + + return 0; +} + +#ifdef USE_DMA + +static int edma_host_dma_h2b(struct edma_host_s *edma_host, + struct bma_dma_addr_s *host_addr, + struct bma_dma_addr_s *bmc_addr) +{ + int ret = 0; + struct notify_msg *pnm = (struct notify_msg *)edma_host->edma_flag; + unsigned long host_h2b_addr = 0; + unsigned long bmc_h2b_addr = 0; + unsigned int bmc_h2b_size = 0; + unsigned int src_h, src_l, dst_h, dst_l; + + if (!host_addr) { + BMA_LOG(DLOG_ERROR, "host_addr is NULL\n"); + return -EFAULT; + } + + BMA_LOG(DLOG_DEBUG, "host_addr->dma_addr = 0x%llx\n", + host_addr->dma_addr); + + if (host_addr->dma_addr) + host_h2b_addr = (unsigned long)(host_addr->dma_addr); + else + host_h2b_addr = edma_host->h2b_addr.dma_addr; + + bmc_h2b_addr = pnm->h2b_addr; + bmc_h2b_size = pnm->h2b_size; + + BMA_LOG(DLOG_DEBUG, + "host_h2b_addr = 0x%lx, dma_data_len = %d, bmc_h2b_addr = 0x%lx, bmc_h2b_size = %d\n", + host_h2b_addr, host_addr->dma_data_len, bmc_h2b_addr, + bmc_h2b_size); + + if (host_addr->dma_data_len > EDMA_DMABUF_SIZE || + bmc_h2b_addr == 0 || + host_addr->dma_data_len > bmc_h2b_size) { + BMA_LOG(DLOG_ERROR, + "dma_data_len too large = %d, bmc_h2b_size = %d\n", + host_addr->dma_data_len, bmc_h2b_size); + return -EFAULT; + } + + edma_host->h2b_state = H2BSTATE_WAITDMA; + + src_h = (unsigned int)((sizeof(unsigned long) == 8) ? + (host_h2b_addr >> 32) : 0); + src_l = (unsigned int)(host_h2b_addr & 0xffffffff); + dst_h = (unsigned int)((sizeof(unsigned long) == 8) ? + (bmc_h2b_addr >> 32) : 0); + dst_l = (unsigned int)(bmc_h2b_addr & 0xffffffff); + (void)start_transfer_h2b(edma_host, + host_addr->dma_data_len, src_h, + src_l, dst_h, dst_l); + + (void)mod_timer(&edma_host->dma_timer, + jiffies_64 + TIMER_INTERVAL_CHECK); + + ret = wait_event_interruptible_timeout(edma_host->wq_dmah2b, + (edma_host->h2b_state == + H2BSTATE_IDLE), + EDMA_DMA_TRANSFER_WAIT_TIMEOUT); + + if (ret == -ERESTARTSYS) { + BMA_LOG(DLOG_ERROR, "eintr 1\n"); + ret = -EINTR; + goto end; + } else if (ret == 0) { + BMA_LOG(DLOG_ERROR, "timeout 2\n"); + ret = -ETIMEDOUT; + goto end; + } else { + ret = 0; + BMA_LOG(DLOG_ERROR, "h2b dma successful\n"); + } + +end: + + return ret; +} + +static int edma_host_dma_b2h(struct edma_host_s *edma_host, + struct bma_dma_addr_s *host_addr, + struct bma_dma_addr_s *bmc_addr) +{ + int ret = 0; + struct notify_msg *pnm = (struct notify_msg *)edma_host->edma_flag; + unsigned long bmc_b2h_addr = 0; + unsigned long host_b2h_addr = 0; + unsigned int src_h, src_l, dst_h, dst_l; + + if (!bmc_addr) + return -EFAULT; + + if (host_addr->dma_addr) + host_b2h_addr = (unsigned long)(host_addr->dma_addr); + else + host_b2h_addr = edma_host->b2h_addr.dma_addr; + + if (bmc_addr->dma_addr) + bmc_b2h_addr = (unsigned long)(bmc_addr->dma_addr); + else + bmc_b2h_addr = pnm->b2h_addr; + + BMA_LOG(DLOG_DEBUG, + "bmc_b2h_addr = 0x%lx, host_b2h_addr = 0x%lx, dma_data_len = %d\n", + bmc_b2h_addr, host_b2h_addr, bmc_addr->dma_data_len); + + if (bmc_addr->dma_data_len > EDMA_DMABUF_SIZE || + bmc_addr->dma_data_len > edma_host->b2h_addr.len) { + BMA_LOG(DLOG_ERROR, + "dma_data_len too large = %d, b2h_addr = %d\n", + host_addr->dma_data_len, edma_host->b2h_addr.len); + return -EFAULT; + } + + edma_host->b2h_state = B2HSTATE_WAITDMA; + + src_h = (unsigned int)((sizeof(unsigned long) == 8) ? + (bmc_b2h_addr >> 32) : 0); + src_l = (unsigned int)(bmc_b2h_addr & 0xffffffff); + dst_h = (unsigned int)((sizeof(unsigned long) == 8) ? + (host_b2h_addr >> 32) : 0); + dst_l = (unsigned int)(host_b2h_addr & 0xffffffff); + (void)start_transfer_b2h(edma_host, + bmc_addr->dma_data_len, src_h, + src_l, dst_h, dst_l); + + (void)mod_timer(&edma_host->dma_timer, + jiffies_64 + TIMER_INTERVAL_CHECK); + + ret = wait_event_interruptible_timeout(edma_host->wq_dmab2h, + (edma_host->b2h_state == + B2HSTATE_IDLE), + EDMA_DMA_TRANSFER_WAIT_TIMEOUT); + + if (ret == -ERESTARTSYS) { + BMA_LOG(DLOG_ERROR, "eintr 1\n"); + ret = -EINTR; + goto end; + } else if (ret == 0) { + BMA_LOG(DLOG_ERROR, "timeout 2\n"); + ret = -ETIMEDOUT; + goto end; + } else { + BMA_LOG(DLOG_DEBUG, "h2b dma successful\n"); + } + +end: + + return ret; +} +#endif + +void host_dma_transfer_without_list(struct edma_host_s *edma_host, + struct bma_dma_transfer_s *dma_transfer, + int *return_code) +{ +#ifdef USE_DMA + union transfer_u *transfer = &dma_transfer->transfer; + + switch (dma_transfer->dir) { + case BMC_TO_HOST: + *return_code = edma_host_dma_b2h(edma_host, + &transfer->nolist.host_addr, + &transfer->nolist.bmc_addr); + break; + case HOST_TO_BMC: + *return_code = edma_host_dma_h2b(edma_host, + &transfer->nolist.host_addr, + &transfer->nolist.bmc_addr); + break; + default: + BMA_LOG(DLOG_ERROR, "direction failed, dir = %d\n", + dma_transfer->dir); + *return_code = -EFAULT; + break; + } +#endif +} + +void host_dma_transfer_withlist(struct edma_host_s *edma_host, + struct bma_dma_transfer_s *dma_transfer, + int *return_code) +{ + unsigned int list_h = 0; + unsigned int list_l = 0; + union transfer_u *transfer = &dma_transfer->transfer; + + list_h = (unsigned int)((sizeof(unsigned long) == 8) ? + (transfer->list.dma_addr >> 32) : 0); + list_l = (unsigned int)(transfer->list.dma_addr + & 0xffffffff); + + switch (dma_transfer->dir) { + case BMC_TO_HOST: + start_listtransfer_b2h(edma_host, list_h, list_l); + break; + case HOST_TO_BMC: + start_listtransfer_h2b(edma_host, list_h, list_l); + break; + default: + BMA_LOG(DLOG_ERROR, "direction failed, dir = %d\n\n", + dma_transfer->dir); + *return_code = -EFAULT; + break; + } +} + +int edma_host_dma_transfer(struct edma_host_s *edma_host, + struct bma_priv_data_s *priv, + struct bma_dma_transfer_s *dma_transfer) +{ + int ret = 0; + unsigned long flags = 0; + struct bma_dev_s *bma_dev = NULL; + + if (!edma_host || !priv || !dma_transfer) + return -EFAULT; + + bma_dev = list_entry(edma_host, struct bma_dev_s, edma_host); + + spin_lock_irqsave(&bma_dev->priv_list_lock, flags); + + if (priv->user.dma_transfer == 0) { + spin_unlock_irqrestore(&bma_dev->priv_list_lock, flags); + BMA_LOG(DLOG_ERROR, "dma_transfer = %hhd\n", + priv->user.dma_transfer); + return -EFAULT; + } + + spin_unlock_irqrestore(&bma_dev->priv_list_lock, flags); + + edma_host->statistics.dma_count++; + + if (dma_transfer->type == DMA_NOT_LIST) { + host_dma_transfer_without_list(edma_host, + dma_transfer, &ret); + } else if (dma_transfer->type == DMA_LIST) { + host_dma_transfer_withlist(edma_host, dma_transfer, &ret); + } else { + BMA_LOG(DLOG_ERROR, "type failed! type = %d\n", + dma_transfer->type); + return -EFAULT; + } + + return ret; +} + +void edma_host_reset_dma(struct edma_host_s *edma_host, int dir) +{ + u32 data = 0; + u32 reg_addr = 0; + unsigned long flags = 0; + int count = 0; + struct pci_dev *pdev = NULL; + + if (!edma_host) + return; + + pdev = edma_host->pdev; + if (!pdev) + return; + + if (dir == BMC_TO_HOST) + reg_addr = REG_PCIE1_DMA_READ_ENGINE_ENABLE; + else if (dir == HOST_TO_BMC) + reg_addr = REG_PCIE1_DMA_WRITE_ENGINE_ENABLE; + else + return; + + spin_lock_irqsave(&edma_host->reg_lock, flags); + + (void)pci_read_config_dword(pdev, reg_addr, &data); + data &= ~(1 << SHIFT_PCIE1_DMA_ENGINE_ENABLE); + (void)pci_write_config_dword(pdev, reg_addr, data); + + while (count++ < 10) { + (void)pci_read_config_dword(pdev, reg_addr, &data); + + if (0 == (data & (1 << SHIFT_PCIE1_DMA_ENGINE_ENABLE))) { + BMA_LOG(DLOG_DEBUG, "reset dma succesfull\n"); + break; + } + + mdelay(100); + } + + spin_unlock_irqrestore(&edma_host->reg_lock, flags); + BMA_LOG(DLOG_DEBUG, "reset dma reg_addr=0x%x count=%d data=0x%08x\n", + reg_addr, count, data); +} + +int edma_host_dma_stop(struct edma_host_s *edma_host, + struct bma_priv_data_s *priv) +{ + unsigned long flags = 0; + struct bma_dev_s *bma_dev = NULL; + + if (!edma_host || !priv) + return -1; + + bma_dev = list_entry(edma_host, struct bma_dev_s, edma_host); + + spin_lock_irqsave(&bma_dev->priv_list_lock, flags); + priv->user.dma_transfer = 0; + spin_unlock_irqrestore(&bma_dev->priv_list_lock, flags); + + return 0; +} + +static int edma_host_send_msg(struct edma_host_s *edma_host) +{ + void *vaddr = NULL; + unsigned long flags = 0; + struct edma_mbx_hdr_s *send_mbx_hdr = NULL; + static unsigned long last_timer_record; + + if (!edma_host) + return 0; + + send_mbx_hdr = (struct edma_mbx_hdr_s *)edma_host->edma_send_addr; + + if (send_mbx_hdr->mbxlen > 0) { + if (send_mbx_hdr->mbxlen > HOST_MAX_SEND_MBX_LEN) { + /*share memory is disable */ + send_mbx_hdr->mbxlen = 0; + BMA_LOG(DLOG_ERROR, "mbxlen is too long\n"); + return -EFAULT; + } + + if (time_after(jiffies, last_timer_record + 10 * HZ)) { + BMA_LOG(DLOG_ERROR, "no response in 10s,clean msg\n"); + edma_host->statistics.failed_count++; + send_mbx_hdr->mbxlen = 0; + return -EFAULT; + } + + BMA_LOG(DLOG_DEBUG, + "still have msg : mbxlen: %d, msg_send_write: %d\n", + send_mbx_hdr->mbxlen, edma_host->msg_send_write); + + /* resend door bell */ + if (time_after(jiffies, last_timer_record + 5 * HZ)) + edma_host_int_to_bmc(edma_host); + + return -EFAULT; + } + + vaddr = + (void *)((unsigned char *)edma_host->edma_send_addr + + SIZE_OF_MBX_HDR); + + last_timer_record = jiffies; + + spin_lock_irqsave(&edma_host->send_msg_lock, flags); + + if (edma_host->msg_send_write == 0) { + spin_unlock_irqrestore(&edma_host->send_msg_lock, flags); + return 0; + } + + if (edma_host->msg_send_write > + HOST_MAX_SEND_MBX_LEN - SIZE_OF_MBX_HDR) { + BMA_LOG(DLOG_ERROR, + "Length of send message %u is larger than %lu\n", + edma_host->msg_send_write, + HOST_MAX_SEND_MBX_LEN - SIZE_OF_MBX_HDR); + edma_host->msg_send_write = 0; + spin_unlock_irqrestore(&edma_host->send_msg_lock, flags); + return 0; + } + + memcpy(vaddr, edma_host->msg_send_buf, + edma_host->msg_send_write); + + send_mbx_hdr->mbxlen = edma_host->msg_send_write; + edma_host->msg_send_write = 0; + + spin_unlock_irqrestore(&edma_host->send_msg_lock, flags); + + edma_host_int_to_bmc(edma_host); + + BMA_LOG(DLOG_DEBUG, + "vaddr: %p, mbxlen : %d, msg_send_write: %d\n", vaddr, + send_mbx_hdr->mbxlen, edma_host->msg_send_write); + + return -EAGAIN; +} + +#ifdef EDMA_TIMER +#ifdef HAVE_TIMER_SETUP +static void edma_host_timeout(struct timer_list *t) +{ + struct edma_host_s *edma_host = from_timer(edma_host, t, timer); +#else +static void edma_host_timeout(unsigned long data) +{ + struct edma_host_s *edma_host = (struct edma_host_s *)data; +#endif + int ret = 0; + unsigned long flags = 0; + + ret = edma_host_send_msg(edma_host); + if (ret < 0) { + spin_lock_irqsave(&g_bma_dev->edma_host.send_msg_lock, flags); + (void)mod_timer(&edma_host->timer, + jiffies_64 + TIMER_INTERVAL_CHECK); + spin_unlock_irqrestore(&edma_host->send_msg_lock, flags); + } +} + +#ifdef HAVE_TIMER_SETUP +static void edma_host_heartbeat_timer(struct timer_list *t) +{ + struct edma_host_s *edma_host = from_timer(edma_host, t, + heartbeat_timer); +#else +static void edma_host_heartbeat_timer(unsigned long data) +{ + struct edma_host_s *edma_host = (struct edma_host_s *)data; +#endif + struct edma_statistics_s *edma_stats = &edma_host->statistics; + unsigned int *remote_status = &edma_stats->remote_status; + static unsigned int bmc_heartbeat; + struct notify_msg *pnm = (struct notify_msg *)edma_host->edma_flag; + + if (pnm) { + if (pnm->bmc_registered) { + if ((pnm->host_heartbeat & 7) == 0) { + if (bmc_heartbeat != pnm->bmc_heartbeat) { + if (*remote_status != REGISTERED) { + BMA_LOG(DLOG_DEBUG, + "bmc is registered\n"); + *remote_status = REGISTERED; + } + + bmc_heartbeat = pnm->bmc_heartbeat; + } else { + if (*remote_status == REGISTERED) { + *remote_status = LOST; + edma_stats->lost_count++; + BMA_LOG(DLOG_DEBUG, + "bmc is lost\n"); + } + } + } + } else { + if (*remote_status == REGISTERED) + BMA_LOG(DLOG_DEBUG, "bmc is deregistered\n"); + + *remote_status = DEREGISTERED; + } + + pnm->host_heartbeat++; + } + + (void)mod_timer(&edma_host->heartbeat_timer, + jiffies_64 + HEARTBEAT_TIMER_INTERVAL_CHECK); +} + +#ifdef USE_DMA +#ifdef HAVE_TIMER_SETUP +static void edma_host_dma_timeout(struct timer_list *t) +{ + struct edma_host_s *edma_host = from_timer(edma_host, t, dma_timer); +#else +static void edma_host_dma_timeout(unsigned long data) +{ + struct edma_host_s *edma_host = (struct edma_host_s *)data; +#endif + int ret = 0; + + ret = edma_host_dma_interrupt(edma_host); + if (ret < 0) + (void)mod_timer(&edma_host->dma_timer, + jiffies_64 + DMA_TIMER_INTERVAL_CHECK); +} +#endif +#else + +static int edma_host_thread(void *arg) +{ + struct edma_host_s *edma_host = (struct edma_host_s *)arg; + + BMA_LOG(DLOG_ERROR, "edma host thread\n"); + + while (!kthread_should_stop()) { + wait_for_completion_interruptible_timeout(&edma_host->msg_ready, + 1 * HZ); + edma_host_send_msg(edma_host); + (void)edma_host_dma_interrupt(edma_host); + } + + BMA_LOG(DLOG_ERROR, "edma host thread exiting\n"); + + return 0; +} + +#endif + +int edma_host_send_driver_msg(const void *msg, size_t msg_len, int subtype) +{ + int ret = 0; + unsigned long flags = 0; + struct edma_host_s *edma_host = NULL; + struct edma_msg_hdr_s *hdr = NULL; + int total_len = msg_len + SIZE_OF_MSG_HDR; + + if (!msg || !g_bma_dev) + return -1; + + edma_host = &g_bma_dev->edma_host; + if (!edma_host) + return -1; + + spin_lock_irqsave(&edma_host->send_msg_lock, flags); + + if (edma_host->msg_send_write + total_len <= + (HOST_MAX_SEND_MBX_LEN - SIZE_OF_MBX_HDR)) { + hdr = (struct edma_msg_hdr_s *)(edma_host->msg_send_buf + + edma_host->msg_send_write); + hdr->type = TYPE_EDMA_DRIVER; + hdr->sub_type = subtype; + hdr->datalen = msg_len; + + memcpy(hdr->data, msg, msg_len); + + edma_host->msg_send_write += total_len; + + spin_unlock_irqrestore(&edma_host->send_msg_lock, flags); + + (void)mod_timer(&edma_host->timer, jiffies_64); + BMA_LOG(DLOG_DEBUG, "msg_send_write = %d\n", + edma_host->msg_send_write); + } else { + ret = -ENOSPC; + spin_unlock_irqrestore(&edma_host->send_msg_lock, flags); + + BMA_LOG(DLOG_DEBUG, + "msg lost,msg_send_write: %d,msg_len:%d,max_len: %d\n", + edma_host->msg_send_write, total_len, + HOST_MAX_SEND_MBX_LEN); + } + + return ret; +} + +static int edma_host_insert_recv_msg(struct edma_host_s *edma_host, + struct edma_msg_hdr_s *msg_header) +{ + unsigned long flags = 0, msg_flags = 0; + struct bma_dev_s *bma_dev = NULL; + struct bma_priv_data_s *priv = NULL; + struct bma_user_s *puser = NULL; + struct list_head *entry = NULL; + struct edma_recv_msg_s *msg_tmp = NULL; + struct bma_user_s usertmp = { }; + struct edma_recv_msg_s *recv_msg = NULL; + + if (!edma_host || !msg_header || + msg_header->datalen > CDEV_MAX_WRITE_LEN) { + return -EFAULT; + } + + bma_dev = list_entry(edma_host, struct bma_dev_s, edma_host); + + recv_msg = kmalloc(sizeof(*recv_msg) + msg_header->datalen, GFP_ATOMIC); + if (!recv_msg) { + BMA_LOG(DLOG_ERROR, "malloc recv_msg failed\n"); + return -ENOMEM; + } + + recv_msg->msg_len = msg_header->datalen; + memcpy(recv_msg->msg_data, msg_header->data, + msg_header->datalen); + + spin_lock_irqsave(&bma_dev->priv_list_lock, flags); + list_for_each_entry_rcu(puser, &bma_dev->priv_list, link) { + if (puser->type != msg_header->type || + puser->sub_type != msg_header->sub_type) + continue; + + priv = list_entry(puser, struct bma_priv_data_s, user); + + memcpy(&usertmp, puser, + sizeof(struct bma_user_s)); + + spin_lock_irqsave(&priv->recv_msg_lock, msg_flags); + + if (puser->cur_recvmsg_nums >= puser->max_recvmsg_nums || + puser->cur_recvmsg_nums >= MAX_RECV_MSG_NUMS) { + entry = priv->recv_msgs.next; + msg_tmp = + list_entry(entry, struct edma_recv_msg_s, + link); + list_del(entry); + puser->cur_recvmsg_nums--; + kfree(msg_tmp); + } + + if (edma_host->local_open_status[puser->type] + == DEV_OPEN) { + list_add_tail(&recv_msg->link, &priv->recv_msgs); + puser->cur_recvmsg_nums++; + usertmp.cur_recvmsg_nums = + puser->cur_recvmsg_nums; + spin_unlock_irqrestore(&priv->recv_msg_lock, + msg_flags); + + } else { + spin_unlock_irqrestore(&priv->recv_msg_lock, + msg_flags); + break; + } + + wake_up_interruptible(&priv->wait); + spin_unlock_irqrestore(&bma_dev->priv_list_lock, flags); + + BMA_LOG(DLOG_DEBUG, + "find user, type = %d, sub_type = %d, user_id = %d, insert msg\n", + usertmp.type, usertmp.sub_type, + usertmp.user_id); + BMA_LOG(DLOG_DEBUG, + "msg_len = %d, cur_recvmsg_nums: %d, max_recvmsg_nums: %d\n", + recv_msg->msg_len, usertmp.cur_recvmsg_nums, + usertmp.max_recvmsg_nums); + + return 0; + } + + spin_unlock_irqrestore(&bma_dev->priv_list_lock, flags); + kfree(recv_msg); + edma_host->statistics.drop_pkgs++; + BMA_LOG(DLOG_DEBUG, + "insert msg failed! not find user, type = %d, sub_type = %d\n", + msg_header->type, msg_header->sub_type); + + return -EFAULT; +} + +int edma_host_recv_msg(struct edma_host_s *edma_host, + struct bma_priv_data_s *priv, + struct edma_recv_msg_s **msg) +{ + unsigned long flags = 0; + struct list_head *entry = NULL; + struct edma_recv_msg_s *msg_tmp = NULL; + struct bma_dev_s *bma_dev = NULL; + + if (!edma_host || !priv || !msg) + return -EAGAIN; + + bma_dev = list_entry(edma_host, struct bma_dev_s, edma_host); + + spin_lock_irqsave(&bma_dev->priv_list_lock, flags); + + if (list_empty(&priv->recv_msgs)) { + priv->user.cur_recvmsg_nums = 0; + spin_unlock_irqrestore(&bma_dev->priv_list_lock, flags); + BMA_LOG(DLOG_DEBUG, "recv msgs empty\n"); + return -EAGAIN; + } + + entry = priv->recv_msgs.next; + msg_tmp = list_entry(entry, struct edma_recv_msg_s, link); + list_del(entry); + + if (priv->user.cur_recvmsg_nums > 0) + priv->user.cur_recvmsg_nums--; + + spin_unlock_irqrestore(&bma_dev->priv_list_lock, flags); + + *msg = msg_tmp; + + BMA_LOG(DLOG_DEBUG, "msg->msg_len = %d\n", (int)msg_tmp->msg_len); + + return 0; +} + +static int edma_host_msg_process(struct edma_host_s *edma_host, + struct edma_msg_hdr_s *msg_header) +{ + struct bma_user_s *user_ptr = NULL; + char drv_msg[TYPE_MAX * 2 + 1] = { 0 }; + + if (!edma_host || !msg_header) + return 0; + + if (msg_header->type != TYPE_EDMA_DRIVER) + return -1; + + if (msg_header->sub_type != DEV_OPEN_STATUS_REQ) + return 0; + + list_for_each_entry_rcu(user_ptr, &g_bma_dev->priv_list, link) { + drv_msg[drv_msg[0] * 2 + 1] = user_ptr->type; + drv_msg[drv_msg[0] * 2 + 2] = + edma_host->local_open_status[user_ptr->type]; + BMA_LOG(DLOG_DEBUG, + "send DEV_OPEN_STATUS_ANS index=%d type=%d status=%d\n", + drv_msg[0], drv_msg[drv_msg[0] * 2 + 1], + drv_msg[drv_msg[0] * 2 + 2]); + drv_msg[0]++; + } + + if (drv_msg[0]) { + (void)edma_host_send_driver_msg((void *)drv_msg, + drv_msg[0] * 2 + + 1, + DEV_OPEN_STATUS_ANS); + BMA_LOG(DLOG_DEBUG, + "send DEV_OPEN_STATUS_ANS %d\n", + drv_msg[0]); + } + + return 0; +} + +void edma_host_isr_tasklet(unsigned long data) +{ + int result = 0; + u16 len = 0; + u16 off = 0; + u16 msg_cnt = 0; + struct edma_mbx_hdr_s *recv_mbx_hdr = NULL; + struct edma_host_s *edma_host = (struct edma_host_s *)data; + struct edma_msg_hdr_s *msg_header = NULL; + unsigned char *ptr = NULL; + + if (!edma_host) + return; + + recv_mbx_hdr = (struct edma_mbx_hdr_s *)(edma_host->edma_recv_addr); + msg_header = + (struct edma_msg_hdr_s *)((char *)(edma_host->edma_recv_addr) + + SIZE_OF_MBX_HDR + recv_mbx_hdr->mbxoff); + + off = readw((unsigned char *)edma_host->edma_recv_addr + + EDMA_B2H_INT_FLAG); + len = readw((unsigned char *)edma_host->edma_recv_addr) - off; + + BMA_LOG(DLOG_DEBUG, + " edma_host->edma_recv_addr = %p, len = %d, off = %d, mbxlen = %d\n", + edma_host->edma_recv_addr, len, off, recv_mbx_hdr->mbxlen); + edma_host->statistics.recv_bytes += (recv_mbx_hdr->mbxlen - off); + + if (len == 0) { + writel(0, (void *)(edma_host->edma_recv_addr)); + return; + } + + while (recv_mbx_hdr->mbxlen - off) { + if (len == 0) { + BMA_LOG(DLOG_DEBUG, " receive done\n"); + break; + } + + if (len < (SIZE_OF_MSG_HDR + msg_header->datalen)) { + BMA_LOG(DLOG_ERROR, " len too less, is %d\n", len); + break; + } + + edma_host->statistics.recv_pkgs++; + + if (edma_host_msg_process(edma_host, msg_header) == -1) { + result = edma_host_insert_recv_msg(edma_host, + msg_header); + if (result < 0) + BMA_LOG(DLOG_DEBUG, + "edma_host_insert_recv_msg failed\n"); + } + + BMA_LOG(DLOG_DEBUG, "len = %d\n", len); + BMA_LOG(DLOG_DEBUG, "off = %d\n", off); + len -= (msg_header->datalen + SIZE_OF_MSG_HDR); + BMA_LOG(DLOG_DEBUG, + "msg_header->datalen = %d, SIZE_OF_MSG_HDR=%d\n", + msg_header->datalen, (int)SIZE_OF_MSG_HDR); + off += (msg_header->datalen + SIZE_OF_MSG_HDR); + + msg_cnt++; + + ptr = (unsigned char *)msg_header; + msg_header = (struct edma_msg_hdr_s *)(ptr + + (msg_header->datalen + + SIZE_OF_MSG_HDR)); + + if (msg_cnt > 2) { + recv_mbx_hdr->mbxoff = off; + BMA_LOG(DLOG_DEBUG, "len = %d\n", len); + BMA_LOG(DLOG_DEBUG, "off = %d\n", off); + BMA_LOG(DLOG_DEBUG, "off works\n"); + + tasklet_hi_schedule(&edma_host->tasklet); + + break; + } + + if (!len) { + writel(0, (void *)(edma_host->edma_recv_addr)); + recv_mbx_hdr->mbxoff = 0; + } + } +} + +static int edma_host_dma_interrupt(struct edma_host_s *edma_host) +{ + if (!edma_host) + return 0; + + if (check_status_dmah2b(edma_host)) { + clear_int_dmah2b(edma_host); + + edma_host->h2b_state = H2BSTATE_IDLE; + wake_up_interruptible(&edma_host->wq_dmah2b); + return 0; + } + + if (check_status_dmab2h(edma_host)) { + clear_int_dmab2h(edma_host); + + edma_host->b2h_state = B2HSTATE_IDLE; + wake_up_interruptible(&edma_host->wq_dmab2h); + + return 0; + } + + return -EAGAIN; +} + +irqreturn_t edma_host_irq_handle(struct edma_host_s *edma_host) +{ + if (edma_host) { + (void)edma_host_dma_interrupt(edma_host); + + tasklet_hi_schedule(&edma_host->tasklet); + } + + return IRQ_HANDLED; +} + +struct edma_user_inft_s *edma_host_get_user_inft(u32 type) +{ + if (type >= TYPE_MAX) { + BMA_LOG(DLOG_ERROR, "type error %d\n", type); + return NULL; + } + + return g_user_func[type]; +} + +int edma_host_user_register(u32 type, struct edma_user_inft_s *func) +{ + if (type >= TYPE_MAX) { + BMA_LOG(DLOG_ERROR, "type error %d\n", type); + return -EFAULT; + } + + if (!func) { + BMA_LOG(DLOG_ERROR, "func is NULL\n"); + return -EFAULT; + } + + g_user_func[type] = func; + + return 0; +} + +int edma_host_user_unregister(u32 type) +{ + if (type >= TYPE_MAX) { + BMA_LOG(DLOG_ERROR, "type error %d\n", type); + return -EFAULT; + } + + g_user_func[type] = NULL; + + return 0; +} + +int edma_host_init(struct edma_host_s *edma_host) +{ + int ret = 0; + struct bma_dev_s *bma_dev = NULL; + struct notify_msg *pnm = NULL; + + if (!edma_host) + return -1; + + bma_dev = list_entry(edma_host, struct bma_dev_s, edma_host); + g_bma_dev = bma_dev; + + edma_host->pdev = bma_dev->bma_pci_dev->pdev; + +#ifdef EDMA_TIMER + #ifdef HAVE_TIMER_SETUP + timer_setup(&edma_host->timer, edma_host_timeout, 0); + #else + setup_timer(&edma_host->timer, edma_host_timeout, + (unsigned long)edma_host); + #endif + (void)mod_timer(&edma_host->timer, jiffies_64 + TIMER_INTERVAL_CHECK); +#ifdef USE_DMA + #ifdef HAVE_TIMER_SETUP + timer_setup(&edma_host->dma_timer, edma_host_dma_timeout, 0); + + #else + setup_timer(&edma_host->dma_timer, edma_host_dma_timeout, + (unsigned long)edma_host); + #endif + (void)mod_timer(&edma_host->dma_timer, + jiffies_64 + DMA_TIMER_INTERVAL_CHECK); +#endif + +#else + init_completion(&edma_host->msg_ready); + + edma_host->edma_thread = + kthread_run(edma_host_thread, (void *)edma_host, "edma_host_msg"); + + if (IS_ERR(edma_host->edma_thread)) { + BMA_LOG(DLOG_ERROR, "kernel_run edma_host_msg failed\n"); + return PTR_ERR(edma_host->edma_thread); + } +#endif + + edma_host->msg_send_buf = kmalloc(HOST_MAX_SEND_MBX_LEN, GFP_KERNEL); + if (!edma_host->msg_send_buf) { + BMA_LOG(DLOG_ERROR, "malloc msg_send_buf failed!"); + ret = -ENOMEM; + goto failed1; + } + + edma_host->msg_send_write = 0; + + spin_lock_init(&edma_host->send_msg_lock); + + tasklet_init(&edma_host->tasklet, + (void (*)(unsigned long))edma_host_isr_tasklet, + (unsigned long)edma_host); + + edma_host->edma_flag = bma_dev->bma_pci_dev->edma_swap_addr; + + edma_host->edma_send_addr = + (void *)((unsigned char *)bma_dev->bma_pci_dev->edma_swap_addr + + HOST_DMA_FLAG_LEN); + memset(edma_host->edma_send_addr, 0, SIZE_OF_MBX_HDR); + + edma_host->edma_recv_addr = + (void *)((unsigned char *)edma_host->edma_send_addr + + HOST_MAX_SEND_MBX_LEN); + + BMA_LOG(DLOG_DEBUG, + "edma_flag = %p, edma_send_addr = %p, edma_recv_addr = %p\n", + edma_host->edma_flag, edma_host->edma_send_addr, + edma_host->edma_recv_addr); + + edma_host->hostrtc_viraddr = bma_dev->bma_pci_dev->hostrtc_viraddr; + + init_waitqueue_head(&edma_host->wq_dmah2b); + init_waitqueue_head(&edma_host->wq_dmab2h); + + spin_lock_init(&edma_host->reg_lock); + + edma_host->h2b_state = H2BSTATE_IDLE; + edma_host->b2h_state = B2HSTATE_IDLE; + + #ifdef HAVE_TIMER_SETUP + timer_setup(&edma_host->heartbeat_timer, + edma_host_heartbeat_timer, 0); + #else + setup_timer(&edma_host->heartbeat_timer, + edma_host_heartbeat_timer, + (unsigned long)edma_host); + #endif + (void)mod_timer(&edma_host->heartbeat_timer, + jiffies_64 + HEARTBEAT_TIMER_INTERVAL_CHECK); + + pnm = (struct notify_msg *)edma_host->edma_flag; + if (pnm) + pnm->host_registered = REGISTERED; + + GET_SYS_SECONDS(edma_host->statistics.init_time); + +#ifdef EDMA_TIMER + BMA_LOG(DLOG_DEBUG, "timer ok\n"); +#else + BMA_LOG(DLOG_ERROR, "thread ok\n"); +#endif + return 0; + +failed1: +#ifdef EDMA_TIMER + (void)del_timer_sync(&edma_host->timer); +#ifdef USE_DMA + (void)del_timer_sync(&edma_host->dma_timer); +#endif +#else + kthread_stop(edma_host->edma_thread); + complete(&edma_host->msg_ready); +#endif + return ret; +} + +void edma_host_cleanup(struct edma_host_s *edma_host) +{ + struct bma_dev_s *bma_dev = NULL; + struct notify_msg *pnm = NULL; + + if (!edma_host) + return; + + bma_dev = list_entry(edma_host, struct bma_dev_s, edma_host); + (void)del_timer_sync(&edma_host->heartbeat_timer); + pnm = (struct notify_msg *)edma_host->edma_flag; + + if (pnm) + pnm->host_registered = DEREGISTERED; + + tasklet_kill(&edma_host->tasklet); + + kfree(edma_host->msg_send_buf); + edma_host->msg_send_buf = NULL; +#ifdef EDMA_TIMER + (void)del_timer_sync(&edma_host->timer); +#ifdef USE_DMA + (void)del_timer_sync(&edma_host->dma_timer); +#endif + +#else + kthread_stop(edma_host->edma_thread); + + complete(&edma_host->msg_ready); +#endif +} diff --git a/drivers/net/ethernet/huawei/bma/edma_drv/edma_host.h b/drivers/net/ethernet/huawei/bma/edma_drv/edma_host.h new file mode 100644 index 000000000000..9d86b5b0fdd6 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/edma_drv/edma_host.h @@ -0,0 +1,351 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _EDMA_HOST_H_ +#define _EDMA_HOST_H_ + +#include "bma_include.h" +#include "../include/bma_ker_intf.h" + +#define EDMA_TIMER + +#ifndef IN +#define IN +#endif + +#ifndef OUT +#define OUT +#endif + +#ifndef UNUSED +#define UNUSED +#endif + +/* vm_flags in vm_area_struct, see mm_types.h. */ +#define VM_NONE 0x00000000 + +#define VM_READ 0x00000001 /* currently active flags */ +#define VM_WRITE 0x00000002 +#define VM_EXEC 0x00000004 +#define VM_SHARED 0x00000008 + +#define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ +#define VM_MAYWRITE 0x00000020 +#define VM_MAYEXEC 0x00000040 +#define VM_MAYSHARE 0x00000080 + +#define VM_GROWSDOWN 0x00000100 /* general info on the segment */ +/* Page-ranges managed without "struct page", just pure PFN */ +#define VM_PFNMAP 0x00000400 +#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ + +#define VM_LOCKED 0x00002000 +#define VM_IO 0x00004000 /* Memory mapped I/O or similar */ + + /* Used by sys_madvise() */ +#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ +/* App will not benefit from clustered reads */ +#define VM_RAND_READ 0x00010000 + +#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ +#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ +#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ +#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ +#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ +#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ +#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ +#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ +/* Can contain "struct page" and pure PFN pages */ +#define VM_MIXEDMAP 0x10000000 + +#define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ + +#if defined(CONFIG_X86) +/* PAT reserves whole VMA at once (x86) */ +#define VM_PAT VM_ARCH_1 +#elif defined(CONFIG_PPC) +#define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */ +#elif defined(CONFIG_PARISC) +#define VM_GROWSUP VM_ARCH_1 +#elif defined(CONFIG_METAG) +#define VM_GROWSUP VM_ARCH_1 +#elif defined(CONFIG_IA64) +#define VM_GROWSUP VM_ARCH_1 +#elif !defined(CONFIG_MMU) +#define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ +#endif + +#ifndef VM_GROWSUP +#define VM_GROWSUP VM_NONE +#endif + +/* Bits set in the VMA until the stack is in its final location */ +#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) + +#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ +#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS +#endif + +#define VM_READHINTMASK (VM_SEQ_READ | VM_RAND_READ) +#define VM_NORMAL_READ_HINT(v) (!((v)->vm_flags & VM_READHINTMASK)) +#define VM_SEQUENTIAL_READ_HINT(v) ((v)->vm_flags & VM_SEQ_READ) +#define VM_RANDOM_READ_HINT(v) ((v)->vm_flags & VM_RAND_READ) + +#define REG_PCIE1_DMAREAD_ENABLE 0xa18 +#define SHIFT_PCIE1_DMAREAD_ENABLE 0 + +#define REG_PCIE1_DMAWRITE_ENABLE 0x9c4 +#define SHIFT_PCIE1_DMAWRITE_ENABLE 0 + +#define REG_PCIE1_DMAREAD_STATUS 0xa10 +#define SHIFT_PCIE1_DMAREAD_STATUS 0 +#define REG_PCIE1_DMAREADINT_CLEAR 0xa1c +#define SHIFT_PCIE1_DMAREADINT_CLEAR 0 + +#define REG_PCIE1_DMAWRITE_STATUS 0x9bc +#define SHIFT_PCIE1_DMAWRITE_STATUS 0 +#define REG_PCIE1_DMAWRITEINT_CLEAR 0x9c8 +#define SHIFT_PCIE1_DMAWRITEINT_CLEAR 0 + +#define REG_PCIE1_DMA_READ_ENGINE_ENABLE (0x99c) +#define SHIFT_PCIE1_DMA_ENGINE_ENABLE (0) +#define REG_PCIE1_DMA_WRITE_ENGINE_ENABLE (0x97C) + +#define HOSTRTC_INT_OFFSET 0x10 + +#define H2BSTATE_IDLE 0 +#define H2BSTATE_WAITREADY 1 +#define H2BSTATE_WAITDMA 2 +#define H2BSTATE_WAITACK 3 +#define H2BSTATE_ERROR 4 + +#define B2HSTATE_IDLE 0 +#define B2HSTATE_WAITREADY 1 +#define B2HSTATE_WAITRECV 2 +#define B2HSTATE_WAITDMA 3 +#define B2HSTATE_ERROR 4 + +#define PAGE_ORDER 8 +#define EDMA_DMABUF_SIZE (1 << (PAGE_SHIFT + PAGE_ORDER)) + +#define EDMA_DMA_TRANSFER_WAIT_TIMEOUT (10 * HZ) +#define TIMEOUT_WAIT_NOSIGNAL 2 + +#define TIMER_INTERVAL_CHECK (HZ / 10) +#define DMA_TIMER_INTERVAL_CHECK 50 +#define HEARTBEAT_TIMER_INTERVAL_CHECK HZ + +#define EDMA_PCI_MSG_LEN (56 * 1024) + +#define HOST_DMA_FLAG_LEN (64) + +#define HOST_MAX_SEND_MBX_LEN (40 * 1024) +#define BMC_MAX_RCV_MBX_LEN HOST_MAX_SEND_MBX_LEN + +#define HOST_MAX_RCV_MBX_LEN (16 * 1024) +#define BMC_MAX_SEND_MBX_LEN HOST_MAX_RCV_MBX_LEN +#define CDEV_MAX_WRITE_LEN (4 * 1024) + +#define HOST_MAX_MSG_LENGTH 272 + +#define EDMA_MMAP_H2B_DMABUF 0xf1000000 + +#define EDMA_MMAP_B2H_DMABUF 0xf2000000 + +#define EDMA_IOC_MAGIC 'e' + +#define EDMA_H_REGISTER_TYPE _IOW(EDMA_IOC_MAGIC, 100, unsigned long) + +#define EDMA_H_UNREGISTER_TYPE _IOW(EDMA_IOC_MAGIC, 101, unsigned long) + +#define EDMA_H_DMA_START _IOW(EDMA_IOC_MAGIC, 102, unsigned long) + +#define EDMA_H_DMA_TRANSFER _IOW(EDMA_IOC_MAGIC, 103, unsigned long) + +#define EDMA_H_DMA_STOP _IOW(EDMA_IOC_MAGIC, 104, unsigned long) + +#define U64ADDR_H(addr) ((((u64)addr) >> 32) & 0xffffffff) +#define U64ADDR_L(addr) ((addr) & 0xffffffff) + +struct bma_register_dev_type_s { + u32 type; + u32 sub_type; +}; + +struct edma_mbx_hdr_s { + u16 mbxlen; + u16 mbxoff; + u8 reserve[28]; +} __packed; + +#define SIZE_OF_MBX_HDR (sizeof(struct edma_mbx_hdr_s)) + +struct edma_recv_msg_s { + struct list_head link; + u32 msg_len; + unsigned char msg_data[0]; +}; + +struct edma_dma_addr_s { + void *kvaddr; + dma_addr_t dma_addr; + u32 len; +}; + +struct edma_msg_hdr_s { + u32 type; + u32 sub_type; + u8 user_id; + u8 dma_flag; + u8 reserve1[2]; + u32 datalen; + u8 data[0]; +}; + +#define SIZE_OF_MSG_HDR (sizeof(struct edma_msg_hdr_s)) + +#pragma pack(1) + +#define IS_EDMA_B2H_INT(flag) ((flag) & 0x02) +#define CLEAR_EDMA_B2H_INT(flag) ((flag) = (flag) & 0xfffffffd) +#define SET_EDMA_H2B_INT(flag) ((flag) = (flag) | 0x01) +#define EDMA_B2H_INT_FLAG 0x02 + +struct notify_msg { + unsigned int host_registered; + unsigned int host_heartbeat; + unsigned int bmc_registered; + unsigned int bmc_heartbeat; + unsigned int int_flag; + + unsigned int reservrd5; + unsigned int h2b_addr; + unsigned int h2b_size; + unsigned int h2b_rsize; + unsigned int b2h_addr; + unsigned int b2h_size; + unsigned int b2h_rsize; +}; + +#pragma pack() + +struct edma_statistics_s { + unsigned int remote_status; + __kernel_time_t init_time; + unsigned int h2b_int; + unsigned int b2h_int; + unsigned int recv_bytes; + unsigned int send_bytes; + unsigned int send_pkgs; + unsigned int recv_pkgs; + unsigned int failed_count; + unsigned int drop_pkgs; + unsigned int dma_count; + unsigned int lost_count; +}; + +struct edma_host_s { + struct pci_dev *pdev; + + struct tasklet_struct tasklet; + + void __iomem *hostrtc_viraddr; + + void __iomem *edma_flag; + void __iomem *edma_send_addr; + void __iomem *edma_recv_addr; +#ifdef USE_DMA + struct timer_list dma_timer; +#endif + + struct timer_list heartbeat_timer; + +#ifdef EDMA_TIMER + struct timer_list timer; +#else + struct completion msg_ready; /* to sleep thread on */ + struct task_struct *edma_thread; +#endif + /* spinlock for send msg buf */ + spinlock_t send_msg_lock; + unsigned char *msg_send_buf; + unsigned int msg_send_write; + + /* DMA */ + wait_queue_head_t wq_dmah2b; + wait_queue_head_t wq_dmab2h; + + /* spinlock for read pci register */ + spinlock_t reg_lock; + int h2b_state; + int b2h_state; + struct edma_dma_addr_s h2b_addr; + struct edma_dma_addr_s b2h_addr; + + struct proc_dir_entry *proc_edma_dir; + + struct edma_statistics_s statistics; + unsigned char local_open_status[TYPE_MAX]; + unsigned char remote_open_status[TYPE_MAX]; +}; + +struct edma_user_inft_s { + /* register user */ + int (*user_register)(struct bma_priv_data_s *priv); + + /* unregister user */ + void (*user_unregister)(struct bma_priv_data_s *priv); + + /* add msg */ + int (*add_msg)(void *msg, size_t msg_len); +}; + +int is_edma_b2h_int(struct edma_host_s *edma_host); +void edma_int_to_bmc(struct edma_host_s *edma_host); +int edma_host_mmap(struct edma_host_s *edma_hos, struct file *filp, + struct vm_area_struct *vma); +int edma_host_copy_msg(struct edma_host_s *edma_host, void *msg, + size_t msg_len); +int edma_host_add_msg(struct edma_host_s *edma_host, + struct bma_priv_data_s *priv, void *msg, size_t msg_len); +int edma_host_recv_msg(struct edma_host_s *edma_host, + struct bma_priv_data_s *priv, + struct edma_recv_msg_s **msg); +void edma_host_isr_tasklet(unsigned long data); +int edma_host_check_dma_status(enum dma_direction_e dir); +int edma_host_dma_start(struct edma_host_s *edma_host, + struct bma_priv_data_s *priv); +int edma_host_dma_transfer(struct edma_host_s *edma_host, + struct bma_priv_data_s *priv, + struct bma_dma_transfer_s *dma_transfer); +int edma_host_dma_stop(struct edma_host_s *edma_host, + struct bma_priv_data_s *priv); +irqreturn_t edma_host_irq_handle(struct edma_host_s *edma_host); +struct edma_user_inft_s *edma_host_get_user_inft(u32 type); +int edma_host_user_register(u32 type, struct edma_user_inft_s *func); +int edma_host_user_unregister(u32 type); +int edma_host_init(struct edma_host_s *edma_host); +void edma_host_cleanup(struct edma_host_s *edma_host); +int edma_host_send_driver_msg(const void *msg, size_t msg_len, int subtype); +void edma_host_reset_dma(struct edma_host_s *edma_host, int dir); +void clear_int_dmah2b(struct edma_host_s *edma_host); +void clear_int_dmab2h(struct edma_host_s *edma_host); + +enum EDMA_STATUS { + DEREGISTERED = 0, + REGISTERED = 1, + LOST, +}; +#endif diff --git a/drivers/net/ethernet/huawei/bma/include/bma_ker_intf.h b/drivers/net/ethernet/huawei/bma/include/bma_ker_intf.h new file mode 100644 index 000000000000..d1df99b0c9fd --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/include/bma_ker_intf.h @@ -0,0 +1,94 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _BMA_KER_INTF_H_ +#define _BMA_KER_INTF_H_ + +typedef long __kernel_time_t; +#define BAD_FUNC_ADDR(x) ((0xFFFFFFFF == (x)) || (0 == (x))) + +enum { + /* 0 -127 msg */ + TYPE_LOGIC_PARTITION = 0, + TYPE_UPGRADE = 1, + TYPE_CDEV = 2, + TYPE_VETH = 0x40, + TYPE_MAX = 128, + + TYPE_KBOX = 129, + TYPE_EDMA_DRIVER = 130, + TYPE_UNKNOWN = 0xff, +}; + +enum dma_direction_e { + BMC_TO_HOST = 0, + HOST_TO_BMC = 1, +}; + +enum dma_type_e { + DMA_NOT_LIST = 0, + DMA_LIST = 1, +}; + +enum intr_mod { + INTR_DISABLE = 0, + INTR_ENABLE = 1, +}; + +struct bma_dma_addr_s { + dma_addr_t dma_addr; + u32 dma_data_len; +}; + +struct dma_transfer_s { + struct bma_dma_addr_s host_addr; + struct bma_dma_addr_s bmc_addr; +}; + +struct dmalist_transfer_s { + dma_addr_t dma_addr; +}; + +union transfer_u { + struct dma_transfer_s nolist; + struct dmalist_transfer_s list; +}; + +struct bma_dma_transfer_s { + enum dma_type_e type; + enum dma_direction_e dir; + union transfer_u transfer; +}; + +int bma_intf_register_int_notifier(struct notifier_block *nb); +void bma_intf_unregister_int_notifier(struct notifier_block *nb); +int bma_intf_register_type(u32 type, u32 sub_type, enum intr_mod support_int, + void **handle); +int bma_intf_unregister_type(void **handle); +int bma_intf_check_dma_status(enum dma_direction_e dir); +int bma_intf_start_dma(void *handle, struct bma_dma_transfer_s *dma_transfer); +int bma_intf_int_to_bmc(void *handle); +void bma_intf_set_open_status(void *handle, int s); +int bma_intf_is_link_ok(void); +void bma_intf_reset_dma(enum dma_direction_e dir); +void bma_intf_clear_dma_int(enum dma_direction_e dir); + +int bma_cdev_recv_msg(void *handle, char __user *data, size_t count); +int bma_cdev_add_msg(void *handle, const char __user *msg, size_t msg_len); + +unsigned int bma_cdev_check_recv(void *handle); +void *bma_cdev_get_wait_queue(void *handle); +int bma_intf_check_edma_supported(void); +#endif
From: Qi Deng dengqi7@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4ETXO CVE: NA
-----------------------------------------
The BMA software is a system management software offered by Huawei. It supports the status monitoring, performance monitoring, and event monitoring of various components, including server CPUs, memory, hard disks, NICs, IB cards, PCIe cards, RAID controller cards, and optical modules.
This host_cdev_drv driver is one of the communication driver used by BMA software. It depends on the host_edma_drv driver. The host_cdev_drv driver will create 4 char devices(hwbmc0, hwbmc1, hwbmc2, hwbmc3) once loaded. These char devices offer interfaces, including open, close, read, write and poll, to upper level applications. BMA uses them to send/receive ipmi commons to/from BMC.
Link: https://lkml.org/lkml/2020/6/22/752
Signed-off-by: Qi Deng dengqi7@huawei.com Reviewed-by: Qidong Wang wangqindong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/huawei/bma/Makefile | 1 + .../net/ethernet/huawei/bma/cdev_drv/Makefile | 2 + .../ethernet/huawei/bma/cdev_drv/bma_cdev.c | 369 ++++++++++++++++++ 3 files changed, 372 insertions(+) create mode 100644 drivers/net/ethernet/huawei/bma/cdev_drv/Makefile create mode 100644 drivers/net/ethernet/huawei/bma/cdev_drv/bma_cdev.c
diff --git a/drivers/net/ethernet/huawei/bma/Makefile b/drivers/net/ethernet/huawei/bma/Makefile index 15b3e29e65f9..6309edc99bff 100644 --- a/drivers/net/ethernet/huawei/bma/Makefile +++ b/drivers/net/ethernet/huawei/bma/Makefile @@ -3,3 +3,4 @@ #
obj-$(CONFIG_BMA) += edma_drv/ +obj-$(CONFIG_BMA) += cdev_drv/ diff --git a/drivers/net/ethernet/huawei/bma/cdev_drv/Makefile b/drivers/net/ethernet/huawei/bma/cdev_drv/Makefile new file mode 100644 index 000000000000..a6186ca12038 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/cdev_drv/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_BMA) += host_cdev_drv.o +host_cdev_drv-y := bma_cdev.o \ No newline at end of file diff --git a/drivers/net/ethernet/huawei/bma/cdev_drv/bma_cdev.c b/drivers/net/ethernet/huawei/bma/cdev_drv/bma_cdev.c new file mode 100644 index 000000000000..0348a83005d6 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/cdev_drv/bma_cdev.c @@ -0,0 +1,369 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/io.h> +#include <linux/module.h> +#include <linux/poll.h> +#include <linux/device.h> +#include <linux/cdev.h> +#include <linux/miscdevice.h> +#include <linux/netdevice.h> +#include "../edma_drv/bma_include.h" +#include "../include/bma_ker_intf.h" + +#define CDEV_NAME_PREFIX "hwibmc" + +#ifdef DRV_VERSION +#define CDEV_VERSION MICRO_TO_STR(DRV_VERSION) +#else +#define CDEV_VERSION "0.3.4" +#endif + +#define CDEV_DEFAULT_NUM 4 +#define CDEV_MAX_NUM 8 + +#define CDEV_NAME_MAX_LEN 32 +#define CDEV_INVALID_ID (0xffffffff) + +struct cdev_statistics_s { + unsigned int recv_bytes; + unsigned int send_bytes; + unsigned int send_pkgs; + unsigned int recv_pkgs; + unsigned int send_failed_count; + unsigned int recv_failed_count; + unsigned int open_status; +}; + +struct cdev_dev { + struct miscdevice dev_struct; + struct cdev_statistics_s s; + char dev_name[CDEV_NAME_MAX_LEN]; + dev_t dev_id; + void *dev_data; + atomic_t open; + int type; +}; + +struct cdev_dev_set { + struct cdev_dev dev_list[CDEV_MAX_NUM]; + int dev_num; + unsigned int init_time; +}; + +int dev_num = CDEV_DEFAULT_NUM; /* the dev num want to create */ +int debug = DLOG_ERROR; /* debug switch */ +module_param(dev_num, int, 0640); +MODULE_PARM_DESC(dev_num, "cdev num you want"); +MODULE_PARM_DESC(debug, "Debug switch (0=close debug, 1=open debug)"); + +#define CDEV_LOG(level, fmt, args...) do {\ + if (debug >= (level)) {\ + netdev_info(0, "edma_cdev: %s, %d, " fmt "\n", \ + __func__, __LINE__, ## args);\ + } \ +} while (0) + +static int cdev_open(struct inode *inode, struct file *filp); +static int cdev_release(struct inode *inode, struct file *filp); +static unsigned int cdev_poll(struct file *file, poll_table *wait); +static ssize_t cdev_read(struct file *filp, char __user *data, size_t count, + loff_t *ppos); +static ssize_t cdev_write(struct file *filp, const char __user *data, + size_t count, loff_t *ppos); + +struct cdev_dev_set g_cdev_set; + +#define GET_PRIVATE_DATA(f) (((struct cdev_dev *)((f)->private_data))->dev_data) + +module_param_call(debug, &edma_param_set_debug, ¶m_get_int, &debug, 0644); + +static int cdev_param_get_statics(char *buf, const struct kernel_param *kp) +{ + int len = 0; + int i = 0; + __kernel_time_t running_time = 0; + + if (!buf) + return 0; + + GET_SYS_SECONDS(running_time); + running_time -= g_cdev_set.init_time; + len += sprintf(buf + len, + "============================CDEV_DRIVER_INFO=======================\n"); + len += sprintf(buf + len, "version :%s\n", CDEV_VERSION); + + len += sprintf(buf + len, "running_time :%luD %02lu:%02lu:%02lu\n", + running_time / (SECONDS_PER_DAY), + running_time % (SECONDS_PER_DAY) / SECONDS_PER_HOUR, + running_time % SECONDS_PER_HOUR / SECONDS_PER_MINUTE, + running_time % SECONDS_PER_MINUTE); + + for (i = 0; i < g_cdev_set.dev_num; i++) { + len += sprintf(buf + len, + "===================================================\n"); + len += sprintf(buf + len, "name :%s\n", + g_cdev_set.dev_list[i].dev_name); + len += + sprintf(buf + len, "dev_id :%08x\n", + g_cdev_set.dev_list[i].dev_id); + len += sprintf(buf + len, "type :%u\n", + g_cdev_set.dev_list[i].type); + len += sprintf(buf + len, "status :%s\n", + g_cdev_set.dev_list[i].s.open_status == + 1 ? "open" : "close"); + len += sprintf(buf + len, "send_pkgs :%u\n", + g_cdev_set.dev_list[i].s.send_pkgs); + len += + sprintf(buf + len, "send_bytes:%u\n", + g_cdev_set.dev_list[i].s.send_bytes); + len += sprintf(buf + len, "send_failed_count:%u\n", + g_cdev_set.dev_list[i].s.send_failed_count); + len += sprintf(buf + len, "recv_pkgs :%u\n", + g_cdev_set.dev_list[i].s.recv_pkgs); + len += sprintf(buf + len, "recv_bytes:%u\n", + g_cdev_set.dev_list[i].s.recv_bytes); + len += sprintf(buf + len, "recv_failed_count:%u\n", + g_cdev_set.dev_list[i].s.recv_failed_count); + } + + return len; +} +module_param_call(statistics, NULL, cdev_param_get_statics, &debug, 0444); +MODULE_PARM_DESC(statistics, "Statistics info of cdev driver,readonly"); + +const struct file_operations g_bma_cdev_fops = { + .owner = THIS_MODULE, + .open = cdev_open, + .release = cdev_release, + .poll = cdev_poll, + .read = cdev_read, + .write = cdev_write, +}; + +static int __init bma_cdev_init(void) +{ + int i = 0; + + int ret = 0; + int err_count = 0; + + if (!bma_intf_check_edma_supported()) + return -ENXIO; + + if (dev_num <= 0 || dev_num > CDEV_MAX_NUM) + return -EINVAL; + + memset(&g_cdev_set, 0, sizeof(struct cdev_dev_set)); + g_cdev_set.dev_num = dev_num; + + for (i = 0; i < dev_num; i++) { + struct cdev_dev *pdev = &g_cdev_set.dev_list[i]; + + sprintf(pdev->dev_name, "%s%d", CDEV_NAME_PREFIX, i); + pdev->dev_struct.name = pdev->dev_name; + pdev->dev_struct.minor = MISC_DYNAMIC_MINOR; + pdev->dev_struct.fops = &g_bma_cdev_fops; + + pdev->dev_id = CDEV_INVALID_ID; + + ret = misc_register(&pdev->dev_struct); + + if (ret) { + CDEV_LOG(DLOG_DEBUG, "misc_register failed %d", i); + err_count++; + continue; + } + + pdev->dev_id = MKDEV(MISC_MAJOR, pdev->dev_struct.minor); + + ret = bma_intf_register_type(TYPE_CDEV + i, 0, INTR_DISABLE, + &pdev->dev_data); + + if (ret) { + CDEV_LOG(DLOG_ERROR, + "cdev %d open failed ,result = %d", + i, ret); + misc_deregister(&pdev->dev_struct); + pdev->dev_id = CDEV_INVALID_ID; + err_count++; + continue; + } else { + pdev->type = TYPE_CDEV + i; + atomic_set(&pdev->open, 1); + } + + CDEV_LOG(DLOG_DEBUG, "%s id is %08x", pdev->dev_struct.name, + pdev->dev_id); + } + + if (err_count == dev_num) { + CDEV_LOG(DLOG_ERROR, "init cdev failed!"); + return -EFAULT; + } + GET_SYS_SECONDS(g_cdev_set.init_time); + return 0; +} + +static void __exit bma_cdev_exit(void) +{ + while (dev_num--) { + struct cdev_dev *pdev = &g_cdev_set.dev_list[dev_num]; + + if (pdev->dev_id != CDEV_INVALID_ID) { + if (pdev->dev_data && pdev->type != 0) + (void)bma_intf_unregister_type(&pdev->dev_data); + + (void)misc_deregister + (&g_cdev_set.dev_list[dev_num].dev_struct); + } + } +} + +int cdev_open(struct inode *inode_prt, struct file *filp) +{ + int i = 0; + struct cdev_dev *pdev = NULL; + + if (!inode_prt) + return -EFAULT; + if (!filp) + return -EFAULT; + + if (dev_num <= 0) { + CDEV_LOG(DLOG_ERROR, "dev_num error"); + return -EFAULT; + } + + for (i = 0; i < dev_num; i++) { + pdev = &g_cdev_set.dev_list[i]; + + if (pdev->dev_id == inode_prt->i_rdev) + break; + } + + if (i == dev_num) { + CDEV_LOG(DLOG_ERROR, "can not find dev id %08x", + inode_prt->i_rdev); + return -ENODEV; + } + /*each device can be opened only onece */ + if (atomic_dec_and_test(&pdev->open) == 0) { + CDEV_LOG(DLOG_ERROR, "%s is already opened", + pdev->dev_name); + atomic_inc(&pdev->open); + return -EBUSY; /* already opened */ + } + + filp->private_data = &g_cdev_set.dev_list[i]; + bma_intf_set_open_status(pdev->dev_data, DEV_OPEN); + ((struct cdev_dev *)filp->private_data)->s.open_status++; + + return 0; +} + +int cdev_release(struct inode *inode_prt, struct file *filp) +{ + struct cdev_dev *pdev = NULL; + + if (!filp) + return 0; + + pdev = (struct cdev_dev *)filp->private_data; + if (pdev) { + ((struct cdev_dev *)filp->private_data)->s.open_status--; + bma_intf_set_open_status(pdev->dev_data, DEV_CLOSE); + atomic_inc(&pdev->open); + filp->private_data = NULL; + } + + return 0; +} + +unsigned int cdev_poll(struct file *filp, poll_table *wait) +{ + unsigned int mask = 0; + wait_queue_head_t *queue_head = NULL; + + if (!filp) + return 0; + queue_head = (wait_queue_head_t *) + bma_cdev_get_wait_queue(GET_PRIVATE_DATA(filp)); + + if (!queue_head) + return 0; + + poll_wait(filp, queue_head, wait); + + if (bma_cdev_check_recv(GET_PRIVATE_DATA(filp))) + mask |= (POLLIN | POLLRDNORM); + + CDEV_LOG(DLOG_DEBUG, "poll return %08x", mask); + + return mask; +} + +ssize_t cdev_read(struct file *filp, char __user *data, size_t count, + loff_t *ppos) +{ + int ret = 0; + + CDEV_LOG(DLOG_DEBUG, "data is %p,count is %u", data, + (unsigned int)count); + + if (!data || count <= 0) + return -EFAULT; + + ret = bma_cdev_recv_msg(GET_PRIVATE_DATA(filp), data, count); + + if (ret > 0) { + ((struct cdev_dev *)filp->private_data)->s.recv_bytes += ret; + ((struct cdev_dev *)filp->private_data)->s.recv_pkgs++; + } else { + ((struct cdev_dev *)filp->private_data)->s.recv_failed_count++; + } + + return ret; +} + +ssize_t cdev_write(struct file *filp, const char __user *data, size_t count, + loff_t *ppos) +{ + int ret = 0; + + if (!data || count <= 0) + return -EFAULT; + + CDEV_LOG(DLOG_DEBUG, "data is %p,count is %u", data, + (unsigned int)count); + ret = bma_cdev_add_msg(GET_PRIVATE_DATA(filp), data, count); + + if (ret > 0) { + ((struct cdev_dev *)filp->private_data)->s.send_bytes += ret; + ((struct cdev_dev *)filp->private_data)->s.send_pkgs++; + } else { + ((struct cdev_dev *)filp->private_data)->s.send_failed_count++; + } + + return ret; +} + +MODULE_AUTHOR("HUAWEI TECHNOLOGIES CO., LTD."); +MODULE_DESCRIPTION("HUAWEI CDEV DRIVER"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(CDEV_VERSION); + +module_init(bma_cdev_init); +module_exit(bma_cdev_exit);
From: Qi Deng dengqi7@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4ETXO CVE: NA
-----------------------------------------
The BMA software is a system management software offered by Huawei. It supports the status monitoring, performance monitoring, and event monitoring of various components, including server CPUs, memory, hard disks, NICs, IB cards, PCIe cards, RAID controller cards, and optical modules.
This host_veth_drv driver is one of the communication drivers used by BMA software. It depends on the host_edma_drv driver. The host_veth_drv driver will create a virtual net device "veth" once loaded. BMA software will use it to send/receive RESTful messages to/from BMC software.
Link: https://lkml.org/lkml/2020/6/22/752
Signed-off-by: Qi Deng dengqi7@huawei.com Reviewed-by: Qidong Wang wangqindong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/huawei/bma/Makefile | 1 + .../net/ethernet/huawei/bma/veth_drv/Makefile | 2 + .../ethernet/huawei/bma/veth_drv/veth_hb.c | 2502 +++++++++++++++++ .../ethernet/huawei/bma/veth_drv/veth_hb.h | 440 +++ 4 files changed, 2945 insertions(+) create mode 100644 drivers/net/ethernet/huawei/bma/veth_drv/Makefile create mode 100644 drivers/net/ethernet/huawei/bma/veth_drv/veth_hb.c create mode 100644 drivers/net/ethernet/huawei/bma/veth_drv/veth_hb.h
diff --git a/drivers/net/ethernet/huawei/bma/Makefile b/drivers/net/ethernet/huawei/bma/Makefile index 6309edc99bff..6f452aa96a69 100644 --- a/drivers/net/ethernet/huawei/bma/Makefile +++ b/drivers/net/ethernet/huawei/bma/Makefile @@ -4,3 +4,4 @@
obj-$(CONFIG_BMA) += edma_drv/ obj-$(CONFIG_BMA) += cdev_drv/ +obj-$(CONFIG_BMA) += veth_drv/ diff --git a/drivers/net/ethernet/huawei/bma/veth_drv/Makefile b/drivers/net/ethernet/huawei/bma/veth_drv/Makefile new file mode 100644 index 000000000000..c9ab07371ef4 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/veth_drv/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_BMA) += host_veth_drv.o +host_veth_drv-y := veth_hb.o \ No newline at end of file diff --git a/drivers/net/ethernet/huawei/bma/veth_drv/veth_hb.c b/drivers/net/ethernet/huawei/bma/veth_drv/veth_hb.c new file mode 100644 index 000000000000..9681ce3bfc7b --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/veth_drv/veth_hb.c @@ -0,0 +1,2502 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/device.h> +#include <linux/kthread.h> + +#include <linux/ethtool.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/skbuff.h> + +#include <linux/vmalloc.h> +#include <linux/atomic.h> +#include <linux/proc_fs.h> +#include <linux/mm.h> +#include <linux/gfp.h> +#include <asm/page.h> + +#include <linux/ip.h> + +#include "veth_hb.h" + +#define GET_QUEUE_STAT(node, stat) \ + ((node) ? ((char *)(node) + (stat)->stat_offset) : NULL) + +#define GET_SHM_QUEUE_STAT(node, stat) \ + (((node) && (node)->pshmqhd_v) ? \ + ((char *)(node)->pshmqhd_v + (stat)->stat_offset) : NULL) + +#define GET_STATS_VALUE(ptr, pstat) \ + ((ptr) ? (((pstat)->sizeof_stat == sizeof(u64)) ? \ + (*(u64 *)(ptr)) : (*(u32 *)(ptr))) : 0) + +#define GET_DMA_DIRECTION(type) \ + (((type) == BSPVETH_RX) ? BMC_TO_HOST : HOST_TO_BMC) + +#define CHECK_DMA_QUEUE_EMPTY(type, queue) \ + (((type) == BSPVETH_RX && \ + (queue)->pshmqhd_v->head == (queue)->pshmqhd_v->tail) || \ + ((type) != BSPVETH_RX && (queue)->head == (queue)->tail)) + +#define CHECK_DMA_RXQ_FAULT(queue, type, cnt) \ + ((queue)->dmal_cnt > 1 && (cnt) < ((queue)->work_limit / 2) && \ + (type) == BSPVETH_RX) + +static u32 veth_ethtool_get_link(struct net_device *dev); + +int debug; /* debug switch*/ +module_param_call(debug, &edma_param_set_debug, ¶m_get_int, &debug, 0644); + +MODULE_PARM_DESC(debug, "Debug switch (0=close debug, 1=open debug)"); + +#define VETH_LOG(lv, fmt, args...) \ +do { \ + if (debug < (lv)) \ + continue; \ + if (lv == DLOG_DEBUG) \ + netdev_dbg(g_bspveth_dev.pnetdev, "%s(), %d, " \ + fmt, __func__, __LINE__, ## args); \ + else if (lv == DLOG_ERROR) \ + netdev_err(g_bspveth_dev.pnetdev, "%s(), %d, " \ + fmt, __func__, __LINE__, ## args); \ +} while (0) + +#ifdef __UT_TEST +u32 g_testdma; + +u32 g_testlbk; + +#endif + +struct bspveth_device g_bspveth_dev = {}; + +static int veth_int_handler(struct notifier_block *pthis, unsigned long ev, + void *unuse); + +static struct notifier_block g_veth_int_nb = { + .notifier_call = veth_int_handler, +}; + +static const struct veth_stats veth_gstrings_stats[] = { + {"rx_packets", NET_STATS, VETH_STAT_SIZE(stats.rx_packets), + VETH_STAT_OFFSET(stats.rx_packets)}, + {"rx_bytes", NET_STATS, VETH_STAT_SIZE(stats.rx_bytes), + VETH_STAT_OFFSET(stats.rx_bytes)}, + {"rx_dropped", NET_STATS, VETH_STAT_SIZE(stats.rx_dropped), + VETH_STAT_OFFSET(stats.rx_dropped)}, + {"rx_head", QUEUE_RX_STATS, QUEUE_TXRX_STAT_SIZE(head), + QUEUE_TXRX_STAT_OFFSET(head)}, + {"rx_tail", QUEUE_RX_STATS, QUEUE_TXRX_STAT_SIZE(tail), + QUEUE_TXRX_STAT_OFFSET(tail)}, + {"rx_next_to_fill", QUEUE_RX_STATS, + QUEUE_TXRX_STAT_SIZE(next_to_fill), + QUEUE_TXRX_STAT_OFFSET(next_to_fill)}, + {"rx_shmq_head", SHMQ_RX_STATS, SHMQ_TXRX_STAT_SIZE(head), + SHMQ_TXRX_STAT_OFFSET(head)}, + {"rx_shmq_tail", SHMQ_RX_STATS, SHMQ_TXRX_STAT_SIZE(tail), + SHMQ_TXRX_STAT_OFFSET(tail)}, + {"rx_shmq_next_to_free", SHMQ_RX_STATS, + SHMQ_TXRX_STAT_SIZE(next_to_free), + SHMQ_TXRX_STAT_OFFSET(next_to_free)}, + {"rx_queue_full", QUEUE_RX_STATS, + QUEUE_TXRX_STAT_SIZE(s.q_full), + QUEUE_TXRX_STAT_OFFSET(s.q_full)}, + {"rx_dma_busy", QUEUE_RX_STATS, + QUEUE_TXRX_STAT_SIZE(s.dma_busy), + QUEUE_TXRX_STAT_OFFSET(s.dma_busy)}, + {"rx_dma_failed", QUEUE_RX_STATS, + QUEUE_TXRX_STAT_SIZE(s.dma_failed), + QUEUE_TXRX_STAT_OFFSET(s.dma_failed)}, + + {"tx_packets", NET_STATS, VETH_STAT_SIZE(stats.tx_packets), + VETH_STAT_OFFSET(stats.tx_packets)}, + {"tx_bytes", NET_STATS, VETH_STAT_SIZE(stats.tx_bytes), + VETH_STAT_OFFSET(stats.tx_bytes)}, + {"tx_dropped", NET_STATS, VETH_STAT_SIZE(stats.tx_dropped), + VETH_STAT_OFFSET(stats.tx_dropped)}, + + {"tx_head", QUEUE_TX_STATS, QUEUE_TXRX_STAT_SIZE(head), + QUEUE_TXRX_STAT_OFFSET(head)}, + {"tx_tail", QUEUE_TX_STATS, QUEUE_TXRX_STAT_SIZE(tail), + QUEUE_TXRX_STAT_OFFSET(tail)}, + {"tx_next_to_free", QUEUE_TX_STATS, + QUEUE_TXRX_STAT_SIZE(next_to_free), + QUEUE_TXRX_STAT_OFFSET(next_to_free)}, + {"tx_shmq_head", SHMQ_TX_STATS, SHMQ_TXRX_STAT_SIZE(head), + SHMQ_TXRX_STAT_OFFSET(head)}, + {"tx_shmq_tail", SHMQ_TX_STATS, SHMQ_TXRX_STAT_SIZE(tail), + SHMQ_TXRX_STAT_OFFSET(tail)}, + {"tx_shmq_next_to_free", SHMQ_TX_STATS, + SHMQ_TXRX_STAT_SIZE(next_to_free), + SHMQ_TXRX_STAT_OFFSET(next_to_free)}, + + {"tx_queue_full", QUEUE_TX_STATS, + QUEUE_TXRX_STAT_SIZE(s.q_full), + QUEUE_TXRX_STAT_OFFSET(s.q_full)}, + {"tx_dma_busy", QUEUE_TX_STATS, + QUEUE_TXRX_STAT_SIZE(s.dma_busy), + QUEUE_TXRX_STAT_OFFSET(s.dma_busy)}, + {"tx_dma_failed", QUEUE_TX_STATS, + QUEUE_TXRX_STAT_SIZE(s.dma_failed), + QUEUE_TXRX_STAT_OFFSET(s.dma_failed)}, + + {"recv_int", VETH_STATS, VETH_STAT_SIZE(recv_int), + VETH_STAT_OFFSET(recv_int)}, + {"tobmc_int", VETH_STATS, VETH_STAT_SIZE(tobmc_int), + VETH_STAT_OFFSET(tobmc_int)}, +}; + +#define VETH_GLOBAL_STATS_LEN \ + (sizeof(veth_gstrings_stats) / sizeof(struct veth_stats)) + +static int veth_param_get_statics(char *buf, const struct kernel_param *kp) +{ + int len = 0; + int i = 0, j = 0, type = 0; + struct bspveth_rxtx_q *pqueue = NULL; + __kernel_time_t running_time = 0; + + if (!buf) + return 0; + + GET_SYS_SECONDS(running_time); + + running_time -= g_bspveth_dev.init_time; + + len += sprintf(buf + len, + "================VETH INFO=============\r\n"); + len += sprintf(buf + len, "[version ]:" VETH_VERSION "\n"); + len += sprintf(buf + len, "[link state ]:%d\n", + veth_ethtool_get_link(g_bspveth_dev.pnetdev)); + len += sprintf(buf + len, "[running_time]:%luD %02lu:%02lu:%02lu\n", + running_time / (SECONDS_PER_DAY), + running_time % (SECONDS_PER_DAY) / SECONDS_PER_HOUR, + running_time % SECONDS_PER_HOUR / SECONDS_PER_MINUTE, + running_time % SECONDS_PER_MINUTE); + len += sprintf(buf + len, + "[bspveth_dev ]:MAX_QUEUE_NUM :0x%-16x ", + MAX_QUEUE_NUM); + len += sprintf(buf + len, + "MAX_QUEUE_BDNUM :0x%-16x\r\n", MAX_QUEUE_BDNUM); + len += sprintf(buf + len, + "[bspveth_dev ]:pnetdev :0x%-16p ", + g_bspveth_dev.pnetdev); + len += sprintf(buf + len, + "ppcidev :0x%-16p\r\n", + g_bspveth_dev.ppcidev); + len += sprintf(buf + len, + "[bspveth_dev ]:pshmpool_p:0x%-16p ", + g_bspveth_dev.pshmpool_p); + len += sprintf(buf + len, + "pshmpool_v :0x%-16p\r\n", + g_bspveth_dev.pshmpool_v); + len += sprintf(buf + len, + "[bspveth_dev ]:shmpoolsize:0x%-16x ", + g_bspveth_dev.shmpoolsize); + len += sprintf(buf + len, + "g_veth_dbg_lv :0x%-16x\r\n", debug); + + for (i = 0; i < MAX_QUEUE_NUM; i++) { + for (j = 0, type = BSPVETH_RX; j < 2; j++, type++) { + if (type == BSPVETH_RX) { + pqueue = g_bspveth_dev.prx_queue[i]; + len += sprintf(buf + len, + "=============RXQUEUE STATIS============\r\n"); + } else { + pqueue = g_bspveth_dev.ptx_queue[i]; + len += sprintf(buf + len, + "=============TXQUEUE STATIS============\r\n"); + } + + if (!pqueue) { + len += sprintf(buf + len, "NULL\r\n"); + continue; + } + + len += sprintf(buf + len, + "QUEUE[%d]--[pkt ] :%lld\r\n", i, + pqueue->s.pkt); + len += sprintf(buf + len, + "QUEUE[%d]--[pktbyte ] :%lld\r\n", i, + pqueue->s.pktbyte); + len += sprintf(buf + len, + "QUEUE[%d]--[refill ] :%lld\r\n", i, + pqueue->s.refill); + len += sprintf(buf + len, + "QUEUE[%d]--[freetx ] :%lld\r\n", i, + pqueue->s.freetx); + len += sprintf(buf + len, + "QUEUE[%d]--[dmapkt ] :%lld\r\n", i, + pqueue->s.dmapkt); + len += sprintf(buf + len, + "QUEUE[%d]--[dmapktbyte ] :%lld\r\n", i, + pqueue->s.dmapktbyte); + len += sprintf(buf + len, + "QUEUE[%d]--[next_to_fill ] :%d\r\n", i, + pqueue->next_to_fill); + len += sprintf(buf + len, + "QUEUE[%d]--[next_to_free ] :%d\r\n", i, + pqueue->next_to_free); + len += sprintf(buf + len, + "QUEUE[%d]--[head ] :%d\r\n", i, + pqueue->head); + len += sprintf(buf + len, + "QUEUE[%d]--[tail ] :%d\r\n", i, + pqueue->tail); + len += sprintf(buf + len, + "QUEUE[%d]--[work_limit ] :%d\r\n", i, + pqueue->work_limit); + len += sprintf(buf + len, + "=================SHARE=================\r\n"); + len += sprintf(buf + len, + "QUEUE[%d]--[next_to_fill] :%d\r\n", i, + pqueue->pshmqhd_v->next_to_fill); + len += sprintf(buf + len, + "QUEUE[%d]--[next_to_free] :%d\r\n", i, + pqueue->pshmqhd_v->next_to_free); + len += sprintf(buf + len, + "QUEUE[%d]--[head ] :%d\r\n", i, + pqueue->pshmqhd_v->head); + len += sprintf(buf + len, + "QUEUE[%d]--[tail ] :%d\r\n", i, + pqueue->pshmqhd_v->tail); + len += sprintf(buf + len, + "=======================================\r\n"); + len += sprintf(buf + len, + "QUEUE[%d]--[dropped_pkt] :%d\r\n", i, + pqueue->s.dropped_pkt); + len += sprintf(buf + len, + "QUEUE[%d]--[netifrx_err] :%d\r\n", i, + pqueue->s.netifrx_err); + len += sprintf(buf + len, + "QUEUE[%d]--[null_point ] :%d\r\n", i, + pqueue->s.null_point); + len += sprintf(buf + len, + "QUEUE[%d]--[retry_err ] :%d\r\n", i, + pqueue->s.retry_err); + len += sprintf(buf + len, + "QUEUE[%d]--[allocskb_err ] :%d\r\n", + i, pqueue->s.allocskb_err); + len += sprintf(buf + len, + "QUEUE[%d]--[q_full ] :%d\r\n", i, + pqueue->s.q_full); + len += sprintf(buf + len, + "QUEUE[%d]--[q_emp ] :%d\r\n", i, + pqueue->s.q_emp); + len += sprintf(buf + len, + "QUEUE[%d]--[need_fill ] :%d\r\n", i, + pqueue->s.need_fill); + len += sprintf(buf + len, + "QUEUE[%d]--[need_free ] :%d\r\n", i, + pqueue->s.need_free); + len += sprintf(buf + len, + "QUEUE[%d]--[type_err ] :%d\r\n", i, + pqueue->s.type_err); + len += sprintf(buf + len, + "QUEUE[%d]--[shm_full ] :%d\r\n", i, + pqueue->s.shm_full); + len += sprintf(buf + len, + "QUEUE[%d]--[shm_emp ] :%d\r\n", i, + pqueue->s.shm_emp); + len += sprintf(buf + len, + "QUEUE[%d]--[shmretry_err ] :%d\r\n", i, + pqueue->s.shmretry_err); + len += sprintf(buf + len, + "QUEUE[%d]--[shmqueue_noinit] :%d\r\n", + i, pqueue->s.shmqueue_noinit); + len += sprintf(buf + len, + "QUEUE[%d]--[dma_busy ] :%d\r\n", i, + pqueue->s.dma_busy); + len += sprintf(buf + len, + "QUEUE[%d]--[dma_mapping_err] :%d\r\n", + i, pqueue->s.dma_mapping_err); + len += sprintf(buf + len, + "QUEUE[%d]--[dma_failed ] :%d\r\n", i, + pqueue->s.dma_failed); + len += sprintf(buf + len, + "QUEUE[%d]--[dma_burst ] :%d\r\n", i, + pqueue->s.dma_burst); + len += sprintf(buf + len, + "QUEUE[%d]--[lbk_cnt ] :%d\r\n", i, + pqueue->s.lbk_cnt); + len += sprintf(buf + len, + "QUEUE[%d]--[dma_need_offset] :%d\r\n", + i, pqueue->s.dma_need_offset); + len += sprintf(buf + len, + "QUEUE[%d]--[lbk_txerr ] :%d\r\n", i, + pqueue->s.lbk_txerr); + } + } + + len += sprintf(buf + len, "=============BSPVETH STATIS===========\r\n"); + len += sprintf(buf + len, + "[bspveth_dev]:run_dma_rx_task:0x%-8x(%d)\r\n", + g_bspveth_dev.run_dma_rx_task, + g_bspveth_dev.run_dma_rx_task); + len += sprintf(buf + len, + "[bspveth_dev]:run_dma_tx_task:0x%-8x(%d)\r\n", + g_bspveth_dev.run_dma_tx_task, + g_bspveth_dev.run_dma_tx_task); + len += sprintf(buf + len, + "[bspveth_dev]:run_skb_rx_task:0x%-8x(%d)\r\n", + g_bspveth_dev.run_skb_rx_task, + g_bspveth_dev.run_skb_rx_task); + len += sprintf(buf + len, + "[bspveth_dev]:run_skb_fr_task:0x%-8x(%d)\r\n", + g_bspveth_dev.run_skb_fr_task, + g_bspveth_dev.run_skb_fr_task); + len += sprintf(buf + len, + "[bspveth_dev]:recv_int :0x%-8x(%d)\r\n", + g_bspveth_dev.recv_int, g_bspveth_dev.recv_int); + len += sprintf(buf + len, + "[bspveth_dev]:tobmc_int :0x%-8x(%d)\r\n", + g_bspveth_dev.tobmc_int, + g_bspveth_dev.tobmc_int); + len += sprintf(buf + len, + "[bspveth_dev]:shutdown_cnt :0x%-8x(%d)\r\n", + g_bspveth_dev.shutdown_cnt, + g_bspveth_dev.shutdown_cnt); + + return len; +} + +module_param_call(statistics, NULL, veth_param_get_statics, &debug, 0444); + +MODULE_PARM_DESC(statistics, "Statistics info of veth driver,readonly"); + +static void veth_reset_dma(int type) +{ + if (type == BSPVETH_RX) + bma_intf_reset_dma(BMC_TO_HOST); + else if (type == BSPVETH_TX) + bma_intf_reset_dma(HOST_TO_BMC); + else + return; +} + +s32 bspveth_setup_tx_resources(struct bspveth_device *pvethdev, + struct bspveth_rxtx_q *ptx_queue) +{ + unsigned int size; + + if (!pvethdev || !ptx_queue) + return BSP_ERR_NULL_POINTER; + + ptx_queue->count = MAX_QUEUE_BDNUM; + + size = sizeof(struct bspveth_bd_info) * ptx_queue->count; + ptx_queue->pbdinfobase_v = vmalloc(size); + if (!ptx_queue->pbdinfobase_v) + goto alloc_failed; + + memset(ptx_queue->pbdinfobase_v, 0, size); + + /* round up to nearest 4K */ + ptx_queue->size = ptx_queue->count * sizeof(struct bspveth_bd_info); + ptx_queue->size = ALIGN(ptx_queue->size, 4096); + + /* prepare 4096 send buffer */ + ptx_queue->pbdbase_v = kmalloc(ptx_queue->size, GFP_KERNEL); + if (!ptx_queue->pbdbase_v) { + VETH_LOG(DLOG_ERROR, + "Unable to kmalloc for the receive descriptor ring\n"); + + vfree(ptx_queue->pbdinfobase_v); + ptx_queue->pbdinfobase_v = NULL; + + goto alloc_failed; + } + + ptx_queue->pbdbase_p = (u8 *)(__pa((BSP_VETH_T)(ptx_queue->pbdbase_v))); + + ptx_queue->next_to_fill = 0; + ptx_queue->next_to_free = 0; + ptx_queue->head = 0; + ptx_queue->tail = 0; + ptx_queue->work_limit = BSPVETH_WORK_LIMIT; + + memset(&ptx_queue->s, 0, sizeof(struct bspveth_rxtx_statis)); + + return 0; + +alloc_failed: + return -ENOMEM; +} + +void bspveth_free_tx_resources(struct bspveth_device *pvethdev, + struct bspveth_rxtx_q *ptx_queue) +{ + unsigned int i; + unsigned long size; + struct bspveth_bd_info *pbdinfobase_v = NULL; + struct sk_buff *skb = NULL; + + if (!ptx_queue || !pvethdev) + return; + + pbdinfobase_v = ptx_queue->pbdinfobase_v; + if (!pbdinfobase_v) + return; + + for (i = 0; i < ptx_queue->count; i++) { + skb = pbdinfobase_v[i].pdma_v; + if (skb) + dev_kfree_skb_any(skb); + + pbdinfobase_v[i].pdma_v = NULL; + } + + size = sizeof(struct bspveth_bd_info) * ptx_queue->count; + memset(ptx_queue->pbdinfobase_v, 0, size); + memset(ptx_queue->pbdbase_v, 0, ptx_queue->size); + + ptx_queue->next_to_fill = 0; + ptx_queue->next_to_free = 0; + ptx_queue->head = 0; + ptx_queue->tail = 0; + + vfree(ptx_queue->pbdinfobase_v); + ptx_queue->pbdinfobase_v = NULL; + + kfree(ptx_queue->pbdbase_v); + ptx_queue->pbdbase_v = NULL; + + VETH_LOG(DLOG_DEBUG, "bspveth free tx resources ok, count=%d\n", + ptx_queue->count); +} + +s32 bspveth_setup_all_tx_resources(struct bspveth_device *pvethdev) +{ + int qid = 0; + int i = 0; + int err = 0; + u8 *shmq_head_p = NULL; + struct bspveth_shmq_hd *shmq_head = NULL; + + if (!pvethdev) + return BSP_ERR_NULL_POINTER; + for (qid = 0; qid < MAX_QUEUE_NUM; qid++) { + pvethdev->ptx_queue[qid] = + kmalloc(sizeof(*pvethdev->ptx_queue[qid]), + GFP_KERNEL); + if (!pvethdev->ptx_queue[qid]) { + VETH_LOG(DLOG_ERROR, + "kmalloc failed for ptx_queue[%d]\n", qid); + err = -1; + goto failed; + } + memset(pvethdev->ptx_queue[qid], + 0, sizeof(struct bspveth_rxtx_q)); + shmq_head = (struct bspveth_shmq_hd *)(pvethdev->pshmpool_v + + MAX_SHAREQUEUE_SIZE * (qid)); + pvethdev->ptx_queue[qid]->pshmqhd_v = shmq_head; + shmq_head_p = pvethdev->pshmpool_p + MAX_SHAREQUEUE_SIZE * qid; + pvethdev->ptx_queue[qid]->pshmqhd_p = shmq_head_p; + + pvethdev->ptx_queue[qid]->pshmbdbase_v = + (struct bspveth_dma_shmbd *)((BSP_VETH_T)(shmq_head) + + BSPVETH_SHMBDBASE_OFFSET); + pvethdev->ptx_queue[qid]->pshmbdbase_p = + (u8 *)((BSP_VETH_T)(shmq_head_p) + + BSPVETH_SHMBDBASE_OFFSET); + pvethdev->ptx_queue[qid]->pdmalbase_v = + (struct bspveth_dmal *)((BSP_VETH_T)(shmq_head) + + SHMDMAL_OFFSET); + pvethdev->ptx_queue[qid]->pdmalbase_p = + (u8 *)(u64)(VETH_SHAREPOOL_BASE_INBMC + + MAX_SHAREQUEUE_SIZE * qid + + SHMDMAL_OFFSET); + + memset(pvethdev->ptx_queue[qid]->pdmalbase_v, + 0, MAX_SHMDMAL_SIZE); + + err = bspveth_setup_tx_resources(pvethdev, + pvethdev->ptx_queue[qid]); + if (err) { + pvethdev->ptx_queue[qid]->pshmqhd_v = NULL; + kfree(pvethdev->ptx_queue[qid]); + pvethdev->ptx_queue[i] = NULL; + VETH_LOG(DLOG_ERROR, + "Allocation for Tx Queue %u failed\n", qid); + + goto failed; + } + } + + return 0; +failed: + for (i = 0; i < MAX_QUEUE_NUM; i++) { + bspveth_free_tx_resources(pvethdev, pvethdev->ptx_queue[i]); + kfree(pvethdev->ptx_queue[i]); + pvethdev->ptx_queue[i] = NULL; + } + + return err; +} + +void bspveth_free_all_tx_resources(struct bspveth_device *pvethdev) +{ + int i; + + if (!pvethdev) + return; + + for (i = 0; i < MAX_QUEUE_NUM; i++) { + if (pvethdev->ptx_queue[i]) + bspveth_free_tx_resources(pvethdev, + pvethdev->ptx_queue[i]); + + kfree(pvethdev->ptx_queue[i]); + pvethdev->ptx_queue[i] = NULL; + } +} + +s32 veth_alloc_one_rx_skb(struct bspveth_rxtx_q *prx_queue, int idx) +{ + dma_addr_t dma = 0; + struct sk_buff *skb; + struct bspveth_bd_info *pbdinfobase_v = NULL; + struct bspveth_dma_bd *pbdbase_v = NULL; + + pbdinfobase_v = prx_queue->pbdinfobase_v; + pbdbase_v = prx_queue->pbdbase_v; + + skb = netdev_alloc_skb(g_bspveth_dev.pnetdev, + BSPVETH_SKB_SIZE + BSPVETH_CACHELINE_SIZE); + if (!skb) { + VETH_LOG(DLOG_ERROR, "netdev_alloc_skb failed\n"); + return -ENOMEM; + } + + /* advance the data pointer to the next cache line */ + skb_reserve(skb, PTR_ALIGN(skb->data, + BSPVETH_CACHELINE_SIZE) - skb->data); + + dma = dma_map_single(&g_bspveth_dev.ppcidev->dev, + skb->data, BSPVETH_SKB_SIZE, DMA_FROM_DEVICE); + if (dma_mapping_error(&g_bspveth_dev.ppcidev->dev, dma)) { + VETH_LOG(DLOG_ERROR, "dma_mapping_error failed\n"); + dev_kfree_skb_any(skb); + return -EFAULT; + } + +#ifdef __UT_TEST + if (g_testdma) + VETH_LOG(DLOG_ERROR, + "[refill]:dma=0x%llx,skb=%p,skb->len=%d\r\n", + dma, skb, skb->len); +#endif + + pbdinfobase_v[idx].pdma_v = skb; + pbdinfobase_v[idx].len = BSPVETH_SKB_SIZE; + + pbdbase_v[idx].dma_p = dma; + pbdbase_v[idx].len = BSPVETH_SKB_SIZE; + + return 0; +} + +s32 veth_refill_rxskb(struct bspveth_rxtx_q *prx_queue, int queue) +{ + int i, work_limit; + unsigned int next_to_fill, tail; + int ret = BSP_OK; + + if (!prx_queue) + return BSP_ERR_AGAIN; + + work_limit = prx_queue->work_limit; + next_to_fill = prx_queue->next_to_fill; + tail = prx_queue->tail; + + for (i = 0; i < work_limit; i++) { + if (!JUDGE_RX_QUEUE_SPACE(next_to_fill, tail, 1)) + break; + + ret = veth_alloc_one_rx_skb(prx_queue, next_to_fill); + if (ret) + break; + + g_bspveth_dev.prx_queue[queue]->s.refill++; + next_to_fill = (next_to_fill + 1) & BSPVETH_POINT_MASK; + } + + prx_queue->next_to_fill = next_to_fill; + + tail = prx_queue->tail; + if (JUDGE_RX_QUEUE_SPACE(next_to_fill, tail, 1)) { + VETH_LOG(DLOG_DEBUG, "next_to_fill(%d) != tail(%d)\n", + next_to_fill, tail); + + return BSP_ERR_AGAIN; + } + + return 0; +} + +s32 bspveth_setup_rx_skb(struct bspveth_device *pvethdev, + struct bspveth_rxtx_q *prx_queue) +{ + u32 idx; + int ret = 0; + + if (!pvethdev || !prx_queue) + return BSP_ERR_NULL_POINTER; + + VETH_LOG(DLOG_DEBUG, "waite setup rx skb ,count=%d\n", + prx_queue->count); + + for (idx = 0; idx < prx_queue->count - 1; idx++) { + ret = veth_alloc_one_rx_skb(prx_queue, idx); + if (ret) + break; + } + + if (!idx) /* Can't alloc even one packets */ + return -EFAULT; + + prx_queue->next_to_fill = idx; + + VETH_LOG(DLOG_DEBUG, "prx_queue->next_to_fill=%d\n", + prx_queue->next_to_fill); + + VETH_LOG(DLOG_DEBUG, "setup rx skb ok, count=%d\n", prx_queue->count); + + return BSP_OK; +} + +void bspveth_free_rx_skb(struct bspveth_device *pvethdev, + struct bspveth_rxtx_q *prx_queue) +{ + u32 i = 0; + struct bspveth_bd_info *pbdinfobase_v = NULL; + struct bspveth_dma_bd *pbdbase_v = NULL; + struct sk_buff *skb = NULL; + + if (!pvethdev || !prx_queue) + return; + + pbdinfobase_v = prx_queue->pbdinfobase_v; + pbdbase_v = prx_queue->pbdbase_v; + if (!pbdinfobase_v || !pbdbase_v) + return; + + /* Free all the Rx ring pages */ + for (i = 0; i < prx_queue->count; i++) { + skb = pbdinfobase_v[i].pdma_v; + if (!skb) + continue; + + dma_unmap_single(&g_bspveth_dev.ppcidev->dev, + pbdbase_v[i].dma_p, BSPVETH_SKB_SIZE, + DMA_FROM_DEVICE); + dev_kfree_skb_any(skb); + + pbdinfobase_v[i].pdma_v = NULL; + } + + prx_queue->next_to_fill = 0; +} + +s32 bspveth_setup_all_rx_skb(struct bspveth_device *pvethdev) +{ + int qid, i, err = BSP_OK; + + if (!pvethdev) + return BSP_ERR_NULL_POINTER; + + for (qid = 0; qid < MAX_QUEUE_NUM; qid++) { + err = bspveth_setup_rx_skb(pvethdev, pvethdev->prx_queue[qid]); + if (err) { + VETH_LOG(DLOG_ERROR, "queue[%d]setup RX skb failed\n", + qid); + goto failed; + } + + VETH_LOG(DLOG_DEBUG, "queue[%d] bspveth_setup_rx_skb ok\n", + qid); + } + + return 0; + +failed: + for (i = 0; i < MAX_QUEUE_NUM; i++) + bspveth_free_rx_skb(pvethdev, pvethdev->prx_queue[i]); + + return err; +} + +void bspveth_free_all_rx_skb(struct bspveth_device *pvethdev) +{ + int qid; + + if (!pvethdev) + return; + + /* Free all the Rx ring pages */ + for (qid = 0; qid < MAX_QUEUE_NUM; qid++) + bspveth_free_rx_skb(pvethdev, pvethdev->prx_queue[qid]); +} + +s32 bspveth_setup_rx_resources(struct bspveth_device *pvethdev, + struct bspveth_rxtx_q *prx_queue) +{ + int size; + + if (!pvethdev || !prx_queue) + return BSP_ERR_NULL_POINTER; + + prx_queue->count = MAX_QUEUE_BDNUM; + size = sizeof(*prx_queue->pbdinfobase_v) * prx_queue->count; + prx_queue->pbdinfobase_v = vmalloc(size); + if (!prx_queue->pbdinfobase_v) { + VETH_LOG(DLOG_ERROR, + "Unable to vmalloc for the receive descriptor ring\n"); + + goto alloc_failed; + } + + memset(prx_queue->pbdinfobase_v, 0, size); + + /* Round up to nearest 4K */ + prx_queue->size = prx_queue->count * sizeof(*prx_queue->pbdbase_v); + prx_queue->size = ALIGN(prx_queue->size, 4096); + prx_queue->pbdbase_v = kmalloc(prx_queue->size, GFP_ATOMIC); + if (!prx_queue->pbdbase_v) { + VETH_LOG(DLOG_ERROR, + "Unable to kmalloc for the receive descriptor ring\n"); + + vfree(prx_queue->pbdinfobase_v); + prx_queue->pbdinfobase_v = NULL; + + goto alloc_failed; + } + + prx_queue->pbdbase_p = (u8 *)__pa((BSP_VETH_T) (prx_queue->pbdbase_v)); + + prx_queue->next_to_fill = 0; + prx_queue->next_to_free = 0; + prx_queue->head = 0; + prx_queue->tail = 0; + + prx_queue->work_limit = BSPVETH_WORK_LIMIT; + + memset(&prx_queue->s, 0, sizeof(struct bspveth_rxtx_statis)); + + return 0; + +alloc_failed: + return -ENOMEM; +} + +void bspveth_free_rx_resources(struct bspveth_device *pvethdev, + struct bspveth_rxtx_q *prx_queue) +{ + unsigned long size; + struct bspveth_bd_info *pbdinfobase_v = NULL; + + if (!pvethdev || !prx_queue) + return; + + pbdinfobase_v = prx_queue->pbdinfobase_v; + if (!pbdinfobase_v) + return; + + if (!prx_queue->pbdbase_v) + return; + + size = sizeof(struct bspveth_bd_info) * prx_queue->count; + memset(prx_queue->pbdinfobase_v, 0, size); + + /* Zero out the descriptor ring */ + memset(prx_queue->pbdbase_v, 0, prx_queue->size); + + vfree(prx_queue->pbdinfobase_v); + prx_queue->pbdinfobase_v = NULL; + + kfree(prx_queue->pbdbase_v); + prx_queue->pbdbase_v = NULL; + + VETH_LOG(DLOG_DEBUG, "bspveth free rx resources ok!!count=%d\n", + prx_queue->count); +} + +s32 bspveth_setup_all_rx_resources(struct bspveth_device *pvethdev) +{ + int qid, i, err = 0; + struct bspveth_shmq_hd *shmq_head = NULL; + u8 *shmq_head_p = NULL; + + if (!pvethdev) + return BSP_ERR_NULL_POINTER; + + for (qid = 0; qid < MAX_QUEUE_NUM; qid++) { + pvethdev->prx_queue[qid] = + kmalloc(sizeof(*pvethdev->prx_queue[qid]), GFP_KERNEL); + if (!pvethdev->prx_queue[qid]) { + VETH_LOG(DLOG_ERROR, + "kmalloc failed for prx_queue[%d]\n", qid); + + goto failed; + } + + memset(pvethdev->prx_queue[qid], 0, + sizeof(struct bspveth_rxtx_q)); + + shmq_head = (struct bspveth_shmq_hd *)(pvethdev->pshmpool_v + + MAX_SHAREQUEUE_SIZE * (qid + 1)); + + pvethdev->prx_queue[qid]->pshmqhd_v = shmq_head; + shmq_head_p = + pvethdev->pshmpool_p + MAX_SHAREQUEUE_SIZE * (qid + 1); + pvethdev->prx_queue[qid]->pshmqhd_p = shmq_head_p; + pvethdev->prx_queue[qid]->pshmbdbase_v = + (struct bspveth_dma_shmbd *)((BSP_VETH_T)(shmq_head) + + BSPVETH_SHMBDBASE_OFFSET); + pvethdev->prx_queue[qid]->pshmbdbase_p = + (u8 *)((BSP_VETH_T)(shmq_head_p) + + BSPVETH_SHMBDBASE_OFFSET); + pvethdev->prx_queue[qid]->pdmalbase_v = + (struct bspveth_dmal *)((BSP_VETH_T)(shmq_head) + + SHMDMAL_OFFSET); + pvethdev->prx_queue[qid]->pdmalbase_p = + (u8 *)(u64)(VETH_SHAREPOOL_BASE_INBMC + + MAX_SHAREQUEUE_SIZE * (qid + 1) + + SHMDMAL_OFFSET); + memset(pvethdev->prx_queue[qid]->pdmalbase_v, 0, + MAX_SHMDMAL_SIZE); + + err = bspveth_setup_rx_resources(pvethdev, + pvethdev->prx_queue[qid]); + if (err) { + kfree(pvethdev->prx_queue[qid]); + pvethdev->prx_queue[qid] = NULL; + VETH_LOG(DLOG_ERROR, + "Allocation for Rx Queue %u failed\n", qid); + + goto failed; + } + } + + return 0; +failed: + for (i = 0; i < MAX_QUEUE_NUM; i++) { + bspveth_free_rx_resources(pvethdev, pvethdev->prx_queue[i]); + kfree(pvethdev->prx_queue[i]); + pvethdev->prx_queue[i] = NULL; + } + return err; +} + +void bspveth_free_all_rx_resources(struct bspveth_device *pvethdev) +{ + int i; + + if (!pvethdev) + return; + + for (i = 0; i < MAX_QUEUE_NUM; i++) { + if (pvethdev->prx_queue[i]) { + bspveth_free_rx_resources(pvethdev, + pvethdev->prx_queue[i]); + } + + kfree(pvethdev->prx_queue[i]); + pvethdev->prx_queue[i] = NULL; + } +} + +s32 bspveth_dev_install(void) +{ + int err; + + err = bspveth_setup_all_rx_resources(&g_bspveth_dev); + if (err != BSP_OK) { + err = -1; + goto err_setup_rx; + } + + err = bspveth_setup_all_tx_resources(&g_bspveth_dev); + if (err != BSP_OK) { + err = -1; + goto err_setup_tx; + } + + err = bspveth_setup_all_rx_skb(&g_bspveth_dev); + if (err != BSP_OK) { + err = -1; + goto err_setup_rx_skb; + } + + return BSP_OK; + +err_setup_rx_skb: + bspveth_free_all_tx_resources(&g_bspveth_dev); + +err_setup_tx: + bspveth_free_all_rx_resources(&g_bspveth_dev); + +err_setup_rx: + + return err; +} + +s32 bspveth_dev_uninstall(void) +{ + int err = BSP_OK; + + /* Free all the Rx ring pages */ + bspveth_free_all_rx_skb(&g_bspveth_dev); + + bspveth_free_all_tx_resources(&g_bspveth_dev); + + VETH_LOG(DLOG_DEBUG, "bspveth_free_all_tx_resources ok\n"); + + bspveth_free_all_rx_resources(&g_bspveth_dev); + + VETH_LOG(DLOG_DEBUG, "bspveth_free_all_rx_resources ok\n"); + + return err; +} + +s32 veth_open(struct net_device *pstr_dev) +{ + s32 ret = BSP_OK; + + if (!pstr_dev) + return -1; + + if (!g_bspveth_dev.pnetdev) + g_bspveth_dev.pnetdev = pstr_dev; + + ret = bspveth_dev_install(); + if (ret != BSP_OK) { + ret = -1; + goto failed1; + } + + veth_skbtimer_init(); + + veth_dmatimer_init_H(); + + ret = bma_intf_register_int_notifier(&g_veth_int_nb); + if (ret != BSP_OK) { + ret = -1; + goto failed2; + } + + bma_intf_set_open_status(g_bspveth_dev.bma_priv, DEV_OPEN); + + g_bspveth_dev.prx_queue[0]->pshmqhd_v->tail = + g_bspveth_dev.prx_queue[0]->pshmqhd_v->head; + + bma_intf_int_to_bmc(g_bspveth_dev.bma_priv); + + netif_start_queue(g_bspveth_dev.pnetdev); + netif_carrier_on(pstr_dev); + + return BSP_OK; + +failed2: + veth_dmatimer_close_H(); + + veth_skbtimer_close(); + + (void)bspveth_dev_uninstall(); + +failed1: + return ret; +} + +s32 veth_close(struct net_device *pstr_dev) +{ + (void)bma_intf_unregister_int_notifier(&g_veth_int_nb); + + netif_carrier_off(pstr_dev); + + bma_intf_set_open_status(g_bspveth_dev.bma_priv, DEV_CLOSE); + + netif_stop_queue(g_bspveth_dev.pnetdev); + + (void)veth_dmatimer_close_H(); + (void)veth_skbtimer_close(); + + (void)bspveth_dev_uninstall(); + + return BSP_OK; +} + +s32 veth_config(struct net_device *pstr_dev, struct ifmap *pstr_map) +{ + if (!pstr_dev || !pstr_map) + return BSP_ERR_NULL_POINTER; + + /* can't act on a running interface */ + if (pstr_dev->flags & IFF_UP) + return -EBUSY; + + /* Don't allow changing the I/O address */ + if (pstr_map->base_addr != pstr_dev->base_addr) + return -EOPNOTSUPP; + + /* ignore other fields */ + return BSP_OK; +} + +void bspveth_initstatis(void) +{ + int i; + struct bspveth_rxtx_q *prx_queue = NULL; + struct bspveth_rxtx_q *ptx_queue = NULL; + + for (i = 0; i < MAX_QUEUE_NUM; i++) { + prx_queue = g_bspveth_dev.prx_queue[i]; + ptx_queue = g_bspveth_dev.ptx_queue[i]; + + if (prx_queue && ptx_queue) { + memset(&prx_queue->s, + 0, sizeof(struct bspveth_rxtx_statis)); + + memset(&ptx_queue->s, + 0, sizeof(struct bspveth_rxtx_statis)); + } else { + VETH_LOG(DLOG_ERROR, + "prx_queue OR ptx_queue is NULL\n"); + } + } + + VETH_LOG(DLOG_DEBUG, "bspveth initstatis ok\n"); +} + +s32 veth_ioctl(struct net_device *pstr_dev, struct ifreq *pifr, s32 l_cmd) +{ + return -EFAULT; +} + +struct net_device_stats *veth_stats(struct net_device *pstr_dev) +{ + return &g_bspveth_dev.stats; +} + +s32 veth_mac_set(struct net_device *pstr_dev, void *p_mac) +{ + struct sockaddr *str_addr = NULL; + u8 *puc_mac = NULL; + + if (!pstr_dev || !p_mac) + return BSP_ERR_NULL_POINTER; + + str_addr = (struct sockaddr *)p_mac; + puc_mac = (u8 *)str_addr->sa_data; + + pstr_dev->dev_addr[0] = puc_mac[0]; + pstr_dev->dev_addr[1] = puc_mac[1]; + pstr_dev->dev_addr[2] = puc_mac[2]; + pstr_dev->dev_addr[3] = puc_mac[3]; + pstr_dev->dev_addr[4] = puc_mac[4]; + pstr_dev->dev_addr[5] = puc_mac[5]; + + return BSP_OK; +} + +static u32 veth_ethtool_get_link(struct net_device *dev) +{ + if (!bma_intf_is_link_ok() || !netif_running(g_bspveth_dev.pnetdev)) + return 0; + + if (g_bspveth_dev.ptx_queue[0] && + g_bspveth_dev.ptx_queue[0]->pshmqhd_v) + return (u32)((BSPVETH_SHMQUEUE_INITOK == + g_bspveth_dev.ptx_queue[0]->pshmqhd_v->init) && + netif_carrier_ok(dev)); + + return 0; +} + +static void veth_ethtool_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *info) +{ + strlcpy(info->driver, MODULE_NAME, sizeof(info->driver)); + strlcpy(info->version, VETH_VERSION, sizeof(info->version)); + + info->n_stats = VETH_GLOBAL_STATS_LEN; +} + +static void veth_ethtool_get_stats(struct net_device *netdev, + struct ethtool_stats *tool_stats, u64 *data) +{ + unsigned int i = 0; + char *p = NULL; + const struct veth_stats *p_stat = veth_gstrings_stats; + struct bspveth_rxtx_q *ptx_node = g_bspveth_dev.ptx_queue[0]; + struct bspveth_rxtx_q *prx_node = g_bspveth_dev.prx_queue[0]; + char * const pstat_map[] = { + /* QUEUE TX STATS*/ + GET_QUEUE_STAT(ptx_node, p_stat), + /* QUEUE RX STATS*/ + GET_QUEUE_STAT(prx_node, p_stat), + /* VETH STATS */ + (char *)&g_bspveth_dev + p_stat->stat_offset, + /* SHMQ TX STATS */ + GET_SHM_QUEUE_STAT(ptx_node, p_stat), + /* SHMQ RX STATS */ + GET_SHM_QUEUE_STAT(prx_node, p_stat), + /* NET STATS */ + (char *)&g_bspveth_dev + p_stat->stat_offset + }; + + if (!data || !netdev || !tool_stats) + return; + + for (i = 0; i < VETH_GLOBAL_STATS_LEN; i++) { + p = NULL; + + if (p_stat->type > NET_STATS) + break; + + p = pstat_map[p_stat->type]; + + data[i] = GET_STATS_VALUE(p, p_stat); + + p_stat++; + } +} + +static void veth_get_strings(struct net_device *netdev, u32 stringset, u8 *data) +{ + u8 *p = data; + unsigned int i; + + if (!p) + return; + + switch (stringset) { + case ETH_SS_STATS: + for (i = 0; i < VETH_GLOBAL_STATS_LEN; i++) { + memcpy(p, veth_gstrings_stats[i].stat_string, + ETH_GSTRING_LEN); + + p += ETH_GSTRING_LEN; + } + + break; + } +} + +static int veth_get_sset_count(struct net_device *netdev, int sset) +{ + switch (sset) { + case ETH_SS_STATS: + return VETH_GLOBAL_STATS_LEN; + + default: + return -EOPNOTSUPP; + } +} + +const struct ethtool_ops veth_ethtool_ops = { + .get_drvinfo = veth_ethtool_get_drvinfo, + .get_link = veth_ethtool_get_link, + + .get_ethtool_stats = veth_ethtool_get_stats, + .get_strings = veth_get_strings, + .get_sset_count = veth_get_sset_count, + +}; + +static const struct net_device_ops veth_ops = { + .ndo_open = veth_open, + .ndo_stop = veth_close, + .ndo_set_config = veth_config, + .ndo_start_xmit = veth_tx, + .ndo_do_ioctl = veth_ioctl, + .ndo_get_stats = veth_stats, + .ndo_set_mac_address = veth_mac_set, +}; + +void veth_netdev_func_init(struct net_device *dev) +{ + struct tag_pcie_comm_priv *priv = + (struct tag_pcie_comm_priv *)netdev_priv(dev); + + VETH_LOG(DLOG_DEBUG, "eth init start\n"); + + ether_setup(dev); + + dev->netdev_ops = &veth_ops; + + dev->watchdog_timeo = BSPVETH_NET_TIMEOUT; + dev->mtu = BSPVETH_MTU_MAX; + dev->flags = IFF_BROADCAST; + dev->tx_queue_len = BSPVETH_MAX_QUE_DEEP; + dev->ethtool_ops = &veth_ethtool_ops; + + /* Then, initialize the priv field. This encloses the statistics + * and a few private fields. + */ + memset(priv, 0, sizeof(struct tag_pcie_comm_priv)); + strncpy(priv->net_type, MODULE_NAME, NET_TYPE_LEN); + + /*9C:7D:A3:28:6F:F9*/ + dev->dev_addr[0] = 0x9c; + dev->dev_addr[1] = 0x7d; + dev->dev_addr[2] = 0xa3; + dev->dev_addr[3] = 0x28; + dev->dev_addr[4] = 0x6f; + dev->dev_addr[5] = 0xf9; + + VETH_LOG(DLOG_DEBUG, "set veth MAC addr OK\n"); +} + +s32 veth_send_one_pkt(struct sk_buff *skb, int queue) +{ + u32 head, next_to_free; + dma_addr_t dma = 0; + u32 off = 0; + int ret = 0; + int type = BSPVETH_TX; + struct bspveth_bd_info *pbdinfo_v = NULL; + struct bspveth_dma_bd *pbd_v = NULL; + struct bspveth_rxtx_q *ptx_queue = g_bspveth_dev.ptx_queue[queue]; + + if (!skb || !ptx_queue || !ptx_queue->pbdinfobase_v || + !ptx_queue->pbdbase_v) { + INC_STATIS_RXTX(queue, null_point, 1, type); + return BSP_ERR_NULL_POINTER; + } + + if (!bma_intf_is_link_ok() || + ptx_queue->pshmqhd_v->init != BSPVETH_SHMQUEUE_INITOK) + return -1; + + head = ptx_queue->head; + next_to_free = ptx_queue->next_to_free; + + /* stop to send pkt when queue is going to full */ + if (!JUDGE_TX_QUEUE_SPACE(head, next_to_free, 3)) { + netif_stop_subqueue(g_bspveth_dev.pnetdev, queue); + VETH_LOG(DLOG_DEBUG, + "going to full, head: %d, nex to free: %d\n", + head, next_to_free); + } + + if (!JUDGE_TX_QUEUE_SPACE(head, next_to_free, 1)) + return BSP_NETDEV_TX_BUSY; + + if (skb_shinfo(skb)->nr_frags) { + /* We don't support frags */ + ret = skb_linearize(skb); + if (ret) + return -ENOMEM; + } + + dma = dma_map_single(&g_bspveth_dev.ppcidev->dev, skb->data, skb->len, + DMA_TO_DEVICE); + + ret = dma_mapping_error(&g_bspveth_dev.ppcidev->dev, dma); + if (ret != BSP_OK) { + ret = BSP_ERR_DMA_ERR; + g_bspveth_dev.ptx_queue[queue]->s.dma_mapping_err++; + goto failed; + } + + off = dma & 0x3; + if (off) + g_bspveth_dev.ptx_queue[queue]->s.dma_need_offset++; + + pbdinfo_v = &ptx_queue->pbdinfobase_v[head]; + pbdinfo_v->pdma_v = skb; + pbd_v = &ptx_queue->pbdbase_v[head]; + pbd_v->dma_p = dma & (~((u64)0x3)); + pbd_v->off = off; + pbd_v->len = skb->len; + + head = (head + 1) & BSPVETH_POINT_MASK; + ptx_queue->head = head; + + VETH_LOG(DLOG_DEBUG, + "[send]:oridma=0x%llx,skb=%p,skb->data=%p,skb->len=%d,", + (u64)dma, skb, skb->data, skb->len); + VETH_LOG(DLOG_DEBUG, "head=%d,off=%d, alidma0x%llx\n", head, off, + (u64)(dma & (~((u64)0x3)))); + + return BSP_OK; + +failed: + return ret; +} + +int veth_tx(struct sk_buff *skb, struct net_device *pstr_dev) +{ + u32 ul_ret = 0; + int queue = 0; + + VETH_LOG(DLOG_DEBUG, "===============enter==================\n"); + + if (!skb || !pstr_dev) { + g_bspveth_dev.ptx_queue[queue]->s.null_point++; + return NETDEV_TX_OK; + } + + VETH_LOG(DLOG_DEBUG, "skb->data=%p\n", skb->data); + VETH_LOG(DLOG_DEBUG, "skb->len=%d\n", skb->len); + + ul_ret = veth_send_one_pkt(skb, queue); + + if (ul_ret == BSP_OK) { + g_bspveth_dev.ptx_queue[queue]->s.pkt++; + g_bspveth_dev.stats.tx_packets++; + g_bspveth_dev.ptx_queue[queue]->s.pktbyte += skb->len; + g_bspveth_dev.stats.tx_bytes += skb->len; + +#ifndef USE_TASKLET + (void)mod_timer(&g_bspveth_dev.dmatimer, jiffies_64); +#else + tasklet_hi_schedule(&g_bspveth_dev.dma_task); +#endif + + } else { + VETH_LOG(DLOG_DEBUG, "=======exit ret = %d=======\n", ul_ret); + g_bspveth_dev.ptx_queue[queue]->s.dropped_pkt++; + g_bspveth_dev.stats.tx_dropped++; + dev_kfree_skb_any(skb); + } + + return NETDEV_TX_OK; +} + +s32 veth_free_txskb(struct bspveth_rxtx_q *ptx_queue, int queue) +{ + int i, work_limit; + unsigned int tail, next_to_free; + struct bspveth_bd_info *ptx_bdinfo_v = NULL; + struct sk_buff *skb = NULL; + struct bspveth_dma_bd *pbd_v = NULL; + + if (!ptx_queue) + return BSP_ERR_AGAIN; + + work_limit = ptx_queue->work_limit; + tail = ptx_queue->tail; + next_to_free = ptx_queue->next_to_free; + + for (i = 0; i < work_limit; i++) { + if (next_to_free == tail) + break; + + ptx_bdinfo_v = &ptx_queue->pbdinfobase_v[next_to_free]; + + pbd_v = &ptx_queue->pbdbase_v[next_to_free]; + + skb = ptx_bdinfo_v->pdma_v; + + dma_unmap_single(&g_bspveth_dev.ppcidev->dev, + pbd_v->dma_p | pbd_v->off, + pbd_v->len, DMA_TO_DEVICE); + + if (skb) + dev_kfree_skb_any(skb); + else + VETH_LOG(DLOG_ERROR, + "skb is NULL,tail=%d next_to_free=%d\n", + tail, next_to_free); + + ptx_bdinfo_v->pdma_v = NULL; + g_bspveth_dev.ptx_queue[queue]->s.freetx++; + + next_to_free = (next_to_free + 1) & BSPVETH_POINT_MASK; + } + + ptx_queue->next_to_free = next_to_free; + tail = ptx_queue->tail; + + if (next_to_free != tail) { + VETH_LOG(DLOG_DEBUG, "next_to_free(%d) != tail(%d)\n", + next_to_free, tail); + + return BSP_ERR_AGAIN; + } + + return BSP_OK; +} + +s32 veth_recv_pkt(struct bspveth_rxtx_q *prx_queue, int queue) +{ + int ret = BSP_OK, i, work_limit; + u32 tail, head; + struct bspveth_bd_info *prx_bdinfo_v = NULL; + struct bspveth_dma_bd *pbd_v = NULL; + struct sk_buff *skb = NULL; + dma_addr_t dma_map = 0; + u32 off = 0; + + if (!prx_queue) + return BSP_ERR_AGAIN; + + work_limit = prx_queue->work_limit; + tail = prx_queue->tail; + + for (i = 0; i < work_limit; i++) { + head = prx_queue->head; + if (tail == head) + break; + + prx_bdinfo_v = &prx_queue->pbdinfobase_v[tail]; + + skb = prx_bdinfo_v->pdma_v; + if (!skb) { + tail = (tail + 1) & BSPVETH_POINT_MASK; + continue; + } + + prx_bdinfo_v->pdma_v = NULL; + pbd_v = &prx_queue->pbdbase_v[tail]; + + off = pbd_v->off; + if (off) + skb_reserve(skb, off); + + dma_unmap_single(&g_bspveth_dev.ppcidev->dev, pbd_v->dma_p, + BSPVETH_SKB_SIZE, DMA_FROM_DEVICE); + + tail = (tail + 1) & BSPVETH_POINT_MASK; + + skb_put(skb, pbd_v->len); + + skb->protocol = eth_type_trans(skb, g_bspveth_dev.pnetdev); + skb->ip_summed = CHECKSUM_NONE; + + VETH_LOG(DLOG_DEBUG, + "skb->len=%d,skb->protocol=%d\n", + skb->len, skb->protocol); + + VETH_LOG(DLOG_DEBUG, + "dma_p=0x%llx,dma_map=0x%llx,", + pbd_v->dma_p, dma_map); + + VETH_LOG(DLOG_DEBUG, + "skb=%p,skb->data=%p,skb->len=%d,tail=%d,shm_off=%d\n", + skb, skb->data, skb->len, tail, off); + + VETH_LOG(DLOG_DEBUG, + "skb_transport_header=%p skb_mac_header=%p ", + skb_transport_header(skb), skb_mac_header(skb)); + + VETH_LOG(DLOG_DEBUG, + "skb_network_header=%p\n", skb_network_header(skb)); + + VETH_LOG(DLOG_DEBUG, + "skb->data=0x%p skb->tail=%08x skb->len=%08x\n", + skb->data, + (unsigned int)skb->tail, + (unsigned int)skb->len); + + g_bspveth_dev.prx_queue[queue]->s.pkt++; + g_bspveth_dev.stats.rx_packets++; + g_bspveth_dev.prx_queue[queue]->s.pktbyte += skb->len; + g_bspveth_dev.stats.rx_bytes += skb->len; + + ret = netif_rx(skb); + if (ret == NET_RX_DROP) { + g_bspveth_dev.prx_queue[queue]->s.netifrx_err++; + g_bspveth_dev.stats.rx_errors++; + + VETH_LOG(DLOG_DEBUG, "netif_rx failed\n"); + } + } + + prx_queue->tail = tail; + head = prx_queue->head; + + ret = veth_refill_rxskb(prx_queue, queue); + if (ret != BSP_OK) + VETH_LOG(DLOG_DEBUG, "veth_refill_rxskb failed\n"); + + if (tail != head) { + VETH_LOG(DLOG_DEBUG, "tail(%d) != head(%d)\n", tail, head); + + return BSP_ERR_AGAIN; + } + + return BSP_OK; +} + +#if !defined(USE_TASKLET) && defined(HAVE_TIMER_SETUP) +void veth_skbtrtimer_do(struct timer_list *t) +#else +void veth_skbtrtimer_do(unsigned long data) +#endif +{ + int ret = 0; + + ret = veth_skb_tr_task(); + if (ret == BSP_ERR_AGAIN) { +#ifndef USE_TASKLET + (void)mod_timer(&g_bspveth_dev.skbtrtimer, jiffies_64); +#else + tasklet_hi_schedule(&g_bspveth_dev.skb_task); +#endif + } +} + +s32 veth_skbtimer_close(void) +{ +#ifndef USE_TASKLET + (void)del_timer_sync(&g_bspveth_dev.skbtrtimer); +#else + tasklet_kill(&g_bspveth_dev.skb_task); +#endif + + VETH_LOG(DLOG_DEBUG, "veth skbtimer close ok\n"); + + return 0; +} + +void veth_skbtimer_init(void) +{ +#ifndef USE_TASKLET +#ifdef HAVE_TIMER_SETUP + timer_setup(&g_bspveth_dev.skbtrtimer, veth_skbtrtimer_do, 0); +#else + setup_timer(&g_bspveth_dev.skbtrtimer, veth_skbtrtimer_do, + (unsigned long)&g_bspveth_dev); +#endif + (void)mod_timer(&g_bspveth_dev.skbtrtimer, + jiffies_64 + BSPVETH_SKBTIMER_INTERVAL); +#else + tasklet_init(&g_bspveth_dev.skb_task, veth_skbtrtimer_do, + (unsigned long)&g_bspveth_dev); +#endif + + VETH_LOG(DLOG_DEBUG, "veth skbtimer init OK\n"); +} + +void veth_netdev_exit(void) +{ + if (g_bspveth_dev.pnetdev) { + netif_stop_queue(g_bspveth_dev.pnetdev); + unregister_netdev(g_bspveth_dev.pnetdev); + free_netdev(g_bspveth_dev.pnetdev); + + VETH_LOG(DLOG_DEBUG, "veth netdev exit OK.\n"); + } else { + VETH_LOG(DLOG_DEBUG, "veth_dev.pnetdev NULL.\n"); + } +} + +static void veth_shutdown_task(struct work_struct *work) +{ + struct net_device *netdev = g_bspveth_dev.pnetdev; + + VETH_LOG(DLOG_ERROR, "veth is going down, please restart it manual\n"); + + g_bspveth_dev.shutdown_cnt++; + + if (netif_carrier_ok(netdev)) { + (void)bma_intf_unregister_int_notifier(&g_veth_int_nb); + + netif_carrier_off(netdev); + + bma_intf_set_open_status(g_bspveth_dev.bma_priv, DEV_CLOSE); + + /* can't transmit any more */ + netif_stop_queue(g_bspveth_dev.pnetdev); + + (void)veth_skbtimer_close(); + + (void)veth_dmatimer_close_H(); + } +} + +s32 veth_netdev_init(void) +{ + s32 l_ret = 0; + struct net_device *netdev = NULL; + + netdev = alloc_netdev_mq(sizeof(struct tag_pcie_comm_priv), + BSPVETH_DEV_NAME, NET_NAME_UNKNOWN, + veth_netdev_func_init, 1); + + /* register netdev */ + l_ret = register_netdev(netdev); + if (l_ret < 0) { + VETH_LOG(DLOG_ERROR, "register_netdev failed!ret=%d\n", l_ret); + + return -ENODEV; + } + + g_bspveth_dev.pnetdev = netdev; + + VETH_LOG(DLOG_DEBUG, "veth netdev init OK\n"); + + INIT_WORK(&g_bspveth_dev.shutdown_task, veth_shutdown_task); + + netif_carrier_off(netdev); + + return BSP_OK; +} + +int veth_skb_tr_task(void) +{ + int rett = BSP_OK; + int retr = BSP_OK; + int i = 0; + int task_state = BSP_OK; + struct bspveth_rxtx_q *ptx_queue = NULL; + struct bspveth_rxtx_q *prx_queue = NULL; + + for (i = 0; i < MAX_QUEUE_NUM; i++) { + prx_queue = g_bspveth_dev.prx_queue[i]; + if (prx_queue) { + g_bspveth_dev.run_skb_rx_task++; + retr = veth_recv_pkt(prx_queue, i); + } + + ptx_queue = g_bspveth_dev.ptx_queue[i]; + if (ptx_queue) { + g_bspveth_dev.run_skb_fr_task++; + rett = veth_free_txskb(ptx_queue, i); + if (__netif_subqueue_stopped + (g_bspveth_dev.pnetdev, i) && + JUDGE_TX_QUEUE_SPACE + (ptx_queue->head, + ptx_queue->next_to_free, 5)) { + netif_wake_subqueue(g_bspveth_dev.pnetdev, i); + VETH_LOG(DLOG_DEBUG, "queue is free, "); + VETH_LOG(DLOG_DEBUG, + "head: %d, next to free: %d\n", + ptx_queue->head, + ptx_queue->next_to_free); + } + } + + if (rett == BSP_ERR_AGAIN || retr == BSP_ERR_AGAIN) + task_state = BSP_ERR_AGAIN; + } + + return task_state; +} + +static int veth_int_handler(struct notifier_block *pthis, unsigned long ev, + void *unuse) +{ + g_bspveth_dev.recv_int++; + + if (netif_running(g_bspveth_dev.pnetdev)) { +#ifndef USE_TASKLET + (void)mod_timer(&g_bspveth_dev.dmatimer, jiffies_64); +#else + tasklet_schedule(&g_bspveth_dev.dma_task); + +#endif + } else { + VETH_LOG(DLOG_DEBUG, "netif is not running\n"); + } + + return IRQ_HANDLED; +} + +#if !defined(USE_TASKLET) && defined(HAVE_TIMER_SETUP) +void veth_dma_tx_timer_do_H(struct timer_list *t) +#else +void veth_dma_tx_timer_do_H(unsigned long data) +#endif +{ + int txret, rxret; + + txret = veth_dma_task_H(BSPVETH_TX); + + rxret = veth_dma_task_H(BSPVETH_RX); + + if (txret == BSP_ERR_AGAIN || rxret == BSP_ERR_AGAIN) { +#ifndef USE_TASKLET + (void)mod_timer(&g_bspveth_dev.dmatimer, jiffies_64); +#else + tasklet_hi_schedule(&g_bspveth_dev.dma_task); +#endif + } +} + +s32 veth_dmatimer_close_H(void) +{ +#ifndef USE_TASKLET + (void)del_timer_sync(&g_bspveth_dev.dmatimer); +#else + tasklet_kill(&g_bspveth_dev.dma_task); +#endif + + VETH_LOG(DLOG_DEBUG, "bspveth_dmatimer_close RXTX TIMER ok\n"); + + return 0; +} + +void veth_dmatimer_init_H(void) +{ +#ifndef USE_TASKLET +#ifdef HAVE_TIMER_SETUP + timer_setup(&g_bspveth_dev.dmatimer, veth_dma_tx_timer_do_H, 0); +#else + setup_timer(&g_bspveth_dev.dmatimer, veth_dma_tx_timer_do_H, + (unsigned long)&g_bspveth_dev); +#endif + (void)mod_timer(&g_bspveth_dev.dmatimer, + jiffies_64 + BSPVETH_DMATIMER_INTERVAL); +#else + tasklet_init(&g_bspveth_dev.dma_task, veth_dma_tx_timer_do_H, + (unsigned long)&g_bspveth_dev); +#endif + + VETH_LOG(DLOG_DEBUG, "bspveth_dmatimer_init RXTX TIMER OK\n"); +} + +s32 dmacmp_err_deal(struct bspveth_rxtx_q *prxtx_queue, u32 queue, + u32 type) +{ + prxtx_queue->dmacmperr = 0; + prxtx_queue->start_dma = 0; + + (void)veth_reset_dma(type); + + if (type == BSPVETH_RX) { + VETH_LOG(DLOG_DEBUG, + "bmc->host dma time out,dma count:%d,work_limit:%d\n", + prxtx_queue->dmal_cnt, + prxtx_queue->work_limit); + + g_bspveth_dev.prx_queue[queue]->s.dma_failed++; + } else { + VETH_LOG(DLOG_DEBUG, + "host->bmc dma time out,dma count:%d,work_limit:%d\n", + prxtx_queue->dmal_cnt, + prxtx_queue->work_limit); + + g_bspveth_dev.ptx_queue[queue]->s.dma_failed++; + } + + if (prxtx_queue->dmal_cnt > 1) + prxtx_queue->work_limit = (prxtx_queue->dmal_cnt >> 1); + + prxtx_queue->dma_overtime++; + if (prxtx_queue->dma_overtime > BSPVETH_MAX_QUE_DEEP) { + schedule_work(&g_bspveth_dev.shutdown_task); + + return -EFAULT; + } + + return BSP_OK; +} + +s32 veth_check_dma_status(struct bspveth_rxtx_q *prxtx_queue, + u32 queue, u32 type) +{ + int i = 0; + enum dma_direction_e dir; + + dir = GET_DMA_DIRECTION(type); + + for (i = 0; i < BSPVETH_CHECK_DMA_STATUS_TIMES; i++) { + if (bma_intf_check_dma_status(dir) == BSPVETH_DMA_OK) + break; + + cpu_relax(); + + if (i > 20) + udelay(5); + } + + if (i >= BSPVETH_CHECK_DMA_STATUS_TIMES) { + INC_STATIS_RXTX(queue, dma_busy, 1, type); + prxtx_queue->dmacmperr++; + + return -EFAULT; + } + + return BSP_OK; +} + +s32 __check_dmacmp_H(struct bspveth_rxtx_q *prxtx_queue, u32 queue, + u32 type) +{ + u16 start_dma = 0; + u16 dmacmperr = 0; + u32 cnt = 0; + u32 len = 0; + u32 host_head = 0; + u32 host_tail = 0; + u32 shm_head = 0; + u32 shm_tail = 0; + s32 ret = 0; + struct bspveth_shmq_hd *pshmq_head = NULL; + + if (!prxtx_queue || !prxtx_queue->pshmqhd_v) + return BSP_ERR_NULL_POINTER; + + pshmq_head = prxtx_queue->pshmqhd_v; + dmacmperr = prxtx_queue->dmacmperr; + start_dma = prxtx_queue->start_dma; + if (!start_dma) + return BSP_OK; + + if (dmacmperr > BSPVETH_WORK_LIMIT / 4) + return dmacmp_err_deal(prxtx_queue, queue, type); + + ret = veth_check_dma_status(prxtx_queue, queue, type); + if (ret != BSP_OK) + return ret; + + prxtx_queue->start_dma = 0; + prxtx_queue->dma_overtime = 0; + + if (type == BSPVETH_RX) { + cnt = prxtx_queue->dmal_cnt; + len = prxtx_queue->dmal_byte; + + host_head = prxtx_queue->head; + shm_tail = pshmq_head->tail; + + pshmq_head->tail = (shm_tail + cnt) & BSPVETH_POINT_MASK; + prxtx_queue->head = (host_head + cnt) & BSPVETH_POINT_MASK; + + g_bspveth_dev.prx_queue[queue]->s.dmapkt += cnt; + g_bspveth_dev.prx_queue[queue]->s.dmapktbyte += len; + } else { + cnt = prxtx_queue->dmal_cnt; + len = prxtx_queue->dmal_byte; + + host_tail = prxtx_queue->tail; + shm_head = pshmq_head->head; + + prxtx_queue->tail = (host_tail + cnt) & BSPVETH_POINT_MASK; + pshmq_head->head = (shm_head + cnt) & BSPVETH_POINT_MASK; + + g_bspveth_dev.ptx_queue[queue]->s.dmapkt += cnt; + g_bspveth_dev.ptx_queue[queue]->s.dmapktbyte += len; + } + +#ifndef USE_TASKLET + (void)mod_timer(&g_bspveth_dev.skbtrtimer, jiffies_64); +#else + tasklet_hi_schedule(&g_bspveth_dev.skb_task); +#endif + + (void)bma_intf_int_to_bmc(g_bspveth_dev.bma_priv); + + g_bspveth_dev.tobmc_int++; + + return BSP_OK; +} + +s32 __checkspace_H(struct bspveth_rxtx_q *prxtx_queue, u32 queue, + u32 type, u32 *pcnt) +{ + int ret = BSP_OK; + u32 host_head, host_tail, host_nextfill; + u32 shm_head, shm_tail, shm_nextfill; + u32 shm_cnt, host_cnt, cnt_tmp, cnt; + struct bspveth_shmq_hd *pshmq_head = NULL; + + if (!prxtx_queue || !prxtx_queue->pshmqhd_v) + return BSP_ERR_NULL_POINTER; + + pshmq_head = prxtx_queue->pshmqhd_v; + host_head = prxtx_queue->head; + host_tail = prxtx_queue->tail; + host_nextfill = prxtx_queue->next_to_fill; + shm_head = pshmq_head->head; + shm_tail = pshmq_head->tail; + shm_nextfill = pshmq_head->next_to_fill; + + switch (type) { + case BSPVETH_RX: + if (shm_tail == shm_head) { + INC_STATIS_RXTX(queue, shm_emp, 1, type); + ret = BSP_ERR_NOT_TO_HANDLE; + goto failed; + } + + if (!JUDGE_RX_QUEUE_SPACE(host_head, host_nextfill, 1)) + return -EFAULT; + + shm_cnt = (shm_head - shm_tail) & BSPVETH_POINT_MASK; + cnt_tmp = min(shm_cnt, prxtx_queue->work_limit); + + host_cnt = (host_nextfill - host_head) & BSPVETH_POINT_MASK; + cnt = min(cnt_tmp, host_cnt); + + break; + + case BSPVETH_TX: + if (host_tail == host_head) { + INC_STATIS_RXTX(queue, q_emp, 1, type); + ret = BSP_ERR_NOT_TO_HANDLE; + goto failed; + } + + if (!JUDGE_TX_QUEUE_SPACE(shm_head, shm_nextfill, 1)) + return -EFAULT; + + host_cnt = (host_head - host_tail) & BSPVETH_POINT_MASK; + cnt_tmp = min(host_cnt, prxtx_queue->work_limit); + shm_cnt = (shm_nextfill - (shm_head + 1)) & BSPVETH_POINT_MASK; + cnt = min(cnt_tmp, shm_cnt); + + break; + + default: + INC_STATIS_RXTX(queue, type_err, 1, type); + ret = -EFAULT; + goto failed; + } + + if (cnt > (BSPVETH_DMABURST_MAX * 7 / 8)) + INC_STATIS_RXTX(queue, dma_burst, 1, type); + +#ifdef __UT_TEST + if (g_testdma) { + VETH_LOG(DLOG_ERROR, + "[type %d],host_cnt=%d cnt_tmp=%d shm_cnt=%d cnt=%d\n", + type, host_cnt, cnt_tmp, shm_cnt, cnt); + } +#endif + + *pcnt = cnt; + + return BSP_OK; + +failed: + return ret; +} + +int __make_dmalistbd_h2b_H(struct bspveth_rxtx_q *prxtx_queue, + u32 cnt, u32 type) +{ + u32 i = 0; + u32 len = 0; + u32 host_tail = 0; + u32 shm_head = 0; + u32 off = 0; + struct bspveth_dmal *pdmalbase_v = NULL; + struct bspveth_shmq_hd *pshmq_head = NULL; + struct bspveth_bd_info *pbdinfobase_v = NULL; + struct bspveth_dma_bd *pbdbase_v = NULL; + struct bspveth_dma_shmbd *pshmbdbase_v = NULL; + + if (!prxtx_queue) + return BSP_ERR_NULL_POINTER; + + pdmalbase_v = prxtx_queue->pdmalbase_v; + pshmq_head = prxtx_queue->pshmqhd_v; + pbdinfobase_v = prxtx_queue->pbdinfobase_v; + pbdbase_v = prxtx_queue->pbdbase_v; + pshmbdbase_v = prxtx_queue->pshmbdbase_v; + if (!pdmalbase_v || !pshmq_head || !pbdinfobase_v || + !pbdbase_v || !pshmbdbase_v) + return BSP_ERR_NULL_POINTER; + + host_tail = prxtx_queue->tail; + shm_head = pshmq_head->head; + + for (i = 0; i < cnt; i++) { + off = pbdbase_v[QUEUE_MASK(host_tail + i)].off; + + if (i == (cnt - 1)) + pdmalbase_v[i].chl = 0x9; + else + pdmalbase_v[i].chl = 0x0000001; + pdmalbase_v[i].len = + (pbdinfobase_v[QUEUE_MASK(host_tail + i)].pdma_v)->len; + pdmalbase_v[i].slow = + lower_32_bits(pbdbase_v[QUEUE_MASK(host_tail + i)].dma_p); + pdmalbase_v[i].shi = + upper_32_bits(pbdbase_v[QUEUE_MASK(host_tail + i)].dma_p); + pdmalbase_v[i].dlow = + lower_32_bits(pshmbdbase_v[QUEUE_MASK(shm_head + i)].dma_p); + pdmalbase_v[i].dhi = 0; + + pshmbdbase_v[QUEUE_MASK(shm_head + i)].len = pdmalbase_v[i].len; + + pdmalbase_v[i].len += off; + + pshmbdbase_v[QUEUE_MASK(shm_head + i)].off = off; + + len += pdmalbase_v[i].len; + +#ifdef __UT_TEST + if (g_testdma) { + struct sk_buff *skb = + pbdinfobase_v[QUEUE_MASK(host_tail + i)].pdma_v; + + VETH_LOG(DLOG_ERROR, + "[%d][makebd-H2B]:chl=0x%x,len=%d,slow=0x%x,", + i, pdmalbase_v[i].chl, pdmalbase_v[i].len, + pdmalbase_v[i].slow); + VETH_LOG(DLOG_ERROR, + "shi=0x%x,dlow=0x%x,dhi=0x%x,skb=%p,", + pdmalbase_v[i].shi, pdmalbase_v[i].dlow, + pdmalbase_v[i].dhi, skb); + VETH_LOG(DLOG_ERROR, + "skb->data=%p,skb->len=%d,host_tail+i=%d,", + skb->data, skb->len, + QUEUE_MASK(host_tail + i)); + VETH_LOG(DLOG_ERROR, + "shm_head+i=%d,off=%d\n", + QUEUE_MASK(shm_head + i), off); + } +#endif + } + + pdmalbase_v[i].chl = 0x7; + pdmalbase_v[i].len = 0x0; + pdmalbase_v[i].slow = lower_32_bits((u64)prxtx_queue->pdmalbase_p); + pdmalbase_v[i].shi = upper_32_bits((u64)prxtx_queue->pdmalbase_p); + pdmalbase_v[i].dlow = 0; + pdmalbase_v[i].dhi = 0; + + prxtx_queue->dmal_cnt = cnt; + prxtx_queue->dmal_byte = len; + +#ifdef __UT_TEST + if (g_testdma) { + VETH_LOG(DLOG_ERROR, + "[END][makebd-H2B]:chl=0x%x,len=%d,slow=0x%x,", + pdmalbase_v[i].chl, pdmalbase_v[i].len, + pdmalbase_v[i].slow); + VETH_LOG(DLOG_ERROR, + "shi=0x%x,dmal_cnt=%d,dmal_dir=%d,dmal_byte=%d,", + pdmalbase_v[i].shi, cnt, type, len); + VETH_LOG(DLOG_ERROR, "pdmalbase_v=%p\n", pdmalbase_v); + } +#endif + + return 0; +} + +int __make_dmalistbd_b2h_H(struct bspveth_rxtx_q *prxtx_queue, u32 cnt, + u32 type) +{ + u32 i, len = 0, host_head, shm_tail, off; + struct bspveth_dmal *pdmalbase_v = NULL; + struct bspveth_shmq_hd *pshmq_head = NULL; + struct bspveth_bd_info *pbdinfobase_v = NULL; + struct bspveth_dma_bd *pbdbase_v = NULL; + struct bspveth_dma_shmbd *pshmbdbase_v = NULL; + + if (!prxtx_queue) { + VETH_LOG(DLOG_ERROR, + "[END][makebd-B2H]:prxtx_queue NULL!!!\n"); + return BSP_ERR_NULL_POINTER; + } + + pdmalbase_v = prxtx_queue->pdmalbase_v; + pshmq_head = prxtx_queue->pshmqhd_v; + pbdinfobase_v = prxtx_queue->pbdinfobase_v; + pbdbase_v = prxtx_queue->pbdbase_v; + pshmbdbase_v = prxtx_queue->pshmbdbase_v; + if (!pdmalbase_v || !pshmq_head || !pbdinfobase_v || + !pbdbase_v || !pshmbdbase_v) { + VETH_LOG(DLOG_ERROR, + "[END][makebd-B2H]:pdmalbase_v NULL!!!\n"); + return BSP_ERR_NULL_POINTER; + } + + host_head = prxtx_queue->head; + shm_tail = pshmq_head->tail; + + for (i = 0; i < cnt; i++) { + off = pshmbdbase_v[QUEUE_MASK(shm_tail + i)].off; + if (i == (cnt - 1)) + pdmalbase_v[i].chl = 0x9; + else + pdmalbase_v[i].chl = 0x0000001; + pdmalbase_v[i].len = pshmbdbase_v[QUEUE_MASK(shm_tail + i)].len; + pdmalbase_v[i].slow = + lower_32_bits(pshmbdbase_v[QUEUE_MASK(shm_tail + i)].dma_p); + pdmalbase_v[i].shi = 0; + pdmalbase_v[i].dlow = + lower_32_bits(pbdbase_v[QUEUE_MASK(host_head + i)].dma_p); + pdmalbase_v[i].dhi = + upper_32_bits(pbdbase_v[QUEUE_MASK(host_head + i)].dma_p); + pdmalbase_v[i].len += off; + + pbdbase_v[QUEUE_MASK(host_head + i)].off = off; + pbdbase_v[QUEUE_MASK(host_head + i)].len = pdmalbase_v[i].len; + + len += pdmalbase_v[i].len; + +#ifdef __UT_TEST + if (g_testdma) { + struct sk_buff *skb = + pbdinfobase_v[QUEUE_MASK(host_head + i)].pdma_v; + + VETH_LOG(DLOG_ERROR, + "[%d][makebd-B2H]:chl=0x%x,len=%d,slow=0x%x,", + i, pdmalbase_v[i].chl, pdmalbase_v[i].len, + pdmalbase_v[i].slow); + VETH_LOG(DLOG_ERROR, + "shi=0x%x,dlow=0x%x,dhi=0x%x,skb=%p,", + pdmalbase_v[i].shi, pdmalbase_v[i].dlow, + pdmalbase_v[i].dhi, skb); + VETH_LOG(DLOG_ERROR, + "skb->data=%p,skb->len=%d,shm_tail+i=%d,", + skb->data, skb->len, + QUEUE_MASK(shm_tail + i)); + VETH_LOG(DLOG_ERROR, + "host_head+i=%d,off=%d\n", + QUEUE_MASK(host_head + i), off); + } +#endif + } + + pdmalbase_v[i].chl = 0x0000007; + pdmalbase_v[i].len = 0x0; + pdmalbase_v[i].slow = lower_32_bits((u64)prxtx_queue->pdmalbase_p); + pdmalbase_v[i].shi = upper_32_bits((u64)prxtx_queue->pdmalbase_p); + pdmalbase_v[i].dlow = 0; + pdmalbase_v[i].dhi = 0; + + prxtx_queue->dmal_cnt = cnt; + prxtx_queue->dmal_byte = len; + +#ifdef __UT_TEST + if (g_testdma) { + VETH_LOG(DLOG_ERROR, + "[END][makebd-B2H]:chl=0x%x,len=%d,slow=0x%x,", + pdmalbase_v[i].chl, pdmalbase_v[i].len, + pdmalbase_v[i].slow); + VETH_LOG(DLOG_ERROR, + "shi=0x%x,dmal_cnt=%d,dmal_dir=%d,dmal_byte=%d ", + pdmalbase_v[i].shi, cnt, type, len); + VETH_LOG(DLOG_ERROR, "pdmalbase_v=%p\n", pdmalbase_v); + } + +#endif + + return 0; +} + +s32 __start_dmalist_H(struct bspveth_rxtx_q *prxtx_queue, u32 cnt, u32 type) +{ + int ret = BSP_OK; + struct bma_dma_transfer_s dma_transfer = { 0 }; + + if (!prxtx_queue) + return -1; + + switch (type) { + case BSPVETH_RX: + ret = __make_dmalistbd_b2h_H(prxtx_queue, cnt, type); + if (ret) + goto failed; + dma_transfer.dir = BMC_TO_HOST; + + break; + + case BSPVETH_TX: + ret = __make_dmalistbd_h2b_H(prxtx_queue, cnt, type); + if (ret) + goto failed; + dma_transfer.dir = HOST_TO_BMC; + + break; + + default: + ret = -1; + goto failed; + } + + dma_transfer.type = DMA_LIST; + dma_transfer.transfer.list.dma_addr = + (dma_addr_t)prxtx_queue->pdmalbase_p; + + ret = bma_intf_start_dma(g_bspveth_dev.bma_priv, &dma_transfer); + if (ret < 0) + goto failed; + + prxtx_queue->start_dma = 1; + + return BSP_OK; + +failed: + return ret; +} + +int check_dma_queue_fault(struct bspveth_rxtx_q *prxtx_queue, + u32 queue, u32 type, u32 *pcnt) +{ + int ret = BSP_OK; + u32 cnt = 0; + + if (prxtx_queue->dma_overtime > BSPVETH_MAX_QUE_DEEP) + return -EFAULT; + + ret = __check_dmacmp_H(prxtx_queue, queue, type); + if (ret != BSP_OK) + return -EFAULT; + + ret = __checkspace_H(prxtx_queue, queue, type, &cnt); + if (ret != BSP_OK) + return -EFAULT; + + if (CHECK_DMA_RXQ_FAULT(prxtx_queue, type, cnt)) { + udelay(50); + prxtx_queue->dmal_cnt--; + + return -EFAULT; + } + + *pcnt = cnt; + + return BSP_OK; +} + +s32 __dma_rxtx_H(struct bspveth_rxtx_q *prxtx_queue, u32 queue, u32 type) +{ + int ret = BSP_OK; + u32 cnt = 0; + u32 shm_init; + struct bspveth_shmq_hd *pshmq_head = NULL; + + if (!prxtx_queue || !prxtx_queue->pshmqhd_v) + return BSP_ERR_NULL_POINTER; + + pshmq_head = prxtx_queue->pshmqhd_v; + shm_init = pshmq_head->init; + if (shm_init != BSPVETH_SHMQUEUE_INITOK) { + INC_STATIS_RXTX(queue, shmqueue_noinit, 1, type); + return -EFAULT; + } + + if (CHECK_DMA_QUEUE_EMPTY(type, prxtx_queue)) + return BSP_OK; + + ret = check_dma_queue_fault(prxtx_queue, queue, type, &cnt); + if (ret != BSP_OK) + return -EFAULT; + + ret = __start_dmalist_H(prxtx_queue, cnt, type); + if (ret != BSP_OK) + return -EFAULT; + + if (cnt <= 16) { + ret = __check_dmacmp_H(prxtx_queue, queue, type); + if (ret != BSP_OK) + return -EFAULT; + } + + return BSP_OK; +} + +int veth_dma_task_H(u32 type) +{ + int i; + struct bspveth_rxtx_q *prxtx_queue = NULL; + + for (i = 0; i < MAX_QUEUE_NUM; i++) { + if (type == BSPVETH_RX) { + g_bspveth_dev.run_dma_rx_task++; + prxtx_queue = g_bspveth_dev.prx_queue[i]; + } else { + g_bspveth_dev.run_dma_tx_task++; + prxtx_queue = g_bspveth_dev.ptx_queue[i]; + } + + if (prxtx_queue) { + struct bspveth_shmq_hd *pshmq_head = + prxtx_queue->pshmqhd_v; + (void)__dma_rxtx_H(prxtx_queue, i, type); + if ((type == BSPVETH_RX && + pshmq_head->head != pshmq_head->tail) || + (type == BSPVETH_TX && + prxtx_queue->head != prxtx_queue->tail)) + return BSP_ERR_AGAIN; + } + } + + return BSP_OK; +} + +#ifdef __UT_TEST + +s32 __atu_config_H(struct pci_dev *pdev, unsigned int region, + unsigned int hostaddr_h, unsigned int hostaddr_l, + unsigned int bmcaddr_h, unsigned int bmcaddr_l, + unsigned int len) +{ + (void)pci_write_config_dword(pdev, 0x900, + 0x80000000 + (region & 0x00000007)); + (void)pci_write_config_dword(pdev, 0x90c, hostaddr_l); + (void)pci_write_config_dword(pdev, 0x910, hostaddr_h); + (void)pci_write_config_dword(pdev, 0x914, hostaddr_l + len - 1); + (void)pci_write_config_dword(pdev, 0x918, bmcaddr_l); + (void)pci_write_config_dword(pdev, 0x91c, bmcaddr_h); + /* atu ctrl1 reg */ + (void)pci_write_config_dword(pdev, 0x904, 0x00000000); + /* atu ctrl2 reg */ + (void)pci_write_config_dword(pdev, 0x908, 0x80000000); + + return 0; +} + +void bspveth_atu_config_H(void) +{ + __atu_config_H(g_bspveth_dev.ppcidev, + REGION_HOST, + (sizeof(unsigned long) == SIZE_OF_UNSIGNED_LONG) ? + ((u64)(g_bspveth_dev.phostrtc_p) >> ADDR_H_SHIFT) : 0, + ((u64)(g_bspveth_dev.phostrtc_p) & 0xffffffff), + 0, HOSTRTC_REG_BASE, HOSTRTC_REG_SIZE); + + __atu_config_H(g_bspveth_dev.ppcidev, + REGION_BMC, + (sizeof(unsigned long) == SIZE_OF_UNSIGNED_LONG) ? + ((u64)(g_bspveth_dev.pshmpool_p) >> ADDR_H_SHIFT) : 0, + ((u64)(g_bspveth_dev.pshmpool_p) & 0xffffffff), + 0, VETH_SHAREPOOL_BASE_INBMC, VETH_SHAREPOOL_SIZE); +} + +void bspveth_pcie_free_H(void) +{ + struct pci_dev *pdev = g_bspveth_dev.ppcidev; + + if (pdev) + pci_disable_device(pdev); + else + VETH_LOG(DLOG_ERROR, "bspveth_dev.ppcidev IS NULL\n"); + + VETH_LOG(DLOG_DEBUG, "bspveth_pcie_exit_H ok\n"); +} + +#endif + +void bspveth_host_exit_H(void) +{ + int ret = 0; + + ret = bma_intf_unregister_type((void **)&g_bspveth_dev.bma_priv); + if (ret < 0) { + VETH_LOG(DLOG_ERROR, "bma_intf_unregister_type failed\n"); + + return; + } + + VETH_LOG(DLOG_DEBUG, "bspveth host exit H OK\n"); +} + +s32 bspveth_host_init_H(void) +{ + int ret = 0; + struct bma_priv_data_s *bma_priv = NULL; + + ret = bma_intf_register_type(TYPE_VETH, 0, INTR_ENABLE, + (void **)&bma_priv); + if (ret) { + ret = -1; + goto failed; + } + + if (!bma_priv) { + VETH_LOG(DLOG_ERROR, "bma_priv is NULL\n"); + return -1; + } + + VETH_LOG(DLOG_DEBUG, + "bma_intf_register_type pdev = %p, veth_swap_addr = %p, ", + bma_priv->specific.veth.pdev, + bma_priv->specific.veth.veth_swap_addr); + + VETH_LOG(DLOG_DEBUG, + "veth_swap_len = 0x%lx, veth_swap_phy_addr = 0x%lx\n", + bma_priv->specific.veth.veth_swap_len, + bma_priv->specific.veth.veth_swap_phy_addr); + + g_bspveth_dev.bma_priv = bma_priv; + g_bspveth_dev.ppcidev = bma_priv->specific.veth.pdev; + + /*bspveth_dev.phostrtc_p = (u8 *)bar1_base;*/ + /*bspveth_dev.phostrtc_v = (u8 *)bar1_remap;*/ + g_bspveth_dev.pshmpool_p = + (u8 *)bma_priv->specific.veth.veth_swap_phy_addr; + g_bspveth_dev.pshmpool_v = + (u8 *)bma_priv->specific.veth.veth_swap_addr; + g_bspveth_dev.shmpoolsize = bma_priv->specific.veth.veth_swap_len; + + VETH_LOG(DLOG_DEBUG, "bspveth host init H OK\n"); + + return BSP_OK; + +failed: + return ret; +} + +static int __init veth_init(void) +{ + int ret = BSP_OK; + int buf_len = 0; + + if (!bma_intf_check_edma_supported()) + return -ENXIO; + + memset(&g_bspveth_dev, 0, sizeof(g_bspveth_dev)); + + buf_len = snprintf(g_bspveth_dev.name, NET_NAME_LEN, + "%s", BSPVETH_DEV_NAME); + if (buf_len < 0 || ((u32)buf_len >= (NET_NAME_LEN))) { + VETH_LOG(DLOG_ERROR, "BSP_SNPRINTF lRet =0x%x\n", buf_len); + return BSP_ERR_INVALID_STR; + } + + ret = bspveth_host_init_H(); + if (ret != BSP_OK) { + ret = -1; + goto failed1; + } + + ret = veth_netdev_init(); + if (ret != BSP_OK) { + ret = -1; + goto failed2; + } + + GET_SYS_SECONDS(g_bspveth_dev.init_time); + + return BSP_OK; + +failed2: + bspveth_host_exit_H(); + +failed1: + + return ret; +} + +static void __exit veth_exit(void) +{ + veth_netdev_exit(); + + bspveth_host_exit_H(); +} + +MODULE_AUTHOR("HUAWEI TECHNOLOGIES CO., LTD."); +MODULE_DESCRIPTION("HUAWEI VETH DRIVER"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(VETH_VERSION); + +module_init(veth_init); +module_exit(veth_exit); diff --git a/drivers/net/ethernet/huawei/bma/veth_drv/veth_hb.h b/drivers/net/ethernet/huawei/bma/veth_drv/veth_hb.h new file mode 100644 index 000000000000..9a4d699e6421 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/veth_drv/veth_hb.h @@ -0,0 +1,440 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/*Huawei iBMA driver. + *Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + *This program is free software; you can redistribute it and/or + *modify it under the terms of the GNU General Public License + *as published by the Free Software Foundation; either version 2 + *of the License, or (at your option) any later version. + * + *This program is distributed in the hope that it will be useful, + *but WITHOUT ANY WARRANTY; without even the implied warranty of + *MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + *GNU General Public License for more details. + * + */ + +#ifndef _VETH_HB_H_ +#define _VETH_HB_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <linux/interrupt.h> + +#define DEP_BMA + +#include "../edma_drv/bma_include.h" +#include "../include/bma_ker_intf.h" + +#ifdef DRV_VERSION +#define VETH_VERSION MICRO_TO_STR(DRV_VERSION) +#else +#define VETH_VERSION "0.3.4" +#endif + +#define MODULE_NAME "veth" +#define BSP_VETH_T u64 + +#define BSP_OK (0) +#define BSP_ERR (0xFFFFFFFF) +#define BSP_NETDEV_TX_BUSY (1) +#define BSP_ERR_INIT_ERR (BSP_NETDEV_TX_BUSY) +#define BSP_ETH_ERR_BASE (0x0FFFF000) +#define BSP_ERR_OUT_OF_MEM (BSP_ETH_ERR_BASE + 1) +#define BSP_ERR_NULL_POINTER (BSP_ETH_ERR_BASE + 2) +#define BSP_ERR_INVALID_STR (BSP_ETH_ERR_BASE + 3) +#define BSP_ERR_INVALID_PARAM (BSP_ETH_ERR_BASE + 4) +#define BSP_ERR_INVALID_DATA (BSP_ETH_ERR_BASE + 5) +#define BSP_ERR_OUT_OF_RANGE (BSP_ETH_ERR_BASE + 6) +#define BSP_ERR_INVALID_CARD (BSP_ETH_ERR_BASE + 7) +#define BSP_ERR_INVALID_GRP (BSP_ETH_ERR_BASE + 8) +#define BSP_ERR_INVALID_ETH (BSP_ETH_ERR_BASE + 9) +#define BSP_ERR_SEND_ERR (BSP_ETH_ERR_BASE + 10) +#define BSP_ERR_DMA_ERR (BSP_ETH_ERR_BASE + 11) +#define BSP_ERR_RECV_ERR (BSP_ETH_ERR_BASE + 12) +#define BSP_ERR_SKB_ERR (BSP_ETH_ERR_BASE + 13) +#define BSP_ERR_DMA_ADDR_ERR (BSP_ETH_ERR_BASE + 14) +#define BSP_ERR_IOREMAP_ERR (BSP_ETH_ERR_BASE + 15) +#define BSP_ERR_LEN_ERR (BSP_ETH_ERR_BASE + 16) +#define BSP_ERR_STAT_ERR (BSP_ETH_ERR_BASE + 17) +#define BSP_ERR_AGAIN (BSP_ETH_ERR_BASE + 18) +#define BSP_ERR_NOT_TO_HANDLE (BSP_ETH_ERR_BASE + 19) + +#define VETH_H2B_IRQ_NO (113) +#define SYSCTL_REG_BASE (0x20000000) +#define SYSCTL_REG_SIZE (0x1000) +#define PCIE1_REG_BASE (0x29000000) +#define PCIE1_REG_SIZE (0x1000) +#define VETH_SHAREPOOL_BASE_INBMC (0x84820000) +#define VETH_SHAREPOOL_SIZE (0xdf000) +#define VETH_SHAREPOOL_OFFSET (0x10000) +#define MAX_SHAREQUEUE_SIZE (0x20000) + +#define BSPVETH_SHMBDBASE_OFFSET (0x80) +#define SHMDMAL_OFFSET (0x10000) +#define MAX_SHMDMAL_SIZE (BSPVETH_DMABURST_MAX * 32) + +#define BSPVETH_DMABURST_MAX 64 +#define BSPVETH_SKBTIMER_INTERVAL (1) +#define BSPVETH_DMATIMER_INTERVAL (1) +#define BSPVETH_CTLTIMER_INTERVAL (10) +#define BSPVETH_HDCMD_CHKTIMER_INTERVAL (10) +#define BSP_DMA_64BIT_MASK (0xffffffffffffffffULL) +#define BSP_DMA_32BIT_MASK (0x00000000ffffffffULL) +#define HOSTRTC_REG_BASE (0x2f000000) +#define HOSTRTC_REG_SIZE (0x10000) +#define REG_SYSCTL_HOSTINT_CLEAR (0x44) +#define SHIFT_SYSCTL_HOSTINT_CLEAR (22) +#define REG_SYSCTL_HOSTINT (0xf4) +#define SHIFT_SYSCTL_HOSTINT (26) + +#define NET_TYPE_LEN (16) + +#define MAX_QUEUE_NUM (1) +#define MAX_QUEUE_BDNUM (128) +#define BSPVETH_MAX_QUE_DEEP (MAX_QUEUE_BDNUM) +#define BSPVETH_POINT_MASK (MAX_QUEUE_BDNUM - 1) +#define BSPVETH_WORK_LIMIT (64) +#define BSPVETH_CHECK_DMA_STATUS_TIMES (120) + +#define REG_PCIE1_DMAREAD_ENABLE (0xa18) +#define SHIFT_PCIE1_DMAREAD_ENABLE (0) +#define REG_PCIE1_DMAWRITE_ENABLE (0x9c4) +#define SHIFT_PCIE1_DMAWRITE_ENABLE (0) +#define REG_PCIE1_DMAREAD_STATUS (0xa10) +#define SHIFT_PCIE1_DMAREAD_STATUS (0) +#define REG_PCIE1_DMAREADINT_CLEAR (0xa1c) +#define SHIFT_PCIE1_DMAREADINT_CLEAR (0) +#define REG_PCIE1_DMAWRITE_STATUS (0x9bc) +#define SHIFT_PCIE1_DMAWRITE_STATUS (0) +#define REG_PCIE1_DMAWRITEINT_CLEAR (0x9c8) +#define SHIFT_PCIE1_DMAWRITEINT_CLEAR (0) + +#define BSPVETH_DMA_OK (1) +#define BSPVETH_DMA_BUSY (0) +#define BSPVETH_RX (2) +#define BSPVETH_TX (3) +#define HOSTRTC_INT_OFFSET (0x10) +#define BSPVETH_DEV_NAME (MODULE_NAME) +#define NET_NAME_LEN (64) + +#ifdef PCI_VENDOR_ID_HUAWEI +#undef PCI_VENDOR_ID_HUAWEI +#endif +#define PCI_VENDOR_ID_HUAWEI (0x19e5) + +#define PCI_DEVICE_ID_KBOX (0x1710) +#define BSPVETH_MTU_MAX (1500) +#define BSPVETH_MTU_MIN (64) +#define BSPVETH_SKB_SIZE (1536) +#define BSPVETH_NET_TIMEOUT (5 * HZ) +#define BSPVETH_QUEUE_TIMEOUT_10MS (100) +#define BSPVETH_SHMQUEUE_INITOK (0x12) +#define BSPVETH_LBK_TYPE (0x800) + +#ifndef VETH_BMC +#define BSPVETH_CACHELINE_SIZE (64) +#else +#define BSPVETH_CACHELINE_SIZE (32) +#endif +#define BSPVETH_HBCMD_WCMP (0x44) +#define BSPVETH_HBCMD_CMP (0x55) +#define BSPVETH_HBCMD_OK (0x66) +#define BSPVETH_HEART_WACK (0x99) +#define BSPVETH_HEART_ACK (0xaa) + +#define BSPVETH_HBCMD_TIMEOUT (1000) + +#define SIZE_OF_UNSIGNED_LONG 8 +#define ADDR_H_SHIFT 32 +#define REGION_HOST 1 +#define REGION_BMC 2 + +enum veth_hb_cmd { + VETH_HBCMD_UNKNOWN = 0x0, + VETH_HBCMD_SETIP, + + VETH_HBCMD_MAX, +}; + +#define USE_TASKLET + +#define BSPVETH_ETHTOOL_BASE 0x89F0 +#define BSPVETH_ETHTOOL_TESTINT (BSPVETH_ETHTOOL_BASE + 1) +#define BSPVETH_ETHTOOL_TESTSHAREMEM (BSPVETH_ETHTOOL_BASE + 2) +#define BSPVETH_ETHTOOL_DUMPSHAREMEM (BSPVETH_ETHTOOL_BASE + 3) +#define BSPVETH_ETHTOOL_TESTDMA (BSPVETH_ETHTOOL_BASE + 4) +#define BSPVETH_ETHTOOL_RWPCIEREG (BSPVETH_ETHTOOL_BASE + 5) +#define BSPVETH_ETHTOOL_TESTLBK (BSPVETH_ETHTOOL_BASE + 6) +#define BSPVETH_ETHTOOL_INITSTATIS (BSPVETH_ETHTOOL_BASE + 7) +#define BSPVETH_HBCMD (BSPVETH_ETHTOOL_BASE + 8) + +struct bspveth_test { + u32 intdirect; /*0--H2B,1--B2H*/ + u32 rwshmcheck; /*0--w,1--r and check*/ + u32 dshmbase; + u32 dshmlen; + u32 testdma; /*0--disable,1---enable*/ + u32 pcierw; /*0--w,1---r*/ + u32 reg; + u32 data; + u32 testlbk; /*0--disable,1---enable*/ +}; + +struct bspveth_hdcmd { + u32 cmd; + u32 stat; + u32 heart; + u32 err; + u32 sequence; + u32 len; + u8 data[256]; +}; + +struct bspveth_rxtx_statis { + u64 pkt; + u64 pktbyte; + u64 refill; + u64 freetx; + u64 dmapkt; + u64 dmapktbyte; + + u32 dropped_pkt; + u32 netifrx_err; + u32 null_point; + u32 retry_err; + u32 dma_mapping_err; + u32 allocskb_err; + u32 q_full; + u32 q_emp; + u32 shm_full; + u32 shm_emp; + u32 dma_busy; + u32 need_fill; + u32 need_free; + u32 dmacmp_err; + u32 type_err; + u32 shmqueue_noinit; + u32 shmretry_err; + u32 dma_earlyint; + u32 clr_dma_earlyint; + u32 clr_dma_int; + u32 dmarx_shmaddr_unalign; + u32 dmarx_hostaddr_unalign; + u32 dmatx_shmaddr_unalign; + u32 dmatx_hostaddr_unalign; + u32 dma_need_offset; + u32 lastdmadir_err; + u32 dma_failed; + u32 dma_burst; + u32 lbk_cnt; + u32 lbk_txerr; +}; + +struct bspveth_bd_info { + struct sk_buff *pdma_v; + u32 len; + unsigned long time_stamp; +}; + +struct bspveth_dma_shmbd { + u32 dma_p; + u32 len; + u32 off; +}; + +struct bspveth_shmq_hd { + u32 count; + u32 size; /*count x sizeof(dmaBD)*/ + u32 next_to_fill; + u32 next_to_free; + u32 head; + u32 tail; + u16 init; /* 1--ok,0--nok*/ +}; + +struct bspveth_dma_bd { + u64 dma_p; + u32 len; + u32 off; +}; + +struct bspveth_dmal { + u32 chl; + u32 len; + u32 slow; + u32 shi; + u32 dlow; + u32 dhi; +}; + +struct bspveth_rxtx_q { +#ifndef VETH_BMC + struct bspveth_dma_bd *pbdbase_v; + u8 *pbdbase_p; +#endif + + struct bspveth_bd_info *pbdinfobase_v; + struct bspveth_shmq_hd *pshmqhd_v; + u8 *pshmqhd_p; + + struct bspveth_dma_shmbd *pshmbdbase_v; + u8 *pshmbdbase_p; + + struct bspveth_dmal *pdmalbase_v; + u8 *pdmalbase_p; + + u32 dmal_cnt; + u32 dmal_byte; + + u32 count; + u32 size; + u32 rx_buf_len; + + u32 next_to_fill; + u32 next_to_free; + u32 head; + u32 tail; + u16 start_dma; + u16 dmacmperr; + + u16 dma_overtime; + + u32 work_limit; + struct bspveth_rxtx_statis s; +}; + +struct bspveth_device { + struct bspveth_rxtx_q *ptx_queue[MAX_QUEUE_NUM]; + struct bspveth_rxtx_q *prx_queue[MAX_QUEUE_NUM]; + struct net_device *pnetdev; + char name[NET_NAME_LEN]; + + struct pci_dev *ppcidev; + u8 *phostrtc_p; + u8 *phostrtc_v; + + u8 *psysctl_v; + u8 *ppcie1_v; + + u8 *pshmpool_p; + u8 *pshmpool_v; + u32 shmpoolsize; + + u32 recv_int; + u32 tobmc_int; + u32 tohost_int; + u32 run_dma_tx_task; + u32 run_dma_rx_task; + u32 run_skb_rx_task; + u32 run_skb_fr_task; + u32 shutdown_cnt; + __kernel_time_t init_time; + + /* spinlock for register */ + spinlock_t reg_lock; +#ifndef USE_TASKLET + struct timer_list skbtrtimer; + struct timer_list dmatimer; +#else + struct tasklet_struct skb_task; + struct tasklet_struct dma_task; +#endif + + struct net_device_stats stats; + struct work_struct shutdown_task; +#ifdef DEP_BMA + struct bma_priv_data_s *bma_priv; +#else + void *edma_priv; +#endif +}; + +struct tag_pcie_comm_priv { + char net_type[NET_TYPE_LEN]; + struct net_device_stats stats; + int status; + int irq_enable; + int pcie_comm_rx_flag; + spinlock_t lock; /* spinlock for priv data */ +}; + +#define QUEUE_MASK(p) ((p) & (BSPVETH_POINT_MASK)) + +#define CHECK_ADDR_ALIGN(addr, statis)\ +do { \ + if ((addr) & 0x3) \ + statis;\ +} while (0) + +#define PROC_P_STATIS(name, statis)\ + PROC_DPRINTK("[%10s]:\t0x%llx", #name, statis) + +#define INC_STATIS_RXTX(queue, name, count, type) \ +do { \ + if (type == BSPVETH_RX)\ + g_bspveth_dev.prx_queue[queue]->s.name += count;\ + else\ + g_bspveth_dev.ptx_queue[queue]->s.name += count;\ +} while (0) + +#define PROC_DPRINTK(fmt, args...) (len += sprintf(buf + len, fmt, ##args)) + +#define JUDGE_TX_QUEUE_SPACE(head, tail, len) \ + (((BSPVETH_MAX_QUE_DEEP + (tail) - (head) - 1) \ + & BSPVETH_POINT_MASK) >= (len)) + +#define JUDGE_RX_QUEUE_SPACE(head, tail, len) \ + (((BSPVETH_MAX_QUE_DEEP + (tail) - (head)) \ + & BSPVETH_POINT_MASK) > (len)) + +#ifndef VETH_BMC +#define BSPVETH_UNMAP_DMA(data, len) \ + dma_unmap_single(&g_bspveth_dev.ppcidev->dev, \ + data, len, DMA_FROM_DEVICE) +#else +#define BSPVETH_UNMAP_DMA(data, len) \ + dma_unmap_single(NULL, data, len, DMA_FROM_DEVICE) +#endif + +int veth_tx(struct sk_buff *pstr_skb, struct net_device *pstr_dev); +int veth_dma_task_H(u32 type); +s32 veth_skbtimer_close(void); +void veth_skbtimer_init(void); +s32 veth_dmatimer_close_H(void); +void veth_dmatimer_init_H(void); +int veth_skb_tr_task(void); + +s32 __dma_rxtx_H(struct bspveth_rxtx_q *prxtx_queue, u32 queue, u32 type); +s32 veth_recv_pkt(struct bspveth_rxtx_q *prx_queue, int queue); +s32 veth_free_txskb(struct bspveth_rxtx_q *ptx_queue, int queue); + +enum { + QUEUE_TX_STATS, + QUEUE_RX_STATS, + VETH_STATS, + SHMQ_TX_STATS, + SHMQ_RX_STATS, + NET_STATS, +}; + +struct veth_stats { + char stat_string[ETH_GSTRING_LEN]; + int type; + int sizeof_stat; + int stat_offset; +}; + +#define VETH_STAT_SIZE(m) sizeof(((struct bspveth_device *)0)->m) +#define VETH_STAT_OFFSET(m) offsetof(struct bspveth_device, m) +#define QUEUE_TXRX_STAT_SIZE(m) sizeof(((struct bspveth_rxtx_q *)0)->m) +#define QUEUE_TXRX_STAT_OFFSET(m) offsetof(struct bspveth_rxtx_q, m) +#define SHMQ_TXRX_STAT_SIZE(m) sizeof(((struct bspveth_shmq_hd *)0)->m) +#define SHMQ_TXRX_STAT_OFFSET(m) offsetof(struct bspveth_shmq_hd, m) + +#ifdef __cplusplus +} +#endif +#endif
From: Qi Deng dengqi7@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4ETXO CVE: NA
-----------------------------------------
The BMA software is a system management software offered by Huawei. It supports the status monitoring, performance monitoring, and event monitoring of various components, including server CPUs, memory, hard disks, NICs, IB cards, PCIe cards, RAID controller cards, and optical modules.
The host_kbox_drv driver serves the function of a black box. When a panic or mce event happen to the system, it will record the event time, system's status and system logs and send them to BMC before the OS shutdown. This driver depends on the host_edms_drv driver.
Link: https://lkml.org/lkml/2020/6/22/752
Signed-off-by: Qi Deng dengqi7@huawei.com Reviewed-by: Qidong Wang wangqindong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/huawei/bma/Makefile | 1 + .../net/ethernet/huawei/bma/kbox_drv/Makefile | 5 + .../ethernet/huawei/bma/kbox_drv/kbox_dump.c | 121 +++ .../ethernet/huawei/bma/kbox_drv/kbox_dump.h | 33 + .../ethernet/huawei/bma/kbox_drv/kbox_hook.c | 101 ++ .../ethernet/huawei/bma/kbox_drv/kbox_hook.h | 33 + .../huawei/bma/kbox_drv/kbox_include.h | 40 + .../ethernet/huawei/bma/kbox_drv/kbox_main.c | 176 ++++ .../ethernet/huawei/bma/kbox_drv/kbox_main.h | 23 + .../ethernet/huawei/bma/kbox_drv/kbox_mce.c | 264 +++++ .../ethernet/huawei/bma/kbox_drv/kbox_mce.h | 23 + .../ethernet/huawei/bma/kbox_drv/kbox_panic.c | 187 ++++ .../ethernet/huawei/bma/kbox_drv/kbox_panic.h | 25 + .../huawei/bma/kbox_drv/kbox_printk.c | 363 +++++++ .../huawei/bma/kbox_drv/kbox_printk.h | 33 + .../huawei/bma/kbox_drv/kbox_ram_drive.c | 188 ++++ .../huawei/bma/kbox_drv/kbox_ram_drive.h | 31 + .../huawei/bma/kbox_drv/kbox_ram_image.c | 135 +++ .../huawei/bma/kbox_drv/kbox_ram_image.h | 84 ++ .../huawei/bma/kbox_drv/kbox_ram_op.c | 986 ++++++++++++++++++ .../huawei/bma/kbox_drv/kbox_ram_op.h | 77 ++ 21 files changed, 2929 insertions(+) create mode 100644 drivers/net/ethernet/huawei/bma/kbox_drv/Makefile create mode 100644 drivers/net/ethernet/huawei/bma/kbox_drv/kbox_dump.c create mode 100644 drivers/net/ethernet/huawei/bma/kbox_drv/kbox_dump.h create mode 100644 drivers/net/ethernet/huawei/bma/kbox_drv/kbox_hook.c create mode 100644 drivers/net/ethernet/huawei/bma/kbox_drv/kbox_hook.h create mode 100644 drivers/net/ethernet/huawei/bma/kbox_drv/kbox_include.h create mode 100644 drivers/net/ethernet/huawei/bma/kbox_drv/kbox_main.c create mode 100644 drivers/net/ethernet/huawei/bma/kbox_drv/kbox_main.h create mode 100644 drivers/net/ethernet/huawei/bma/kbox_drv/kbox_mce.c create mode 100644 drivers/net/ethernet/huawei/bma/kbox_drv/kbox_mce.h create mode 100644 drivers/net/ethernet/huawei/bma/kbox_drv/kbox_panic.c create mode 100644 drivers/net/ethernet/huawei/bma/kbox_drv/kbox_panic.h create mode 100644 drivers/net/ethernet/huawei/bma/kbox_drv/kbox_printk.c create mode 100644 drivers/net/ethernet/huawei/bma/kbox_drv/kbox_printk.h create mode 100644 drivers/net/ethernet/huawei/bma/kbox_drv/kbox_ram_drive.c create mode 100644 drivers/net/ethernet/huawei/bma/kbox_drv/kbox_ram_drive.h create mode 100644 drivers/net/ethernet/huawei/bma/kbox_drv/kbox_ram_image.c create mode 100644 drivers/net/ethernet/huawei/bma/kbox_drv/kbox_ram_image.h create mode 100644 drivers/net/ethernet/huawei/bma/kbox_drv/kbox_ram_op.c create mode 100644 drivers/net/ethernet/huawei/bma/kbox_drv/kbox_ram_op.h
diff --git a/drivers/net/ethernet/huawei/bma/Makefile b/drivers/net/ethernet/huawei/bma/Makefile index 6f452aa96a69..40fd3a18c34f 100644 --- a/drivers/net/ethernet/huawei/bma/Makefile +++ b/drivers/net/ethernet/huawei/bma/Makefile @@ -5,3 +5,4 @@ obj-$(CONFIG_BMA) += edma_drv/ obj-$(CONFIG_BMA) += cdev_drv/ obj-$(CONFIG_BMA) += veth_drv/ +obj-$(CONFIG_BMA) += kbox_drv/ diff --git a/drivers/net/ethernet/huawei/bma/kbox_drv/Makefile b/drivers/net/ethernet/huawei/bma/kbox_drv/Makefile new file mode 100644 index 000000000000..7d908d5b7f4b --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/kbox_drv/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_BMA) += host_kbox_drv.o +host_kbox_drv-y := kbox_main.o kbox_ram_drive.o kbox_ram_image.o kbox_ram_op.o kbox_printk.o kbox_dump.o kbox_hook.o kbox_panic.o +ifdef CONFIG_X86 +host_kbox_drv-y += kbox_mce.o +endif \ No newline at end of file diff --git a/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_dump.c b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_dump.c new file mode 100644 index 000000000000..1f3a73ca9d1f --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_dump.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/spinlock.h> +#include <linux/utsname.h> /* system_utsname */ +#include <linux/rtc.h> /* struct rtc_time */ +#include "kbox_include.h" +#include "kbox_main.h" +#include "kbox_printk.h" +#include "kbox_ram_image.h" +#include "kbox_ram_op.h" +#include "kbox_dump.h" +#include "kbox_panic.h" + +#ifdef CONFIG_X86 +#include "kbox_mce.h" +#endif + +#define THREAD_TMP_BUF_SIZE 256 + +static DEFINE_SPINLOCK(g_dump_lock); + +static const char g_day_in_month[] = { + 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 +}; + +#define LEAPS_THRU_END_OF(y) ((y) / 4 - (y) / 100 + (y) / 400) +#define LEAP_YEAR(year) \ + ((!((year) % 4) && ((year) % 100)) || !((year) % 400)) +#define MONTH_DAYS(month, year) \ + (g_day_in_month[(month)] + (int)(LEAP_YEAR(year) && (month == 1))) + +static void kbox_show_kernel_version(void) +{ + (void)kbox_dump_painc_info + ("\nOS : %s,\nRelease : %s,\nVersion : %s,\n", + init_uts_ns.name.sysname, + init_uts_ns.name.release, + init_uts_ns.name.version); + (void)kbox_dump_painc_info + ("Machine : %s,\nNodename : %s\n", + init_uts_ns.name.machine, + init_uts_ns.name.nodename); +} + +static void kbox_show_version(void) +{ + (void)kbox_dump_painc_info("\nKBOX_VERSION : %s\n", + KBOX_VERSION); +} + +static void kbox_show_time_stamps(void) +{ + struct rtc_time rtc_time_val = { }; + struct timespec64 uptime; + + ktime_get_coarse_real_ts64(&uptime); + rtc_time64_to_tm(uptime.tv_sec, &rtc_time_val); + + (void)kbox_dump_painc_info + ("Current time : %04d-%02d-%02d %02d:%02d:%02d\n", + rtc_time_val.tm_year + 1900, rtc_time_val.tm_mon + 1, + rtc_time_val.tm_mday, rtc_time_val.tm_hour, + rtc_time_val.tm_min, rtc_time_val.tm_sec); +} + +void kbox_dump_event(enum kbox_error_type_e type, unsigned long event, + const char *msg) +{ + if (!spin_trylock(&g_dump_lock)) + return; + + (void)kbox_dump_painc_info("\n====kbox begin dumping...====\n"); + + switch (type) { +#ifdef CONFIG_X86 + case KBOX_MCE_EVENT: + + kbox_handle_mce_dump(msg); + + break; +#endif + + case KBOX_OPPS_EVENT: + + break; + case KBOX_PANIC_EVENT: + if (kbox_handle_panic_dump(msg) == KBOX_FALSE) + goto end; + + break; + default: + break; + } + + kbox_show_kernel_version(); + + kbox_show_version(); + + kbox_show_time_stamps(); + + (void)kbox_dump_painc_info("\n====kbox end dump====\n"); + + kbox_output_syslog_info(); + kbox_output_printk_info(); + +end: + spin_unlock(&g_dump_lock); +} diff --git a/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_dump.h b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_dump.h new file mode 100644 index 000000000000..cba31377fbf3 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_dump.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _KBOX_DUMP_H_ +#define _KBOX_DUMP_H_ + +#define DUMPSTATE_MCE_RESET 1 +#define DUMPSTATE_OPPS_RESET 2 +#define DUMPSTATE_PANIC_RESET 3 + +enum kbox_error_type_e { + KBOX_MCE_EVENT = 1, + KBOX_OPPS_EVENT, + KBOX_PANIC_EVENT +}; + +int kbox_dump_thread_info(const char *fmt, ...); +void kbox_dump_event(enum kbox_error_type_e type, unsigned long event, + const char *msg); + +#endif diff --git a/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_hook.c b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_hook.c new file mode 100644 index 000000000000..b2acdf24188b --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_hook.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/notifier.h> +#include "kbox_include.h" +#include "kbox_dump.h" +#include "kbox_hook.h" + +int panic_notify(struct notifier_block *this, + unsigned long event, void *msg); + +static int die_notify(struct notifier_block *self, + unsigned long val, void *data); + +static struct notifier_block g_panic_nb = { + .notifier_call = panic_notify, + .priority = 100, +}; + +static struct notifier_block g_die_nb = { + .notifier_call = die_notify, +}; + +int panic_notify(struct notifier_block *pthis, unsigned long event, void *msg) +{ + UNUSED(pthis); + UNUSED(event); + + kbox_dump_event(KBOX_PANIC_EVENT, DUMPSTATE_PANIC_RESET, + (const char *)msg); + + return NOTIFY_OK; +} + +int die_notify(struct notifier_block *self, unsigned long val, void *data) +{ + struct kbox_die_args *args = (struct kbox_die_args *)data; + + if (!args) + return NOTIFY_OK; + + switch (val) { + case 1: + break; + case 5: + if (strcmp(args->str, "nmi") == 0) + return NOTIFY_OK; +#ifdef CONFIG_X86 + kbox_dump_event(KBOX_MCE_EVENT, DUMPSTATE_MCE_RESET, args->str); +#endif + break; + + default: + break; + } + + return NOTIFY_OK; +} + +int kbox_register_hook(void) +{ + int ret = 0; + + ret = atomic_notifier_chain_register(&panic_notifier_list, &g_panic_nb); + if (ret) + KBOX_MSG("atomic_notifier_chain_register g_panic_nb failed!\n"); + + ret = register_die_notifier(&g_die_nb); + if (ret) + KBOX_MSG("register_die_notifier g_die_nb failed!\n"); + + return ret; +} + +void kbox_unregister_hook(void) +{ + int ret = 0; + + ret = + atomic_notifier_chain_unregister(&panic_notifier_list, &g_panic_nb); + if (ret < 0) { + KBOX_MSG + ("atomic_notifier_chain_unregister g_panic_nb failed!\n"); + } + + ret = unregister_die_notifier(&g_die_nb); + if (ret < 0) + KBOX_MSG("unregister_die_notifier g_die_nb failed!\n"); +} diff --git a/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_hook.h b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_hook.h new file mode 100644 index 000000000000..00b3deb510b7 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_hook.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _KBOX_PANIC_HOOK_H_ +#define _KBOX_PANIC_HOOK_H_ + +struct kbox_die_args { + struct pt_regs *regs; + const char *str; + long err; + int trapnr; + int signr; +}; + +int register_die_notifier(struct notifier_block *nb); +int unregister_die_notifier(struct notifier_block *nb); + +int kbox_register_hook(void); +void kbox_unregister_hook(void); + +#endif diff --git a/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_include.h b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_include.h new file mode 100644 index 000000000000..ffadf3727734 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_include.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _KBOX_INCLUDE_H_ +#define _KBOX_INCLUDE_H_ + +#include <linux/kernel.h> +#include <linux/version.h> +#include <linux/netdevice.h> + +#ifdef DRV_VERSION +#define KBOX_VERSION MICRO_TO_STR(DRV_VERSION) +#else +#define KBOX_VERSION "0.3.4" +#endif + +#define UNUSED(x) (x = x) +#define KBOX_FALSE (-1) +#define KBOX_TRUE 0 + +#ifdef KBOX_DEBUG +#define KBOX_MSG(fmt, args...) \ + netdev_notice(0, "kbox: %s(), %d, " fmt, __func__, __LINE__, ## args) +#else +#define KBOX_MSG(fmt, args...) +#endif + +#endif diff --git a/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_main.c b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_main.c new file mode 100644 index 000000000000..32beb9b201a0 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_main.c @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/module.h> +#include <linux/processor.h> /* for rdmsr and MSR_IA32_MCG_STATUS */ +#include <linux/fs.h> /* everything... */ +#include <linux/file.h> /* for fput */ +#include <linux/proc_fs.h> +#include <linux/uaccess.h> /* copy_*_user */ +#include <linux/version.h> +#include "kbox_include.h" +#include "kbox_panic.h" +#include "kbox_main.h" +#include "kbox_printk.h" +#include "kbox_ram_image.h" +#include "kbox_ram_op.h" +#include "kbox_dump.h" +#include "kbox_hook.h" +#include "kbox_ram_drive.h" + +#ifdef CONFIG_X86 +#include <asm/msr.h> +#include "kbox_mce.h" +#endif + +#define KBOX_LOADED_FILE ("/proc/kbox") + +#define KBOX_ROOT_ENTRY_NAME ("kbox") + +static int kbox_is_loaded(void) +{ + struct file *fp = NULL; + + #ifdef set_fs + mm_segment_t old_fs; + + old_fs = get_fs(); /* save old flag */ + set_fs(KERNEL_DS); /* mark data from kernel space */ + #endif + + fp = filp_open(KBOX_LOADED_FILE, O_RDONLY, 0); + + if (IS_ERR(fp)) { + #ifdef set_fs + set_fs(old_fs); + #endif + + return KBOX_FALSE; + } + + (void)filp_close(fp, NULL); + + #ifdef set_fs + set_fs(old_fs); /* restore old flag */ + #endif + + return KBOX_TRUE; +} + +static int kbox_printk_proc_init(void) +{ + struct proc_dir_entry *kbox_entry = NULL; + + if (kbox_is_loaded() != KBOX_TRUE) { + kbox_entry = proc_mkdir(KBOX_ROOT_ENTRY_NAME, NULL); + if (!kbox_entry) { + KBOX_MSG("can not create %s entry\n", + KBOX_ROOT_ENTRY_NAME); + return -ENOMEM; + } + } + + return KBOX_TRUE; +} + +int __init kbox_init(void) +{ + int ret = KBOX_TRUE; + int kbox_proc_exist = 0; + + if (!kbox_get_base_phy_addr()) + return -ENXIO; + + ret = kbox_super_block_init(); + if (ret) { + KBOX_MSG("kbox_super_block_init failed!\n"); + return ret; + } + + if (kbox_is_loaded() == KBOX_TRUE) + kbox_proc_exist = 1; + + ret = kbox_printk_init(kbox_proc_exist); + if (ret) + KBOX_MSG("kbox_printk_init failed!\n"); + + ret = kbox_panic_init(); + if (ret) { + KBOX_MSG("kbox_panic_init failed!\n"); + goto fail1; + } + + ret = kbox_register_hook(); + if (ret) { + KBOX_MSG("kbox_register_hook failed!\n"); + goto fail2; + } + +#ifdef CONFIG_X86 + (void)kbox_mce_init(); +#endif + ret = kbox_read_super_block(); + if (ret) { + KBOX_MSG("update super block failed!\n"); + goto fail3; + } + + if (kbox_printk_proc_init() != 0) { + KBOX_MSG("kbox_printk_proc_init failed!\n"); + goto fail4; + } + + ret = kbox_drive_init(); + if (ret) { + KBOX_MSG("kbox_drive_init failed!\n"); + goto fail5; + } + + return KBOX_TRUE; + +fail5: +fail4: +fail3: +#ifdef CONFIG_X86 + kbox_mce_exit(); +#endif + kbox_unregister_hook(); +fail2: + kbox_panic_exit(); +fail1: + kbox_printk_exit(); + + return ret; +} + +void __exit kbox_cleanup(void) +{ + kbox_drive_cleanup(); +#ifdef CONFIG_X86 + kbox_mce_exit(); +#endif + kbox_unregister_hook(); + kbox_panic_exit(); + kbox_printk_exit(); +} + +MODULE_AUTHOR("HUAWEI TECHNOLOGIES CO., LTD."); +MODULE_DESCRIPTION("HUAWEI KBOX DRIVER"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(KBOX_VERSION); +#ifndef _lint +module_init(kbox_init); +module_exit(kbox_cleanup); +#endif diff --git a/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_main.h b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_main.h new file mode 100644 index 000000000000..2ae02b736529 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_main.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _KBOX_MAIN_H_ +#define _KBOX_MAIN_H_ + +#include "../edma_drv/bma_include.h" +int kbox_init(void); +void kbox_cleanup(void); + +#endif diff --git a/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_mce.c b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_mce.c new file mode 100644 index 000000000000..e9bd931b826e --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_mce.c @@ -0,0 +1,264 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/version.h> +#include <linux/types.h> +#include <linux/atomic.h> +#include <linux/smp.h> +#include <linux/notifier.h> +#include <asm/mce.h> +#include <asm/msr.h> + +#include "kbox_include.h" +#include "kbox_mce.h" +#include "kbox_dump.h" +#include "kbox_printk.h" +#include "kbox_panic.h" + +enum context { + KBOX_IN_KERNEL = 1, KBOX_IN_USER = 2 +}; + +enum ser { + KBOX_SER_REQUIRED = 1, KBOX_NO_SER = 2 +}; + +enum severity_level { + KBOX_MCE_NO_SEVERITY, + KBOX_MCE_KEEP_SEVERITY, + KBOX_MCE_SOME_SEVERITY, + KBOX_MCE_AO_SEVERITY, + KBOX_MCE_UC_SEVERITY, + KBOX_MCE_AR_SEVERITY, + KBOX_MCE_PANIC_SEVERITY, +}; + +static struct severity { + u64 kbox_mask; + u64 kbox_result; + unsigned char kbox_sev; + unsigned char kbox_mcgmask; + unsigned char kbox_mcgres; + unsigned char kbox_ser; + unsigned char kbox_context; + unsigned char kbox_covered; + char *kbox_msg; +} kbox_severities[] = { +#define KBOX_KERNEL .kbox_context = KBOX_IN_KERNEL +#define KBOX_USER .kbox_context = KBOX_IN_USER +#define KBOX_SER .kbox_ser = KBOX_SER_REQUIRED +#define KBOX_NOSER .kbox_ser = KBOX_NO_SER +#define KBOX_SEV(s) .kbox_sev = KBOX_MCE_ ## s ## _SEVERITY +#define KBOX_BITCLR(x, s, m, r...) \ + { .kbox_mask = x, .kbox_result = 0, KBOX_SEV(s), .kbox_msg = m, ## r } +#define KBOX_BITSET(x, s, m, r...) \ + { .kbox_mask = x, .kbox_result = x, KBOX_SEV(s), .kbox_msg = m, ## r } +#define KBOX_MCGMASK(x, res, s, m, r...) \ + { .kbox_mcgmask = x, .kbox_mcgres = res, KBOX_SEV(s), \ + .kbox_msg = m, ## r } +#define KBOX_MASK(x, y, s, m, r...) \ + { .kbox_mask = x, .kbox_result = y, KBOX_SEV(s), .kbox_msg = m, ## r } +#define KBOX_MCI_UC_S (MCI_STATUS_UC | MCI_STATUS_S) +#define KBOX_MCI_UC_SAR (MCI_STATUS_UC | MCI_STATUS_S | MCI_STATUS_AR) +#define KBOX_MCACOD 0xffff + +KBOX_BITCLR(MCI_STATUS_VAL, NO, "Invalid"), +KBOX_BITCLR(MCI_STATUS_EN, NO, "Not enabled"), +KBOX_BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"), + +KBOX_MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"), + +KBOX_MCGMASK(MCG_STATUS_RIPV | MCG_STATUS_EIPV, 0, PANIC, + "Neither restart nor error IP"), +KBOX_MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP", + KBOX_KERNEL), +KBOX_BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", KBOX_NOSER), +KBOX_MASK(MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN, MCI_STATUS_UC, SOME, + "Spurious not enabled", KBOX_SER), + +KBOX_MASK(KBOX_MCI_UC_SAR, MCI_STATUS_UC, KEEP, + "Uncorrected no action required", KBOX_SER), +KBOX_MASK(MCI_STATUS_OVER | KBOX_MCI_UC_SAR, MCI_STATUS_UC | MCI_STATUS_AR, + PANIC, "Illegal combination (UCNA with AR=1)", KBOX_SER), +KBOX_MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", KBOX_SER), + +KBOX_MASK(MCI_STATUS_OVER | KBOX_MCI_UC_SAR, MCI_STATUS_OVER | KBOX_MCI_UC_SAR, + PANIC, "Action required with lost events", KBOX_SER), +KBOX_MASK(MCI_STATUS_OVER | KBOX_MCI_UC_SAR | KBOX_MCACOD, KBOX_MCI_UC_SAR, + PANIC, "Action required; unknown MCACOD", KBOX_SER), + +KBOX_MASK(KBOX_MCI_UC_SAR | MCI_STATUS_OVER | 0xfff0, KBOX_MCI_UC_S | 0xc0, + AO, "Action optional: memory scrubbing error", KBOX_SER), +KBOX_MASK(KBOX_MCI_UC_SAR | MCI_STATUS_OVER | KBOX_MCACOD, + KBOX_MCI_UC_S | 0x17a, AO, + "Action optional: last level cache writeback error", KBOX_SER), + +KBOX_MASK(MCI_STATUS_OVER | KBOX_MCI_UC_SAR, KBOX_MCI_UC_S, SOME, + "Action optional unknown MCACOD", KBOX_SER), +KBOX_MASK(MCI_STATUS_OVER | KBOX_MCI_UC_SAR, KBOX_MCI_UC_S | MCI_STATUS_OVER, + SOME, "Action optional with lost events", KBOX_SER), +KBOX_BITSET(MCI_STATUS_UC | MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"), +KBOX_BITSET(MCI_STATUS_UC, UC, "Uncorrected"), +KBOX_BITSET(0, SOME, "No match") +}; + +static unsigned int g_kbox_nr_mce_banks; +static unsigned int g_kbox_mce_ser; +static atomic_t g_mce_dump_state = ATOMIC_INIT(0); + +static int kbox_mce_severity(u64 mcgstatus, u64 status) +{ + struct severity *s; + + for (s = kbox_severities;; s++) { + if ((status & s->kbox_mask) != s->kbox_result) + continue; + + if ((mcgstatus & s->kbox_mcgmask) != s->kbox_mcgres) + continue; + + if (s->kbox_ser == KBOX_SER_REQUIRED && !g_kbox_mce_ser) + continue; + + if (s->kbox_ser == KBOX_NO_SER && g_kbox_mce_ser) + continue; + + break; + } + + return s->kbox_sev; +} + +static u64 kbox_mce_rdmsrl(u32 ulmsr) +{ + u64 ullv = 0; + + if (rdmsrl_safe(ulmsr, &ullv)) { + (void)kbox_dump_painc_info("mce: Unable to read msr %d!\n", + ulmsr); + ullv = 0; + } + + return ullv; +} + +static int kbox_intel_machine_check(void) +{ + unsigned int idx = 0; + u64 mcgstatus = 0; + int worst = 0; + + mcgstatus = kbox_mce_rdmsrl(MSR_IA32_MCG_STATUS); + + (void) + kbox_dump_painc_info + ("CPU %d: Machine Check Exception MCG STATUS: 0x%016llx\n", + smp_processor_id(), mcgstatus); + + if (!(mcgstatus & MCG_STATUS_RIPV)) + (void)kbox_dump_painc_info("Unable to continue\n"); + + for (idx = 0; idx < g_kbox_nr_mce_banks; idx++) { + u64 status = 0; + u64 misc = 0; + u64 addr = 0; + int lseverity = 0; + + status = kbox_mce_rdmsrl(MSR_IA32_MCx_STATUS(idx)); + + (void)kbox_dump_painc_info("Bank %d STATUS: 0x%016llx\n", idx, + status); + + if (0 == (status & MCI_STATUS_VAL)) + continue; + + lseverity = kbox_mce_severity(mcgstatus, status); + if (lseverity == KBOX_MCE_KEEP_SEVERITY || + lseverity == KBOX_MCE_NO_SEVERITY) + continue; + + (void)kbox_dump_painc_info("severity = %d\n", lseverity); + + if (status & MCI_STATUS_MISCV) { + misc = kbox_mce_rdmsrl(MSR_IA32_MCx_MISC(idx)); + (void)kbox_dump_painc_info("misc = 0x%016llx\n", misc); + } + + if (status & MCI_STATUS_ADDRV) { + addr = kbox_mce_rdmsrl(MSR_IA32_MCx_ADDR(idx)); + (void)kbox_dump_painc_info("addr = 0x%016llx\n", addr); + } + + (void)kbox_dump_painc_info("\n"); + + if (lseverity > worst) + worst = lseverity; + } + + if (worst >= KBOX_MCE_UC_SEVERITY) + return KBOX_FALSE; + + (void)kbox_dump_painc_info("Attempting to continue.\n"); + + return KBOX_TRUE; +} + +int kbox_handle_mce_dump(const char *msg) +{ + int mce_recoverable = KBOX_FALSE; + + atomic_read(&g_mce_dump_state); + + mce_recoverable = kbox_intel_machine_check(); + if (mce_recoverable != KBOX_TRUE) { + static atomic_t mce_entry_tmp; + int flag = atomic_add_return(1, &mce_entry_tmp); + + if (flag != 1) + return KBOX_FALSE; + } + + atomic_set(&g_mce_dump_state, DUMPSTATE_MCE_RESET); + + if (msg) { + (void) + kbox_dump_painc_info + ("-------[ System may reset by %s! ]-------\n\n", msg); + } + + return KBOX_TRUE; +} + +int kbox_mce_init(void) +{ + u64 cap = 0; + + cap = kbox_mce_rdmsrl(MSR_IA32_MCG_CAP); + g_kbox_nr_mce_banks = cap & MCG_BANKCNT_MASK; + + if (cap & MCG_SER_P) + g_kbox_mce_ser = 1; + + KBOX_MSG("get nr_mce_banks:%d, g_kbox_mce_ser = %d, cap = 0x%016llx\n", + g_kbox_nr_mce_banks, g_kbox_mce_ser, cap); + + return KBOX_TRUE; +} + +void kbox_mce_exit(void) +{ + g_kbox_nr_mce_banks = 0; + g_kbox_mce_ser = 0; +} diff --git a/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_mce.h b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_mce.h new file mode 100644 index 000000000000..00d3b787c140 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_mce.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _KBOX_MCE_H_ +#define _KBOX_MCE_H_ + +int kbox_handle_mce_dump(const char *msg); +int kbox_mce_init(void); +void kbox_mce_exit(void); + +#endif diff --git a/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_panic.c b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_panic.c new file mode 100644 index 000000000000..0c17cd2bae49 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_panic.c @@ -0,0 +1,187 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <asm/types.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/err.h> +#include "kbox_include.h" +#include "kbox_panic.h" +#include "kbox_ram_op.h" + +#ifdef CONFIG_X86 +#include <asm/msr.h> +#endif + +#define PANIC_TMP_BUF_SIZE 256 + +static int g_panic_init_ok = KBOX_FALSE; + +static char *g_panic_info_buf_tmp; +static char *g_panic_info_buf; + +static unsigned int g_panic_info_start; + +static unsigned int g_panic_info_end; + +static unsigned int g_panic_info_len; + +static DEFINE_SPINLOCK(g_panic_buf_lock); + +static void kbox_emit_syslog_char(const char c) +{ + if (unlikely(!g_panic_info_buf)) + return; + + *(g_panic_info_buf + (g_panic_info_end % SLOT_LENGTH)) = c; + g_panic_info_end++; + + if (g_panic_info_end > SLOT_LENGTH) + g_panic_info_start++; + + if (g_panic_info_len < SLOT_LENGTH) + g_panic_info_len++; +} + +static int kbox_duplicate_syslog_info(const char *syslog_buf, + unsigned int buf_len) +{ + unsigned int idx = 0; + unsigned long flags = 0; + + if (!syslog_buf) + return 0; + + spin_lock_irqsave(&g_panic_buf_lock, flags); + + for (idx = 0; idx < buf_len; idx++) + kbox_emit_syslog_char(*syslog_buf++); + + spin_unlock_irqrestore(&g_panic_buf_lock, flags); + + return buf_len; +} + +int kbox_dump_painc_info(const char *fmt, ...) +{ + va_list args; + int num = 0; + char tmp_buf[PANIC_TMP_BUF_SIZE] = { }; + + va_start(args, fmt); + + num = vsnprintf(tmp_buf, sizeof(tmp_buf) - 1, fmt, args); + if (num >= 0) + (void)kbox_duplicate_syslog_info(tmp_buf, num); + + va_end(args); + + return num; +} + +void kbox_output_syslog_info(void) +{ + unsigned int start_tmp = 0; + unsigned int end_tmp = 0; + unsigned int len_tmp = 0; + unsigned long flags = 0; + + if (unlikely + (!g_panic_info_buf || !g_panic_info_buf_tmp)) + return; + + spin_lock_irqsave(&g_panic_buf_lock, flags); + if (g_panic_info_len == 0) { + spin_unlock_irqrestore(&g_panic_buf_lock, flags); + return; + } + + start_tmp = (g_panic_info_start % SLOT_LENGTH); + end_tmp = ((g_panic_info_end - 1) % SLOT_LENGTH); + len_tmp = g_panic_info_len; + + if (start_tmp > end_tmp) { + memcpy(g_panic_info_buf_tmp, + (g_panic_info_buf + start_tmp), + len_tmp - start_tmp); + memcpy((g_panic_info_buf_tmp + len_tmp - start_tmp), + g_panic_info_buf, + end_tmp + 1); + } else { + memcpy(g_panic_info_buf_tmp, + (char *)(g_panic_info_buf + start_tmp), + len_tmp); + } + + spin_unlock_irqrestore(&g_panic_buf_lock, flags); + + (void)kbox_write_panic_info(g_panic_info_buf_tmp, len_tmp); +} + +int kbox_panic_init(void) +{ + int ret = KBOX_TRUE; + + g_panic_info_buf = kmalloc(SLOT_LENGTH, GFP_KERNEL); + if (IS_ERR(g_panic_info_buf) || !g_panic_info_buf) { + KBOX_MSG("kmalloc g_panic_info_buf fail!\n"); + ret = -ENOMEM; + goto fail; + } + + memset(g_panic_info_buf, 0, SLOT_LENGTH); + + g_panic_info_buf_tmp = kmalloc(SLOT_LENGTH, GFP_KERNEL); + if (IS_ERR(g_panic_info_buf_tmp) || !g_panic_info_buf_tmp) { + KBOX_MSG("kmalloc g_panic_info_buf_tmp fail!\n"); + ret = -ENOMEM; + goto fail; + } + + memset(g_panic_info_buf_tmp, 0, SLOT_LENGTH); + + g_panic_init_ok = KBOX_TRUE; + + return ret; +fail: + + kfree(g_panic_info_buf); + g_panic_info_buf = NULL; + + kfree(g_panic_info_buf_tmp); + g_panic_info_buf_tmp = NULL; + + return ret; +} + +void kbox_panic_exit(void) +{ + if (g_panic_init_ok != KBOX_TRUE) + return; + + kfree(g_panic_info_buf); + g_panic_info_buf = NULL; + + kfree(g_panic_info_buf_tmp); + g_panic_info_buf_tmp = NULL; +} + +int kbox_handle_panic_dump(const char *msg) +{ + if (msg) + (void)kbox_dump_painc_info("panic string: %s\n", msg); + + return KBOX_TRUE; +} diff --git a/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_panic.h b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_panic.h new file mode 100644 index 000000000000..5715b66c0659 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_panic.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _KBOX_PANIC_H_ +#define _KBOX_PANIC_H_ + +int kbox_handle_panic_dump(const char *msg); +void kbox_output_syslog_info(void); +int kbox_dump_painc_info(const char *fmt, ...); +int kbox_panic_init(void); +void kbox_panic_exit(void); + +#endif diff --git a/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_printk.c b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_printk.c new file mode 100644 index 000000000000..3b04ba206108 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_printk.c @@ -0,0 +1,363 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/spinlock.h> +#include <linux/console.h> /* struct console */ +#include <linux/slab.h> +#include <linux/err.h> +#include "kbox_include.h" +#include "kbox_main.h" +#include "kbox_printk.h" +#include "kbox_ram_image.h" +#include "kbox_ram_op.h" + +#define TMP_BUF_SIZE 256 + +static int g_printk_init_ok = KBOX_FALSE; +static char *g_printk_info_buf; +static char *g_printk_info_buf_tmp; +static struct printk_ctrl_block_tmp_s g_printk_ctrl_block_tmp = { }; + +static DEFINE_SPINLOCK(g_printk_buf_lock); + +static void kbox_printk_info_write(struct console *console, + const char *printk_buf, + unsigned int buf_len); + +static struct console g_printk_console = { + .name = "k_prtk", + .flags = CON_ENABLED | CON_PRINTBUFFER, + .write = kbox_printk_info_write, +}; + +static int kbox_printk_format_is_order(struct printk_info_ctrl_block_s * + printk_ctrl_blk_first, + struct printk_info_ctrl_block_s * + printk_ctrl_blk_second) +{ + if (!printk_ctrl_blk_first || !printk_ctrl_blk_second) + return KBOX_FALSE; + + if (!memcmp(printk_ctrl_blk_first->flag, PRINTK_CURR_FLAG, + PRINTK_FLAG_LEN) && + !memcmp(printk_ctrl_blk_second->flag, PRINTK_LAST_FLAG, + PRINTK_FLAG_LEN)) { + return KBOX_TRUE; + } + + return KBOX_FALSE; +} + +static void kbox_printk_format(struct printk_info_ctrl_block_s *printk_ctrl_blk, + const unsigned int len, const char *flag) +{ + if (!printk_ctrl_blk || !flag) + return; + + memset(printk_ctrl_blk, 0, len); + memcpy(printk_ctrl_blk->flag, flag, PRINTK_FLAG_LEN); +} + +static void kbox_printk_init_info_first + (struct image_super_block_s *kbox_super_block) +{ + KBOX_MSG("\n"); + if (kbox_printk_format_is_order(kbox_super_block->printk_ctrl_blk, + kbox_super_block->printk_ctrl_blk + + 1) == KBOX_TRUE) { + memcpy(kbox_super_block->printk_ctrl_blk[0].flag, + PRINTK_LAST_FLAG, PRINTK_FLAG_LEN); + memcpy(kbox_super_block->printk_ctrl_blk[1].flag, + PRINTK_CURR_FLAG, PRINTK_FLAG_LEN); + kbox_super_block->printk_ctrl_blk[1].len = 0; + g_printk_ctrl_block_tmp.printk_region = 1; + g_printk_ctrl_block_tmp.section = KBOX_SECTION_PRINTK2; + (void)kbox_clear_region(KBOX_SECTION_PRINTK2); + } else if (kbox_printk_format_is_order + (kbox_super_block->printk_ctrl_blk + 1, + kbox_super_block->printk_ctrl_blk) == KBOX_TRUE) { + memcpy(kbox_super_block->printk_ctrl_blk[1].flag, + PRINTK_LAST_FLAG, + PRINTK_FLAG_LEN); + memcpy(kbox_super_block->printk_ctrl_blk[0].flag, + PRINTK_CURR_FLAG, + PRINTK_FLAG_LEN); + + kbox_super_block->printk_ctrl_blk[0].len = 0; + g_printk_ctrl_block_tmp.printk_region = 0; + g_printk_ctrl_block_tmp.section = KBOX_SECTION_PRINTK1; + (void)kbox_clear_region(KBOX_SECTION_PRINTK1); + } else { + kbox_printk_format(kbox_super_block->printk_ctrl_blk, + sizeof(struct printk_info_ctrl_block_s), + PRINTK_CURR_FLAG); + kbox_printk_format(kbox_super_block->printk_ctrl_blk + 1, + sizeof(struct printk_info_ctrl_block_s), + PRINTK_LAST_FLAG); + g_printk_ctrl_block_tmp.printk_region = 0; + g_printk_ctrl_block_tmp.section = KBOX_SECTION_PRINTK1; + (void)kbox_clear_region(KBOX_SECTION_PRINTK1); + (void)kbox_clear_region(KBOX_SECTION_PRINTK2); + } + + g_printk_ctrl_block_tmp.start = 0; + g_printk_ctrl_block_tmp.end = 0; + g_printk_ctrl_block_tmp.valid_len = 0; +} + +static void kbox_printk_init_info_not_first + (struct image_super_block_s *kbox_super_block) +{ + KBOX_MSG("\n"); + if (KBOX_TRUE == + kbox_printk_format_is_order(kbox_super_block->printk_ctrl_blk, + kbox_super_block->printk_ctrl_blk + + 1)) { + g_printk_ctrl_block_tmp.printk_region = 0; + g_printk_ctrl_block_tmp.section = KBOX_SECTION_PRINTK1; + + } else if (KBOX_TRUE == + kbox_printk_format_is_order + (kbox_super_block->printk_ctrl_blk + 1, + kbox_super_block->printk_ctrl_blk)) { + g_printk_ctrl_block_tmp.printk_region = 1; + g_printk_ctrl_block_tmp.section = KBOX_SECTION_PRINTK2; + + } else { + kbox_printk_format(kbox_super_block->printk_ctrl_blk, + sizeof(struct printk_info_ctrl_block_s), + PRINTK_CURR_FLAG); + kbox_printk_format(kbox_super_block->printk_ctrl_blk + 1, + sizeof(struct printk_info_ctrl_block_s), + PRINTK_LAST_FLAG); + g_printk_ctrl_block_tmp.printk_region = 0; + g_printk_ctrl_block_tmp.section = KBOX_SECTION_PRINTK1; + (void)kbox_clear_region(KBOX_SECTION_PRINTK1); + (void)kbox_clear_region(KBOX_SECTION_PRINTK2); + } + + g_printk_ctrl_block_tmp.start = 0; +} + +static int kbox_printk_init_info(int kbox_proc_exist) +{ + struct image_super_block_s kbox_super_block = { }; + unsigned int read_len = 0; + unsigned int write_len = 0; + + read_len = + kbox_read_from_ram(SECTION_KERNEL_OFFSET, + (unsigned int)sizeof(struct image_super_block_s), + (char *)&kbox_super_block, KBOX_SECTION_KERNEL); + if (read_len != sizeof(struct image_super_block_s)) { + KBOX_MSG("fail to get superblock data!\n"); + return KBOX_FALSE; + } + + if (kbox_proc_exist) { + kbox_printk_init_info_not_first(&kbox_super_block); + if (KBOX_TRUE != + kbox_read_printk_info(g_printk_info_buf, + &g_printk_ctrl_block_tmp)) { + g_printk_ctrl_block_tmp.end = 0; + g_printk_ctrl_block_tmp.valid_len = 0; + } + } else { + kbox_printk_init_info_first(&kbox_super_block); + } + + kbox_super_block.checksum = 0; + kbox_super_block.checksum = + ~((unsigned char) + kbox_checksum((char *)&kbox_super_block, + (unsigned int)sizeof(kbox_super_block))) + 1; + write_len = + kbox_write_to_ram(SECTION_KERNEL_OFFSET, + (unsigned int)sizeof(struct image_super_block_s), + (char *)&kbox_super_block, KBOX_SECTION_KERNEL); + if (write_len <= 0) { + KBOX_MSG("fail to write superblock data!\n"); + return KBOX_FALSE; + } + + return KBOX_TRUE; +} + +void kbox_output_printk_info(void) +{ + unsigned int start_tmp = 0; + unsigned int end_tmp = 0; + unsigned int len_tmp = 0; + unsigned long flags = 0; + + if (unlikely(!g_printk_info_buf || !g_printk_info_buf_tmp)) + return; + + if (g_printk_init_ok != KBOX_TRUE) + return; + + spin_lock_irqsave(&g_printk_buf_lock, flags); + if (g_printk_ctrl_block_tmp.valid_len == 0) { + spin_unlock_irqrestore(&g_printk_buf_lock, flags); + return; + } + + start_tmp = (g_printk_ctrl_block_tmp.start % SECTION_PRINTK_LEN); + end_tmp = ((g_printk_ctrl_block_tmp.end - 1) % SECTION_PRINTK_LEN); + len_tmp = g_printk_ctrl_block_tmp.valid_len; + + if (start_tmp > end_tmp) { + memcpy(g_printk_info_buf_tmp, + g_printk_info_buf + start_tmp, + len_tmp - start_tmp); + memcpy(g_printk_info_buf_tmp + len_tmp - start_tmp, + g_printk_info_buf, + end_tmp + 1); + } else { + memcpy(g_printk_info_buf_tmp, + g_printk_info_buf + start_tmp, + len_tmp); + } + + spin_unlock_irqrestore(&g_printk_buf_lock, flags); + + (void)kbox_write_printk_info(g_printk_info_buf_tmp, + &g_printk_ctrl_block_tmp); +} + +static void kbox_emit_printk_char(const char c) +{ + if (unlikely(!g_printk_info_buf)) + return; + + *(g_printk_info_buf + + (g_printk_ctrl_block_tmp.end % SECTION_PRINTK_LEN)) = c; + g_printk_ctrl_block_tmp.end++; + + if (g_printk_ctrl_block_tmp.end > SECTION_PRINTK_LEN) + g_printk_ctrl_block_tmp.start++; + + if (g_printk_ctrl_block_tmp.end < SECTION_PRINTK_LEN) + g_printk_ctrl_block_tmp.valid_len++; +} + +static int kbox_duplicate_printk_info(const char *printk_buf, + unsigned int buf_len) +{ + unsigned int idx = 0; + unsigned long flags = 0; + + spin_lock_irqsave(&g_printk_buf_lock, flags); + for (idx = 0; idx < buf_len; idx++) + kbox_emit_printk_char(*printk_buf++); + + spin_unlock_irqrestore(&g_printk_buf_lock, flags); + + return buf_len; +} + +int kbox_dump_printk_info(const char *fmt, ...) +{ + va_list args; + int num = 0; + char tmp_buf[TMP_BUF_SIZE] = { }; + + if (g_printk_init_ok != KBOX_TRUE) + return 0; + + va_start(args, fmt); + num = vsnprintf(tmp_buf, sizeof(tmp_buf) - 1, fmt, args); + if (num >= 0) + (void)kbox_duplicate_printk_info(tmp_buf, num); + + va_end(args); + + return num; +} + +static void kbox_printk_info_write(struct console *pconsole, + const char *printk_buf, unsigned int buf_len) +{ + UNUSED(pconsole); + + if (unlikely(!printk_buf)) + return; + + (void)kbox_duplicate_printk_info(printk_buf, buf_len); +} + +int kbox_printk_init(int kbox_proc_exist) +{ + int ret = KBOX_TRUE; + + g_printk_info_buf = kmalloc(SECTION_PRINTK_LEN, + GFP_KERNEL); + if (IS_ERR(g_printk_info_buf) || !g_printk_info_buf) { + KBOX_MSG("kmalloc g_printk_info_buf fail!\n"); + ret = -ENOMEM; + goto fail; + } + + memset(g_printk_info_buf, 0, SECTION_PRINTK_LEN); + + g_printk_info_buf_tmp = kmalloc(SECTION_PRINTK_LEN, + GFP_KERNEL); + if (IS_ERR(g_printk_info_buf_tmp) || !g_printk_info_buf_tmp) { + KBOX_MSG("kmalloc g_printk_info_buf_tmp fail!\n"); + ret = -ENOMEM; + goto fail; + } + + memset(g_printk_info_buf_tmp, 0, SECTION_PRINTK_LEN); + + ret = kbox_printk_init_info(kbox_proc_exist); + if (ret != KBOX_TRUE) { + KBOX_MSG("kbox_printk_init_info failed!\n"); + goto fail; + } + + register_console(&g_printk_console); + + g_printk_init_ok = KBOX_TRUE; + + return ret; +fail: + + kfree(g_printk_info_buf); + g_printk_info_buf = NULL; + + kfree(g_printk_info_buf_tmp); + g_printk_info_buf_tmp = NULL; + + return ret; +} + +void kbox_printk_exit(void) +{ + int ret = 0; + + if (g_printk_init_ok != KBOX_TRUE) + return; + + kfree(g_printk_info_buf); + g_printk_info_buf = NULL; + + kfree(g_printk_info_buf_tmp); + g_printk_info_buf_tmp = NULL; + + ret = unregister_console(&g_printk_console); + if (ret) + KBOX_MSG("unregister_console failed!\n"); +} diff --git a/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_printk.h b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_printk.h new file mode 100644 index 000000000000..cece825626a8 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_printk.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _KBOX_PRINTK_H_ +#define _KBOX_PRINTK_H_ +#include "kbox_ram_image.h" + +struct printk_ctrl_block_tmp_s { + int printk_region; + enum kbox_section_e section; + unsigned int start; + unsigned int end; + unsigned int valid_len;/* valid length of printk section */ +}; + +int kbox_printk_init(int kbox_proc_exist); +void kbox_output_printk_info(void); +int kbox_dump_printk_info(const char *fmt, ...); +void kbox_printk_exit(void); + +#endif diff --git a/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_ram_drive.c b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_ram_drive.c new file mode 100644 index 000000000000..829e2a498843 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_ram_drive.c @@ -0,0 +1,188 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/fs.h> /* everything... */ +#include <linux/module.h> +#include <linux/miscdevice.h> +#include <asm/ioctls.h> +#include <linux/slab.h> +#include "kbox_include.h" +#include "kbox_ram_drive.h" +#include "kbox_main.h" +#include "kbox_ram_image.h" +#include "kbox_ram_op.h" + +#define KBOX_DEVICE_NAME "kbox" +#define KBOX_DEVICE_MINOR 255 + +static struct kbox_dev_s *g_kbox_dev; +static ssize_t kbox_read(struct file *filp, char __user *data, size_t count, + loff_t *ppos); +static ssize_t kbox_write(struct file *filp, const char __user *data, + size_t count, loff_t *ppos); + +static long kbox_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); +static int kbox_mmap(struct file *filp, struct vm_area_struct *vma); +static int kbox_open(struct inode *inode, struct file *filp); +static int kbox_release(struct inode *inode, struct file *filp); + +const struct file_operations kbox_fops = { + .owner = THIS_MODULE, + .read = kbox_read, + .write = kbox_write, + .unlocked_ioctl = kbox_ioctl, + .mmap = kbox_mmap, + .open = kbox_open, + .release = kbox_release, +}; + +static struct miscdevice kbox_device = { + KBOX_DEVICE_MINOR, + KBOX_DEVICE_NAME, + &kbox_fops, +}; + +static ssize_t kbox_read(struct file *filp, char __user *data, size_t count, + loff_t *ppos) +{ + int read_len = 0; + + if (!filp || !data || !ppos) { + KBOX_MSG("input NULL point!\n"); + return -EFAULT; + } + + read_len = kbox_read_op((long long)(*ppos), + count, + data, + KBOX_SECTION_USER); + if (read_len < 0) + return -EFAULT; + + *ppos += read_len; + + return read_len; +} + +static ssize_t kbox_write(struct file *filp, const char __user *data, + size_t count, loff_t *ppos) +{ + int write_len = 0; + + if (!filp || !data || !ppos) { + KBOX_MSG("input NULL point!\n"); + return -EFAULT; + } + + write_len = kbox_write_op((long long)(*ppos), + count, data, KBOX_SECTION_USER); + if (write_len < 0) + return -EFAULT; + + *ppos += write_len; + + return write_len; +} + +static long kbox_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + UNUSED(filp); + + if (kbox_ioctl_detail(cmd, arg) < 0) + return -ENOTTY; + + return 0; +} + +static int kbox_mmap(struct file *filp, struct vm_area_struct *vma) +{ + if (!filp || !vma) { + KBOX_MSG("input NULL point!\n"); + return -EFAULT; + } + + if (kbox_mmap_ram(filp, vma, KBOX_SECTION_USER) < 0) + return -EFAULT; + + return 0; +} + +static int kbox_open(struct inode *pinode, struct file *filp) +{ + UNUSED(pinode); + + if ((g_kbox_dev) && (!atomic_dec_and_test(&g_kbox_dev->au_count))) { + atomic_inc(&g_kbox_dev->au_count); + KBOX_MSG("EBUSY\n"); + return -EBUSY; + } + + filp->private_data = (void *)g_kbox_dev; + + return 0; +} + +int kbox_release(struct inode *pinode, struct file *filp) +{ + struct kbox_dev_s *kbox_dev = (struct kbox_dev_s *)filp->private_data; + + UNUSED(pinode); + + KBOX_MSG("\n"); + + if (kbox_dev) + atomic_inc(&kbox_dev->au_count); + + return 0; +} + +int kbox_drive_init(void) +{ + int ret = 0; + + KBOX_MSG("\n"); + + g_kbox_dev = + kmalloc(sizeof(struct kbox_dev_s), GFP_KERNEL); + if (!g_kbox_dev) + return -ENOMEM; + + ret = misc_register(&kbox_device); + if (ret) + goto fail; + + atomic_set(&g_kbox_dev->au_count, 1); + + KBOX_MSG("ok!\n"); + + return ret; + +fail: + kfree(g_kbox_dev); + g_kbox_dev = NULL; + + return ret; +} + +void kbox_drive_cleanup(void) +{ + if (!g_kbox_dev) + return; + + misc_deregister(&kbox_device); + + kfree(g_kbox_dev); + g_kbox_dev = NULL; +} diff --git a/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_ram_drive.h b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_ram_drive.h new file mode 100644 index 000000000000..52707c4b82c5 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_ram_drive.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _KBOX_RAM_DRIVE_H_ +#define _KBOX_RAM_DRIVE_H_ + +#include <linux/types.h> +#include <linux/atomic.h> + +struct kbox_dev_s { + atomic_t au_count; + + struct kbox_pci_dev_s *kbox_pci_dev; +}; + +int kbox_drive_init(void); +void kbox_drive_cleanup(void); + +#endif diff --git a/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_ram_image.c b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_ram_image.c new file mode 100644 index 000000000000..f57083261983 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_ram_image.c @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include "kbox_include.h" +#include "kbox_main.h" +#include "kbox_ram_image.h" + +void __iomem *kbox_get_section_addr(enum kbox_section_e kbox_section) +{ + void __iomem *kbox_addr = kbox_get_base_addr(); + unsigned long kbox_len = kbox_get_io_len(); + + if (!kbox_addr || kbox_len == 0) { + KBOX_MSG("get kbox_addr or kbox_len failed!\n"); + return NULL; + } + + switch (kbox_section) { + case KBOX_SECTION_KERNEL: + return kbox_addr; + + case KBOX_SECTION_PANIC: + return kbox_addr + SECTION_KERNEL_LEN; + + case KBOX_SECTION_THREAD: + return kbox_addr + SECTION_KERNEL_LEN + SECTION_PANIC_LEN; + + case KBOX_SECTION_PRINTK1: + return kbox_addr + (kbox_len - (2 * SECTION_PRINTK_LEN) - + SECTION_USER_LEN); + + case KBOX_SECTION_PRINTK2: + return kbox_addr + (kbox_len - SECTION_PRINTK_LEN - + SECTION_USER_LEN); + + case KBOX_SECTION_USER: + return kbox_addr + (kbox_len - SECTION_USER_LEN); + + case KBOX_SECTION_ALL: + return kbox_addr; + + default: + KBOX_MSG("input kbox_section error!\n"); + return NULL; + } +} + +unsigned long kbox_get_section_len(enum kbox_section_e kbox_section) +{ + unsigned long kbox_len = kbox_get_io_len(); + + if (kbox_len == 0) { + KBOX_MSG("get kbox_len failed!\n"); + return 0; + } + + switch (kbox_section) { + case KBOX_SECTION_KERNEL: + return SECTION_KERNEL_LEN; + + case KBOX_SECTION_PANIC: + return SECTION_PANIC_LEN; + + case KBOX_SECTION_THREAD: + return (kbox_len - (2 * SECTION_PRINTK_LEN) - + SECTION_USER_LEN - SECTION_KERNEL_LEN - + SECTION_PANIC_LEN); + + case KBOX_SECTION_PRINTK1: + case KBOX_SECTION_PRINTK2: + return SECTION_PRINTK_LEN; + + case KBOX_SECTION_USER: + return SECTION_USER_LEN; + + case KBOX_SECTION_ALL: + return kbox_len; + + default: + KBOX_MSG("input kbox_section error!\n"); + return 0; + } +} + +unsigned long kbox_get_section_phy_addr(enum kbox_section_e kbox_section) +{ + unsigned long kbox_phy_addr = kbox_get_base_phy_addr(); + unsigned long kbox_len = kbox_get_io_len(); + + if (kbox_phy_addr == 0 || kbox_len == 0) { + KBOX_MSG("get kbox_phy_addr or kbox_len failed!\n"); + return 0; + } + + switch (kbox_section) { + case KBOX_SECTION_KERNEL: + return kbox_phy_addr; + + case KBOX_SECTION_PANIC: + return kbox_phy_addr + SECTION_KERNEL_LEN; + + case KBOX_SECTION_THREAD: + return kbox_phy_addr + SECTION_KERNEL_LEN + SECTION_PANIC_LEN; + + case KBOX_SECTION_PRINTK1: + return kbox_phy_addr + (kbox_len - (2 * SECTION_PRINTK_LEN) - + SECTION_USER_LEN); + + case KBOX_SECTION_PRINTK2: + return kbox_phy_addr + (kbox_len - SECTION_PRINTK_LEN - + SECTION_USER_LEN); + + case KBOX_SECTION_USER: + return kbox_phy_addr + (kbox_len - SECTION_USER_LEN); + + case KBOX_SECTION_ALL: + return kbox_phy_addr; + + default: + KBOX_MSG("input kbox_section error!\n"); + return 0; + } +} diff --git a/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_ram_image.h b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_ram_image.h new file mode 100644 index 000000000000..d1b01bd9ea11 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_ram_image.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _KBOX_RAM_IMAGE_H_ +#define _KBOX_RAM_IMAGE_H_ + +enum kbox_section_e { + KBOX_SECTION_KERNEL = 1, + KBOX_SECTION_PANIC = 2, + KBOX_SECTION_THREAD = 3, + KBOX_SECTION_PRINTK1 = 4, + KBOX_SECTION_PRINTK2 = 5, + KBOX_SECTION_USER = 6, + KBOX_SECTION_ALL = 7 +}; + +#define KBOX_BIG_ENDIAN (0x2B) +#define KBOX_LITTLE_ENDIAN (0xB2) +#define IMAGE_VER (0x0001) +#define IMAGE_MAGIC (0xB202C086) +#define VALID_IMAGE(x) (IMAGE_MAGIC == (x)->magic_flag) +#define SLOT_NUM (8) +#define SLOT_LENGTH (16 * 1024) +#define MAX_RECORD_NO (0xFF) +#define MAX_USE_NUMS (0xFF) + +#define PRINTK_NUM (2) +#define PRINTK_CURR_FLAG ("curr") +#define PRINTK_LAST_FLAG ("last") +#define PRINTK_FLAG_LEN (4) + +struct panic_ctrl_block_s { + unsigned char use_nums; + unsigned char number; + unsigned short len; + unsigned int time; +}; + +struct thread_info_ctrl_block_s { + unsigned int thread_info_len; +}; + +struct printk_info_ctrl_block_s { + unsigned char flag[PRINTK_FLAG_LEN]; + unsigned int len; +}; + +struct image_super_block_s { + unsigned char byte_order; + unsigned char checksum; + unsigned short version; + unsigned int magic_flag; + unsigned int panic_nums; + struct panic_ctrl_block_s panic_ctrl_blk[SLOT_NUM]; + struct printk_info_ctrl_block_s printk_ctrl_blk[PRINTK_NUM]; + struct thread_info_ctrl_block_s thread_ctrl_blk; +}; + +#define SECTION_KERNEL_LEN (sizeof(struct image_super_block_s)) +#define SECTION_PANIC_LEN (8 * SLOT_LENGTH) +#define SECTION_PRINTK_LEN (512 * 1024) +#define SECTION_USER_LEN (2 * 1024 * 1024) + +#define SECTION_KERNEL_OFFSET (0) +#define SECTION_PANIC_OFFSET SECTION_KERNEL_LEN +#define SECTION_THREAD_OFFSET (SECTION_KERNEL_LEN + SECTION_PANIC_LEN) + +void __iomem *kbox_get_section_addr(enum kbox_section_e kbox_section); +unsigned long kbox_get_section_len(enum kbox_section_e kbox_section); +unsigned long kbox_get_section_phy_addr(enum kbox_section_e kbox_section); + +#endif diff --git a/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_ram_op.c b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_ram_op.c new file mode 100644 index 000000000000..49690bab1cef --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_ram_op.c @@ -0,0 +1,986 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/version.h> +#include <linux/semaphore.h> +#include <linux/slab.h> +#include <linux/capability.h> +#include <linux/uaccess.h> /* copy_*_user */ +#include <linux/delay.h> /* udelay */ +#include <linux/mm.h> +#include "kbox_include.h" +#include "kbox_main.h" +#include "kbox_ram_image.h" +#include "kbox_ram_op.h" + +#ifndef VM_RESERVED +#define VM_RESERVED 0x00080000 +#endif + +static DEFINE_SPINLOCK(g_kbox_super_block_lock); +static DEFINE_SEMAPHORE(user_sem); + +union char_int_transfer_u { + int data_int; + char data_char[KBOX_RW_UNIT]; +}; + +static struct image_super_block_s g_kbox_super_block = { }; + +void kbox_write_to_pci(void __iomem *dest, const void *src, int len, + unsigned long offset) +{ + union char_int_transfer_u transfer = { }; + int idx = 0; + int j = 0; + int four_byte_len = 0; + int left_len = 0; + char *src_temp = (char *)src; + char *dest_temp = (char *)dest; + int first_write_num = 0; + + if ((offset % KBOX_RW_UNIT) != 0) { + transfer.data_int = + *(int *)(dest_temp + offset - (offset % KBOX_RW_UNIT)); + + rmb();/* memory barriers. */ + first_write_num = + ((len + (offset % KBOX_RW_UNIT)) > + KBOX_RW_UNIT) ? (KBOX_RW_UNIT - + (offset % KBOX_RW_UNIT)) : len; + for (idx = (int)(offset % KBOX_RW_UNIT); + idx < (int)(first_write_num + (offset % KBOX_RW_UNIT)); + idx++) { + if (!src_temp) + return; + + transfer.data_char[idx] = *src_temp; + src_temp++; + } + *(int *)(dest_temp + offset - (offset % KBOX_RW_UNIT)) = + transfer.data_int; + wmb();/* memory barriers. */ + len -= first_write_num; + offset += first_write_num; + } + + four_byte_len = (len / KBOX_RW_UNIT); + left_len = (len % KBOX_RW_UNIT); + for (idx = 0; idx < four_byte_len; idx++) { + for (j = 0; j < KBOX_RW_UNIT; j++) { + if (!src_temp) + return; + + transfer.data_char[j] = *src_temp; + src_temp++; + } + *(int *)(dest_temp + offset) = transfer.data_int; + wmb();/* memory barriers. */ + offset += KBOX_RW_UNIT; + } + + if (left_len != 0) { + transfer.data_int = *(int *)(dest_temp + offset); + rmb();/* memory barriers. */ + for (idx = 0; idx < left_len; idx++) { + if (!src_temp) + return; + + transfer.data_char[idx] = *src_temp; + src_temp++; + } + *(int *)(dest_temp + offset) = transfer.data_int; + wmb();/* memory barriers. */ + } + + udelay(1); +} + +void kbox_read_from_pci(void *dest, void __iomem *src, int len, + unsigned long offset) +{ + union char_int_transfer_u transfer = { }; + int idx = 0; + int j = 0; + int four_byte_len = 0; + int left_len = 0; + char *dest_temp = (char *)dest; + char *src_temp = (char *)src; + int first_read_num = 0; + + if ((offset % KBOX_RW_UNIT) != 0) { + transfer.data_int = + *(int *)(src_temp + offset - (offset % KBOX_RW_UNIT)); + first_read_num = + ((len + (offset % KBOX_RW_UNIT)) > + KBOX_RW_UNIT) ? (KBOX_RW_UNIT - + (offset % KBOX_RW_UNIT)) : len; + rmb();/* memory barriers. */ + for (idx = (int)(offset % KBOX_RW_UNIT); + idx < (int)(first_read_num + (offset % KBOX_RW_UNIT)); + idx++) { + if (!dest_temp) + return; + + *dest_temp = transfer.data_char[idx]; + dest_temp++; + } + len -= first_read_num; + offset += first_read_num; + } + + four_byte_len = (len / KBOX_RW_UNIT); + left_len = (len % KBOX_RW_UNIT); + for (idx = 0; idx < four_byte_len; idx++) { + transfer.data_int = *(int *)(src_temp + offset); + rmb();/* memory barriers. */ + for (j = 0; j < KBOX_RW_UNIT; j++) { + if (!dest_temp) + return; + + *dest_temp = transfer.data_char[j]; + dest_temp++; + } + offset += KBOX_RW_UNIT; + } + + if (left_len != 0) { + transfer.data_int = *(int *)(src_temp + offset); + rmb();/* memory barriers. */ + for (idx = 0; idx < left_len; idx++) { + if (!dest_temp) + return; + + *dest_temp = transfer.data_char[idx]; + dest_temp++; + } + } +} + +void kbox_memset_pci(void __iomem *dest, const char set_byte, int len, + unsigned long offset) +{ + union char_int_transfer_u transfer = { }; + int idx = 0; + int four_byte_len = 0; + int left_len = 0; + char *dest_temp = (char *)dest; + int first_memset_num = 0; + + if ((offset % KBOX_RW_UNIT) != 0) { + transfer.data_int = + *(int *)(dest_temp + offset - (offset % KBOX_RW_UNIT)); + rmb();/* memory barriers. */ + first_memset_num = + ((len + (offset % KBOX_RW_UNIT)) > + KBOX_RW_UNIT) ? (KBOX_RW_UNIT - + (offset % KBOX_RW_UNIT)) : len; + for (idx = (int)(offset % KBOX_RW_UNIT); + idx < (int)(first_memset_num + (offset % KBOX_RW_UNIT)); + idx++) { + transfer.data_char[idx] = set_byte; + } + *(int *)(dest_temp + offset - (offset % KBOX_RW_UNIT)) = + transfer.data_int; + wmb();/* memory barriers. */ + len -= first_memset_num; + offset += first_memset_num; + } + + four_byte_len = (len / KBOX_RW_UNIT); + left_len = (len % KBOX_RW_UNIT); + for (idx = 0; idx < KBOX_RW_UNIT; idx++) + transfer.data_char[idx] = set_byte; + + for (idx = 0; idx < four_byte_len; idx++) { + *(int *)(dest_temp + offset) = transfer.data_int; + wmb();/* memory barriers. */ + offset += KBOX_RW_UNIT; + } + + if (left_len != 0) { + transfer.data_int = *(int *)(dest_temp + offset); + rmb();/* memory barriers. */ + for (idx = 0; idx < left_len; idx++) + transfer.data_char[idx] = set_byte; + + *(int *)(dest_temp + offset) = transfer.data_int; + wmb();/* memory barriers. */ + } + + udelay(1); +} + +int kbox_read_from_ram(unsigned long offset, unsigned int count, char *data, + enum kbox_section_e section) +{ + unsigned int read_len_total = count; + unsigned long offset_temp = offset; + void __iomem *kbox_section_addr = kbox_get_section_addr(section); + unsigned long kbox_section_len = kbox_get_section_len(section); + unsigned int read_len_real = 0; + + if (!data) { + KBOX_MSG("input NULL point!\n"); + return -EFAULT; + } + + if (!kbox_section_addr || kbox_section_len == 0) { + KBOX_MSG("get kbox_section_addr or kbox_section_len failed!\n"); + return -EFAULT; + } + + if (offset >= kbox_section_len) { + KBOX_MSG("input offset is error!\n"); + return -EFAULT; + } + + if ((offset + count) > kbox_section_len) + read_len_total = (unsigned int)(kbox_section_len - offset); + + while (1) { + unsigned int read_bytes = 0; + + if (read_len_real >= count) + break; + + read_bytes = + (read_len_total > + TEMP_BUF_SIZE) ? TEMP_BUF_SIZE : read_len_total; + + kbox_read_from_pci(data, kbox_section_addr, read_bytes, + offset_temp); + + read_len_total -= read_bytes; + read_len_real += read_bytes; + data += read_bytes; + offset_temp += read_bytes; + } + + return (int)read_len_real; +} + +int kbox_write_to_ram(unsigned long offset, unsigned int count, + const char *data, enum kbox_section_e section) +{ + unsigned int write_len_total = count; + unsigned long offset_temp = offset; + void __iomem *kbox_section_addr = kbox_get_section_addr(section); + unsigned long kbox_section_len = kbox_get_section_len(section); + unsigned int write_len_real = 0; + + if (!data) { + KBOX_MSG("input NULL point!\n"); + return -EFAULT; + } + + if (!kbox_section_addr || kbox_section_len == 0) { + KBOX_MSG("get kbox_section_addr or kbox_section_len failed!\n"); + return -EFAULT; + } + + if (offset >= kbox_section_len) { + KBOX_MSG("input offset is error!\n"); + return -EFAULT; + } + + if ((offset + count) > kbox_section_len) + write_len_total = (unsigned int)(kbox_section_len - offset); + + KBOX_MSG("struct image_super_block_s = %x\n", count); + while (1) { + unsigned int write_bytes = 0; + + if (write_len_real >= count) { + KBOX_MSG("write_len_real = %x\n", write_len_real); + break; + } + KBOX_MSG("write_len_total = %x\n", write_len_total); + + write_bytes = + (write_len_total > + TEMP_BUF_SIZE) ? TEMP_BUF_SIZE : write_len_total; + KBOX_MSG("write_bytes = %x\n", write_bytes); + + kbox_write_to_pci(kbox_section_addr, data, write_bytes, + offset_temp); + + write_len_total -= write_bytes; + write_len_real += write_bytes; + data += write_bytes; + offset_temp += write_bytes; + } + + return (int)write_len_real; +} + +int kbox_memset_ram(unsigned long offset, unsigned int count, + const char set_byte, enum kbox_section_e section) +{ + unsigned int memset_len = count; + void __iomem *kbox_section_addr = kbox_get_section_addr(section); + unsigned long kbox_section_len = kbox_get_section_len(section); + + if (!kbox_section_addr || kbox_section_len == 0) { + KBOX_MSG("get kbox_section_addr or kbox_section_len failed!\n"); + return -EFAULT; + } + + if (offset >= kbox_section_len) { + KBOX_MSG("input offset is error!\n"); + return -EFAULT; + } + + if ((offset + count) > kbox_section_len) + memset_len = (unsigned int)(kbox_section_len - offset); + + kbox_memset_pci(kbox_section_addr, set_byte, memset_len, offset); + + return KBOX_TRUE; +} + +int kbox_read_op(long long offset, unsigned int count, char __user *data, + enum kbox_section_e section) +{ + unsigned int read_bytes = 0; + unsigned int read_len = 0; + unsigned int left_len = count; + char *user_buf = data; + char *temp_buf_char = NULL; + unsigned long offset_tmp = offset; + + if (!data) { + KBOX_MSG("input NULL point!\n"); + return -EFAULT; + } + + if (down_interruptible(&user_sem) != 0) + return KBOX_FALSE; + + temp_buf_char = kmalloc(TEMP_BUF_DATA_SIZE, GFP_KERNEL); + if (!temp_buf_char) { + KBOX_MSG("kmalloc temp_buf_char fail!\n"); + up(&user_sem); + return -ENOMEM; + } + + memset((void *)temp_buf_char, 0, TEMP_BUF_DATA_SIZE); + + while (1) { + if (read_len >= count) + break; + + read_bytes = + (left_len > + TEMP_BUF_DATA_SIZE) ? TEMP_BUF_DATA_SIZE : left_len; + + if (kbox_read_from_ram + (offset_tmp, read_bytes, temp_buf_char, section) < 0) { + KBOX_MSG("kbox_read_from_ram fail!\n"); + break; + } + + if (copy_to_user(user_buf, temp_buf_char, read_bytes)) { + KBOX_MSG("copy_to_user fail!\n"); + break; + } + + left_len -= read_bytes; + read_len += read_bytes; + user_buf += read_bytes; + + offset_tmp += read_bytes; + memset((void *)temp_buf_char, 0, TEMP_BUF_DATA_SIZE); + + msleep(20); + } + + kfree(temp_buf_char); + + up(&user_sem); + + return (int)read_len; +} + +int kbox_write_op(long long offset, unsigned int count, + const char __user *data, enum kbox_section_e section) +{ + unsigned int write_len = 0; + unsigned int left_len = count; + const char *user_buf = data; + char *temp_buf_char = NULL; + unsigned long offset_tmp = offset; + + if (!data) { + KBOX_MSG("input NULL point!\n"); + return -EFAULT; + } + + if (down_interruptible(&user_sem) != 0) + return KBOX_FALSE; + + temp_buf_char = kmalloc(TEMP_BUF_DATA_SIZE, GFP_KERNEL); + if (!temp_buf_char || IS_ERR(temp_buf_char)) { + KBOX_MSG("kmalloc temp_buf_char fail!\n"); + up(&user_sem); + return -ENOMEM; + } + + memset((void *)temp_buf_char, 0, TEMP_BUF_DATA_SIZE); + + while (1) { + unsigned int write_bytes = 0; + + if (write_len >= count) + break; + + write_bytes = + (left_len > + TEMP_BUF_DATA_SIZE) ? TEMP_BUF_DATA_SIZE : left_len; + + if (copy_from_user(temp_buf_char, user_buf, write_bytes)) { + KBOX_MSG("copy_from_user fail!\n"); + break; + } + + if (kbox_write_to_ram + (offset_tmp, write_bytes, temp_buf_char, section) < 0) { + KBOX_MSG("kbox_write_to_ram fail!\n"); + break; + } + + left_len -= write_bytes; + write_len += write_bytes; + user_buf += write_bytes; + + offset_tmp += write_bytes; + memset((void *)temp_buf_char, 0, TEMP_BUF_DATA_SIZE); + + msleep(20); + } + + kfree(temp_buf_char); + + up(&user_sem); + + return (int)write_len; +} + +char kbox_checksum(const char *input_buf, unsigned int len) +{ + unsigned int idx = 0; + char checksum = 0; + + for (idx = 0; idx < len; idx++) + checksum += input_buf[idx]; + + return checksum; +} + +static int kbox_update_super_block(void) +{ + int write_len = 0; + + g_kbox_super_block.checksum = 0; + g_kbox_super_block.checksum = + ~((unsigned char) + kbox_checksum((char *)&g_kbox_super_block, + (unsigned int)sizeof(g_kbox_super_block))) + 1; + write_len = + kbox_write_to_ram(SECTION_KERNEL_OFFSET, + (unsigned int)sizeof(struct image_super_block_s), + (char *)&g_kbox_super_block, KBOX_SECTION_KERNEL); + if (write_len <= 0) { + KBOX_MSG("fail to write superblock data!\n"); + return KBOX_FALSE; + } + + return KBOX_TRUE; +} + +int kbox_read_super_block(void) +{ + int read_len = 0; + + read_len = + kbox_read_from_ram(SECTION_KERNEL_OFFSET, + (unsigned int)sizeof(struct image_super_block_s), + (char *)&g_kbox_super_block, + KBOX_SECTION_KERNEL); + if (read_len != sizeof(struct image_super_block_s)) { + KBOX_MSG("fail to get superblock data!\n"); + return KBOX_FALSE; + } + + return KBOX_TRUE; +} + +static unsigned char kbox_get_byte_order(void) +{ + unsigned short data_short = 0xB22B; + unsigned char *data_char = (unsigned char *)&data_short; + + return (unsigned char)((*data_char == 0xB2) ? KBOX_BIG_ENDIAN : + KBOX_LITTLE_ENDIAN); +} + +int kbox_super_block_init(void) +{ + int ret = 0; + + ret = kbox_read_super_block(); + if (ret != KBOX_TRUE) { + KBOX_MSG("kbox_read_super_block fail!\n"); + return ret; + } + + if (!VALID_IMAGE(&g_kbox_super_block) || + kbox_checksum((char *)&g_kbox_super_block, + (unsigned int)sizeof(g_kbox_super_block)) != 0) { + if (!VALID_IMAGE(&g_kbox_super_block)) { + memset((void *)&g_kbox_super_block, 0x00, + sizeof(struct image_super_block_s)); + } + + g_kbox_super_block.byte_order = kbox_get_byte_order(); + g_kbox_super_block.version = IMAGE_VER; + g_kbox_super_block.magic_flag = IMAGE_MAGIC; + } + + g_kbox_super_block.thread_ctrl_blk.thread_info_len = 0; + + if (kbox_update_super_block() != KBOX_TRUE) { + KBOX_MSG("kbox_update_super_block failed!\n"); + return KBOX_FALSE; + } + + return KBOX_TRUE; +} + +static unsigned char kbox_get_write_slot_num(void) +{ + struct panic_ctrl_block_s *panic_ctrl_block = NULL; + unsigned int idx = 0; + unsigned char slot_num = 0; + unsigned char min_use_nums = 0; + + panic_ctrl_block = g_kbox_super_block.panic_ctrl_blk; + min_use_nums = panic_ctrl_block->use_nums; + + for (idx = 1; idx < SLOT_NUM; idx++) { + panic_ctrl_block++; + if (panic_ctrl_block->use_nums < min_use_nums) { + min_use_nums = panic_ctrl_block->use_nums; + slot_num = (unsigned char)idx; + } + } + + if (min_use_nums == MAX_USE_NUMS) { + panic_ctrl_block = g_kbox_super_block.panic_ctrl_blk; + for (idx = 0; idx < SLOT_NUM; idx++) { + panic_ctrl_block->use_nums = 1; + panic_ctrl_block++; + } + } + + return slot_num; +} + +static unsigned char kbox_get_new_record_number(void) +{ + struct panic_ctrl_block_s *panic_ctrl_block = NULL; + unsigned int idx = 0; + unsigned char max_number = 0; + + panic_ctrl_block = g_kbox_super_block.panic_ctrl_blk; + for (idx = 0; idx < SLOT_NUM; idx++) { + if (panic_ctrl_block->number >= max_number) + max_number = panic_ctrl_block->number; + + panic_ctrl_block++; + } + + return (unsigned char)((max_number + 1) % MAX_RECORD_NO); +} + +int kbox_write_panic_info(const char *input_data, unsigned int data_len) +{ + int write_len = 0; + unsigned int offset = 0; + struct panic_ctrl_block_s *panic_ctrl_block = NULL; + unsigned long time = get_seconds(); + unsigned char slot_num = 0; + unsigned long flags = 0; + + if (!input_data || data_len == 0) { + KBOX_MSG("input parameter error!\n"); + return KBOX_FALSE; + } + + if (data_len > SLOT_LENGTH) + data_len = SLOT_LENGTH; + + spin_lock_irqsave(&g_kbox_super_block_lock, flags); + + slot_num = kbox_get_write_slot_num(); + + panic_ctrl_block = &g_kbox_super_block.panic_ctrl_blk[slot_num]; + panic_ctrl_block->use_nums++; + + panic_ctrl_block->number = kbox_get_new_record_number(); + panic_ctrl_block->len = 0; + panic_ctrl_block->time = (unsigned int)time; + + g_kbox_super_block.panic_nums++; + + spin_unlock_irqrestore(&g_kbox_super_block_lock, flags); + + offset = slot_num * SLOT_LENGTH; + write_len = + kbox_write_to_ram(offset, data_len, input_data, KBOX_SECTION_PANIC); + if (write_len <= 0) { + KBOX_MSG("fail to save panic information!\n"); + return KBOX_FALSE; + } + + spin_lock_irqsave(&g_kbox_super_block_lock, flags); + + panic_ctrl_block->len += (unsigned short)write_len; + + if (kbox_update_super_block() != KBOX_TRUE) { + KBOX_MSG("kbox_update_super_block failed!\n"); + spin_unlock_irqrestore(&g_kbox_super_block_lock, flags); + return KBOX_FALSE; + } + + spin_unlock_irqrestore(&g_kbox_super_block_lock, flags); + + return KBOX_TRUE; +} + +int kbox_write_thread_info(const char *input_data, unsigned int data_len) +{ + int write_len = 0; + unsigned int offset = 0; + unsigned long flags = 0; + unsigned int date_len_tmp = data_len; + + if (!input_data || date_len_tmp == 0) { + KBOX_MSG("input parameter error!\n"); + return KBOX_FALSE; + } + + spin_lock_irqsave(&g_kbox_super_block_lock, flags); + + offset = g_kbox_super_block.thread_ctrl_blk.thread_info_len; + write_len = + kbox_write_to_ram(offset, date_len_tmp, input_data, + KBOX_SECTION_THREAD); + if (write_len <= 0) { + KBOX_MSG("fail to save thread information!\n"); + spin_unlock_irqrestore(&g_kbox_super_block_lock, flags); + return KBOX_FALSE; + } + + g_kbox_super_block.thread_ctrl_blk.thread_info_len += write_len; + + if (kbox_update_super_block() != KBOX_TRUE) { + KBOX_MSG("kbox_update_super_block failed!\n"); + spin_unlock_irqrestore(&g_kbox_super_block_lock, flags); + return KBOX_FALSE; + } + + spin_unlock_irqrestore(&g_kbox_super_block_lock, flags); + + return KBOX_TRUE; +} + +int kbox_read_printk_info(char *input_data, + struct printk_ctrl_block_tmp_s *printk_ctrl_block_tmp) +{ + int read_len = 0; + int printk_region = printk_ctrl_block_tmp->printk_region; + unsigned int len = 0; + + if (!input_data) { + KBOX_MSG("input parameter error!\n"); + return KBOX_FALSE; + } + + len = g_kbox_super_block.printk_ctrl_blk[printk_region].len; + if (len <= 0) { + printk_ctrl_block_tmp->end = 0; + printk_ctrl_block_tmp->valid_len = 0; + return KBOX_TRUE; + } + + read_len = + kbox_read_from_ram(0, len, input_data, + printk_ctrl_block_tmp->section); + if (read_len < 0) { + KBOX_MSG("fail to read printk information!(1)\n"); + return KBOX_FALSE; + } + + printk_ctrl_block_tmp->end = len; + printk_ctrl_block_tmp->valid_len = len; + + return KBOX_TRUE; +} + +int kbox_write_printk_info(const char *input_data, + struct printk_ctrl_block_tmp_s * + printk_ctrl_block_tmp) +{ + int write_len = 0; + int printk_region = printk_ctrl_block_tmp->printk_region; + unsigned long flags = 0; + unsigned int len = 0; + + if (!input_data) { + KBOX_MSG("input parameter error!\n"); + return KBOX_FALSE; + } + + len = printk_ctrl_block_tmp->valid_len; + write_len = + kbox_write_to_ram(0, len, input_data, + printk_ctrl_block_tmp->section); + if (write_len <= 0) { + KBOX_MSG("fail to save printk information!(1)\n"); + return KBOX_FALSE; + } + + spin_lock_irqsave(&g_kbox_super_block_lock, flags); + + g_kbox_super_block.printk_ctrl_blk[printk_region].len = len; + + if (kbox_update_super_block() != KBOX_TRUE) { + KBOX_MSG("kbox_update_super_block failed!\n"); + spin_unlock_irqrestore(&g_kbox_super_block_lock, flags); + return KBOX_FALSE; + } + + spin_unlock_irqrestore(&g_kbox_super_block_lock, flags); + + return KBOX_TRUE; +} + +static int kbox_read_region(unsigned long arg) +{ + unsigned int read_len = 0; + struct kbox_region_arg_s region_arg = { }; + + if (copy_from_user + ((void *)®ion_arg, (void __user *)arg, + sizeof(struct kbox_region_arg_s))) { + KBOX_MSG("fail to copy_from_user!\n"); + return KBOX_FALSE; + } + + read_len = kbox_read_op((long long)region_arg.offset, region_arg.count, + (char __user *)region_arg.data, + KBOX_SECTION_ALL); + if (read_len <= 0) { + KBOX_MSG("fail to get kbox data!\n"); + return KBOX_FALSE; + } + + if (copy_to_user + ((void __user *)arg, (void *)®ion_arg, + sizeof(struct kbox_region_arg_s))) { + KBOX_MSG("fail to copy_to_user!\n"); + return KBOX_FALSE; + } + + return KBOX_TRUE; +} + +static int kbox_writer_region(unsigned long arg) +{ + unsigned int write_len = 0; + struct kbox_region_arg_s region_arg = { }; + + if (copy_from_user + ((void *)®ion_arg, (void __user *)arg, + sizeof(struct kbox_region_arg_s))) { + KBOX_MSG("fail to copy_from_user!\n"); + return KBOX_FALSE; + } + + write_len = kbox_write_op((long long)region_arg.offset, + region_arg.count, + (char __user *)region_arg.data, + KBOX_SECTION_ALL); + if (write_len <= 0) { + KBOX_MSG("fail to write kbox data!\n"); + return KBOX_FALSE; + } + + if (copy_to_user + ((void __user *)arg, (void *)®ion_arg, + sizeof(struct kbox_region_arg_s))) { + KBOX_MSG("fail to copy_to_user!\n"); + return KBOX_FALSE; + } + + return KBOX_TRUE; +} + +int kbox_clear_region(enum kbox_section_e section) +{ + int ret = KBOX_TRUE; + unsigned long kbox_section_len = kbox_get_section_len(section); + + if (kbox_section_len == 0) { + KBOX_MSG("get kbox_section_len failed!\n"); + return -EFAULT; + } + + ret = kbox_memset_ram(0, (unsigned int)kbox_section_len, 0, section); + if (ret != KBOX_TRUE) { + KBOX_MSG("kbox_memset_ram failed!\n"); + return -EFAULT; + } + + return KBOX_TRUE; +} + +static int kbox_get_image_len(unsigned long arg) +{ + unsigned long __user *ptr = (unsigned long __user *)arg; + unsigned long kbox_len = 0; + + kbox_len = kbox_get_section_len(KBOX_SECTION_ALL); + if (kbox_len == 0) { + KBOX_MSG("kbox_get_section_len section all fail!\n"); + return -EFAULT; + } + + return put_user(kbox_len, ptr); +} + +static int kbox_get_user_region_len(unsigned long arg) +{ + unsigned long __user *ptr = (unsigned long __user *)arg; + unsigned long kbox_user_region_len = 0; + + kbox_user_region_len = kbox_get_section_len(KBOX_SECTION_USER); + if (kbox_user_region_len == 0) { + KBOX_MSG("kbox_get_section_len section user fail!\n"); + return -EFAULT; + } + + return put_user(kbox_user_region_len, ptr); +} + +static int kbox_ioctl_verify_cmd(unsigned int cmd, unsigned long arg) +{ + if (arg == 0 || (_IOC_TYPE(cmd) != KBOX_IOC_MAGIC)) + return KBOX_FALSE; + + if (_IOC_NR(cmd) > KBOX_IOC_MAXNR) + return KBOX_FALSE; + + if (!capable(CAP_SYS_ADMIN)) { + KBOX_MSG("permit error\n"); + return KBOX_FALSE; + } + + return KBOX_TRUE; +} + +int kbox_ioctl_detail(unsigned int cmd, unsigned long arg) +{ + if (kbox_ioctl_verify_cmd(cmd, arg) != KBOX_TRUE) + return -EFAULT; + + switch (cmd) { + case GET_KBOX_TOTAL_LEN: + return kbox_get_image_len(arg); + + case GET_KBOX_REGION_USER_LEN: + return kbox_get_user_region_len(arg); + + case KBOX_REGION_READ: + return kbox_read_region(arg); + + case KBOX_REGION_WRITE: + return kbox_writer_region(arg); + + case CLEAR_KBOX_REGION_ALL: + return kbox_clear_region(KBOX_SECTION_ALL); + + case CLEAR_KBOX_REGION_USER: + return kbox_clear_region(KBOX_SECTION_USER); + + default: + return -ENOTTY; + } +} + +int kbox_mmap_ram(struct file *pfile, struct vm_area_struct *vma, + enum kbox_section_e section) +{ + unsigned long kbox_section_phy_addr = + kbox_get_section_phy_addr(section); + unsigned long kbox_section_len = kbox_get_section_len(section); + unsigned long offset = 0; + unsigned long length = 0; + unsigned long vm_size = 0; + int ret = 0; + + UNUSED(pfile); + + if (kbox_section_phy_addr == 0 || kbox_section_len == 0) { + KBOX_MSG + ("get kbox_section_phy_addr or kbox_section_len failed!\n"); + return -EFAULT; + } + + offset = vma->vm_pgoff << PAGE_SHIFT; + vm_size = vma->vm_end - vma->vm_start; + + if (offset >= kbox_section_len) { + KBOX_MSG("vma offset is invalid!\n"); + return -ESPIPE; + } + + if (vma->vm_flags & VM_LOCKED) { + KBOX_MSG("vma is locked!\n"); + return -EPERM; + } + + length = kbox_section_len - offset; + if (vm_size > length) { + KBOX_MSG("vm_size is invalid!\n"); + return -ENOSPC; + } + + vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_IO; + + ret = remap_pfn_range(vma, + vma->vm_start, + (unsigned long)(kbox_section_phy_addr >> + PAGE_SHIFT), vm_size, + vma->vm_page_prot); + if (ret) { + KBOX_MSG("remap_pfn_range failed! ret = %d\n", ret); + return -EAGAIN; + } + + return 0; +} diff --git a/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_ram_op.h b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_ram_op.h new file mode 100644 index 000000000000..4a92c87de139 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/kbox_drv/kbox_ram_op.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Huawei iBMA driver. + * Copyright (c) 2017, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _KBOX_RAM_OP_H_ +#define _KBOX_RAM_OP_H_ + +#include <asm/ioctls.h> +#include <linux/fs.h> +#include "kbox_printk.h" + +#define KBOX_IOC_MAGIC (0xB2) + +#define GET_KBOX_TOTAL_LEN _IOR(KBOX_IOC_MAGIC, 0, unsigned long) + +#define GET_KBOX_REGION_USER_LEN _IOR(KBOX_IOC_MAGIC, 1, unsigned long) + +#define CLEAR_KBOX_REGION_ALL _IO(KBOX_IOC_MAGIC, 2) + +#define CLEAR_KBOX_REGION_USER _IO(KBOX_IOC_MAGIC, 3) + +#define KBOX_REGION_READ _IOR(KBOX_IOC_MAGIC, 4, struct kbox_region_arg_s) + +#define KBOX_REGION_WRITE _IOW(KBOX_IOC_MAGIC, 5, struct kbox_region_arg_s) + +#define KBOX_IOC_MAXNR 6 + +#define TEMP_BUF_SIZE (32 * 1024) +#define TEMP_BUF_DATA_SIZE (128 * 1024) +#define KBOX_RW_UNIT 4 + +struct kbox_region_arg_s { + unsigned long offset; + unsigned int count; + char *data; +}; + +enum kbox_section_e; + +int kbox_read_op(long long offset, unsigned int count, char __user *data, + enum kbox_section_e section); +int kbox_write_op(long long offset, unsigned int count, + const char __user *data, enum kbox_section_e section); +int kbox_read_super_block(void); +int kbox_super_block_init(void); +int kbox_write_panic_info(const char *input_data, unsigned int data_len); +int kbox_write_thread_info(const char *input_data, unsigned int data_len); +int kbox_write_printk_info(const char *input_data, + struct printk_ctrl_block_tmp_s + *printk_ctrl_block_tmp); +int kbox_read_printk_info(char *input_data, + struct printk_ctrl_block_tmp_s + *printk_ctrl_block_tmp); +int kbox_ioctl_detail(unsigned int cmd, unsigned long arg); +int kbox_mmap_ram(struct file *file, struct vm_area_struct *vma, + enum kbox_section_e section); +char kbox_checksum(const char *input_buf, unsigned int len); +int kbox_write_to_ram(unsigned long offset, unsigned int count, + const char *data, enum kbox_section_e section); +int kbox_read_from_ram(unsigned long offset, unsigned int count, char *data, + enum kbox_section_e section); +int kbox_clear_region(enum kbox_section_e section); +int kbox_memset_ram(unsigned long offset, unsigned int count, + const char set_byte, enum kbox_section_e section); + +#endif
From: Qi Deng dengqi7@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4ETXO CVE: NA
-----------------------------------------
The BMA software is a system management software offered by Huawei. It supports the status monitoring, performance monitoring, and event monitoring of various components, including server CPUs, memory, hard disks, NICs, IB cards, PCIe cards, RAID controller cards, and optical modules.
This cdev_veth_drv driver is one of the communication drivers used by BMA software. It depends on the host_edma_drv driver. It will create a char device once loaded, offering interfaces (open, close, read, write and poll) to BMA to send/receive RESTful messages between BMC software. When the message is longer than 1KB, it will be cut into packets of 1KB. The other side, BMC's cdev_veth driver, will assemble these packets back into original mesages.
Link: https://lkml.org/lkml/2020/6/22/752
Signed-off-by: Qi Deng dengqi7@huawei.com Reviewed-by: Qidong Wang wangqindong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ethernet/huawei/bma/Makefile | 1 + .../huawei/bma/cdev_veth_drv/Makefile | 2 + .../bma/cdev_veth_drv/virtual_cdev_eth_net.c | 1862 +++++++++++++++++ .../bma/cdev_veth_drv/virtual_cdev_eth_net.h | 299 +++ 4 files changed, 2164 insertions(+) create mode 100644 drivers/net/ethernet/huawei/bma/cdev_veth_drv/Makefile create mode 100644 drivers/net/ethernet/huawei/bma/cdev_veth_drv/virtual_cdev_eth_net.c create mode 100644 drivers/net/ethernet/huawei/bma/cdev_veth_drv/virtual_cdev_eth_net.h
diff --git a/drivers/net/ethernet/huawei/bma/Makefile b/drivers/net/ethernet/huawei/bma/Makefile index 40fd3a18c34f..9de8424ceba0 100644 --- a/drivers/net/ethernet/huawei/bma/Makefile +++ b/drivers/net/ethernet/huawei/bma/Makefile @@ -6,3 +6,4 @@ obj-$(CONFIG_BMA) += edma_drv/ obj-$(CONFIG_BMA) += cdev_drv/ obj-$(CONFIG_BMA) += veth_drv/ obj-$(CONFIG_BMA) += kbox_drv/ +obj-$(CONFIG_BMA) += cdev_veth_drv/ diff --git a/drivers/net/ethernet/huawei/bma/cdev_veth_drv/Makefile b/drivers/net/ethernet/huawei/bma/cdev_veth_drv/Makefile new file mode 100644 index 000000000000..b3e748d577ac --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/cdev_veth_drv/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_BMA) += cdev_veth_drv.o +cdev_veth_drv-y := virtual_cdev_eth_net.o \ No newline at end of file diff --git a/drivers/net/ethernet/huawei/bma/cdev_veth_drv/virtual_cdev_eth_net.c b/drivers/net/ethernet/huawei/bma/cdev_veth_drv/virtual_cdev_eth_net.c new file mode 100644 index 000000000000..04ea55f4e0a2 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/cdev_veth_drv/virtual_cdev_eth_net.c @@ -0,0 +1,1862 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Huawei iBMA driver. + * Copyright (c) 2019, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include "virtual_cdev_eth_net.h" + +static struct edma_eth_dev_s g_eth_edmaprivate; +static struct edma_packet_node_s g_edma_recv_packet_tmp = {0, NULL}; +static struct edma_cut_packet_node_s *g_edma_send_cut_packet; +static unsigned int g_last_token = TK_START_END; +static unsigned int g_device_opened = CDEV_CLOSED; +static unsigned int g_last_number; +static unsigned int g_peer_not_ready; +static unsigned int g_read_pos; +static unsigned int g_delay_ms; +static int g_write_count; + +static const int NO_SPACE_RETRY = 10; +static const int NO_SPACE_WAIT_MS = 2000; +static const int CUT_PKG_SLEEP_MS = 1; +static const int CUT_PKG_LIMIT_COUNT = 30; +static const int SEND_INT_PKG_COUNT = 50; + +static int cdev_open(struct inode *inode_ptr, struct file *filp); +static int cdev_release(struct inode *inode_ptr, struct file *filp); +static unsigned int cdev_poll(struct file *file, poll_table *wait); +static ssize_t cdev_read(struct file *filp, char __user *data, + size_t count, loff_t *ppos); +static ssize_t cdev_write(struct file *filp, const char __user *data, + size_t count, loff_t *ppos); + +#define IS_CDEV_IN_OPEN_STATE() (g_device_opened != CDEV_CLOSED) +#define SET_CDEV_OPEN_STATE(x) (g_device_opened = (x)) + +int debug = DLOG_ERROR; +module_param(debug, int, 0644); +MODULE_PARM_DESC(debug, "Debug switch (0=close debug, 1=open debug)"); + +#define GET_PRIVATE_DATA(f) (((struct cdev_dev_s *)((f)->private_data))->priv) + +const struct file_operations g_eth_edma_cdev_fops = { + .owner = THIS_MODULE, + .open = cdev_open, + .release = cdev_release, + .poll = cdev_poll, + .read = cdev_read, + .write = cdev_write, +}; + +void dump_global_info(void) +{ + struct edma_shmq_hd_s *pshmqhd_v = NULL; + + if (!debug) + return; + + LOG(DLOG_DEBUG, "\r\n=================VETH INFO=================\r\n"); + + pshmqhd_v = g_eth_edmaprivate.ptx_queue->pshmqhd_v; + LOG(DLOG_DEBUG, "TX head/tail: %u/%u ------------", + pshmqhd_v->head, pshmqhd_v->tail); + + pshmqhd_v = g_eth_edmaprivate.prx_queue->pshmqhd_v; + LOG(DLOG_DEBUG, "RX head/tail: %u/%u ------------", + pshmqhd_v->head, pshmqhd_v->tail); +} + +static inline int edma_is_queue_ready(struct edma_rxtx_q_s *prxtx_queue) +{ + if (!prxtx_queue) + return 0; + + return (prxtx_queue->pshmqhd_v->init == BSPVETH_SHMQUEUE_INITOK_V2); +} + +static inline void edma_veth_host_addr_init(void *priv) +{ + struct bma_priv_data_s *edma_priv = (struct bma_priv_data_s *)priv; + + g_eth_edmaprivate.pshmpool_p = + (u8 *)edma_priv->specific.veth.veth_swap_phy_addr; + g_eth_edmaprivate.pshmpool_v = + (u8 *)edma_priv->specific.veth.veth_swap_addr; + g_eth_edmaprivate.shmpoolsize = + (u32)edma_priv->specific.veth.veth_swap_len; +} + +void edma_veth_free_tx_resources(struct edma_rxtx_q_s *ptx_queue) +{ + struct edma_bd_info_s *pbdinfobase_v = NULL; + + if (!ptx_queue || !ptx_queue->pbdinfobase_v) + return; + + pbdinfobase_v = ptx_queue->pbdinfobase_v; + ptx_queue->pbdinfobase_v = NULL; + ptx_queue->pshmqhd_v = NULL; + + vfree(pbdinfobase_v); + + LOG(DLOG_DEBUG, "%s ok. count=%d", __func__, ptx_queue->count); +} + +void edma_veth_free_all_tx_resources(struct edma_eth_dev_s *edma_eth) +{ + if (edma_eth && edma_eth->ptx_queue) { + edma_veth_free_tx_resources(edma_eth->ptx_queue); + kfree(edma_eth->ptx_queue); + edma_eth->ptx_queue = NULL; + } +} + +int edma_veth_setup_tx_resources(struct edma_rxtx_q_s *ptx_queue) +{ + int size; + + ptx_queue->count = MAX_QUEUE_BDNUM; + size = sizeof(struct edma_bd_info_s) * ptx_queue->count; + + ptx_queue->pbdinfobase_v = vmalloc(size); + if (!ptx_queue->pbdinfobase_v) { + LOG(DLOG_ERROR, "Failed to alloc memory for the TX queue."); + return -ENOMEM; + } + + memset(ptx_queue->pbdinfobase_v, 0, size); + + /* round up to nearest 4K */ + size = sizeof(struct edma_dma_shmbd_s) * ptx_queue->count; + ptx_queue->size = ALIGN(size, ALIGN_MASK); + + ptx_queue->work_limit = BSPVETH_WORK_LIMIT; + + return 0; +} + +int edma_veth_setup_all_tx_resources(struct edma_eth_dev_s *edma_eth) +{ + int err; + u8 *shmq_head = NULL; + u8 *shmq_head_p = NULL; + struct edma_rxtx_q_s *tx_queue = NULL; + + tx_queue = (struct edma_rxtx_q_s *) + kmalloc(sizeof(struct edma_rxtx_q_s), GFP_KERNEL); + if (!tx_queue) { + LOG(DLOG_ERROR, "Failed to alloc TX queue."); + return -ENOMEM; + } + + memset(tx_queue, 0, sizeof(struct edma_rxtx_q_s)); + + shmq_head = edma_eth->pshmpool_v + (MAX_SHAREQUEUE_SIZE * 0); + shmq_head_p = edma_eth->pshmpool_p + (MAX_SHAREQUEUE_SIZE * 0); + + tx_queue->pshmqhd_v = (struct edma_shmq_hd_s *)shmq_head; + tx_queue->pshmqhd_p = shmq_head_p; + + tx_queue->pshmbdbase_v = (struct edma_dma_shmbd_s *) + (shmq_head + BSPVETH_SHMBDBASE_OFFSET); + tx_queue->pshmbdbase_p = shmq_head_p + BSPVETH_SHMBDBASE_OFFSET; + + tx_queue->pdmalbase_v = (struct edma_dmal_s *) + (shmq_head + SHMDMAL_OFFSET); + tx_queue->pdmalbase_p = (u8 *)(VETH_SHAREPOOL_BASE_INBMC + + (MAX_SHAREQUEUE_SIZE * 0) + SHMDMAL_OFFSET); + + memset(tx_queue->pdmalbase_v, 0, MAX_SHMDMAL_SIZE); + + err = edma_veth_setup_tx_resources(tx_queue); + if (err) { + kfree(tx_queue); + return err; + } + + edma_eth->ptx_queue = tx_queue; + + return 0; +} + +int edma_veth_setup_rx_resources(struct edma_rxtx_q_s *prx_queue) +{ + int size; + + prx_queue->count = MAX_QUEUE_BDNUM; + size = sizeof(struct edma_bd_info_s) * prx_queue->count; + + prx_queue->pbdinfobase_v = vmalloc(size); + if (!prx_queue->pbdinfobase_v) { + LOG(DLOG_ERROR, "Failed to alloc memory for the RX queue."); + return -ENOMEM; + } + + memset(prx_queue->pbdinfobase_v, 0, size); + + /* Round up to nearest 4K */ + size = sizeof(struct edma_dma_shmbd_s) * prx_queue->count; + prx_queue->size = ALIGN(size, ALIGN_MASK); + + prx_queue->work_limit = BSPVETH_WORK_LIMIT; + + return 0; +} + +int edma_veth_setup_all_rx_resources(struct edma_eth_dev_s *edma_eth) +{ + int err; + u8 *shmq_head = NULL; + u8 *shmq_head_p = NULL; + struct edma_rxtx_q_s *rx_queue = NULL; + + rx_queue = (struct edma_rxtx_q_s *) + kmalloc(sizeof(struct edma_rxtx_q_s), GFP_KERNEL); + if (!rx_queue) { + LOG(DLOG_ERROR, "Failed to alloc RX queue."); + return -ENOMEM; + } + + memset(rx_queue, 0, sizeof(struct edma_rxtx_q_s)); + + shmq_head = edma_eth->pshmpool_v + MAX_SHAREQUEUE_SIZE; + shmq_head_p = edma_eth->pshmpool_p + MAX_SHAREQUEUE_SIZE; + rx_queue->pshmqhd_v = (struct edma_shmq_hd_s *)shmq_head; + rx_queue->pshmqhd_p = shmq_head_p; + + rx_queue->pshmbdbase_v = (struct edma_dma_shmbd_s *)(shmq_head + + BSPVETH_SHMBDBASE_OFFSET); + rx_queue->pshmbdbase_p = shmq_head_p + BSPVETH_SHMBDBASE_OFFSET; + + /* DMA address list (only used in host). */ + rx_queue->pdmalbase_v = (struct edma_dmal_s *) + (shmq_head + SHMDMAL_OFFSET); + rx_queue->pdmalbase_p = (u8 *)(VETH_SHAREPOOL_BASE_INBMC + + MAX_SHAREQUEUE_SIZE + SHMDMAL_OFFSET); + memset(rx_queue->pdmalbase_v, 0, MAX_SHMDMAL_SIZE); + + err = edma_veth_setup_rx_resources(rx_queue); + if (err) { + kfree(rx_queue); + return err; + } + + edma_eth->prx_queue = rx_queue; + + return 0; +} + +void edma_veth_free_rx_resources(struct edma_rxtx_q_s *prx_queue) +{ + struct edma_bd_info_s *pbdinfobase_v = NULL; + + if (!prx_queue || !prx_queue->pbdinfobase_v) + return; + + pbdinfobase_v = prx_queue->pbdinfobase_v; + prx_queue->pbdinfobase_v = NULL; + prx_queue->pshmqhd_v = NULL; + + /* Free all the Rx ring pages */ + vfree(pbdinfobase_v); + + LOG(DLOG_DEBUG, "%s ok. count=%d", __func__, prx_queue->count); +} + +void edma_veth_free_all_rx_resources(struct edma_eth_dev_s *edma_eth) +{ + if (edma_eth && edma_eth->prx_queue) { + edma_veth_free_rx_resources(edma_eth->prx_queue); + kfree(edma_eth->prx_queue); + edma_eth->prx_queue = NULL; + } +} + +int edma_veth_setup_all_rxtx_queue(struct edma_eth_dev_s *edma_eth) +{ + void *buf = NULL; + unsigned int i; + unsigned int j; + + dma_addr_t dmaaddr; + struct edma_bd_info_s *pbdinfobase_v = NULL; + + struct edma_rxtx_q_s *ptx_queue = NULL; + struct edma_rxtx_q_s *prx_queue = NULL; + + struct bma_priv_data_s *priv = NULL; + struct device *dev = NULL; + + priv = (struct bma_priv_data_s *)edma_eth->edma_priv; + dev = &priv->specific.veth.pdev->dev; + + ptx_queue = edma_eth->ptx_queue; + prx_queue = edma_eth->prx_queue; + + edma_eth->pages_tx = 0; + + pbdinfobase_v = ptx_queue->pbdinfobase_v; + + for (i = 0; i < MAX_QUEUE_BDNUM; i++) { + buf = kmalloc(NODE_SIZE, GFP_KERNEL | GFP_DMA); + if (!buf) { + for (j = 0; j < i; j++) + kfree((void *)pbdinfobase_v[j].pdma_v); + + LOG(DLOG_ERROR, "Fail to alloc tx buf."); + return -ENOMEM; + } + + dmaaddr = dma_map_single(dev, buf, NODE_SIZE, DMA_TO_DEVICE); + if (dma_mapping_error(dev, dmaaddr)) { + LOG(DLOG_ERROR, "Failed to map tx DMA address."); + kfree(buf); + return -EIO; + } + + memset(buf, 0xFF, NODE_SIZE); + + pbdinfobase_v[i].pdma_v = (u8 *)(buf); + pbdinfobase_v[i].dma_p = dmaaddr; + pbdinfobase_v[i].len = NODE_SIZE; + } + + LOG(DLOG_DEBUG, "set tx done."); + + edma_eth->pages_rx = 0; + + pbdinfobase_v = prx_queue->pbdinfobase_v; + + for (i = 0; i < MAX_QUEUE_BDNUM; i++) { + buf = kmalloc(NODE_SIZE, GFP_KERNEL | GFP_DMA); + if (!buf) { + for (j = 0; j < i; j++) + kfree((void *)pbdinfobase_v[j].pdma_v); + + LOG(DLOG_ERROR, "Fail to alloc rx buf."); + return -ENOMEM; + } + + dmaaddr = dma_map_single(dev, buf, NODE_SIZE, DMA_FROM_DEVICE); + if (dma_mapping_error(dev, dmaaddr)) { + LOG(DLOG_ERROR, "Failed to map rx DMA address."); + kfree(buf); + return -EIO; + } + + memset(buf, 0xFF, NODE_SIZE); + + pbdinfobase_v[i].pdma_v = (u8 *)(buf); + pbdinfobase_v[i].dma_p = dmaaddr; + pbdinfobase_v[i].len = NODE_SIZE; + } + + LOG(DLOG_DEBUG, "set rx done."); + + return 0; +} + +void edma_veth_dump(void) +{ + struct edma_eth_dev_s *edma_eth = &g_eth_edmaprivate; + struct edma_rxtx_q_s *ptx_queue = edma_eth->ptx_queue; + struct edma_rxtx_q_s *prx_queue = edma_eth->prx_queue; + struct edma_shmq_hd_s *pshmq_head = NULL; + + if (!debug) + return; + + pshmq_head = prx_queue->pshmqhd_v; + + LOG(DLOG_DEBUG, + "RX host_head:%u, host_tail:%u, shm_head:%u, shm_tail:%u, ", + prx_queue->head, prx_queue->tail, + pshmq_head->head, pshmq_head->tail); + LOG(DLOG_DEBUG, "count: %u, total: %u, init: %u.", + pshmq_head->count, pshmq_head->total, pshmq_head->init); + + pshmq_head = ptx_queue->pshmqhd_v; + + LOG(DLOG_DEBUG, + "TX host_head:%u, host_tail:%u, shm_head:%u, shm_tail:%u, ", + ptx_queue->head, ptx_queue->tail, + pshmq_head->head, pshmq_head->tail); + LOG(DLOG_DEBUG, "count: %u, total: %u, init: %u.", + pshmq_head->count, pshmq_head->total, pshmq_head->init); +} + +int edma_veth_setup_resource(struct edma_eth_dev_s *edma_eth) +{ + int err; + + err = edma_veth_setup_all_rx_resources(edma_eth); + if (err < 0) + return err; + + err = edma_veth_setup_all_tx_resources(edma_eth); + if (err < 0) + goto FREE_RX; + + err = edma_veth_setup_all_rxtx_queue(edma_eth); + if (err < 0) + goto FREE_TX; + + return 0; + +FREE_TX: + edma_veth_free_all_tx_resources(edma_eth); +FREE_RX: + edma_veth_free_all_rx_resources(edma_eth); + + return err; +} + +int edma_veth_free_rxtx_queue(struct edma_eth_dev_s *edma_eth) +{ + int i; + struct edma_rxtx_q_s *ptx_queue = NULL; + struct edma_rxtx_q_s *prx_queue = NULL; + + struct bma_priv_data_s *priv = NULL; + struct device *dev = NULL; + + struct edma_bd_info_s *pbdinfobase_v = NULL; + + if (!edma_eth || !edma_eth->edma_priv) + return 0; + + priv = (struct bma_priv_data_s *)edma_eth->edma_priv; + dev = &priv->specific.veth.pdev->dev; + + ptx_queue = edma_eth->ptx_queue; + prx_queue = edma_eth->prx_queue; + + pbdinfobase_v = ptx_queue->pbdinfobase_v; + + for (i = 0; i < MAX_QUEUE_BDNUM; i++) { + dma_unmap_single(dev, pbdinfobase_v[i].dma_p, + NODE_SIZE, DMA_TO_DEVICE); + kfree(pbdinfobase_v[i].pdma_v); + } + + pbdinfobase_v = prx_queue->pbdinfobase_v; + + for (i = 0; i < MAX_QUEUE_BDNUM; i++) { + dma_unmap_single(dev, pbdinfobase_v[i].dma_p, + NODE_SIZE, DMA_FROM_DEVICE); + kfree(pbdinfobase_v[i].pdma_v); + } + + return 0; +} + +void edma_veth_free_resource(struct edma_eth_dev_s *edma_eth) +{ + edma_veth_free_rxtx_queue(edma_eth); + LOG(DLOG_DEBUG, "edma_veth_free_rxtx_queue done."); + + edma_veth_free_all_rx_resources(edma_eth); + LOG(DLOG_DEBUG, "edma_veth_free_all_rx_resources done."); + + edma_veth_free_all_tx_resources(edma_eth); + LOG(DLOG_DEBUG, "edma_veth_free_all_tx_resources done."); +} + +int edma_veth_send_one_pkt(struct edma_cut_packet_node_s *cut_packet_node) +{ + u32 head, tail, i; + struct edma_bd_info_s *pbdinfo_v = NULL; + struct edma_rxtx_q_s *ptx_queue = g_eth_edmaprivate.ptx_queue; + struct bma_priv_data_s *priv = NULL; + struct device *dev = NULL; + + if (!cut_packet_node || !ptx_queue || !ptx_queue->pshmbdbase_v) { + LOG(DLOG_ERROR, "Invalid packet node."); + return -EFAULT; + } + + priv = (struct bma_priv_data_s *)(g_eth_edmaprivate.edma_priv); + dev = &priv->specific.veth.pdev->dev; + + if (!bma_intf_is_link_ok()) { + LOG(DLOG_ERROR, "EDMA link is not ready."); + return -EIO; + } + + for (i = 0; i < NO_SPACE_RETRY; i++) { + head = ptx_queue->head; + tail = ptx_queue->tail; + + LOG(DLOG_DEBUG, "TX queue, before: head/tail: %u/%u", head, tail); + + if (JUDGE_RING_QUEUE_SPACE(head, tail, 1)) + break; + + if (i == NO_SPACE_RETRY - 1) { + LOG(DLOG_ERROR, "EDMA queue has no space."); + return -EBUSY; + } + + tasklet_hi_schedule(&g_eth_edmaprivate.dma_task); + msleep(NO_SPACE_WAIT_MS); + } + + ptx_queue->head = (head + 1) & BSPVETH_POINT_MASK; + + pbdinfo_v = ptx_queue->pbdinfobase_v + head; + + pbdinfo_v->len = NODE_TO_PACKET_SIZE(cut_packet_node); + (void)memcpy(pbdinfo_v->pdma_v, cut_packet_node, pbdinfo_v->len); + + /* Force sync data from CPU to device. */ + dma_sync_single_for_device(dev, pbdinfo_v->dma_p, + pbdinfo_v->len, DMA_TO_DEVICE); + + LOG(DLOG_DEBUG, "TX queue, after: head/tail: %u -> %u\n", + ptx_queue->head, ptx_queue->tail); + + return 0; +} + +static inline unsigned int edma_veth_get_ring_buf_count(unsigned int head, + unsigned int tail, + unsigned int size) +{ + return (tail + size - head) % size; +} + +static inline void edma_veth_flush_ring_node(struct edma_packet_node_s *node, + unsigned int ring_len) +{ + unsigned int i; + + for (i = 0; i < ring_len; i++) { + kfree(node[i].packet); + node[i].packet = NULL; + } +} + +static int get_peer_queue_stress(struct edma_rxtx_q_s *queue) +{ + int stress; + + if (++g_write_count < RL_MAX_PACKET) { + /* not enough packets, use the last delay. */ + return -1; + } + + g_write_count = 0; + + /* check peer rx queue stress. */ + if (!queue || queue->pshmqhd_v->total == 0) { + /* no rate limit allowed. */ + return 0; + } + + stress = (int)((queue->pshmqhd_v->count * STRESS_FACTOR) / + queue->pshmqhd_v->total); + + return stress; +} + +static void do_queue_rate_limit(struct edma_rxtx_q_s *queue) +{ + unsigned long delay_jiffies; + int stress = get_peer_queue_stress(queue); + + LOG(DLOG_DEBUG, "count: %u, total: %u, stress: %d", + queue->pshmqhd_v->count, queue->pshmqhd_v->total, stress); + + if (stress >= RL_STRESS_HIGH) + g_delay_ms = RL_DELAY_MS_HIGH; + else if (stress >= RL_STRESS_LOW) + g_delay_ms = RL_DELAY_MS_LOW; + else if (stress >= 0) + g_delay_ms = 0; + + if (g_delay_ms) { + delay_jiffies = msecs_to_jiffies(g_delay_ms); + schedule_timeout_killable(delay_jiffies); + } +} + +static int edma_veth_cut_tx_packet_send(struct edma_eth_dev_s *eth_dev, + const char __user *data, size_t len) +{ + int ret = 0; + struct edma_cut_packet_node_s *tx_cut_pkt = g_edma_send_cut_packet; + unsigned int length = len; + unsigned int already_read_len = 0; + unsigned int count = 0; + + if (!tx_cut_pkt) + return -EFAULT; + + do_queue_rate_limit(eth_dev->ptx_queue); + + while (length > 0) { + LOG(DLOG_DEBUG, "length: %u/%lu", length, len); + + if (length > BSPPACKET_MTU_MAX) { + /* fragment. */ + if (copy_from_user(tx_cut_pkt->cut_packet, + data + already_read_len, + BSPPACKET_MTU_MAX)) { + LOG(DLOG_DEBUG, "Failed to copy user data."); + return -EFAULT; + } + tx_cut_pkt->number = count++; + length = length - BSPPACKET_MTU_MAX; + + if (tx_cut_pkt->number == 0) { + tx_cut_pkt->token = TK_START_PACKET; + tx_cut_pkt->cut_packet_len = BSPPACKET_MTU_MAX; + } else { + tx_cut_pkt->token = TK_MIDDLE_PACKET; + tx_cut_pkt->cut_packet_len = BSPPACKET_MTU_MAX; + } + } else { + if (copy_from_user(tx_cut_pkt->cut_packet, + data + already_read_len, length)) { + LOG(DLOG_DEBUG, "Failed to copy user data."); + return -EFAULT; + } + tx_cut_pkt->number = count++; + if (len > BSPPACKET_MTU_MAX) + tx_cut_pkt->token = TK_END_PACKET; + else + tx_cut_pkt->token = TK_START_END; + + tx_cut_pkt->cut_packet_len = length; + length = 0; + } + + already_read_len += tx_cut_pkt->cut_packet_len; + ret = edma_veth_send_one_pkt(tx_cut_pkt); + if (ret < 0) { + LOG(DLOG_DEBUG, "edma_veth_send_one_pkt failed, %d.", + ret); + return ret; + } + if (length > 0 && count > CUT_PKG_LIMIT_COUNT) { + LOG(DLOG_DEBUG, "middle pkg: %d, need sleep.", + count); + msleep(CUT_PKG_SLEEP_MS); + /* send a interrupt to BMC for recv package */ + if (count % SEND_INT_PKG_COUNT == 0) + tasklet_hi_schedule(&g_eth_edmaprivate.dma_task); + } + } + + LOG(DLOG_DEBUG, "send done, length: %u", length); + + return 0; +} + +static int edma_veth_copy_full_packet(struct edma_eth_dev_s *eth_dev, + u8 *packet, u32 len) +{ + unsigned int count = 0; + u8 *ptr = NULL; + + LOG(DLOG_DEBUG, "Recv full packet, len %u.", len); + + ptr = kmalloc(len, GFP_ATOMIC); + if (ptr) { + /* lock the queue. */ + spin_lock(ð_dev->rx_queue_lock); + + count = edma_veth_get_ring_buf_count(eth_dev->rx_packet_head, + eth_dev->rx_packet_tail, + MAX_RXTX_PACKET_LEN); + if (count >= (MAX_RXTX_PACKET_LEN - 1)) { + LOG(DLOG_DEBUG, "The rx queue is full."); + spin_unlock(ð_dev->rx_queue_lock); + kfree(ptr); + return -EBUSY; + } + + (void)memcpy(ptr, packet, len); + eth_dev->rx_packet[eth_dev->rx_packet_tail].packet = ptr; + eth_dev->rx_packet[eth_dev->rx_packet_tail].len = len; + eth_dev->rx_packet_tail = (eth_dev->rx_packet_tail + 1) % + MAX_RXTX_PACKET_LEN; + + spin_unlock(ð_dev->rx_queue_lock); + + return 0; + } + + return -ENOMEM; +} + +static int edma_veth_cut_rx_packet_recv(struct edma_eth_dev_s *eth_dev, + u8 *packet, u32 len) +{ + int ret = 0; + struct edma_cut_packet_node_s *node = + (struct edma_cut_packet_node_s *)packet; + struct edma_packet_node_s *g_packet = &g_edma_recv_packet_tmp; + unsigned int copy_back = 0; + + if (node->cut_packet_len && len > NODE_TO_PACKET_SIZE(node)) + len = NODE_TO_PACKET_SIZE(node); + + LOG(DLOG_DEBUG, + "cut_packet_len: %u, token: %u/%u, number: %u, real length: %u.", + node->cut_packet_len, node->token, g_last_token, node->number, len); + + if (node->cut_packet_len > BSPPACKET_MTU_MAX || + ((g_packet->len + node->cut_packet_len) > MAX_PACKET_LEN)) { + LOG(DLOG_ERROR, "This packet is too long, packet length %u/%u", + node->cut_packet_len, g_packet->len); + ret = -EINVAL; + goto fail; + } + + if (g_last_token == TK_START_END || g_last_token == TK_END_PACKET) { + /* This should be a new packet. */ + if (node->token == TK_START_PACKET || + node->token == TK_START_END) { + (void)memcpy(g_packet->packet, node->cut_packet, + node->cut_packet_len); + g_packet->len = node->cut_packet_len; + + if (node->token == TK_START_END) { + /* A full packet, increase tail. */ + copy_back = 1; + } else { + LOG(DLOG_DEBUG, + "Add middle packet with length %u", + node->cut_packet_len); + } + } else { + LOG(DLOG_ERROR, "The rx packet is out-of-order"); + LOG(DLOG_ERROR, "token: %d, len: %u, number: %u", + node->token, node->cut_packet_len, node->number); + ret = -EINVAL; + goto fail; + } + } else { + /* Fragments, last token: TK_MIDDLE_PACKET/TK_START_PACKET. */ + if (g_last_number != (node->number - 1)) { + LOG(DLOG_ERROR, "The number is not correct (%u/%u)", + g_last_number, node->number); + ret = -EINVAL; + goto fail; + } + + if (node->token == TK_MIDDLE_PACKET) { + (void)memcpy(g_packet->packet + g_packet->len, + node->cut_packet, node->cut_packet_len); + g_packet->len = g_packet->len + node->cut_packet_len; + LOG(DLOG_DEBUG, "Add middle packet with length %u", + node->cut_packet_len); + } else if (node->token == TK_END_PACKET) { + (void)memcpy(g_packet->packet + g_packet->len, + node->cut_packet, node->cut_packet_len); + g_packet->len = g_packet->len + node->cut_packet_len; + copy_back = 1; + } else { + LOG(DLOG_ERROR, "Unexpected token: %u", node->token); + ret = -EINVAL; + goto fail; + } + } + + if (copy_back) { + ret = edma_veth_copy_full_packet(eth_dev, g_packet->packet, + g_packet->len); + g_packet->len = 0; + } + + g_last_token = node->token; + g_last_number = node->number; + + LOG(DLOG_DEBUG, "rx_packet_head:%u, rx_packet_tail: %u", + eth_dev->rx_packet_head, eth_dev->rx_packet_tail); + + return copy_back; + +fail: + g_last_token = TK_START_END; + g_last_number = 0; + memset(g_packet->packet, 0, MAX_PACKET_LEN); + g_packet->len = 0; + + return ret; +} + +int edma_veth_recv_pkt(struct edma_rxtx_q_s *prx_queue, + struct bma_priv_data_s *priv) +{ + int ret = BSP_OK; + + u32 i, work_limit; + u32 tail, head; + + struct edma_bd_info_s *prx_bdinfo_v = NULL; + struct device *dev = NULL; + + u8 *packet = NULL; + u32 len; + u32 off; + + wait_queue_head_t *queue_head = NULL; + u8 do_wake_up = 0; + + if (!priv) + return BSP_OK; + + dev = &priv->specific.veth.pdev->dev; + + work_limit = prx_queue->work_limit; + tail = prx_queue->tail; + + for (i = 0; i < work_limit; i++) { + head = prx_queue->head; + + if (tail == head) + break; + + LOG(DLOG_DEBUG, "===== enter ===== [%u/%u] ======", head, tail); + prx_bdinfo_v = prx_queue->pbdinfobase_v + tail; + + len = prx_bdinfo_v->len; + off = prx_bdinfo_v->off; + packet = prx_bdinfo_v->pdma_v; + + LOG(DLOG_DEBUG, "off:%u, len: %u.", off, len); + + if (!IS_CDEV_IN_OPEN_STATE()) { + LOG(DLOG_DEBUG, + "Local char device is not opened, drop packet"); + tail = BD_QUEUE_MASK(tail + 1); + continue; + } + + dma_sync_single_for_cpu(dev, prx_bdinfo_v->dma_p, + len + off, DMA_FROM_DEVICE); + + if (off) + packet += off; + + ret = edma_veth_cut_rx_packet_recv(&g_eth_edmaprivate, + packet, len); + if (ret < 0) + LOG(DLOG_DEBUG, "recv rx pkt fail, ret: %d", ret); + else if (ret != 0) + do_wake_up = 1; + + tail = BD_QUEUE_MASK(tail + 1); + } + + prx_queue->tail = tail; + head = prx_queue->head; + + if (tail != head) { + /* check if more processing is needed. */ + return BSP_ERR_AGAIN; + } else if (do_wake_up) { + queue_head = (wait_queue_head_t *)bma_cdev_get_wait_queue(priv); + /* finish reciving pkt, wake up the waiting process. */ + if (queue_head && waitqueue_active(queue_head)) { + LOG(DLOG_DEBUG, "Wake up queue."); + wake_up(queue_head); + } + } + + return BSP_OK; +} + +void edma_task_do_packet_recv(unsigned long data) +{ + int ret = BSP_OK; + struct edma_rxtx_q_s *prx_queue = NULL; + struct bma_priv_data_s *priv = NULL; + struct tasklet_struct *t = (struct tasklet_struct *)data; + + priv = (struct bma_priv_data_s *)g_eth_edmaprivate.edma_priv; + prx_queue = g_eth_edmaprivate.prx_queue; + + if (prx_queue) { + g_eth_edmaprivate.run_skb_RX_task++; + + ret = edma_veth_recv_pkt(prx_queue, priv); + } + + if (ret == BSP_ERR_AGAIN) + tasklet_hi_schedule(t); +} + +static inline void edma_veth_reset_dma(int type) +{ + bma_intf_reset_dma(GET_DMA_DIRECTION(type)); +} + +int __dmacmp_err_deal_2(struct edma_rxtx_q_s *prxtx_queue, u32 type) +{ + prxtx_queue->dmacmperr = 0; + prxtx_queue->start_dma = 0; + + (void)edma_veth_reset_dma(type); + + if (type == BSPVETH_RX) { + LOG(DLOG_DEBUG, + "bmc->host dma time out, dma count:%d, work_limit:%d\n", + prxtx_queue->dmal_cnt, + prxtx_queue->work_limit); + + prxtx_queue->s.dma_failed++; + } else { + LOG(DLOG_DEBUG, + "host->bmc dma time out, dma count:%d, work_limit:%d\n", + prxtx_queue->dmal_cnt, + prxtx_queue->work_limit); + + prxtx_queue->s.dma_failed++; + } + + if (prxtx_queue->dmal_cnt > 1) + prxtx_queue->work_limit = (prxtx_queue->dmal_cnt >> 1); + + prxtx_queue->dma_overtime++; + if (prxtx_queue->dma_overtime > BSPVETH_MAX_QUE_DEEP) + return BSPVETH_DMA_BUSY; + + return BSP_OK; +} + +int edma_veth_check_dma_status(struct edma_rxtx_q_s *prxtx_queue, u32 type) +{ + int i = 0; + enum dma_direction_e dir = GET_DMA_DIRECTION(type); + + for (i = 0; i < BSPVETH_CHECK_DMA_STATUS_TIMES; i++) { + if (bma_intf_check_dma_status(dir) == BSPVETH_DMA_OK) + return BSP_OK; + + cpu_relax(); + + if (i > DMA_STATUS_CHECK_DELAY_LIMIT) + udelay(DMA_STATUS_CHECK_DELAY_MS); + } + + prxtx_queue->s.dma_busy++; + prxtx_queue->dmacmperr++; + + return BSPVETH_DMA_BUSY; +} + +int __check_dmacmp_H_2(struct edma_rxtx_q_s *prxtx_queue, u32 type) +{ + u16 start_dma; + u16 dmacmperr; + u32 cnt = 0; + u32 len = 0; + u32 host_head = 0; + u32 host_tail = 0; + u32 shm_head = 0; + u32 shm_tail = 0; + s32 ret; + struct edma_shmq_hd_s *pshmq_head = NULL; + + if (!prxtx_queue || !prxtx_queue->pshmqhd_v) + return BSP_ERR_NULL_POINTER; + + start_dma = prxtx_queue->start_dma; + if (!start_dma) + return BSP_OK; + + pshmq_head = prxtx_queue->pshmqhd_v; + dmacmperr = prxtx_queue->dmacmperr; + + if (dmacmperr > BSPVETH_WORK_LIMIT / DMACMP_ERR_FACTOR) + return __dmacmp_err_deal_2(prxtx_queue, type); + + ret = edma_veth_check_dma_status(prxtx_queue, type); + if (ret != BSP_OK) + return ret; + + prxtx_queue->start_dma = 0; + prxtx_queue->dma_overtime = 0; + + if (type == BSPVETH_RX) { + cnt = prxtx_queue->dmal_cnt; + len = prxtx_queue->dmal_byte; + + host_head = prxtx_queue->head; + shm_tail = pshmq_head->tail; + + pshmq_head->tail = BD_QUEUE_MASK(shm_tail + cnt); + prxtx_queue->head = BD_QUEUE_MASK(host_head + cnt); + + LOG(DLOG_DEBUG, "RX:host_head:%u, host_tail:%u, ", + prxtx_queue->head, prxtx_queue->tail); + + LOG(DLOG_DEBUG, "shm_head:%u, shm_tail:%u, inc: %u.", + pshmq_head->head, pshmq_head->tail, cnt); + + prxtx_queue->s.dmapkt += cnt; + prxtx_queue->s.dmapktbyte += len; + } else { + cnt = prxtx_queue->dmal_cnt; + len = prxtx_queue->dmal_byte; + + host_tail = prxtx_queue->tail; + shm_head = pshmq_head->head; + + prxtx_queue->tail = BD_QUEUE_MASK(host_tail + cnt); + pshmq_head->head = BD_QUEUE_MASK(shm_head + cnt); + + LOG(DLOG_DEBUG, "TX:host_head:%u, host_tail:%u, ", + prxtx_queue->head, prxtx_queue->tail); + + LOG(DLOG_DEBUG, "shm_head:%u, shm_tail:%u, inc: %u.", + pshmq_head->head, pshmq_head->tail, cnt); + + prxtx_queue->s.dmapkt += cnt; + prxtx_queue->s.dmapktbyte += len; + } + + tasklet_hi_schedule(&g_eth_edmaprivate.skb_task); + + (void)bma_intf_int_to_bmc(g_eth_edmaprivate.edma_priv); + + g_eth_edmaprivate.tobmc_int++; + + return BSP_OK; +} + +int __checkspace_H_2(struct edma_rxtx_q_s *prxtx_queue, u32 type, u32 *pcnt) +{ + u32 host_head, host_tail; + u32 shm_head, shm_tail; + u32 shm_cnt, host_cnt, cnt_tmp, cnt; + struct edma_shmq_hd_s *pshmq_head = NULL; + + if (!prxtx_queue || !prxtx_queue->pshmqhd_v) + return -EFAULT; + + pshmq_head = prxtx_queue->pshmqhd_v; + + host_head = prxtx_queue->head; + host_tail = prxtx_queue->tail; + shm_head = pshmq_head->head; + shm_tail = pshmq_head->tail; + + LOG(DLOG_DEBUG, "host_head:%u, host_tail:%u, shm_head:%u, shm_tail:%u.", + host_head, host_tail, shm_head, shm_tail); + + switch (type) { + case BSPVETH_RX: + if (shm_head == shm_tail) { + prxtx_queue->s.shm_empty++; + return BSP_ERR_NOT_TO_HANDLE; + } + + if (!JUDGE_RING_QUEUE_SPACE(host_head, host_tail, 1)) + return -EFAULT; + + shm_cnt = GET_BD_RING_QUEUE_COUNT(shm_head, shm_tail); + cnt_tmp = min(shm_cnt, prxtx_queue->work_limit); + + host_cnt = GET_BD_RING_QUEUE_SPACE(host_tail, host_head); + cnt = min(cnt_tmp, host_cnt); + + LOG(DLOG_DEBUG, + "RX, host_cnt: %u, shm_cnt: %u, cnt_tmp: %u, cnt: %u", + host_cnt, shm_cnt, cnt_tmp, cnt); + + break; + + case BSPVETH_TX: + if (host_tail == host_head) { + prxtx_queue->s.q_empty++; + return BSP_ERR_NOT_TO_HANDLE; + } + + host_cnt = GET_BD_RING_QUEUE_COUNT(host_head, host_tail); + cnt_tmp = min(host_cnt, prxtx_queue->work_limit); + + shm_cnt = GET_BD_RING_QUEUE_SPACE(shm_head, shm_tail); + cnt = min(cnt_tmp, shm_cnt); + + LOG(DLOG_DEBUG, + "TX, host_cnt: %u, shm_cnt: %u, cnt_tmp: %u, cnt: %u", + host_cnt, shm_cnt, cnt_tmp, cnt); + + break; + + default: + prxtx_queue->s.type_err++; + return -EFAULT; + } + + if (cnt > ((BSPVETH_DMABURST_MAX * DMABURST_FACTOR) / + (DMABURST_FACTOR + 1))) + prxtx_queue->s.dma_burst++; + + *pcnt = cnt; + + return BSP_OK; +} + +int __make_dmalistbd_h2b_H_2(struct edma_rxtx_q_s *prxtx_queue, u32 cnt) +{ + u32 i = 0; + u32 len = 0; + u32 off = 0; + struct edma_dmal_s *pdmalbase_v = NULL; + struct edma_shmq_hd_s *pshmq_head = NULL; + struct edma_bd_info_s *pbdinfobase_v = NULL; + struct edma_dma_shmbd_s *pshmbdbase_v = NULL; + + unsigned long addr; + + u32 host_tail; + u32 shm_head; + + if (!prxtx_queue) + return -EFAULT; + + if (cnt == 0) + return 0; + + pdmalbase_v = prxtx_queue->pdmalbase_v; + pbdinfobase_v = prxtx_queue->pbdinfobase_v; + pshmbdbase_v = prxtx_queue->pshmbdbase_v; + + pshmq_head = prxtx_queue->pshmqhd_v; + + host_tail = prxtx_queue->tail; + shm_head = pshmq_head->head; + + for (i = 0; i < cnt; i++) { + LOG(DLOG_DEBUG, "TX DMA, HOST: %u -> BMC: %u", + host_tail, shm_head); + + pdmalbase_v[i].chl = 0x1; + + addr = EDMA_ADDR_ALIGNED(pbdinfobase_v[host_tail].dma_p); + off = EDMA_ADDR_OFFSET(addr); + + /* src: veth_send_one_pkt. */ + pdmalbase_v[i].slow = lower_32_bits(addr); + pdmalbase_v[i].shi = upper_32_bits(addr); + + /* dst: bmc dma, in shared memory. */ + pdmalbase_v[i].dlow = + lower_32_bits(pshmbdbase_v[shm_head].dma_p); + pdmalbase_v[i].dhi = 0; + + /* len: len + offset caused by alignment */ + pdmalbase_v[i].len = pbdinfobase_v[host_tail].len + off; + + LOG(DLOG_DEBUG, + "TX DMA %08x%08x -> %08x%08x, off: %u, len: %u.", + pdmalbase_v[i].shi, pdmalbase_v[i].slow, + pdmalbase_v[i].dhi, pdmalbase_v[i].dlow, + off, pbdinfobase_v[host_tail].len); + + pshmbdbase_v[shm_head].len = pbdinfobase_v[host_tail].len; + pshmbdbase_v[shm_head].off = off; + + len += pdmalbase_v[i].len; + + /* ready for the next round. */ + host_tail = BD_QUEUE_MASK(host_tail + 1); + shm_head = BD_QUEUE_MASK(shm_head + 1); + } + + pdmalbase_v[i - 1].chl = 0x9; + + pdmalbase_v[i].chl = 0x7; + pdmalbase_v[i].len = 0x0; + pdmalbase_v[i].slow = lower_32_bits((u64)prxtx_queue->pdmalbase_p); + pdmalbase_v[i].shi = upper_32_bits((u64)prxtx_queue->pdmalbase_p); + pdmalbase_v[i].dlow = 0; + pdmalbase_v[i].dhi = 0; + + prxtx_queue->dmal_cnt = cnt; + prxtx_queue->dmal_byte = len; + + return 0; +} + +int __make_dmalistbd_b2h_H_2(struct edma_rxtx_q_s *prxtx_queue, u32 cnt) +{ + u32 i; + u32 len = 0; + + struct edma_dmal_s *pdmalbase_v = NULL; + struct edma_shmq_hd_s *pshmq_head = NULL; + struct edma_bd_info_s *pbdinfobase_v = NULL; + struct edma_dma_shmbd_s *pshmbdbase_v = NULL; + + u32 host_head; + u32 shm_tail; + + if (!prxtx_queue) + return -EFAULT; + + if (cnt == 0) + return -EFAULT; + + pdmalbase_v = prxtx_queue->pdmalbase_v; + pbdinfobase_v = prxtx_queue->pbdinfobase_v; + pshmbdbase_v = prxtx_queue->pshmbdbase_v; + + pshmq_head = prxtx_queue->pshmqhd_v; + + host_head = prxtx_queue->head; + shm_tail = pshmq_head->tail; + + for (i = 0; i < cnt; i++) { + LOG(DLOG_DEBUG, "RX DMA, BMC: %u -> HOST: %u", + shm_tail, host_head); + + pbdinfobase_v[host_head].off = pshmbdbase_v[shm_tail].off; + pbdinfobase_v[host_head].len = pshmbdbase_v[shm_tail].len; + + pdmalbase_v[i].chl = 0x1; + + /* src: bmc set in shared memory. */ + pdmalbase_v[i].slow = + lower_32_bits(pshmbdbase_v[shm_tail].dma_p); + pdmalbase_v[i].shi = 0; + + /* dst: edma_veth_setup_all_rxtx_queue. */ + pdmalbase_v[i].dlow = + lower_32_bits(pbdinfobase_v[host_head].dma_p); + pdmalbase_v[i].dhi = + upper_32_bits(pbdinfobase_v[host_head].dma_p); + + pdmalbase_v[i].len = pshmbdbase_v[shm_tail].len + + pshmbdbase_v[shm_tail].off; + + LOG(DLOG_DEBUG, + "RX DMA %08x%08x -> %08x%08x, off: %u, len: %u, total: %u.", + pdmalbase_v[i].shi, pdmalbase_v[i].slow, + pdmalbase_v[i].dhi, pdmalbase_v[i].dlow, + pshmbdbase_v[shm_tail].off, pshmbdbase_v[shm_tail].len, + pdmalbase_v[i].len); + + len += pdmalbase_v[i].len; + + /* ready for the next round. */ + host_head = BD_QUEUE_MASK(host_head + 1); + shm_tail = BD_QUEUE_MASK(shm_tail + 1); + } + + pdmalbase_v[i - 1].chl = 0x9; + + pdmalbase_v[i].chl = 0x7; + pdmalbase_v[i].len = 0x0; + pdmalbase_v[i].slow = lower_32_bits((u64)prxtx_queue->pdmalbase_p); + pdmalbase_v[i].shi = upper_32_bits((u64)prxtx_queue->pdmalbase_p); + pdmalbase_v[i].dlow = 0; + pdmalbase_v[i].dhi = 0; + + prxtx_queue->dmal_cnt = cnt; + prxtx_queue->dmal_byte = len; + + return 0; +} + +int __start_dmalist_H_2(struct edma_rxtx_q_s *prxtx_queue, u32 type, u32 cnt) +{ + int ret = BSP_OK; + struct bma_dma_transfer_s dma_transfer = { 0 }; + struct edma_shmq_hd_s *pshmq_head = NULL; + + if (!prxtx_queue) + return -1; + + pshmq_head = prxtx_queue->pshmqhd_v; + + LOG(DLOG_DEBUG, "before -> %u/%u/%u/%u.", + prxtx_queue->head, prxtx_queue->tail, + pshmq_head->head, pshmq_head->tail); + + if (type == BSPVETH_RX) { + dma_transfer.dir = BMC_TO_HOST; + ret = __make_dmalistbd_b2h_H_2(prxtx_queue, cnt); + } else { + dma_transfer.dir = HOST_TO_BMC; + ret = __make_dmalistbd_h2b_H_2(prxtx_queue, cnt); + } + + if (ret < 0) + return ret; + + dma_transfer.type = DMA_LIST; + dma_transfer.transfer.list.dma_addr = + (dma_addr_t)prxtx_queue->pdmalbase_p; + + ret = bma_intf_start_dma(g_eth_edmaprivate.edma_priv, &dma_transfer); + LOG(DLOG_DEBUG, "after -> %u/%u/%u/%u, ret: %d", + prxtx_queue->head, prxtx_queue->tail, + pshmq_head->head, pshmq_head->tail, + ret); + + if (ret < 0) + return ret; + + prxtx_queue->start_dma = 1; + + return BSP_OK; +} + +int check_dma_queue_fault_2(struct edma_rxtx_q_s *prxtx_queue, + u32 type, u32 *pcnt) +{ + int ret; + u32 cnt = 0; + + if (prxtx_queue->dma_overtime > BSPVETH_MAX_QUE_DEEP) + return -EFAULT; + + ret = __check_dmacmp_H_2(prxtx_queue, type); + if (ret != BSP_OK) + return -EFAULT; + + ret = __checkspace_H_2(prxtx_queue, type, &cnt); + if (ret != BSP_OK) + return -EFAULT; + + if (CHECK_DMA_RXQ_FAULT(prxtx_queue, type, cnt)) { + udelay(DMA_RXQ_FAULT_DELAY); + + prxtx_queue->dmal_cnt--; + + return -EFAULT; + } + + *pcnt = cnt; + + return BSP_OK; +} + +int __dma_rxtx_H_2(struct edma_rxtx_q_s *prxtx_queue, u32 type) +{ + int ret; + u32 cnt = 0; + + if (!prxtx_queue || !prxtx_queue->pshmqhd_v) + return -EFAULT; + + if (CHECK_DMA_QUEUE_EMPTY(type, prxtx_queue)) { + LOG(DLOG_DEBUG, "Queue (type: %u) is empty.", type); + return BSP_OK; + } + + ret = check_dma_queue_fault_2(prxtx_queue, type, &cnt); + if (ret != BSP_OK) { + LOG(DLOG_DEBUG, "check_dma_queue_fault_2 (ret: %d).", ret); + return -EFAULT; + } + + if (cnt == 0) + return BSP_OK; + + ret = __start_dmalist_H_2(prxtx_queue, type, cnt); + if (ret != BSP_OK) { + LOG(DLOG_DEBUG, "__start_dmalist_H_2 returns %d", ret); + return -EFAULT; + } + + if (cnt <= DMA_QUEUE_FAULT_LIMIT) { + ret = __check_dmacmp_H_2(prxtx_queue, type); + if (ret != BSP_OK) { + LOG(DLOG_DEBUG, "__check_dmacmp_H_2 returns %d", ret); + return -EFAULT; + } + } + + return BSP_OK; +} + +inline int veth_dma_task_H_2(u32 type) +{ + struct edma_rxtx_q_s *prxtx_queue = NULL; + + if (type == BSPVETH_RX) { + g_eth_edmaprivate.run_dma_RX_task++; + prxtx_queue = g_eth_edmaprivate.prx_queue; + } else { + g_eth_edmaprivate.run_dma_TX_task++; + prxtx_queue = g_eth_edmaprivate.ptx_queue; + } + + if (prxtx_queue) { + if (!edma_is_queue_ready(prxtx_queue)) { + LOG(DLOG_DEBUG, "queue is not ready, init flag: %u.", + prxtx_queue->pshmqhd_v->init); + return BSP_OK; + } + + (void)__dma_rxtx_H_2(prxtx_queue, type); + + if (!CHECK_DMA_QUEUE_EMPTY(type, prxtx_queue)) + return BSP_ERR_AGAIN; + } + + return BSP_OK; +} + +void edma_task_do_data_transmit(unsigned long data) +{ + struct tasklet_struct *t = (struct tasklet_struct *)data; + int txret, rxret; + + LOG(DLOG_DEBUG, "host_head/host_tail/shm_head/shm_tail - "); + LOG(DLOG_DEBUG, "rx:%u/%u/%u/%u, tx:%u/%u/%u/%u.", + g_eth_edmaprivate.prx_queue->head, + g_eth_edmaprivate.prx_queue->tail, + g_eth_edmaprivate.prx_queue->pshmqhd_v->head, + g_eth_edmaprivate.prx_queue->pshmqhd_v->tail, + g_eth_edmaprivate.ptx_queue->head, + g_eth_edmaprivate.ptx_queue->tail, + g_eth_edmaprivate.ptx_queue->pshmqhd_v->head, + g_eth_edmaprivate.ptx_queue->pshmqhd_v->tail); + + txret = veth_dma_task_H_2(BSPVETH_TX); + + rxret = veth_dma_task_H_2(BSPVETH_RX); + + LOG(DLOG_DEBUG, "host_head/host_tail/shm_head/shm_tail - "); + LOG(DLOG_DEBUG, "rx:%u/%u/%u/%u, tx:%u/%u/%u/%u.\n", + g_eth_edmaprivate.prx_queue->head, + g_eth_edmaprivate.prx_queue->tail, + g_eth_edmaprivate.prx_queue->pshmqhd_v->head, + g_eth_edmaprivate.prx_queue->pshmqhd_v->tail, + g_eth_edmaprivate.ptx_queue->head, + g_eth_edmaprivate.ptx_queue->tail, + g_eth_edmaprivate.ptx_queue->pshmqhd_v->head, + g_eth_edmaprivate.ptx_queue->pshmqhd_v->tail); + + if (txret == BSP_ERR_AGAIN || rxret == BSP_ERR_AGAIN) { + /* restart transmission. */ + tasklet_hi_schedule(t); + } +} + +int edma_tasklet_setup(struct edma_eth_dev_s *dev, u8 **rx_buf, + struct edma_cut_packet_node_s **tx_cut_pkt_buf) +{ + u8 *rx_pkt_buf; + struct edma_packet_node_s *rx_packet = NULL; + struct edma_cut_packet_node_s *tx_cut_buf = NULL; + size_t rx_size = + sizeof(struct edma_packet_node_s) * MAX_RXTX_PACKET_LEN; + + rx_pkt_buf = kmalloc(MAX_PACKET_LEN, GFP_KERNEL); + if (!rx_pkt_buf) + return -ENOMEM; + + tx_cut_buf = (struct edma_cut_packet_node_s *) + kmalloc(sizeof(*tx_cut_buf), GFP_KERNEL); + if (!tx_cut_buf) { + kfree(rx_pkt_buf); + return -ENOMEM; + } + + rx_packet = kmalloc(rx_size, GFP_KERNEL); + if (!rx_packet) { + kfree(rx_pkt_buf); + kfree(tx_cut_buf); + return -ENOMEM; + } + + memset(rx_pkt_buf, 0, MAX_PACKET_LEN); + memset(tx_cut_buf, 0, sizeof(*tx_cut_buf)); + memset(rx_packet, 0, rx_size); + + *rx_buf = rx_pkt_buf; + *tx_cut_pkt_buf = tx_cut_buf; + dev->rx_packet = rx_packet; + + spin_lock_init(&dev->rx_queue_lock); + + tasklet_init(&dev->skb_task, + edma_task_do_packet_recv, + (unsigned long)&dev->skb_task); + + tasklet_init(&dev->dma_task, + edma_task_do_data_transmit, + (unsigned long)&dev->dma_task); + + return 0; +} + +void edma_tasklet_free(struct edma_eth_dev_s *dev, u8 **rx_buf, + struct edma_cut_packet_node_s **tx_cut_pkt_buf) +{ + if (!*rx_buf) + return; + + /* stop task before releasing resource. */ + tasklet_kill(&dev->dma_task); + tasklet_kill(&dev->skb_task); + + kfree(*rx_buf); + kfree(*tx_cut_pkt_buf); + + /* flush the ring buf. */ + edma_veth_flush_ring_node(dev->rx_packet, MAX_RXTX_PACKET_LEN); + kfree(dev->rx_packet); + + *rx_buf = NULL; + *tx_cut_pkt_buf = NULL; + dev->rx_packet = NULL; +} + +static int edma_veth_int_handler(struct notifier_block *nb, + unsigned long ev, void *unuse) +{ + g_eth_edmaprivate.recv_int++; + + if (g_eth_edmaprivate.dma_task.func) + tasklet_hi_schedule(&g_eth_edmaprivate.dma_task); + + return IRQ_HANDLED; +} + +static struct notifier_block g_edma_veth_int_nb = { + .notifier_call = edma_veth_int_handler, +}; + +static int comm_init_dev(struct edma_eth_dev_s *edma, + const struct file_operations *fops) +{ + struct cdev_dev_s *dev = &edma->cdev; + int ret; + + dev->priv = edma->edma_priv; + dev->dev.minor = MISC_DYNAMIC_MINOR; + dev->dev.name = CDEV_VETH_NAME; + dev->dev.fops = fops; + + ret = misc_register(&dev->dev); + if (ret < 0) { + LOG(DLOG_ERROR, "Failed to alloc major number, %d", ret); + return ret; + } + + return 0; +} + +static inline void comm_cleanup_dev(struct edma_eth_dev_s *edma) +{ + struct cdev_dev_s *dev = &edma->cdev; + + misc_deregister(&dev->dev); +} + +static int __init edma_cdev_init(void) +{ + int ret; + + g_write_count = 0; + g_delay_ms = 0; + g_last_number = 0; + g_peer_not_ready = 0; + + LOG(DLOG_DEBUG, "Module init."); + + if (!bma_intf_check_edma_supported()) + return -ENXIO; + + (void)memset(&g_eth_edmaprivate, 0, sizeof(g_eth_edmaprivate)); + + /* register EDMA sub-subyem. */ + ret = bma_intf_register_type(TYPE_VETH, 0, INTR_ENABLE, + &g_eth_edmaprivate.edma_priv); + if (ret < 0) { + LOG(DLOG_ERROR, "Failed to register EDMA interface."); + goto failed; + } + + /* initialize host DMA address. */ + edma_veth_host_addr_init(g_eth_edmaprivate.edma_priv); + + /* setup TX/RX resource */ + ret = edma_veth_setup_resource(&g_eth_edmaprivate); + if (ret < 0) { + LOG(DLOG_ERROR, "Failed to setup resource."); + goto failed1; + } + + /* setup resource for user packets. */ + ret = edma_tasklet_setup(&g_eth_edmaprivate, + &g_edma_recv_packet_tmp.packet, + &g_edma_send_cut_packet); + if (ret < 0) + goto failed2; + + /* register char device. */ + ret = comm_init_dev(&g_eth_edmaprivate, &g_eth_edma_cdev_fops); + if (ret != 0) { + LOG(DLOG_ERROR, "Failed to register cdev device."); + goto failed3; + } + + /* register EDMA INT notifier. */ + ret = bma_intf_register_int_notifier(&g_edma_veth_int_nb); + if (ret < 0) { + LOG(DLOG_ERROR, "Failed to register INT notifier."); + goto failed4; + } + + dump_global_info(); + + GET_SYS_SECONDS(g_eth_edmaprivate.init_time); + + return 0; + +failed4: + comm_cleanup_dev(&g_eth_edmaprivate); +failed3: + edma_tasklet_free(&g_eth_edmaprivate, + &g_edma_recv_packet_tmp.packet, + &g_edma_send_cut_packet); +failed2: + edma_veth_free_resource(&g_eth_edmaprivate); +failed1: + (void)bma_intf_unregister_type(&g_eth_edmaprivate.edma_priv); +failed: + return ret; +} + +static void __exit edma_cdev_exit(void) +{ + LOG(DLOG_DEBUG, "Module exit."); + + bma_intf_unregister_int_notifier(&g_edma_veth_int_nb); + + comm_cleanup_dev(&g_eth_edmaprivate); + + edma_tasklet_free(&g_eth_edmaprivate, + &g_edma_recv_packet_tmp.packet, + &g_edma_send_cut_packet); + + edma_veth_free_resource(&g_eth_edmaprivate); + + bma_intf_unregister_type(&g_eth_edmaprivate.edma_priv); +} + +static inline int cdev_check_ring_recv(void) +{ + unsigned int count; + + count = edma_veth_get_ring_buf_count(g_eth_edmaprivate.rx_packet_head, + g_eth_edmaprivate.rx_packet_tail, + MAX_RXTX_PACKET_LEN); + return (count > 0 ? 1 : 0); +} + +static ssize_t cdev_copy_packet_to_user(struct edma_eth_dev_s *dev, + char __user *data, size_t count) +{ + unsigned char *packet = NULL; + unsigned char *start = NULL; + unsigned int free_packet = 0; + ssize_t length = (ssize_t)count; + ssize_t left; + + LOG(DLOG_DEBUG, "rx_packet_head:%u, rx_packet_tail: %u", + dev->rx_packet_head, dev->rx_packet_tail); + + spin_lock(&dev->rx_queue_lock); + + if (!cdev_check_ring_recv()) { + spin_unlock(&dev->rx_queue_lock); + return -EAGAIN; + } + + left = (ssize_t)(dev->rx_packet[dev->rx_packet_head].len) - g_read_pos; + start = dev->rx_packet[dev->rx_packet_head].packet + g_read_pos; + + LOG(DLOG_DEBUG, + "User needs %ld bytes, pos: %u, total len: %u, left: %ld.", + count, g_read_pos, dev->rx_packet[dev->rx_packet_head].len, left); + if (left <= 0) { + /* No more data in this message, retry. */ + length = -EAGAIN; + free_packet = 1; + } else if (length > left) { + /* A full message is returned. */ + length = left; + free_packet = 1; + } else { + /* Update pos. */ + g_read_pos += length; + } + + if (free_packet) { + g_read_pos = 0; + packet = dev->rx_packet[dev->rx_packet_head].packet; + dev->rx_packet[dev->rx_packet_head].packet = NULL; + dev->rx_packet_head = (dev->rx_packet_head + 1) % + MAX_RXTX_PACKET_LEN; + } + + spin_unlock(&dev->rx_queue_lock); + + if (length > 0 && copy_to_user(data, start, length)) { + LOG(DLOG_DEBUG, "Failed to copy to user, skip this message."); + length = -EFAULT; + g_read_pos = 0; + } + + LOG(DLOG_DEBUG, + "Copied bytes: %ld, pos: %d, buf len: %lu, free_packet: %d.", + length, g_read_pos, count, free_packet); + + if (packet) { + /* Free the packet as needed. */ + kfree(packet); + } + + return length; +} + +int cdev_open(struct inode *inode_ptr, struct file *filp) +{ + struct cdev_dev_s *dev = &g_eth_edmaprivate.cdev; + + LOG(DLOG_DEBUG, "Open device."); + + if (!inode_ptr || !filp) + return -EFAULT; + + /* only one instance is allowed. */ + if (IS_CDEV_IN_OPEN_STATE()) + return -EBUSY; + + LOG(DLOG_DEBUG, "Init flag, rx: %d, tx:%d", + g_eth_edmaprivate.prx_queue->pshmqhd_v->init, + g_eth_edmaprivate.ptx_queue->pshmqhd_v->init); + + /* save to private data. */ + filp->private_data = dev; + SET_CDEV_OPEN_STATE(CDEV_OPENED); + g_read_pos = 0; + + return 0; +} + +int cdev_release(struct inode *inode_ptr, struct file *filp) +{ + LOG(DLOG_DEBUG, "Close device."); + + if (!filp) + return 0; + + filp->private_data = NULL; + + SET_CDEV_OPEN_STATE(CDEV_CLOSED); + + return 0; +} + +unsigned int cdev_poll(struct file *filp, poll_table *wait) +{ + unsigned int mask = 0; + wait_queue_head_t *queue_head = NULL; + + if (!filp) + return 0; + + edma_veth_dump(); + + queue_head = (wait_queue_head_t *) + bma_cdev_get_wait_queue(GET_PRIVATE_DATA(filp)); + if (!queue_head) + return 0; + + /* check or add to wait queue. */ + poll_wait(filp, queue_head, wait); + + if (!edma_is_queue_ready(g_eth_edmaprivate.prx_queue)) + return 0; + + if (cdev_check_ring_recv() > 0) + mask = (POLLIN | POLLRDNORM); + + return mask; +} + +ssize_t cdev_read(struct file *filp, char __user *data, + size_t count, loff_t *ppos) +{ + struct edma_eth_dev_s *dev = &g_eth_edmaprivate; + ssize_t length = 0; + + if (!data || count >= MAX_PACKET_LEN) + return -EFAULT; + + LOG(DLOG_DEBUG, "read begin, count: %ld, pos: %u.", count, g_read_pos); + + length = cdev_copy_packet_to_user(dev, data, count); + + LOG(DLOG_DEBUG, "read done, length: %ld, pos: %u.", length, g_read_pos); + + return length; +} + +ssize_t cdev_write(struct file *filp, const char __user *data, + size_t count, loff_t *ppos) +{ + int ret = 0; + struct edma_eth_dev_s *pdev = &g_eth_edmaprivate; + + if (!data || count <= 0 || count > MAX_PACKET_LEN) + return -EINVAL; + + if (!edma_is_queue_ready(pdev->ptx_queue)) { + if (g_peer_not_ready == 0 && pdev->ptx_queue) { + LOG(DLOG_ERROR, "Peer rx queue is not ready (%u).", + pdev->ptx_queue->pshmqhd_v->init); + g_peer_not_ready = 1; + } + return -EPERM; + } else if (g_peer_not_ready) { + LOG(DLOG_ERROR, "Peer rx queue becomes ready."); + g_peer_not_ready = 0; + } + + LOG(DLOG_DEBUG, "data length is %lu, pos: %u (%u/%u)", + count, g_read_pos, + pdev->ptx_queue->pshmqhd_v->count, + pdev->ptx_queue->pshmqhd_v->total); + + ret = edma_veth_cut_tx_packet_send(pdev, data, count); + if (ret < 0) { + LOG(DLOG_ERROR, "Failed to send packet, return code: %d.", ret); + } else { + tasklet_hi_schedule(&g_eth_edmaprivate.dma_task); + ret = count; + } + + return ret; +} + +MODULE_VERSION(MICRO_TO_STR(CDEV_VETH_VERSION)); +MODULE_AUTHOR("HUAWEI TECHNOLOGIES CO., LTD."); +MODULE_DESCRIPTION("HUAWEI CDEV DRIVER"); +MODULE_LICENSE("GPL"); + +module_init(edma_cdev_init); +module_exit(edma_cdev_exit); \ No newline at end of file diff --git a/drivers/net/ethernet/huawei/bma/cdev_veth_drv/virtual_cdev_eth_net.h b/drivers/net/ethernet/huawei/bma/cdev_veth_drv/virtual_cdev_eth_net.h new file mode 100644 index 000000000000..44abe56dd787 --- /dev/null +++ b/drivers/net/ethernet/huawei/bma/cdev_veth_drv/virtual_cdev_eth_net.h @@ -0,0 +1,299 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Huawei iBMA driver. + * Copyright (c) 2019, Huawei Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _VETH_CDEV_NET_H_ +#define _VETH_CDEV_NET_H_ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/spinlock.h> +#include <linux/dma-mapping.h> +#include <linux/miscdevice.h> +#include <linux/netdevice.h> +#include <linux/poll.h> +#include <linux/delay.h> + +#include "../edma_drv/bma_include.h" +#include "../include/bma_ker_intf.h" + +#define BSP_OK (0) +#define BSP_ERR (0xFFFFFFFF) +#define BSP_NETDEV_TX_BUSY (1) +#define BSP_ERR_INIT_ERR (BSP_NETDEV_TX_BUSY) +#define BSP_ETH_ERR_BASE (0x0FFFF000) +#define BSP_ERR_OUT_OF_MEM (BSP_ETH_ERR_BASE + 1) +#define BSP_ERR_NULL_POINTER (BSP_ETH_ERR_BASE + 2) +#define BSP_ERR_INVALID_STR (BSP_ETH_ERR_BASE + 3) +#define BSP_ERR_INVALID_PARAM (BSP_ETH_ERR_BASE + 4) +#define BSP_ERR_INVALID_DATA (BSP_ETH_ERR_BASE + 5) +#define BSP_ERR_OUT_OF_RANGE (BSP_ETH_ERR_BASE + 6) +#define BSP_ERR_INVALID_CARD (BSP_ETH_ERR_BASE + 7) +#define BSP_ERR_INVALID_GRP (BSP_ETH_ERR_BASE + 8) +#define BSP_ERR_INVALID_ETH (BSP_ETH_ERR_BASE + 9) +#define BSP_ERR_SEND_ERR (BSP_ETH_ERR_BASE + 10) +#define BSP_ERR_DMA_ERR (BSP_ETH_ERR_BASE + 11) +#define BSP_ERR_RECV_ERR (BSP_ETH_ERR_BASE + 12) +#define BSP_ERR_SKB_ERR (BSP_ETH_ERR_BASE + 13) +#define BSP_ERR_DMA_ADDR_ERR (BSP_ETH_ERR_BASE + 14) +#define BSP_ERR_IOREMAP_ERR (BSP_ETH_ERR_BASE + 15) +#define BSP_ERR_LEN_ERR (BSP_ETH_ERR_BASE + 16) +#define BSP_ERR_STAT_ERR (BSP_ETH_ERR_BASE + 17) +#define BSP_ERR_AGAIN (BSP_ETH_ERR_BASE + 18) +#define BSP_ERR_NOT_TO_HANDLE (BSP_ETH_ERR_BASE + 19) + +#define VETH_SHAREPOOL_BASE_INBMC (0x84820000) +#define VETH_SHAREPOOL_SIZE (0xdf000) +#define VETH_SHAREPOOL_OFFSET (0x10000) +#define MAX_SHAREQUEUE_SIZE (0x20000) + +#define BSPVETH_DMABURST_MAX (64) +#define BSPVETH_SHMBDBASE_OFFSET (0x80) +#define SHMDMAL_OFFSET (0x10000) +#define MAX_SHMDMAL_SIZE (BSPVETH_DMABURST_MAX * 32) +#define MAX_QUEUE_NUM (1) +#define MAX_QUEUE_BDNUM (128) +#define BSPVETH_MAX_QUE_DEEP (MAX_QUEUE_BDNUM) +#define BSPVETH_POINT_MASK (MAX_QUEUE_BDNUM - 1) +#define BSPVETH_WORK_LIMIT (64) +#define BSPVETH_CHECK_DMA_STATUS_TIMES (512) + +#define BSPPACKET_MTU_MAX (1500) + +#define BSPVETH_DMA_OK (1) +#define BSPVETH_DMA_BUSY (0) +#define BSPVETH_RX (2) +#define BSPVETH_TX (3) +#define BSPVETH_SHMQUEUE_INITOK (0x12) +#define BSPVETH_SHMQUEUE_INITOK_V2 (0x16) + +#define MAX_PACKET_LEN (128 * BSPPACKET_MTU_MAX) +#define MAX_RXTX_PACKET_LEN 64 +#define RESERVE_SPACE 24 + +/* device name. */ +#define CDEV_VETH_NAME "net_cdev" +#define CDEV_OPENED (1) +#define CDEV_CLOSED (0) + +#ifndef GET_SYS_SECONDS +#define GET_SYS_SECONDS(t) do { \ + struct timespec _uptime; \ + get_monotonic_boottime(&_uptime); \ + t = _uptime.tv_sec; \ +} while (0) +#endif + +struct edma_packet_node_s { + u32 len; + u8 *packet; +}; + +struct edma_cut_packet_node_s { + u32 token; + u32 number; + u32 cut_packet_len; + u8 cut_packet[BSPPACKET_MTU_MAX]; + u8 resv[RESERVE_SPACE]; +}; + +#define TK_MIDDLE_PACKET 0 +#define TK_START_PACKET 1 +#define TK_END_PACKET 2 +#define TK_START_END 3 + +/* EDMA transfer requires an alignment of 4. */ +#define EDMA_ADDR_ALIGNMENT (4UL) +#define EDMA_ADDR_ALIGN_MASK (EDMA_ADDR_ALIGNMENT - 1) +#define EDMA_ADDR_ALIGNED(dma_p) (((unsigned long)(dma_p)) & \ + (~(EDMA_ADDR_ALIGN_MASK))) +#define EDMA_ADDR_OFFSET(dma_p) (((unsigned long)(dma_p)) & \ + (EDMA_ADDR_ALIGN_MASK)) + +#define NODE_SIZE (sizeof(struct edma_cut_packet_node_s)) +#define NODE_TO_PACKET_SIZE(n) (n->cut_packet_len + (3 * sizeof(u32))) +#define NODE_PER_PAGE (PAGE_SIZE / (NODE_SIZE)) + +#define ALIGN_MASK 4096 +#define STRESS_FACTOR 100 +#define DMA_STATUS_CHECK_DELAY_LIMIT 20 +#define DMA_STATUS_CHECK_DELAY_MS 5 +#define DMA_RXQ_FAULT_DELAY 50 +#define DMA_QUEUE_FAULT_LIMIT 16 +#define DMACMP_ERR_FACTOR 4 +#define DMABURST_FACTOR 7 + +struct cdev_dev_s { + struct miscdevice dev; + void *priv; +}; + +struct edma_rxtx_statistics { + u64 dmapkt; + u64 dmapktbyte; + + u32 q_empty; + u32 shm_empty; + u32 dma_busy; + u32 type_err; + + u32 dma_need_offset; + u32 dma_failed; + u32 dma_burst; +}; + +struct edma_bd_info_s { + u8 *pdma_v; + dma_addr_t dma_p; + u32 len; + u32 off; +}; + +struct edma_dma_shmbd_s { + u32 dma_p; + u32 len; + u32 off; +}; + +struct edma_shmq_hd_s { + u32 count; + u32 total; + u32 next_to_fill; + u32 next_to_free; + u32 resv1; + u32 resv2; + u32 init; + u32 head; + u32 tail; +}; + +struct edma_dmal_s { + u32 chl; + u32 len; + u32 slow; + u32 shi; + u32 dlow; + u32 dhi; +}; + +struct edma_rxtx_q_s { + struct edma_bd_info_s *pbdinfobase_v; + + struct edma_shmq_hd_s *pshmqhd_v; + u8 *pshmqhd_p; + + struct edma_dma_shmbd_s *pshmbdbase_v; + u8 *pshmbdbase_p; + + struct edma_dmal_s *pdmalbase_v; + u8 *pdmalbase_p; + + u32 dmal_cnt; + u32 dmal_byte; + + u32 count; + u32 size; + + u32 head; + u32 tail; + + u16 start_dma; + u16 dmacmperr; + u16 dma_overtime; + + u32 work_limit; + + struct edma_rxtx_statistics s; +}; + +struct edma_eth_dev_s { + struct edma_rxtx_q_s *ptx_queue; + struct edma_rxtx_q_s *prx_queue; + + struct edma_packet_node_s *rx_packet; + spinlock_t rx_queue_lock; /* spinlock for rx queue */ + + u32 rx_packet_head; + u32 rx_packet_tail; + + unsigned long pages_tx; + unsigned long pages_rx; + + u8 *pshmpool_p; + u8 *pshmpool_v; + u32 shmpoolsize; + + u32 recv_int; + u32 tobmc_int; + u32 run_dma_TX_task; + u32 run_dma_RX_task; + u32 run_skb_RX_task; + + struct tasklet_struct skb_task; + struct tasklet_struct dma_task; + + struct cdev_dev_s cdev; + __kernel_time_t init_time; + + void *edma_priv; +}; + +#ifndef LOG +#define LOG(level, fmt, ...) do {\ + if (debug >= (level)) {\ + netdev_err(0, "[%s,%d] -> " fmt "\n", \ + __func__, __LINE__, ##__VA_ARGS__); \ + } \ +} while (0) +#endif + +#define BD_QUEUE_MASK(p) ((p) & (BSPVETH_POINT_MASK)) + +#define GET_BD_RING_QUEUE_COUNT(head, tail) \ + ((BSPVETH_MAX_QUE_DEEP + (head) - (tail)) & BSPVETH_POINT_MASK) +#define GET_BD_RING_QUEUE_SPACE(head, tail) \ + ((BSPVETH_MAX_QUE_DEEP - 1 + (tail) - (head)) & BSPVETH_POINT_MASK) +#define JUDGE_RING_QUEUE_SPACE(head, tail, len) \ + (GET_BD_RING_QUEUE_SPACE(head, tail) >= (len)) + +#define CHECK_DMA_QUEUE_EMPTY(type, queue) \ + (((type) == BSPVETH_RX && \ + (queue)->pshmqhd_v->head == (queue)->pshmqhd_v->tail) || \ + ((type) == BSPVETH_TX && (queue)->head == (queue)->tail)) + +#define CHECK_DMA_RXQ_FAULT(queue, type, cnt) \ + ((type) == BSPVETH_RX && (queue)->dmal_cnt > 1 && \ + (cnt) < ((queue)->work_limit / 2)) + +#define GET_DMA_DIRECTION(type) \ + (((type) == BSPVETH_RX) ? BMC_TO_HOST : HOST_TO_BMC) + +/******* rate limit *********/ +#define RL_MAX_PACKET 10 +#define RL_STRESS_LOW 50 +#define RL_STRESS_HIGH 80 +#define RL_DELAY_MS_LOW 20 +#define RL_DELAY_MS_HIGH 100 + +void veth_dma_task_H(u32 type); +void veth_skbtimer_close(void); +int veth_skbtimer_init(void); +int veth_dmatimer_close_H(void); +int veth_dmatimer_init_H(void); +int veth_skb_tr_task(unsigned long data); + +#endif \ No newline at end of file
From: Qi Deng dengqi7@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4ETXO CVE: NA
-----------------------------------------
Make CONFIG_BMA=m, except euleros_defconfig.
Link: https://lkml.org/lkml/2020/6/22/752
Signed-off-by: Qi Deng dengqi7@huawei.com Reviewed-by: Qidong Wang wangqindong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + 2 files changed, 2 insertions(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index cb9f3645d84c..7e6902f64c66 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -2649,6 +2649,7 @@ CONFIG_HNS3_HCLGEVF=m CONFIG_HNS3_ENET=m CONFIG_NET_VENDOR_HUAWEI=y CONFIG_HINIC=m +CONFIG_BMA=m # CONFIG_NET_VENDOR_I825XX is not set CONFIG_NET_VENDOR_INTEL=y # CONFIG_E100 is not set diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 11e0d90b5f06..b25d908dc7a1 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -2676,6 +2676,7 @@ CONFIG_NET_VENDOR_GOOGLE=y # CONFIG_GVE is not set CONFIG_NET_VENDOR_HUAWEI=y CONFIG_HINIC=m +CONFIG_BMA=m CONFIG_NET_VENDOR_NETSWIFT=y CONFIG_TXGBE=m # CONFIG_NET_VENDOR_I825XX is not set
From: l00525341 liuqi115@huawei.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4A1WQ
------------------------------------------------------------------------
Build HISI PMU drivers as modules.
Signed-off-by: Qi Liu liuqi115@huawei.com Reviewed-by: Shaokun Zhang zhangshaokun@hisilicon.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/configs/openeuler_defconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 7e6902f64c66..76d6a118330d 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -5996,7 +5996,7 @@ CONFIG_QCOM_L3_PMU=y CONFIG_THUNDERX2_PMU=m CONFIG_XGENE_PMU=y CONFIG_ARM_SPE_PMU=y -CONFIG_HISI_PMU=y +CONFIG_HISI_PMU=m # end of Performance monitor support
CONFIG_RAS=y
From: Joakim Zhang qiangqing.zhang@nxp.com
mainline inclusion from mainline-v5.14-rc1 commit 0a4355c2b7f8 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4H7E6 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
CLKOUT is enabled by default after PHY hardware reset, this patch adds "realtek,clkout-disable" property for user to disable CLKOUT clock to save PHY power.
Per RTL8211F guide, a PHY reset should be issued after setting these bits in PHYCR2 register. After this patch, CLKOUT clock output to be disabled.
Signed-off-by: Joakim Zhang qiangqing.zhang@nxp.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com (fix conflicts: adjust the location of RTL8211F_CLKOUT_EN) (fix conflicts: replace rtl8211e_config_init by rtl821x_resume) (fix conflicts: replace .read_status by .ack_interrupt) Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/phy/realtek.c | 42 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-)
diff --git a/drivers/net/phy/realtek.c b/drivers/net/phy/realtek.c index b4879306bb8a..847d19fac630 100644 --- a/drivers/net/phy/realtek.c +++ b/drivers/net/phy/realtek.c @@ -8,6 +8,7 @@ * Copyright (c) 2004 Freescale Semiconductor, Inc. */ #include <linux/bitops.h> +#include <linux/of.h> #include <linux/phy.h> #include <linux/module.h> #include <linux/delay.h> @@ -27,6 +28,7 @@ #define RTL821x_PAGE_SELECT 0x1f
#define RTL8211F_PHYCR1 0x18 +#define RTL8211F_PHYCR2 0x19 #define RTL8211F_INSR 0x1d
#define RTL8211F_TX_DELAY BIT(8) @@ -40,6 +42,8 @@ #define RTL8211E_TX_DELAY BIT(12) #define RTL8211E_RX_DELAY BIT(11)
+#define RTL8211F_CLKOUT_EN BIT(0) + #define RTL8201F_ISR 0x1e #define RTL8201F_IER 0x13
@@ -62,6 +66,10 @@ MODULE_DESCRIPTION("Realtek PHY driver"); MODULE_AUTHOR("Johnson Leung"); MODULE_LICENSE("GPL");
+struct rtl821x_priv { + u16 phycr2; +}; + static int rtl821x_read_page(struct phy_device *phydev) { return __phy_read(phydev, RTL821x_PAGE_SELECT); @@ -72,6 +80,28 @@ static int rtl821x_write_page(struct phy_device *phydev, int page) return __phy_write(phydev, RTL821x_PAGE_SELECT, page); }
+static int rtl821x_probe(struct phy_device *phydev) +{ + struct device *dev = &phydev->mdio.dev; + struct rtl821x_priv *priv; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->phycr2 = phy_read_paged(phydev, 0xa43, RTL8211F_PHYCR2); + if (priv->phycr2 < 0) + return priv->phycr2; + + priv->phycr2 &= RTL8211F_CLKOUT_EN; + if (of_property_read_bool(dev->of_node, "realtek,clkout-disable")) + priv->phycr2 &= ~RTL8211F_CLKOUT_EN; + + phydev->priv = priv; + + return 0; +} + static int rtl8201_ack_interrupt(struct phy_device *phydev) { int err; @@ -180,6 +210,7 @@ static int rtl8211c_config_init(struct phy_device *phydev)
static int rtl8211f_config_init(struct phy_device *phydev) { + struct rtl821x_priv *priv = phydev->priv; struct device *dev = &phydev->mdio.dev; u16 val_txdly, val_rxdly; u16 val; @@ -243,7 +274,15 @@ static int rtl8211f_config_init(struct phy_device *phydev) val_rxdly ? "enabled" : "disabled"); }
- return 0; + ret = phy_modify_paged(phydev, 0xa43, RTL8211F_PHYCR2, + RTL8211F_CLKOUT_EN, priv->phycr2); + if (ret < 0) { + dev_err(dev, "clkout configuration failed: %pe\n", + ERR_PTR(ret)); + return ret; + } + + return genphy_soft_reset(phydev); }
static int rtl821x_resume(struct phy_device *phydev) @@ -633,6 +672,7 @@ static struct phy_driver realtek_drvs[] = { }, { PHY_ID_MATCH_EXACT(0x001cc916), .name = "RTL8211F Gigabit Ethernet", + .probe = rtl821x_probe, .config_init = &rtl8211f_config_init, .ack_interrupt = &rtl8211f_ack_interrupt, .config_intr = &rtl8211f_config_intr,
From: Joakim Zhang qiangqing.zhang@nxp.com
mainline inclusion from mainline-v5.14-rc1 commit d90db36a9e74 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4H7E6 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
If enable Advance Link Down Power Saving (ALDPS) mode, it will change crystal/clock behavior, which cause RXC clock stop for dozens to hundreds of miliseconds. This is comfirmed by Realtek engineer. For some MACs, it needs RXC clock to support RX logic, after this patch, PHY can generate continuous RXC clock during auto-negotiation.
ALDPS default is disabled after hardware reset, it's more reasonable to add a property to enable this feature, since ALDPS would introduce side effect. This patch adds dt property "realtek,aldps-enable" to enable ALDPS mode per users' requirement.
Jisheng Zhang enables this feature, changes the default behavior. Since mine patch breaks the rule that new implementation should not break existing design, so Cc'ed let him know to see if it can be accepted.
Cc: Jisheng Zhang Jisheng.Zhang@synaptics.com Signed-off-by: Joakim Zhang qiangqing.zhang@nxp.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/phy/realtek.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-)
diff --git a/drivers/net/phy/realtek.c b/drivers/net/phy/realtek.c index 847d19fac630..f5095fc6c904 100644 --- a/drivers/net/phy/realtek.c +++ b/drivers/net/phy/realtek.c @@ -67,6 +67,7 @@ MODULE_AUTHOR("Johnson Leung"); MODULE_LICENSE("GPL");
struct rtl821x_priv { + u16 phycr1; u16 phycr2; };
@@ -89,6 +90,14 @@ static int rtl821x_probe(struct phy_device *phydev) if (!priv) return -ENOMEM;
+ priv->phycr1 = phy_read_paged(phydev, 0xa43, RTL8211F_PHYCR1); + if (priv->phycr1 < 0) + return priv->phycr1; + + priv->phycr1 &= (RTL8211F_ALDPS_PLL_OFF | RTL8211F_ALDPS_ENABLE | RTL8211F_ALDPS_XTAL_OFF); + if (of_property_read_bool(dev->of_node, "realtek,aldps-enable")) + priv->phycr1 |= RTL8211F_ALDPS_PLL_OFF | RTL8211F_ALDPS_ENABLE | RTL8211F_ALDPS_XTAL_OFF; + priv->phycr2 = phy_read_paged(phydev, 0xa43, RTL8211F_PHYCR2); if (priv->phycr2 < 0) return priv->phycr2; @@ -213,11 +222,16 @@ static int rtl8211f_config_init(struct phy_device *phydev) struct rtl821x_priv *priv = phydev->priv; struct device *dev = &phydev->mdio.dev; u16 val_txdly, val_rxdly; - u16 val; int ret;
- val = RTL8211F_ALDPS_ENABLE | RTL8211F_ALDPS_PLL_OFF | RTL8211F_ALDPS_XTAL_OFF; - phy_modify_paged_changed(phydev, 0xa43, RTL8211F_PHYCR1, val, val); + ret = phy_modify_paged_changed(phydev, 0xa43, RTL8211F_PHYCR1, + RTL8211F_ALDPS_PLL_OFF | RTL8211F_ALDPS_ENABLE | RTL8211F_ALDPS_XTAL_OFF, + priv->phycr1); + if (ret < 0) { + dev_err(dev, "aldps mode configuration failed: %pe\n", + ERR_PTR(ret)); + return ret; + }
switch (phydev->interface) { case PHY_INTERFACE_MODE_RGMII:
From: Colin Ian King colin.king@canonical.com
mainline inclusion from mainline-v5.14-rc1 commit f25247d88708 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4H7E6 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
The comparisons of the u16 values priv->phycr1 and priv->phycr2 to less than zero always false because they are unsigned. Fix this by using an int for the assignment and less than zero check.
Addresses-Coverity: ("Unsigned compared against 0") Fixes: 0a4355c2b7f8 ("net: phy: realtek: add dt property to disable CLKOUT clock") Fixes: d90db36a9e74 ("net: phy: realtek: add dt property to enable ALDPS mode") Signed-off-by: Colin Ian King colin.king@canonical.com Signed-off-by: David S. Miller davem@davemloft.net Reviewed-by: Yongxin Li liyongxin1@huawei.com Signed-off-by: Junxin Chen chenjunxin1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/phy/realtek.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/drivers/net/phy/realtek.c b/drivers/net/phy/realtek.c index f5095fc6c904..d231c35aa10a 100644 --- a/drivers/net/phy/realtek.c +++ b/drivers/net/phy/realtek.c @@ -85,24 +85,25 @@ static int rtl821x_probe(struct phy_device *phydev) { struct device *dev = &phydev->mdio.dev; struct rtl821x_priv *priv; + int ret;
priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM;
- priv->phycr1 = phy_read_paged(phydev, 0xa43, RTL8211F_PHYCR1); - if (priv->phycr1 < 0) - return priv->phycr1; + ret = phy_read_paged(phydev, 0xa43, RTL8211F_PHYCR1); + if (ret < 0) + return ret;
- priv->phycr1 &= (RTL8211F_ALDPS_PLL_OFF | RTL8211F_ALDPS_ENABLE | RTL8211F_ALDPS_XTAL_OFF); + priv->phycr1 = ret & (RTL8211F_ALDPS_PLL_OFF | RTL8211F_ALDPS_ENABLE | RTL8211F_ALDPS_XTAL_OFF); if (of_property_read_bool(dev->of_node, "realtek,aldps-enable")) priv->phycr1 |= RTL8211F_ALDPS_PLL_OFF | RTL8211F_ALDPS_ENABLE | RTL8211F_ALDPS_XTAL_OFF;
- priv->phycr2 = phy_read_paged(phydev, 0xa43, RTL8211F_PHYCR2); - if (priv->phycr2 < 0) - return priv->phycr2; + ret = phy_read_paged(phydev, 0xa43, RTL8211F_PHYCR2); + if (ret < 0) + return ret;
- priv->phycr2 &= RTL8211F_CLKOUT_EN; + priv->phycr2 = ret & RTL8211F_CLKOUT_EN; if (of_property_read_bool(dev->of_node, "realtek,clkout-disable")) priv->phycr2 &= ~RTL8211F_CLKOUT_EN;