hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICDF44?from=project-issue ---------------------------------------- EEVDF null pointer reproduction code. Signed-off-by: Zicheng Qu <quzicheng@huawei.com> Signed-off-by: wulibin163 <wulibin163@126.com> --- kernel/sched/fair.c | 101 +++++++++++++++++++- null_reproduction_test/Makefile | 9 ++ null_reproduction_test/fullcpu.c | 12 +++ null_reproduction_test/make.sh | 17 ++++ null_reproduction_test/test.sh | 34 +++++++ null_reproduction_test/test_sched.c | 141 ++++++++++++++++++++++++++++ 6 files changed, 313 insertions(+), 1 deletion(-) create mode 100644 null_reproduction_test/Makefile create mode 100644 null_reproduction_test/fullcpu.c create mode 100755 null_reproduction_test/make.sh create mode 100755 null_reproduction_test/test.sh create mode 100644 null_reproduction_test/test_sched.c diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c4c3afa6e7b4..ebfbdc1a4ce4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -75,6 +75,17 @@ #endif #include <linux/sched/grid_qos.h> +static int se_schedule_pid = 0; // the pid of task `test_sched_0` started in test_sched.c. +module_param(se_schedule_pid, int, 0644); +static int qzc_vlag_switch = 0; // switch to control the vlag for test_sched_0 in place_entity() +module_param(qzc_vlag_switch, int, 0644); +static int qzc_fixed_switch = 0; // switch to control whether to apply the old fix patch, not zero_vruntime patch +module_param(qzc_fixed_switch, int, 0644); +#define __FILENAME__ (__builtin_strrchr(__FILE__, '/') ? __builtin_strrchr(__FILE__, '/') + 1 : __FILE__) +#define ENQUEUE_ENTITY_NONE 0 +#define ENQUEUE_ENTITY_BEGIN 1 +#define ENQUEUE_ENTITY_END 2 + /* * The initial- and re-scaling of tunables is configurable * @@ -3930,6 +3941,14 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, bool curr = cfs_rq->curr == se; u64 avruntime; + if (qzc_fixed_switch && curr && se->on_rq && cfs_rq->nr_running == 1 && + se->vruntime < cfs_rq->min_vruntime) { + s64 rel_deadline = se->deadline - se->vruntime; + + se->vruntime = cfs_rq->min_vruntime; + se->deadline = se->vruntime + rel_deadline; + } + if (se->on_rq) { /* commit outstanding execution time */ update_curr(cfs_rq); @@ -4106,7 +4125,7 @@ static long calc_group_shares(struct cfs_rq *cfs_rq) * Recomputes the group entity based on the current state of its group * runqueue. */ -static void update_cfs_group(struct sched_entity *se) +static void __update_cfs_group(struct sched_entity *se, int flag) { struct cfs_rq *gcfs_rq = group_cfs_rq(se); long shares; @@ -4126,10 +4145,21 @@ static void update_cfs_group(struct sched_entity *se) #else shares = calc_group_shares(gcfs_rq); #endif + + if (flag == ENQUEUE_ENTITY_BEGIN) // enqueue begin + shares = 111616; + else if (flag == ENQUEUE_ENTITY_END) // enqueue end + shares = 395264; + if (unlikely(se->load.weight != shares)) reweight_entity(cfs_rq_of(se), se, shares); } +static void update_cfs_group(struct sched_entity *se) +{ + __update_cfs_group(se, ENQUEUE_ENTITY_NONE); +} + #else /* CONFIG_FAIR_GROUP_SCHED */ static inline void update_cfs_group(struct sched_entity *se) { @@ -5352,6 +5382,16 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) struct sched_entity *curr = cfs_rq->curr; unsigned long load; + /* + * To make the avg_vruntime() and cfs_rq->avg_vruntime lower and lower: + * The original goal is to migrate a large number (countless) of test_sched_0 type tasks + * with very positive high vlag one by one to a specific cfs_rq. + * However, it is difficult to control from user space, + * so we directly simulate it here to achieve this. + */ + if (qzc_vlag_switch != 0 && se_schedule_pid > 0 && entity_is_task(se) && (task_of(se)->pid == se_schedule_pid)) + se->vlag = qzc_vlag_switch == 1 ? calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se) : qzc_vlag_switch; + lag = se->vlag; /* @@ -5442,6 +5482,19 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { bool curr = cfs_rq->curr == se; + /* + * At the very beginning and end of the enqueue process for task `test_sched_0`, + * we want to adjust the weight/shares of cfs_rq->curr simultaneously, + * which is actually the task `fullcpu` from test.sh in most cases. + * + * However, it is quite challenging to control from user space, + * so we intend to simulate this behavior here instead. + */ + if (se_schedule_pid > 0 && entity_is_task(se) && (task_of(se)->pid == se_schedule_pid)) { + if (cfs_rq->curr) + __update_cfs_group(cfs_rq->curr, ENQUEUE_ENTITY_BEGIN); + } + /* * If we're the current task, we must renormalise before calling * update_curr(). @@ -5509,6 +5562,11 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) #endif } } + + if (se_schedule_pid > 0 && entity_is_task(se) && (task_of(se)->pid == se_schedule_pid)) { + if (cfs_rq->curr) + __update_cfs_group(cfs_rq->curr, ENQUEUE_ENTITY_END); + } } static void __clear_buddies_next(struct sched_entity *se) @@ -14819,6 +14877,15 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT_CACHED; u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20))); + /* + * We suppose the original intention of (u64)(-(1LL << 20)) was likely to + * force cfs_rq->min_vruntime to overflow as quickly as possible, + * thereby exposing related overflow issues early during the kernel initial phase. + * + * To accelerate the reproduction of these issues, + * we have temporarily modified the initial value of cfs_rq->min_vruntime. + */ + cfs_rq->min_vruntime = (u64)(4596393947272479); #ifdef CONFIG_SMP raw_spin_lock_init(&cfs_rq->removed.lock); #endif @@ -15269,3 +15336,35 @@ __init void init_sched_fair_class(void) #endif /* SMP */ } +u64 sched_debug_min_vruntime(struct cfs_rq *cfs_rq) +{ + return cfs_rq->min_vruntime; +} +EXPORT_SYMBOL(sched_debug_min_vruntime); + +void sched_debug_cfs_rq_info(struct cfs_rq *cfs_rq) +{ + u64 qzc_avruntime = avg_vruntime(cfs_rq); + + printk("%s:%s:%d, cfs_rq=[%p]\tcfs_rq->nr_running=[%d]\tcfs_rq->avg_vruntime=[%lld]\tcfs_rq->min_vruntime=[%llu]\tcfs_rq->avg_load=[%llu]\tavg_vruntime(cfs_rq)=[%llu]\n", + __FILENAME__,__FUNCTION__, __LINE__, + cfs_rq, cfs_rq->nr_running, cfs_rq->avg_vruntime, cfs_rq->min_vruntime, cfs_rq->avg_load, qzc_avruntime); + + if (cfs_rq->curr) { + printk("%s:%s:%d, curr=[%p]\tpid=[%d]\ttgid=[%d]\tcurr->vruntime=[%llu]\tcurr->load.weight=[%lu]\tcurr->vlag=[%lld]\tcurr->slice=[%llu]\tcurr->deadline=[%llu]\tcurr->my_q=[%p]\treal_vlag=[%lld]\tvruntime_eligible=[%d]\n", + __FILENAME__,__FUNCTION__, __LINE__, + cfs_rq->curr, entity_is_task(cfs_rq->curr) ? task_of(cfs_rq->curr)->pid : -1, entity_is_task(cfs_rq->curr) ? task_of(cfs_rq->curr)->tgid : -1, + cfs_rq->curr->vruntime, cfs_rq->curr->load.weight, cfs_rq->curr->vlag, cfs_rq->curr->slice, cfs_rq->curr->deadline, cfs_rq->curr->my_q, entity_lag(qzc_avruntime, cfs_rq->curr), vruntime_eligible(cfs_rq, cfs_rq->curr->vruntime)); + } + + struct rb_node *node = rb_first_cached(&cfs_rq->tasks_timeline); + + for (; node; node = rb_next(node)) { + struct sched_entity *rb_se = __node_2_se(node); + printk("%s:%s:%d, rb_se=[%p]\tpid=[%d]\ttgid=[%d]\trb_se->vruntime=[%llu]\trb_se->load.weight=[%lu]\trb_se->vlag=[%lld]\trb_se->slice=[%llu]\trb_se->deadline=[%llu]\trb_se->my_q=[%p]\treal_vlag=[%lld]\tvruntime_eligible=[%d]\n", + __FILENAME__,__FUNCTION__, __LINE__, + rb_se, entity_is_task(rb_se) ? task_of(rb_se)->pid : -1, entity_is_task(rb_se) ? task_of(rb_se)->tgid : -1, + rb_se->vruntime, rb_se->load.weight, rb_se->vlag, rb_se->slice, rb_se->deadline, rb_se->my_q, entity_lag(qzc_avruntime, rb_se), vruntime_eligible(cfs_rq, rb_se->vruntime)); + } +} +EXPORT_SYMBOL(sched_debug_cfs_rq_info); diff --git a/null_reproduction_test/Makefile b/null_reproduction_test/Makefile new file mode 100644 index 000000000000..48feb459e5ff --- /dev/null +++ b/null_reproduction_test/Makefile @@ -0,0 +1,9 @@ +obj-m += test_sched.o +KDIR := /lib/modules/$(shell uname -r)/build +PWD := $(shell pwd) + +all: + $(MAKE) -C $(KDIR) M=$(PWD) modules + +clean: + $(MAKE) -C $(KDIR) M=$(PWD) clean \ No newline at end of file diff --git a/null_reproduction_test/fullcpu.c b/null_reproduction_test/fullcpu.c new file mode 100644 index 000000000000..136c73671035 --- /dev/null +++ b/null_reproduction_test/fullcpu.c @@ -0,0 +1,12 @@ +#include <string.h> +#include <unistd.h> + +int main() +{ + int a=9; + while(1) { + a*=9; + } + + return 0; +} \ No newline at end of file diff --git a/null_reproduction_test/make.sh b/null_reproduction_test/make.sh new file mode 100755 index 000000000000..002385d17046 --- /dev/null +++ b/null_reproduction_test/make.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +make clean + +cd .. + +make modules_prepare + +cd ./null_reproduction_test + +make -C ../ M=$(pwd) + +gcc fullcpu.c -o fullcpu + +echo "====================" +echo 'please run test.sh' +echo "====================" \ No newline at end of file diff --git a/null_reproduction_test/test.sh b/null_reproduction_test/test.sh new file mode 100755 index 000000000000..a6cac6d2d7c2 --- /dev/null +++ b/null_reproduction_test/test.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +test() { + cpu=$1 + cgroup=test0 + + mkdir /sys/fs/cgroup/cpu/$cgroup/ + mkdir /sys/fs/cgroup/memory/$cgroup/ + echo 10000000 > /sys/fs/cgroup/memory/$cgroup/memory.limit_in_bytes + + taskset -c $cpu ./fullcpu & + pid=$! + + echo $pid > /sys/fs/cgroup/cpu/$cgroup/tasks + echo $pid > /sys/fs/cgroup/memory/$cgroup/tasks + + let cpu1_count=0 + for pid in $(ps -auxf | grep test_sched | grep -v grep | grep -v test_sched_0 | grep -v test_sched_1 | awk '{print($2)}'); do + echo $pid > /sys/fs/cgroup/cpu/$cgroup/tasks + done +} + +killall fullcpu +rmmod test_sched +insmod ./test_sched.ko bind_cpu=1 test_count=15 + +pid0=$(ps -auxf | grep 'test_sched_0' | grep -v grep | awk '{print($2)}') +echo $pid0 > /sys/module/fair/parameters/se_schedule_pid + +# echo 1 > /sys/module/fair/parameters/qzc_fixed_switch + +echo 1 > /sys/module/fair/parameters/qzc_vlag_switch + +test 1 diff --git a/null_reproduction_test/test_sched.c b/null_reproduction_test/test_sched.c new file mode 100644 index 000000000000..7a33fa77c923 --- /dev/null +++ b/null_reproduction_test/test_sched.c @@ -0,0 +1,141 @@ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/cpumask.h> +#include <linux/completion.h> +#include <linux/slab.h> +#include <linux/sched/task.h> + +static DECLARE_COMPLETION(comp); + +#define THREAD_NUM 100000 +static struct task_struct *schedule_threads[THREAD_NUM]; +static int bind_cpu = 0; +module_param(bind_cpu, int, 0644); +MODULE_PARM_DESC(bind_cpu, "CPU core to bind the thread to"); + +static int test_count = 1; +module_param(test_count, int, 0644); +MODULE_PARM_DESC(test_count, "test thread count (default: 1)"); + +static int sched_debug_cfs_rq_info_print_cnt = 0; + +static int thread_function(void *data); +static void start_one_thread(int id, int cpu); + +static int __init schedule_driver_init(void) +{ + printk(KERN_INFO "Schedule driver: Initializing\n"); + + start_one_thread(0, bind_cpu); + start_one_thread(1, bind_cpu); + for (int i=2; i<test_count; i++) + start_one_thread(i, -1); + + return 0; +} + +struct thread_data { + int id; +}; + +static void start_one_thread(int id, int cpu) +{ + char name[255]; + sprintf(name, "test_sched_%u/%d", id, cpu); + + struct thread_data *tdata = kmalloc(sizeof(struct thread_data), GFP_KERNEL); + tdata->id = id; + + // create kthread but not run immediately + schedule_threads[id] = kthread_create(thread_function, tdata, name); + if (IS_ERR(schedule_threads[id])) { + schedule_threads[id] = 0; + printk("Failed to create %s, %ld\n", name, PTR_ERR(schedule_threads[id])); + return; + } + + if (cpu > 0) + kthread_bind(schedule_threads[id], cpu); + // run the kthread + wake_up_process(schedule_threads[id]); + + printk(KERN_INFO "create %s success\n", name); + return; +} + +u64 sched_debug_min_vruntime(struct cfs_rq *cfs); +void sched_debug_cfs_rq_info(struct cfs_rq *cfs_rq); + +static int thread_function(void *data) +{ + printk(KERN_INFO "Schedule thread: Started on CPU %d\n", smp_processor_id()); + struct task_struct *task = current; + + set_current_state(TASK_RUNNING); + + struct thread_data *tdata = data; + // test_sched_1 wait + if (tdata->id == 1) { + set_user_nice(task, 8); + wait_for_completion_interruptible(&comp); + } + + while (!kthread_should_stop()) { + // test_sched_0 check the condition + if (tdata->id == 0) { + struct sched_entity *se = &task->se; + struct cfs_rq *cfs = se->cfs_rq; + u64 vruntime = se->vruntime; + u64 min_vruntime = sched_debug_min_vruntime(cfs); + + if (sched_debug_cfs_rq_info_print_cnt % 10000 == 0) { + sched_debug_cfs_rq_info(cfs); + } + sched_debug_cfs_rq_info_print_cnt += 1; + + if (-102743846405689 > (s64)(vruntime - min_vruntime)) { + int old_nice = task_nice(task); + set_user_nice(task, -20); + + complete(&comp); // wake up test_sched_1 + printk("vruntime: %llu, min_vruntime: %llu, renice: %d->%d\n", + vruntime, min_vruntime, old_nice, -20); + } + } else if (tdata->id == 1) { + int a = 1; + for (int i=0; i<1000000; i++) { + a += tdata->id; + } + } + + if (tdata->id == 1) + cond_resched(); + else { + schedule_timeout_uninterruptible(1); + } + } + + printk(KERN_INFO "Schedule thread: Exiting from CPU %d\n", smp_processor_id()); + return 0; +} + +static void __exit schedule_driver_exit(void) +{ + for (int i=0; i<test_count; i++) { + if (schedule_threads[i]) { + kthread_stop(schedule_threads[i]); + printk(KERN_INFO "Schedule driver: Thread stopped\n"); + } + } +} + +module_init(schedule_driver_init); +module_exit(schedule_driver_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Your Name"); +MODULE_DESCRIPTION("A driver that creates a thread calling schedule() in a loop with CPU binding"); +MODULE_VERSION("1.0"); -- 2.34.1