From 40fbbe466f421bc8534325e60b4369dcff7626ac Mon Sep 17 00:00:00 2001
From: zhoukang <zhoukang7(a)huawei.com>
Date: Sat, 20 Mar 2021 07:07:58 +0000
Subject: [PATCH] cpu: add cpuload for debug cpu usage
cpuload calculates the cpu usage, showing which tasks run out of cpu resource.
It display top N tasks when the cpu usage exceeds more than P% and calculates
every T ms.
This works by tracing the sched switch events using tracepoints.
Since this uses BPF, only the root user can use this tool.
optional arguments:
-h, --help show this help message and exit
-t TIME, --time TIME interval to calculate, default 1000
-n NUMBER, --number NUMBER
maximum tasks to print, default 3
-p PERCENT, --percent PERCENT
minimum percent to print, default 30
example:
[root@localhost ~]# ./cpuload.py -p 50 -n 2 -t 100
Tracing task switch. Output when cpu is overload. Ctrl-C to end.
DATE COMM PID CPU TIME(ms) %CPU
2021-01-27 10:40:39 stress-ng-cpu 33179 1 100.529 96.68%
2021-01-27 10:40:39 cpuload.py 395575 1 3.363 03.23%
2021-01-27 10:40:39 stress-ng-cpu 33175 3 107.704 99.73%
2021-01-27 10:40:39 sshd 2259 3 0.226 00.21%
2021-01-27 10:40:39 stress-ng-cpu 33176 0 131.978 99.99%
2021-01-27 10:40:39 kworker/0:0 388650 0 0.017 00.01%
2021-01-27 10:40:39 stress-ng-cpu 33178 2 183.987 99.99%
2021-01-27 10:40:39 kworker/2:0 391880 2 0.011 00.01%
Signed-off-by: Liu Chao <liuchao173(a)huawei.com>
---
doc/cpuload.en.md | 43 ++++++++++
doc/cpuload.md | 13 +++
src/cpu/cpuload.py | 197 +++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 253 insertions(+)
create mode 100644 doc/cpuload.en.md
create mode 100644 doc/cpuload.md
create mode 100755 src/cpu/cpuload.py
diff --git a/doc/cpuload.en.md b/doc/cpuload.en.md
new file mode 100644
index 0000000..2c37567
--- /dev/null
+++ b/doc/cpuload.en.md
@@ -0,0 +1,43 @@
+# cpuload
+
+The CPU usage detection tool cpuload can be used to print processes with high CPU usage.
+Implementation principle: Use the bcc tool to accurately trace scheduling tracks and collect statistics on processes with high CPU usage.
+python /usr/share/bcc/tools/cpuload
+-t Interval for calculating the CPU usage. The value ranges from 0 to 60000, in milliseconds. If the value is 0, thread information is printed each time a scheduling occurs. The default value is 1000.
+-n Displays the top CPU usage. The default value is 3.
+-p Sets the CPU usage threshold. When the CPU usage exceeds the threshold, the system displays information. The value ranges from 0 to 100. The default value is 90.
+-m Sets the size of the circular buffer for recording scheduling tracks. 1000 to 1000000. The default value is 10000.
+
+
+cpuload calculates the cpu usage, showing which tasks run out of cpu resource.
+
+It display top N tasks when the cpu usage exceeds more than P% and calculates
+every T ms.
+
+This works by tracing the sched switch events using tracepoints.
+
+Since this uses BPF, only the root user can use this tool.
+
+optional arguments:
+ -h, --help show this help message and exit
+ -t TIME, --time TIME interval to calculate, default 1000
+ -n NUMBER, --number NUMBER
+ maximum tasks to print, default 3
+ -p PERCENT, --percent PERCENT
+ minimum percent to print, default 30
+
+example:
+[root@localhost ~]# ./cpuload.py -p 50 -n 2 -t 100
+Tracing task switch. Output when cpu is overload. Ctrl-C to end.
+DATE COMM PID CPU TIME(ms) %CPU
+2021-01-27 10:40:39 stress-ng-cpu 33179 1 100.529 96.68%
+2021-01-27 10:40:39 cpuload.py 395575 1 3.363 03.23%
+2021-01-27 10:40:39 stress-ng-cpu 33175 3 107.704 99.73%
+2021-01-27 10:40:39 sshd 2259 3 0.226 00.21%
+2021-01-27 10:40:39 stress-ng-cpu 33176 0 131.978 99.99%
+2021-01-27 10:40:39 kworker/0:0 388650 0 0.017 00.01%
+2021-01-27 10:40:39 stress-ng-cpu 33178 2 183.987 99.99%
+2021-01-27 10:40:39 kworker/2:0 391880 2 0.011 00.01%
+
+
+
diff --git a/doc/cpuload.md b/doc/cpuload.md
new file mode 100644
index 0000000..c56af1f
--- /dev/null
+++ b/doc/cpuload.md
@@ -0,0 +1,13 @@
+# cpuload
+
+CPU冲高检测工具cpuload,使用该工具能够将CPU使用率高的进程打印出来。
+实现原理, 通过bcc工具精确trace调度轨迹, 统计分析CPU占用率高的进程;
+
+```shell
+python /usr/share/bcc/tools/cpuload
+-t 计算CPU使用率的周期。单位为毫秒,取值为0~60000,取0则每次发生调度都会打印出线程的信息,默认值为1000。
+-n 打印CPU使用率top。默认值为3。
+-p 设置CPU使用率的水线,超过时打印。0~100,默认值为90。
+-m 设置记录调度轨迹的循环缓冲区大小。1000~1000000,默认值为10000。
+```
+
diff --git a/src/cpu/cpuload.py b/src/cpu/cpuload.py
new file mode 100755
index 0000000..47062e9
--- /dev/null
+++ b/src/cpu/cpuload.py
@@ -0,0 +1,197 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# cpuload Display top N tasks use more than U percent cpu resource when
+# the cpu doesn't enter idle state for more than T ms.
+#
+# USAGE: cpuload [-h] [-t time] [-n number] [-p percent_limit] [-m max_entry]
+#
+# This uses in-kernel eBPF maps to cache task details (PID and comm) by
+# sched_switch, as well as a running time for calculating cpu usage.
+
+from __future__ import print_function
+from bcc import BPF
+from bcc.utils import printb
+import argparse
+from datetime import datetime
+
+# arguments
+examples = """examples:
+ ./cpuload # display tasks when cpu overload
+ ./cpuload -t 100 # calculate cpu usage every 100 ms
+ ./cpuload -n 5 # display top 5 tasks details
+ ./cpuload -p 30 # display tasks when cpu usage exceeds 30%
+ ./cpuload -m 10000 # set the maximum number of entry to 10,000
+"""
+parser = argparse.ArgumentParser(
+ description="display tasks when cpu overload",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog=examples)
+parser.add_argument("-t", "--time", default=1000,
+ help="interval for calculating the CPU usage, in milliseconds(0 - 60000), default 1000")
+parser.add_argument("-n", "--number", default=3,
+ help="display top n tasks with high cpu usage, default 3")
+parser.add_argument("-p", "--percent_limit", default=90,
+ help="display when the usage of a cpu exceeds percent_limit(0 - 100), default 90")
+parser.add_argument("-m", "--max_entry", default=10000,
+ help="size of the cyclic buffer for recording the scheduling track(1000 - 1000000), default 10000")
+parser.add_argument("--ebpf", action="store_true",
+ help=argparse.SUPPRESS)
+args = parser.parse_args()
+time_ms = int(args.time)
+time_ns = time_ms * 1000000
+number = int(args.number)
+percent_limit = int(args.percent_limit)
+max_entry = int(args.max_entry)
+debug = 0
+
+if time_ms > 60000 or time_ms < 0:
+ print("time invalid")
+ exit(1)
+
+if percent_limit > 100 or percent_limit < 0:
+ print("percent_limit invalid")
+ exit(1)
+
+if max_entry > 1000000 or max_entry < 1000:
+ print("max_entry invalid")
+ exit(1)
+
+# define BPF program
+bpf_text = """
+#include <linux/sched.h>
+
+#define MAX_TIME """ + str(time_ns) + """
+#define THRESHOLD """ + str(percent_limit) + """
+#define MAX_ENTRY """ + str(max_entry) + """
+
+struct cpu_data_t {
+ u32 index;
+ u32 number;
+ u64 prev_time;
+ u64 busy_time;
+ u64 total_time;
+};
+
+struct task_data_t {
+ u32 pid;
+ char comm[TASK_COMM_LEN];
+ u64 delta;
+};
+
+struct data_t {
+ u32 index;
+ u32 number;
+ u64 total_time;
+};
+
+BPF_PERCPU_ARRAY(cpu_data, struct cpu_data_t, 1);
+
+BPF_PERCPU_ARRAY(task_data, struct task_data_t, MAX_ENTRY);
+
+BPF_PERF_OUTPUT(events);
+TRACEPOINT_PROBE(sched, sched_switch) {
+ u32 index = 0;
+ u64 now = bpf_ktime_get_ns(), delta;
+ struct data_t data = {};
+ struct cpu_data_t *cpu = cpu_data.lookup(&index);
+ struct task_data_t *task;
+
+ if (cpu == NULL)
+ return 0;
+
+ if (cpu->prev_time == 0) {
+ cpu->prev_time = now;
+ return 0;
+ }
+
+ index = (cpu->index + cpu->number) % MAX_ENTRY;
+ task = task_data.lookup(&index);
+ if (task == NULL)
+ return 0;
+
+ delta = now - cpu->prev_time;
+ if (args->prev_pid != 0) {
+ cpu->busy_time += delta;
+ task->pid = args->prev_pid;
+ __builtin_memcpy(&task->comm, &args->prev_comm, sizeof(task->comm));
+ task->delta = now - cpu->prev_time;
+ cpu->number++;
+ }
+
+ cpu->prev_time = now;
+ cpu->total_time += delta;
+
+ if (cpu->total_time > MAX_TIME || cpu->number == MAX_ENTRY) {
+ if (cpu->busy_time * 100 > cpu->total_time * THRESHOLD) {
+ data.index = cpu->index;
+ data.number = cpu->number;
+ data.total_time = cpu->total_time;
+ events.perf_submit(args, &data, sizeof(data));
+ cpu->index = (index + 1) % MAX_ENTRY;
+ }
+ cpu->number = 0;
+ cpu->busy_time = 0;
+ cpu->total_time = 0;
+ cpu->prev_time = now;
+ }
+
+ return 0;
+}
+"""
+
+if debug or args.ebpf:
+ print(bpf_text)
+ if args.ebpf:
+ exit()
+
+# initialize BPF
+b = BPF(text=bpf_text)
+
+print("Tracing task switch. Output when cpu is overload. Ctrl-C to end.")
+
+print("%-19s %-14s %-7s %-4s %-8s %-5s" %
+ ("DATE", "COMM", "PID", "CPU", "TIME(ms)", "%CPU"))
+
+# process event
+def print_event(cpu, data, size):
+ date = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+ data = b["events"].event(data)
+ dic = {}
+ tasks = b["task_data"]
+ if data.total_time < time_ns:
+ print("max_entry is too small, please set more than %d" %
+ (max_entry * time_ns / data.total_time))
+ for i in range(data.index, data.number + data.index):
+ task = tasks[i % max_entry][cpu]
+ entry = dic.get(task.pid)
+ if entry is not None:
+ entry.delta += task.delta
+ else:
+ dic[task.pid] = task
+
+ count = 0
+ for item in sorted(dic.items(), key=lambda x: x[1].delta, reverse=True):
+ if count >= number:
+ break
+ task = item[1]
+ u = task.delta * 100 / data.total_time
+ print("%s %-14.14s %-7s %-4s %-8.3f %05.2f%%" % (
+ date,
+ task.comm.decode("utf-8", "replace"),
+ task.pid,
+ cpu,
+ float(task.delta) / 1000000,
+ u))
+ count += 1
+ dic.clear()
+ print("---------------------------------------------------------------")
+
+# loop with callback to print_event
+b["events"].open_perf_buffer(print_event)
+while 1:
+ try:
+ b.perf_buffer_poll()
+ except KeyboardInterrupt:
+ exit()
+
--
2.29.2