From 40fbbe466f421bc8534325e60b4369dcff7626ac Mon Sep 17 00:00:00 2001
From: zhoukang <zhoukang7(a)huawei.com>
Date: Sat, 20 Mar 2021 07:07:58 +0000
Subject: [PATCH] cpu: add cpuload for debug cpu usage
cpuload calculates the cpu usage, showing which tasks run out of cpu resource.
It display top N tasks when the cpu usage exceeds more than P% and calculates
every T ms.
This works by tracing the sched switch events using tracepoints.
Since this uses BPF, only the root user can use this tool.
optional arguments:
  -h, --help            show this help message and exit
  -t TIME, --time TIME  interval to calculate, default 1000
  -n NUMBER, --number NUMBER
                        maximum tasks to print, default 3
  -p PERCENT, --percent PERCENT
                        minimum percent to print, default 30
example:
[root@localhost ~]# ./cpuload.py -p 50 -n 2 -t 100
Tracing task switch. Output when cpu is overload. Ctrl-C to end.
DATE                COMM           PID     CPU  TIME(ms) %CPU
2021-01-27 10:40:39 stress-ng-cpu  33179   1    100.529  96.68%
2021-01-27 10:40:39 cpuload.py     395575  1    3.363    03.23%
2021-01-27 10:40:39 stress-ng-cpu  33175   3    107.704  99.73%
2021-01-27 10:40:39 sshd           2259    3    0.226    00.21%
2021-01-27 10:40:39 stress-ng-cpu  33176   0    131.978  99.99%
2021-01-27 10:40:39 kworker/0:0    388650  0    0.017    00.01%
2021-01-27 10:40:39 stress-ng-cpu  33178   2    183.987  99.99%
2021-01-27 10:40:39 kworker/2:0    391880  2    0.011    00.01%
Signed-off-by: Liu Chao <liuchao173(a)huawei.com>
---
 doc/cpuload.en.md  |  43 ++++++++++
 doc/cpuload.md     |  13 +++
 src/cpu/cpuload.py | 197 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 253 insertions(+)
 create mode 100644 doc/cpuload.en.md
 create mode 100644 doc/cpuload.md
 create mode 100755 src/cpu/cpuload.py
diff --git a/doc/cpuload.en.md b/doc/cpuload.en.md
new file mode 100644
index 0000000..2c37567
--- /dev/null
+++ b/doc/cpuload.en.md
@@ -0,0 +1,43 @@
+# cpuload
+
+The CPU usage detection tool cpuload can be used to print processes with high CPU usage.
+Implementation principle: Use the bcc tool to accurately trace scheduling tracks and collect statistics on processes with high CPU usage.
+python /usr/share/bcc/tools/cpuload
+-t Interval for calculating the CPU usage. The value ranges from 0 to 60000, in milliseconds. If the value is 0, thread information is printed each time a scheduling occurs. The default value is 1000.
+-n Displays the top CPU usage. The default value is 3.
+-p Sets the CPU usage threshold. When the CPU usage exceeds the threshold, the system displays information. The value ranges from 0 to 100. The default value is 90.
+-m Sets the size of the circular buffer for recording scheduling tracks. 1000 to 1000000. The default value is 10000.
+
+
+cpuload calculates the cpu usage, showing which tasks run out of cpu resource.
+
+It display top N tasks when the cpu usage exceeds more than P% and calculates
+every T ms.
+
+This works by tracing the sched switch events using tracepoints.
+
+Since this uses BPF, only the root user can use this tool.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -t TIME, --time TIME  interval to calculate, default 1000
+  -n NUMBER, --number NUMBER
+                        maximum tasks to print, default 3
+  -p PERCENT, --percent PERCENT
+                        minimum percent to print, default 30
+
+example:
+[root@localhost ~]# ./cpuload.py -p 50 -n 2 -t 100
+Tracing task switch. Output when cpu is overload. Ctrl-C to end.
+DATE                COMM           PID     CPU  TIME(ms) %CPU
+2021-01-27 10:40:39 stress-ng-cpu  33179   1    100.529  96.68%
+2021-01-27 10:40:39 cpuload.py     395575  1    3.363    03.23%
+2021-01-27 10:40:39 stress-ng-cpu  33175   3    107.704  99.73%
+2021-01-27 10:40:39 sshd           2259    3    0.226    00.21%
+2021-01-27 10:40:39 stress-ng-cpu  33176   0    131.978  99.99%
+2021-01-27 10:40:39 kworker/0:0    388650  0    0.017    00.01%
+2021-01-27 10:40:39 stress-ng-cpu  33178   2    183.987  99.99%
+2021-01-27 10:40:39 kworker/2:0    391880  2    0.011    00.01%
+
+
+
diff --git a/doc/cpuload.md b/doc/cpuload.md
new file mode 100644
index 0000000..c56af1f
--- /dev/null
+++ b/doc/cpuload.md
@@ -0,0 +1,13 @@
+# cpuload
+
+CPU冲高检测工具cpuload,使用该工具能够将CPU使用率高的进程打印出来。
+实现原理, 通过bcc工具精确trace调度轨迹, 统计分析CPU占用率高的进程;
+
+```shell
+python /usr/share/bcc/tools/cpuload
+-t 计算CPU使用率的周期。单位为毫秒,取值为0~60000,取0则每次发生调度都会打印出线程的信息,默认值为1000。
+-n 打印CPU使用率top。默认值为3。
+-p 设置CPU使用率的水线,超过时打印。0~100,默认值为90。
+-m 设置记录调度轨迹的循环缓冲区大小。1000~1000000,默认值为10000。
+```
+
diff --git a/src/cpu/cpuload.py b/src/cpu/cpuload.py
new file mode 100755
index 0000000..47062e9
--- /dev/null
+++ b/src/cpu/cpuload.py
@@ -0,0 +1,197 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# cpuload   Display top N tasks use more than U percent cpu resource when
+#           the cpu doesn't enter idle state for more than T ms.
+#
+# USAGE: cpuload [-h] [-t time] [-n number] [-p percent_limit] [-m max_entry]
+#
+# This uses in-kernel eBPF maps to cache task details (PID and comm) by
+# sched_switch, as well as a running time for calculating cpu usage.
+
+from __future__ import print_function
+from bcc import BPF
+from bcc.utils import printb
+import argparse
+from datetime import datetime
+
+# arguments
+examples = """examples:
+    ./cpuload                # display tasks when cpu overload
+    ./cpuload -t 100         # calculate cpu usage every 100 ms
+    ./cpuload -n 5           # display top 5 tasks details
+    ./cpuload -p 30          # display tasks when cpu usage exceeds 30%
+    ./cpuload -m 10000       # set the maximum number of entry to 10,000
+"""
+parser = argparse.ArgumentParser(
+    description="display tasks when cpu overload",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-t", "--time", default=1000,
+    help="interval for calculating the CPU usage, in milliseconds(0 - 60000), default 1000")
+parser.add_argument("-n", "--number", default=3,
+    help="display top n tasks with high cpu usage, default 3")
+parser.add_argument("-p", "--percent_limit", default=90,
+    help="display when the usage of a cpu exceeds percent_limit(0 - 100), default 90")
+parser.add_argument("-m", "--max_entry", default=10000,
+    help="size of the cyclic buffer for recording the scheduling track(1000 - 1000000), default 10000")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+time_ms = int(args.time)
+time_ns = time_ms * 1000000
+number = int(args.number)
+percent_limit = int(args.percent_limit)
+max_entry = int(args.max_entry)
+debug = 0
+
+if time_ms > 60000 or time_ms < 0:
+    print("time invalid")
+    exit(1)
+
+if percent_limit > 100 or percent_limit < 0:
+    print("percent_limit invalid")
+    exit(1)
+
+if max_entry > 1000000 or max_entry < 1000:
+    print("max_entry invalid")
+    exit(1)
+
+# define BPF program
+bpf_text = """
+#include <linux/sched.h>
+
+#define MAX_TIME """ + str(time_ns) + """
+#define THRESHOLD """ + str(percent_limit) + """
+#define MAX_ENTRY """ + str(max_entry) + """
+
+struct cpu_data_t {
+    u32 index;
+    u32 number;
+    u64 prev_time;
+    u64 busy_time;
+    u64 total_time;
+};
+
+struct task_data_t {
+    u32 pid;
+    char comm[TASK_COMM_LEN];
+    u64 delta;
+};
+
+struct data_t {
+    u32 index;
+    u32 number;
+    u64 total_time;
+};
+
+BPF_PERCPU_ARRAY(cpu_data, struct cpu_data_t, 1);
+
+BPF_PERCPU_ARRAY(task_data, struct task_data_t, MAX_ENTRY);
+
+BPF_PERF_OUTPUT(events);
+TRACEPOINT_PROBE(sched, sched_switch) {
+    u32 index = 0;
+    u64 now = bpf_ktime_get_ns(), delta;
+    struct data_t data = {};
+    struct cpu_data_t *cpu = cpu_data.lookup(&index);
+    struct task_data_t *task;
+
+    if (cpu == NULL)
+        return 0;
+
+    if (cpu->prev_time == 0) {
+        cpu->prev_time = now;
+        return 0;
+    }
+
+    index = (cpu->index + cpu->number) % MAX_ENTRY;
+    task = task_data.lookup(&index);
+    if (task == NULL)
+        return 0;
+
+    delta = now - cpu->prev_time;
+    if (args->prev_pid != 0) {
+        cpu->busy_time += delta;
+        task->pid = args->prev_pid;
+        __builtin_memcpy(&task->comm, &args->prev_comm, sizeof(task->comm));
+        task->delta = now - cpu->prev_time;
+        cpu->number++;
+    }
+
+    cpu->prev_time = now;
+    cpu->total_time += delta;
+
+    if (cpu->total_time > MAX_TIME || cpu->number == MAX_ENTRY) {
+        if (cpu->busy_time * 100 > cpu->total_time * THRESHOLD) {
+            data.index = cpu->index;
+            data.number = cpu->number;
+            data.total_time = cpu->total_time;
+            events.perf_submit(args, &data, sizeof(data));
+            cpu->index = (index + 1) % MAX_ENTRY;
+        }
+        cpu->number = 0;
+        cpu->busy_time = 0;
+        cpu->total_time = 0;
+        cpu->prev_time = now;
+    }
+
+    return 0;
+}
+"""
+
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# initialize BPF
+b = BPF(text=bpf_text)
+
+print("Tracing task switch. Output when cpu is overload. Ctrl-C to end.")
+
+print("%-19s %-14s %-7s %-4s %-8s %-5s" %
+        ("DATE", "COMM", "PID", "CPU", "TIME(ms)", "%CPU"))
+
+# process event
+def print_event(cpu, data, size):
+    date = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    data = b["events"].event(data)
+    dic = {}
+    tasks = b["task_data"]
+    if data.total_time < time_ns:
+        print("max_entry is too small, please set more than %d" %
+            (max_entry * time_ns / data.total_time))
+    for i in range(data.index, data.number + data.index):
+        task = tasks[i % max_entry][cpu]
+        entry = dic.get(task.pid)
+        if entry is not None:
+            entry.delta += task.delta
+        else:
+            dic[task.pid] = task
+
+    count = 0
+    for item in sorted(dic.items(), key=lambda x: x[1].delta, reverse=True):
+        if count >= number:
+            break
+        task = item[1]
+        u = task.delta * 100 / data.total_time
+        print("%s %-14.14s %-7s %-4s %-8.3f %05.2f%%" % (
+            date,
+            task.comm.decode("utf-8", "replace"),
+            task.pid,
+            cpu,
+            float(task.delta) / 1000000,
+            u))
+        count += 1
+    dic.clear()
+    print("---------------------------------------------------------------")
+
+# loop with callback to print_event
+b["events"].open_perf_buffer(print_event)
+while 1:
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
+
-- 
2.29.2