criu use `ptrace(PTRACE_SYSCALL)` to watch whether the tracee steps in correct status, it isn't necessory to stop tracee at every syscall. Therefore, customizing `ptrace(PTRACE_SYSCALL_NR)` to make tracee stop at the specific syscall can save time (1000 threads consume about 140ms).
ptrace syntax: long ptrace(PTRACE_SYSCALL_NR, pid_t pid, void *addr, void *data);
The argument `addr` is unused in original `ptrace(PTRACE_SYSCALL)`, Here `ptrace(PTRACE_SYSCALL_NR)` use `addr` parameter to give the specific sysno which is wanted to trace.
use `criu check` to generate `/run/criu.kdat` before the first usage of criu, or auto-check during `criu {dump, restore}`.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/25
Signed-off-by: fu.lin fulin10@huawei.com --- compel/Makefile | 1 + compel/arch/aarch64/src/lib/infect.c | 2 +- compel/include/uapi/bisect.h | 30 +++++ compel/include/uapi/infect.h | 15 ++- compel/src/lib/bisect.c | 92 +++++++++++++++ compel/src/lib/infect.c | 169 +++++++++++++++++++++++++-- criu/cgroup-props.c | 6 +- criu/cgroup.c | 12 +- criu/cr-dump.c | 10 +- criu/cr-restore.c | 97 ++++++++++++++- criu/eventfd.c | 2 +- criu/eventpoll.c | 4 +- criu/files-reg.c | 4 +- criu/files.c | 16 +-- criu/include/kerndat.h | 1 + criu/kerndat.c | 67 ++++++++++- criu/lsm.c | 4 +- criu/mount.c | 4 +- criu/sk-packet.c | 2 +- criu/sk-unix.c | 2 +- 20 files changed, 487 insertions(+), 53 deletions(-) create mode 100644 compel/include/uapi/bisect.h create mode 100644 compel/src/lib/bisect.c
diff --git a/compel/Makefile b/compel/Makefile index de9318c..eea93a7 100644 --- a/compel/Makefile +++ b/compel/Makefile @@ -27,6 +27,7 @@ lib-y += src/lib/infect-rpc.o lib-y += src/lib/infect-util.o lib-y += src/lib/infect.o lib-y += src/lib/ptrace.o +lib-y += src/lib/bisect.o
# handle_elf() has no support of ELF relocations on ARM (yet?) ifneq ($(filter arm aarch64,$(ARCH)),) diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index 4b59390..c897b52 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -67,7 +67,7 @@ int get_task_regs(pid_t pid, user_regs_struct_t *regs, save_regs_t save, user_fpregs_struct_t fpsimd; int ret;
- pr_info("Dumping GP/FPU registers for %d\n", pid); + pr_debug("Dumping GP/FPU registers for %d\n", pid);
iov.iov_base = regs; iov.iov_len = sizeof(user_regs_struct_t); diff --git a/compel/include/uapi/bisect.h b/compel/include/uapi/bisect.h new file mode 100644 index 0000000..55ebcbd --- /dev/null +++ b/compel/include/uapi/bisect.h @@ -0,0 +1,30 @@ +#ifndef __COMPEL_BISECT_H__ +#define __COMPEL_BISECT_H__ + +#include <sys/types.h> + +enum tf { + TRACE_INTERRUPT, + TRACE_SYSCALL_ENTER, + TRACE_SYSCALL_EXIT, +}; + +struct trace_flag { + pid_t key; + enum tf flag; +}; + +struct bisect_meta { + int size; + int used; + void *data; /* data pointer array */ + void *__data; /* data array */ +}; + +struct trace_flag *tf_bisect(struct bisect_meta *meta, pid_t key); +struct trace_flag *tf_insert(struct bisect_meta *meta, pid_t key); +int tf_create(struct bisect_meta *meta, int len); +void tf_destroy(struct bisect_meta *meta); +void tf_clear(struct bisect_meta *meta); + +#endif /* __COMPEL_BISECT_H__ */ diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index 257658a..9b356ef 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -8,6 +8,7 @@ #include <compel/ksigset.h> #include <compel/handle-elf.h> #include <compel/task-state.h> +#include <compel/bisect.h>
#include "common/compiler.h"
@@ -41,7 +42,7 @@ extern int __must_check compel_infect(struct parasite_ctl *ctl, extern struct parasite_thread_ctl __must_check *compel_prepare_thread(struct parasite_ctl *ctl, int pid); extern void compel_release_thread(struct parasite_thread_ctl *);
-extern int __must_check compel_stop_daemon(struct parasite_ctl *ctl); +extern int __must_check compel_stop_daemon(struct parasite_ctl *ctl, bool customize); extern int __must_check compel_cure_remote(struct parasite_ctl *ctl); extern int __must_check compel_cure_local(struct parasite_ctl *ctl); extern int __must_check compel_cure(struct parasite_ctl *ctl); @@ -90,6 +91,14 @@ extern int __must_check compel_stop_pie(pid_t pid, void *addr,
extern int __must_check compel_unmap(struct parasite_ctl *ctl, unsigned long addr);
+extern int __must_check compel_stop_on_syscall_customize(int tasks, + const int sys_nr, const int exit_sys_nr, struct bisect_meta *meta); + +extern int __must_check compel_stop_pie_customize(pid_t pid, + const int sys_nr, struct trace_flag *tf); + +extern int __must_check compel_unmap_customize(struct parasite_ctl *ctl, unsigned long addr); + extern int compel_mode_native(struct parasite_ctl *ctl);
extern k_rtsigset_t *compel_task_sigmask(struct parasite_ctl *ctl); @@ -173,4 +182,8 @@ extern unsigned long compel_task_size(void); extern uint64_t compel_get_leader_sp(struct parasite_ctl *ctl); extern uint64_t compel_get_thread_sp(struct parasite_thread_ctl *tctl);
+#ifndef PTRACE_SYSCALL_NR +#define PTRACE_SYSCALL_NR 0xff00 +#endif + #endif diff --git a/compel/src/lib/bisect.c b/compel/src/lib/bisect.c new file mode 100644 index 0000000..807a5a9 --- /dev/null +++ b/compel/src/lib/bisect.c @@ -0,0 +1,92 @@ +#include <stddef.h> + +#include "log.h" +#include "common/xmalloc.h" +#include "bisect.h" + +struct trace_flag *tf_bisect(struct bisect_meta *meta, pid_t key) +{ + struct trace_flag **tfs = meta->data; + int lo = 0, hi = meta->used, mid; + + if (meta->used <= 0) + return NULL; + + while (lo < hi) { + mid = (int)((lo + hi) / 2); + if (tfs[mid]->key == key) { + return tfs[mid]; + } else if (tfs[mid]->key > key) { + hi = mid; + } else { + lo = mid + 1; + } + } + + return NULL; +} + +/* used in cr-restore */ +struct trace_flag *tf_insert(struct bisect_meta *meta, pid_t key) +{ + struct trace_flag **tfs = meta->data; + struct trace_flag *tf = &((struct trace_flag *)meta->__data)[meta->used]; + int i = 0, j = 0; + + if (meta->used == meta->size) + return NULL; + + for (i = 0; i < meta->used; i++) { + if (tfs[i]->key >= key) /* impossible condition: `tfs[i]->key == key` */ + break; + } + + j = meta->used; + meta->used += 1; + + while (j > i) { + tfs[j] = tfs[j-1]; + j -= 1; + } + + tfs[i] = tf; + tf->key = key; + + return tf; +} + +int tf_create(struct bisect_meta *meta, int len) +{ + struct trace_flag *tfs; + struct trace_flag **tfs_ptr; + + tfs = xzalloc(sizeof(*tfs) * len); + if (tfs == NULL) + return -1; + + tfs_ptr = xmalloc(sizeof(*tfs_ptr) * len); + if (tfs_ptr == NULL) + goto err; + + meta->size = len; + meta->used = 0; + meta->__data = tfs; + meta->data = tfs_ptr; + + return 0; +err: + xfree(tfs); + return -1; +} + +void tf_destroy(struct bisect_meta *meta) +{ + xfree(meta->__data); + xfree(meta->data); +} + +void tf_clear(struct bisect_meta *meta) +{ + meta->used = 0; + __builtin_memset(meta->data, 0, sizeof(struct trace_flag **)*meta->size); +} diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index 38846c2..6b1d445 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -442,7 +442,7 @@ static int restore_child_handler(struct parasite_ctl *ctl) }
static int parasite_run(pid_t pid, int cmd, unsigned long ip, void *stack, - user_regs_struct_t *regs, struct thread_ctx *octx) + user_regs_struct_t *regs, struct thread_ctx *octx, void *addr) { k_rtsigset_t block;
@@ -458,7 +458,7 @@ static int parasite_run(pid_t pid, int cmd, unsigned long ip, void *stack, goto err_regs; }
- if (ptrace(cmd, pid, NULL, NULL)) { + if (ptrace(cmd, pid, addr, NULL)) { pr_perror("Can't run parasite at %d", pid); goto err_cont; } @@ -565,7 +565,7 @@ int compel_execute_syscall(struct parasite_ctl *ctl, return -1; }
- err = parasite_run(pid, PTRACE_CONT, ctl->ictx.syscall_ip, 0, regs, &ctl->orig); + err = parasite_run(pid, PTRACE_CONT, ctl->ictx.syscall_ip, 0, regs, &ctl->orig, NULL); if (!err) err = parasite_trap(ctl, pid, regs, &ctl->orig);
@@ -583,7 +583,7 @@ int compel_run_at(struct parasite_ctl *ctl, unsigned long ip, user_regs_struct_t user_regs_struct_t regs = ctl->orig.regs; int ret;
- ret = parasite_run(ctl->rpid, PTRACE_CONT, ip, 0, ®s, &ctl->orig); + ret = parasite_run(ctl->rpid, PTRACE_CONT, ip, 0, ®s, &ctl->orig, NULL); if (!ret) ret = parasite_trap(ctl, ctl->rpid, ret_regs ? ret_regs : ®s, &ctl->orig); return ret; @@ -632,7 +632,7 @@ static int parasite_init_daemon(struct parasite_ctl *ctl) goto err;
regs = ctl->orig.regs; - if (parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, ctl->rstack, ®s, &ctl->orig)) + if (parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, ctl->rstack, ®s, &ctl->orig, NULL)) goto err;
futex_wait_while_eq(&args->daemon_connected, 0); @@ -1272,7 +1272,7 @@ static bool task_in_parasite(struct parasite_ctl *ctl, user_regs_struct_t *regs) addr < ctl->remote_map + ctl->map_length; }
-static int parasite_fini_seized(struct parasite_ctl *ctl) +static int parasite_fini_seized(struct parasite_ctl *ctl, bool customize) { pid_t pid = ctl->rpid; user_regs_struct_t regs; @@ -1317,9 +1317,37 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) if (ret) return -1;
+ /* use customize ptrace */ + if (customize) { + struct trace_flag tf = { .key = pid, .flag = TRACE_SYSCALL_ENTER }; + struct trace_flag *tf_ptr[] = { &tf }; + struct bisect_meta meta = { + .size = 1, + .used = 1, + .__data = &tf, + .data = tf_ptr, + }; + + ret = compel_stop_pie_customize(pid, __NR(rt_sigreturn, 0), &tf); + if (ret < 0) + return ret; + + /* The process is going to execute the required syscall, the + * original syscall should be forgot(set `-1`) in + * `syscall_trace_enter()` handler in kernel when no other + * else operation in tracer. + * + * Note: -1 means NO_SYSCALL which is defined in + * `arch/arm64/include/asm/ptrace.h`. + */ + return compel_stop_on_syscall_customize(1, + __NR(rt_sigreturn, 0), + -1, &meta); + } + /* Go to sigreturn as closer as we can */ ret = compel_stop_pie(pid, ctl->sigreturn_addr, &flag, - ctl->ictx.flags & INFECT_NO_BREAKPOINTS); + ctl->ictx.flags & INFECT_NO_BREAKPOINTS); if (ret < 0) return ret;
@@ -1339,7 +1367,7 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) return 0; }
-int compel_stop_daemon(struct parasite_ctl *ctl) +int compel_stop_daemon(struct parasite_ctl *ctl, bool customize) { if (ctl->daemonized) { /* @@ -1349,7 +1377,7 @@ int compel_stop_daemon(struct parasite_ctl *ctl) if (ctl->tsock < 0) return -1;
- if (parasite_fini_seized(ctl)) { + if (parasite_fini_seized(ctl, customize)) { close_safe(&ctl->tsock); return -1; } @@ -1365,7 +1393,7 @@ int compel_cure_remote(struct parasite_ctl *ctl) long ret; int err;
- if (compel_stop_daemon(ctl)) + if (compel_stop_daemon(ctl, false)) return -1;
if (!ctl->remote_map) @@ -1434,7 +1462,7 @@ int compel_run_in_thread(struct parasite_thread_ctl *tctl, unsigned int cmd)
*ctl->cmd = cmd;
- ret = parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, stack, ®s, octx); + ret = parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, stack, ®s, octx, NULL); if (ret == 0) ret = parasite_trap(ctl, pid, ®s, octx); if (ret == 0) @@ -1457,7 +1485,7 @@ int compel_unmap(struct parasite_ctl *ctl, unsigned long addr) pid_t pid = ctl->rpid; int ret = -1;
- ret = parasite_run(pid, PTRACE_SYSCALL, addr, ctl->rstack, ®s, &ctl->orig); + ret = parasite_run(pid, PTRACE_SYSCALL, addr, ctl->rstack, ®s, &ctl->orig, NULL); if (ret) goto err;
@@ -1470,6 +1498,45 @@ err: return ret; }
+int compel_unmap_customize(struct parasite_ctl *ctl, unsigned long addr) +{ + user_regs_struct_t regs = ctl->orig.regs; + pid_t pid = ctl->rpid; + int ret = -1; + struct trace_flag tf = { .key = pid, .flag = TRACE_SYSCALL_ENTER }; + struct trace_flag *tf_ptr[] = { &tf }; + struct bisect_meta meta = { + .size = 1, + .used = 1, + .__data = &tf, + .data = tf_ptr, + }; + + /* + * Here it parasite code. Unlike trap code `compel_stop_pie()`, it + * won't let tracee forget the original syscall. In such way, tracer + * just trace the syscall called by tracee. The log likes the following + * if tracee forget syscall: + * + * [ 817.638332] set pid 1877 ptrace sysno 215 + * [ 817.638343] syscall_trace_enter: pid 1877 ptrace_sysno 0 current_sysno 215 + * [ 817.638363] (00.006280) Error (compel/src/lib/infect.c:1582): 1877 (native) is going to execute the syscall 215, required is 215 + * [ 817.638368] set pid 1877 ptrace sysno 0 + * [ 817.638402] syscall_trace_exit: pid 1877 ptrace_sysno 0 current_sysno 215 + */ + ret = parasite_run(pid, PTRACE_SYSCALL_NR, addr, ctl->rstack, ®s, + &ctl->orig, (void *)(long)__NR(munmap, 0)); + if (ret) + goto err; + + ret = compel_stop_on_syscall_customize(1, __NR(munmap, 0), 0, &meta); + + if (restore_thread_ctx(pid, &ctl->orig)) + ret = -1; +err: + return ret; +} + int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) { int ret; @@ -1505,6 +1572,17 @@ int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) return 0; }
+int compel_stop_pie_customize(pid_t pid, const int sys_nr, struct trace_flag *tf) +{ + if (ptrace(PTRACE_SYSCALL_NR, pid, sys_nr, NULL)) { + pr_perror("Unable to restart the %d process", pid); + return -1; + } + + tf->flag = TRACE_SYSCALL_ENTER; + return 0; +} + static bool task_is_trapped(int status, pid_t pid) { if (WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP) @@ -1617,6 +1695,73 @@ goon: return 0; }
+int compel_stop_on_syscall_customize(int tasks, const int sys_nr, + const int exit_sys_nr, struct bisect_meta *meta) +{ + struct trace_flag *tf; + user_regs_struct_t regs; + int status, ret; + pid_t pid; + + while (tasks) { + pid = wait4(-1, &status, __WALL, NULL); + if (pid == -1) { + pr_perror("wait4 failed"); + return -1; + } + + tf = tf_bisect(meta, pid); + if (tf == NULL) { + pr_warn("Unexpected task %d, state %d signal %d: %s\n", + pid, WEXITSTATUS(status), + WTERMSIG(status), strsignal(WTERMSIG(status))); + continue; + } + + if (!task_is_trapped(status, pid)) + return -1; + + switch (tf->flag) { + case TRACE_SYSCALL_ENTER: + pr_debug("%d was trapped\n", pid); + pr_debug("`- Expecting exit\n"); + + ret = ptrace_get_regs(pid, ®s); + if (ret) { + pr_perror("ptrace"); + return -1; + } + + if (is_required_syscall(®s, pid, sys_nr, sys_nr)) { + ret = ptrace(PTRACE_SYSCALL_NR, pid, exit_sys_nr, NULL); + if (ret) { + pr_perror("ptrace"); + return -1; + } + tf->flag = TRACE_SYSCALL_EXIT; + } else { + pr_warn("Impossible condition, check the system, try our best to restore...\n"); + ret = ptrace(PTRACE_SYSCALL_NR, pid, sys_nr, NULL); + if (ret) { + pr_perror("ptrace"); + return -1; + } + } + break; + case TRACE_SYSCALL_EXIT: + pr_debug("%d was stopped\n", pid); + tasks--; + break; + + default: + pr_err("pid %d invalid status: %d\n", pid, tf->flag); + return -1; + } + } + + return 0; +} + int compel_mode_native(struct parasite_ctl *ctl) { return user_regs_native(&ctl->orig.regs); diff --git a/criu/cgroup-props.c b/criu/cgroup-props.c index 2f628f4..4f0458d 100644 --- a/criu/cgroup-props.c +++ b/criu/cgroup-props.c @@ -245,7 +245,7 @@ static int cgp_parse_stream(char *stream, size_t len) goto err_parse; }
- pr_info("Parsing controller "%s"\n", p); + pr_debug("Parsing controller "%s"\n", p);
cgp_entry = xzalloc(sizeof(*cgp_entry)); if (cgp_entry) { @@ -287,7 +287,7 @@ static int cgp_parse_stream(char *stream, size_t len) goto err_parse; }
- pr_info("\tStrategy "%s"\n", p); + pr_debug("\tStrategy "%s"\n", p); xfree(p);
if (!eat_symbols(&stream, &len, "\n - ", 4, true)) { @@ -324,7 +324,7 @@ static int cgp_parse_stream(char *stream, size_t len) }
cgp_entry->cgp.props[cgp_entry->cgp.nr_props++] = p; - pr_info("\tProperty "%s"\n", p); + pr_debug("\tProperty "%s"\n", p);
if (!eat_symbol(&stream, &len, ',', true)) { if (stream[0] == ']') { diff --git a/criu/cgroup.c b/criu/cgroup.c index e7e15bc..4088b08 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -417,7 +417,7 @@ static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd, const prop->value = new; }
- pr_info("Dumping value %s from %s/%s\n", prop->value, fpath, prop->name); + pr_debug("Dumping value %s from %s/%s\n", prop->value, fpath, prop->name); list_add_tail(&prop->list, &ncd->properties); ncd->n_properties++; } @@ -455,7 +455,7 @@ static int add_cgroup(const char *fpath, const struct stat *sb, int typeflag) if (typeflag == FTW_D) { int mtype;
- pr_info("adding cgroup %s\n", fpath); + pr_debug("adding cgroup %s\n", fpath);
ncd = xmalloc(sizeof(*ncd)); if (!ncd) @@ -737,7 +737,7 @@ int dump_task_cgroup(struct pstree_item *item, u32 *cg_id, struct parasite_dump_ else pid = getpid();
- pr_info("Dumping cgroups for %d\n", pid); + pr_debug("Dumping cgroups for %d\n", pid); if (parse_task_cgroup(pid, args, &ctls, &n_ctls)) return -1;
@@ -748,17 +748,17 @@ int dump_task_cgroup(struct pstree_item *item, u32 *cg_id, struct parasite_dump_ if (!item) { BUG_ON(criu_cgset); criu_cgset = cs; - pr_info("Set %d is criu one\n", cs->id); + pr_debug("Set %d is criu one\n", cs->id); } else { if (item == root_item) { BUG_ON(root_cgset); root_cgset = cs; - pr_info("Set %d is root one\n", cs->id); + pr_debug("Set %d is root one\n", cs->id); } else { struct cg_ctl *root, *stray;
BUG_ON(!root_cgset); - pr_info("Set %d is a stray\n", cs->id); + pr_debug("Set %d is a stray\n", cs->id);
/* Copy the cgns prefix from the root cgset for each * controller. This is ok because we know that there is diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 0c212a8..cbd40a9 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -158,7 +158,7 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc) return -1; }
- pr_info("%d has %d sched policy\n", pid, ret); + pr_debug("%d has %d sched policy\n", pid, ret); tc->has_sched_policy = true; tc->sched_policy = ret;
@@ -186,18 +186,18 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc) return -1; }
- pr_info("\tdumping %d nice for %d\n", ret, pid); + pr_debug("\tdumping %d nice for %d\n", ret, pid); tc->has_sched_nice = true; tc->sched_nice = ret;
- pr_info("\tdumping allowed cpus for %d\n", pid); + pr_debug("\tdumping allowed cpus for %d\n", pid); ret = syscall(__NR_sched_getaffinity, pid, sizeof(cpumask), &cpumask); if (ret < 0) { pr_perror("Can't get sched affinity for %d", pid); return -1; } memcpy(tc->allowed_cpus->cpumask, &cpumask, sizeof(cpu_set_t)); - pr_info("\t 0x%llx, 0x%llx, 0x%llx, 0x%llx\n", + pr_debug("\t 0x%llx, 0x%llx, 0x%llx, 0x%llx\n", (unsigned long long)tc->allowed_cpus->cpumask[3], (unsigned long long)tc->allowed_cpus->cpumask[2], (unsigned long long)tc->allowed_cpus->cpumask[1], @@ -1428,7 +1428,7 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) goto err_cure; }
- ret = compel_stop_daemon(parasite_ctl); + ret = compel_stop_daemon(parasite_ctl, kdat.has_customize_ptrace); if (ret) { pr_err("Can't stop daemon in parasite (pid: %d)\n", pid); goto err_cure; diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 3049e07..ccb2690 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2171,6 +2171,64 @@ static int catch_tasks(bool root_seized, enum trace_flags *flag) return 0; }
+static int cache_tasks_customize(bool root_seized, struct bisect_meta *meta) +{ + struct pstree_item *item; + struct trace_flag *tf; + + for_each_pstree_item(item) { + int status, i, ret; + pid_t pid; + + if (!task_alive(item)) + continue; + + if (item->nr_threads == 1) { + item->threads[0].real = item->pid->real; + } else { + if (parse_threads(item->pid->real, &item->threads, &item->nr_threads)) + return -1; + } + + for (i = 0; i < item->nr_threads; i++) { + pid = item->threads[i].real; + + if (ptrace(PTRACE_INTERRUPT, pid, 0, 0)) { + pr_perror("Can't interrupt the %d task", pid); + return -1; + } + + tf = tf_insert(meta, pid); + if (tf == NULL) { + pr_err("Can't find trace flag for %d, used %d\n", + pid, meta->used); + return -1; + } + tf->flag = TRACE_INTERRUPT; + } + + for (i = 0; i < item->nr_threads; i++) { + pid = wait4(-1, &status, __WALL, NULL); + + tf = tf_bisect(meta, pid); + if (tf == NULL) { + pr_err("Can't find trace flag for %d, used %d\n", + pid, meta->used); + return -1; + } + + ret = compel_stop_pie_customize(pid, + __NR(rt_sigreturn, 0), + tf); + if (ret < 0) + return -1; + + } + } + + return 0; +} + static int clear_breakpoints(void) { struct pstree_item *item; @@ -2197,6 +2255,7 @@ static void finalize_restore(void) pid_t pid = item->pid->real; struct parasite_ctl *ctl; unsigned long restorer_addr; + int retval;
if (!task_alive(item)) continue; @@ -2207,7 +2266,12 @@ static void finalize_restore(void) continue;
restorer_addr = (unsigned long)rsti(item)->munmap_restorer; - if (compel_unmap(ctl, restorer_addr)) + if (!kdat.has_customize_ptrace) + retval = compel_unmap(ctl, restorer_addr); + else + retval = compel_unmap_customize(ctl, restorer_addr); + + if (retval) pr_err("Failed to unmap restorer from %d\n", pid);
xfree(ctl); @@ -2312,11 +2376,18 @@ static int write_restored_pid(void)
static int restore_root_task(struct pstree_item *init) { + struct bisect_meta tfs_meta; enum trace_flags flag = TRACE_ALL; int ret, fd, mnt_ns_fd = -1; int root_seized = 0; struct pstree_item *item;
+ if (kdat.has_customize_ptrace + && tf_create(&tfs_meta, task_entries->nr_threads) != 0) { + pr_err("Can't alloc memory, tf_create failed\n"); + return -1; + } + ret = run_scripts(ACT_PRE_RESTORE); if (ret != 0) { pr_err("Aborting restore due to pre-restore script ret code %d\n", ret); @@ -2521,7 +2592,12 @@ skip_ns_bouncing:
timing_stop(TIME_RESTORE);
- if (catch_tasks(root_seized, &flag)) { + if (!kdat.has_customize_ptrace) + ret = catch_tasks(root_seized, &flag); + else + ret = cache_tasks_customize(root_seized, &tfs_meta); + + if (ret) { pr_err("Can't catch all tasks\n"); goto out_kill_network_unlocked; } @@ -2531,8 +2607,14 @@ skip_ns_bouncing:
__restore_switch_stage(CR_STATE_COMPLETE);
- ret = compel_stop_on_syscall(task_entries->nr_threads, - __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag); + if (!kdat.has_customize_ptrace) { + ret = compel_stop_on_syscall(task_entries->nr_threads, + __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag); + } else { + ret = compel_stop_on_syscall_customize(task_entries->nr_threads, + __NR(rt_sigreturn, 0), + -1, &tfs_meta); + } if (ret) { pr_err("Can't stop all tasks on rt_sigreturn\n"); goto out_kill_network_unlocked; @@ -2575,6 +2657,9 @@ skip_ns_bouncing: if (!opts.restore_detach && !opts.exec_cmd) wait(NULL);
+ if (kdat.has_customize_ptrace) + tf_destroy(&tfs_meta); + return 0;
out_kill_network_unlocked: @@ -2608,6 +2693,10 @@ out: stop_usernsd(); __restore_switch_stage(CR_STATE_FAIL); pr_err("Restoring FAILED.\n"); + + if (kdat.has_customize_ptrace) + tf_destroy(&tfs_meta); + return -1; }
diff --git a/criu/eventfd.c b/criu/eventfd.c index da31ce9..17cbceb 100644 --- a/criu/eventfd.c +++ b/criu/eventfd.c @@ -38,7 +38,7 @@ int is_eventfd_link(char *link)
static void pr_info_eventfd(char *action, EventfdFileEntry *efe) { - pr_info("%s: id %#08x flags %#04x counter %#016"PRIx64"\n", + pr_debug("%s: id %#08x flags %#04x counter %#016"PRIx64"\n", action, efe->id, efe->flags, efe->counter); }
diff --git a/criu/eventpoll.c b/criu/eventpoll.c index 6097e42..d8c8166 100644 --- a/criu/eventpoll.c +++ b/criu/eventpoll.c @@ -67,13 +67,13 @@ int is_eventpoll_link(char *link)
static void pr_info_eventpoll_tfd(char *action, uint32_t id, EventpollTfdEntry *e) { - pr_info("%seventpoll-tfd: id %#08x tfd %8d events %#08x data %#016"PRIx64" ignore %d\n", + pr_debug("%seventpoll-tfd: id %#08x tfd %8d events %#08x data %#016"PRIx64" ignore %d\n", action, id, e->tfd, e->events, e->data, e->ignore); }
static void pr_info_eventpoll(char *action, EventpollFileEntry *e) { - pr_info("%seventpoll: id %#08x flags %#04x\n", action, e->id, e->flags); + pr_debug("%seventpoll: id %#08x flags %#04x\n", action, e->id, e->flags); }
static int queue_dinfo(FileEntry **fe, EventpollFileEntry **e, toff_t **toff, const struct fd_parms *p) diff --git a/criu/files-reg.c b/criu/files-reg.c index 01e0895..4752085 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -1718,7 +1718,7 @@ static bool store_validation_data(RegFileEntry *rfe, return false;
if (!result) - pr_info("Only file size could be stored for validation for file %s\n", + pr_debug("Only file size could be stored for validation for file %s\n", rfe->name); return true; } @@ -1768,7 +1768,7 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) rfe.has_mnt_id = true; }
- pr_info("Dumping path for %d fd via self %d [%s], id: %d\n", + pr_debug("Dumping path for %d fd via self %d [%s], id: %d\n", p->fd, lfd, &link->name[1], id);
/* diff --git a/criu/files.c b/criu/files.c index 0e5be91..b4382fd 100644 --- a/criu/files.c +++ b/criu/files.c @@ -334,7 +334,7 @@ int do_dump_gen_file(struct fd_parms *p, int lfd, e->fd = p->fd; e->flags = p->fd_flags;
- pr_info("fdinfoEntry fd: %d\n", e->fd); + pr_debug("fdinfoEntry fd: %d\n", e->fd); ret = fd_id_generate(p->pid, e, p); if (ret == 1) /* new ID generated */ ret = ops->dump(lfd, e->id, p); @@ -422,7 +422,7 @@ static int fill_fd_params(struct pid *owner_pid, int fd, int lfd,
fown_entry__init(&p->fown);
- pr_info("%d fdinfo %d: pos: %#16"PRIx64" flags: %16o/%#x\n", + pr_debug("%d fdinfo %d: pos: %#16"PRIx64" flags: %16o/%#x\n", owner_pid->real, fd, p->pos, p->flags, (int)p->fd_flags);
if (p->flags & O_PATH) @@ -513,7 +513,7 @@ static int dump_chr_file(int lfd, u32 id, const struct fd_parms *p) } else link = p->link;
- pr_info("Dumping chr-file fd %d with lfd %d with id %d, name: %s\n", p->fd, lfd, id, link->name); + pr_debug("Dumping chr-file fd %d with lfd %d with id %d, name: %s\n", p->fd, lfd, id, link->name);
if (strstr(link->name, "(deleted)") != NULL) { pr_err("char device '%s' is deleted\n", link->name); @@ -727,9 +727,9 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, int i, ret = -1; int off, nr_fds = min((int) PARASITE_MAX_FDS, dfds->nr_fds);
- pr_info("\n"); - pr_info("Dumping opened files (pid: %d)\n", item->pid->real); - pr_info("----------------------------------------\n"); + pr_debug("\n"); + pr_debug("Dumping opened files (pid: %d)\n", item->pid->real); + pr_debug("----------------------------------------\n");
lfds = xmalloc(nr_fds * sizeof(int)); if (!lfds) @@ -762,7 +762,7 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, break;
e.flags |= need_reuse_flag; - pr_info("write fdinfoEntry fd=%d id=%d\n", (&e)->fd, (&e)->id); + pr_debug("write fdinfoEntry fd=%d id=%d\n", (&e)->fd, (&e)->id); ret = pb_write_one(img, &e, PB_FDINFO); if (ret) break; @@ -772,7 +772,7 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, close(lfds[i]); }
- pr_info("----------------------------------------\n"); + pr_debug("----------------------------------------\n"); err: if (img) close_image(img); diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index 665051d..76fe342 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -69,6 +69,7 @@ struct kerndat_s { bool has_clone3_set_tid; bool has_timens; bool has_unix_sk_repair; + bool has_customize_ptrace; };
extern struct kerndat_s kdat; diff --git a/criu/kerndat.c b/criu/kerndat.c index cf9187a..7e26740 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -13,6 +13,9 @@ #include <arpa/inet.h> /* for sockaddr_in and inet_ntoa() */ #include <sys/prctl.h> #include <sys/inotify.h> +#include <sys/ptrace.h> +#include <sys/wait.h> +#include <linux/ptrace.h>
#include "common/config.h" @@ -1082,6 +1085,66 @@ static void kerndat_has_unix_sk_repair(void) return; }
+static void kerndat_has_customize_ptrace(void) +{ + pid_t tracee = fork(); + int status; + int retval; + + if (tracee == 0) { + /* ensure */ + prctl(PR_SET_PDEATHSIG, SIGKILL); + + while (true) + sleep(1); + } else if (tracee > 0) { + pr_debug("fork task %d as tracee\n", tracee); + retval = ptrace(PTRACE_ATTACH, tracee, 0, 0); + if (retval < 0) { + pr_perror("Unexpect error from ptrace(PTRACE_ATTACH)"); + return; + } + + retval = wait4(-1, &status, __WALL, NULL); + if (retval == -1) + pr_perror("Unexpect error from wait"); + else if (retval != tracee || !(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)) + pr_err("Task %d (expect %d) is unexpect, status: %d," + " stoped: %d signal: %d(%s)\n", + retval, tracee, status, + WIFSTOPPED(status), WSTOPSIG(status), + strsignal(WTERMSIG(status))); + else { + retval = ptrace(PTRACE_SYSCALL_NR, tracee, 0, 0); + if (retval == 0) + kdat.has_customize_ptrace = true; + else + pr_perror("Unexpect error from ptrace(PTRACE_SYSCALL_NR)"); + } + + if (kill(tracee, SIGKILL) != 0) { + pr_perror("kill tracee %d failed", tracee); + return; + } + + /* + * To prevent wait4 unexpect task when criu.kdat is generated + * in dump process. + */ + retval = waitpid(tracee, &status, 0); + if (retval == -1) + pr_err("waitpid() failed"); + else + pr_debug("tracee %d exited, status %d, signal %d(%s)\n", + WEXITSTATUS(status), WTERMSIG(status), + WTERMSIG(status), strsignal(WTERMSIG(status))); + } else { + pr_perror("Unexpected error from fork\n"); + } + + return; +} + int kerndat_init(void) { int ret; @@ -1095,8 +1158,7 @@ int kerndat_init(void) memset(&kdat, 0, sizeof(kdat));
preload_socket_modules(); - if (!opts.use_nft) - preload_netfilter_modules(); + preload_netfilter_modules();
if (check_pagemap()) { pr_err("check_pagemap failed when initializing kerndat.\n"); @@ -1218,6 +1280,7 @@ int kerndat_init(void) }
kerndat_has_unix_sk_repair(); + kerndat_has_customize_ptrace();
kerndat_lsm(); kerndat_mmap_min_addr(); diff --git a/criu/lsm.c b/criu/lsm.c index 6713ca7..9d9d38e 100644 --- a/criu/lsm.c +++ b/criu/lsm.c @@ -265,9 +265,9 @@ int collect_lsm_profile(pid_t pid, CredsEntry *ce) }
if (ce->lsm_profile) - pr_info("%d has lsm profile %s\n", pid, ce->lsm_profile); + pr_debug("%d has lsm profile %s\n", pid, ce->lsm_profile); if (ce->lsm_sockcreate) - pr_info("%d has lsm sockcreate label %s\n", pid, ce->lsm_sockcreate); + pr_debug("%d has lsm sockcreate label %s\n", pid, ce->lsm_sockcreate);
return ret; } diff --git a/criu/mount.c b/criu/mount.c index 25ef7f0..124c9c8 100644 --- a/criu/mount.c +++ b/criu/mount.c @@ -449,13 +449,13 @@ static void mnt_tree_show(struct mount_info *tree, int off) { struct mount_info *m;
- pr_info("%*s[%s](%d->%d)\n", off, "", + pr_debug("%*s[%s](%d->%d)\n", off, "", tree->mountpoint, tree->mnt_id, tree->parent_mnt_id);
list_for_each_entry(m, &tree->children, siblings) mnt_tree_show(m, off + 1);
- pr_info("%*s<--\n", off, ""); + pr_debug("%*s<--\n", off, ""); }
/* Returns -1 on error, 1 if external mount resolved, 0 otherwise */ diff --git a/criu/sk-packet.c b/criu/sk-packet.c index 0abe840..82b6b2c 100644 --- a/criu/sk-packet.c +++ b/criu/sk-packet.c @@ -262,7 +262,7 @@ int packet_receive_one(struct nlmsghdr *hdr, struct ns_id *ns, void *arg) m = NLMSG_DATA(hdr); nlmsg_parse(hdr, sizeof(struct packet_diag_msg), tb, PACKET_DIAG_MAX, NULL); - pr_info("Collect packet sock %u %u\n", m->pdiag_ino, (unsigned int)m->pdiag_num); + pr_debug("Collect packet sock %u %u\n", m->pdiag_ino, (unsigned int)m->pdiag_num);
if (!tb[PACKET_DIAG_INFO]) { pr_err("No packet sock info in nlm\n"); diff --git a/criu/sk-unix.c b/criu/sk-unix.c index b4c24ed..3d9af75 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -584,7 +584,7 @@ dump: if (dump_socket_opts(lfd, skopts)) goto err;
- pr_info("Dumping unix socket at %d\n", p->fd); + pr_debug("Dumping unix socket at %d\n", p->fd); show_one_unix("Dumping", sk);
sk->ue = ue;