Jingxian He (21): build: add secure compilation options mm: add pin memory method for criu pid: add pid recover method for criu notifier: add notifier calling method for checkpoint and restore cred: provide cred checkpoint restore method block-device: dump block device as reguler file anon-inode: add support for anon inode fd char_dev: add support for char device dump and restore socket: fix connect error of invalid param criu: eventpollfd fix for improper usage in appdata task_exit_notify: add task exit notify mask method for criu looser file mode and size check add O_REPAIR flag to vma fd add reuse file method for recover deleted file state fix share sockets repair problem remove sigaction handler register in restorer fix fds list restore and rollback problem improve char dev fd check and repair method optimization: parallel collecting vmas net: improve nft table set clear method add exec file mapping pin method
Liu Chao (3): save and restore sigev_notify_thread_id unlock network when restore fails save src ports to ip_local_reserved_ports when dump tasks and retore it when restore tasks
Luo Longjun (1): unix socket: add support for unix stream socket
Sang Yan (3): file-lock: add repair mode to dump file locks net: add shared socket recover method for criu clean repair res when dump fail
Xiaoguang Li (2): selinux: fix selinux context lable check add netlink repair modes
Zhuling (2): fix dump fail problem with null seek op fix dump fail problem with no access to get socket filter
anatasluo (1): vdso: fix segmentation fault caused by char pointer array
fu.lin (34): criu: dump and restore cpu affinity of each thread tty: fix NULL pointer access in tty namespaces: drop func address print to make someone happy zdtm: fix zdtm/static/maps00 case in arm64 test: flush ipt rules after program exits zdtm: fix cleaning step of zdtm_netns sysvshm: add dump/restore sysv-shm in host ipc ns proc parse: fix vma offset value for the sysfs file of pci devices nftables: add mnl api nftables: implement nft api for tcp nftables: implement nft api for lock net ns criu: switch to nftables api mmap: restore /dev/hisi_sec2* deivce vma log: print error log to /dev/kmsg unix sk: improve dgram robustness sk: ignore the bind error for icmp socket infiniband: fix the infiniband fd conflict ptrace: trace specific syscall notifier: rollback when open img failed detach: don't kill task when `ptrace(PTRACE_DETACH)` return ESRCH zdtm: unlink kdat before testing sysvshm: fix bug caused by sscanf zdtm: add host ns sysvshm ipc case zdtm: add pinmem testcase zdtm: init notifier testcase zdtm: print errno info when accessing *.out failure zdtm: print more info for fs.c zdtm: add chardev testcase zdtm: add infiniband testcase zdtm: add share port testcase zdtm: tmp test script mod: add criu-indepent test zdtm: fix py bug kabichk: add KABI check code
lingsheng (3): Fix crit encode TypeError Fix crit info struct unpack error Fix crit x UnicodeDecodeError
root (1): add clear pin mem and init page map option
Makefile | 4 + compel/Makefile | 1 + compel/arch/aarch64/src/lib/infect.c | 2 +- .../arch/arm/plugins/std/syscalls/syscall.def | 1 + .../plugins/std/syscalls/syscall-ppc64.tbl | 1 + .../plugins/std/syscalls/syscall-s390.tbl | 1 + .../x86/plugins/std/syscalls/syscall_32.tbl | 1 + .../x86/plugins/std/syscalls/syscall_64.tbl | 1 + compel/include/uapi/bisect.h | 30 + compel/include/uapi/infect.h | 15 +- compel/src/lib/bisect.c | 92 ++ compel/src/lib/infect.c | 169 +++- criu/Makefile | 2 +- criu/Makefile.crtools | 6 + criu/Makefile.packages | 7 + criu/arch/aarch64/include/asm/vdso.h | 17 +- criu/arch/arm/include/asm/vdso.h | 9 +- criu/arch/ppc64/include/asm/vdso.h | 34 +- criu/arch/s390/include/asm/vdso.h | 17 +- criu/arch/x86/include/asm/vdso.h | 23 +- criu/cgroup-props.c | 6 +- criu/cgroup.c | 12 +- criu/char.c | 68 ++ criu/config.c | 32 +- criu/cr-dump.c | 170 +++- criu/cr-restore.c | 373 ++++++- criu/crtools.c | 51 + criu/devname.c | 130 +++ criu/eventfd.c | 2 +- criu/eventpoll.c | 18 +- criu/file-lock.c | 10 + criu/files-reg.c | 251 ++++- criu/files.c | 281 +++++- criu/include/char.h | 17 + criu/include/cr_options.h | 17 + criu/include/fcntl.h | 11 + criu/include/files-reg.h | 26 + criu/include/files.h | 10 + criu/include/image-desc.h | 2 + criu/include/image.h | 3 + criu/include/kerndat.h | 2 + criu/include/log.h | 2 + criu/include/mem.h | 3 + criu/include/net.h | 9 +- criu/include/netfilter.h | 7 +- criu/include/nftables.h | 168 +++ criu/include/parasite-syscall.h | 2 + criu/include/parasite.h | 10 + criu/include/posix-timer.h | 1 + criu/include/prctl.h | 4 + criu/include/protobuf-desc.h | 2 + criu/include/pstree.h | 4 + criu/include/restorer.h | 48 + criu/include/sk-inet.h | 9 +- criu/include/sockets.h | 2 + criu/include/taskqueue.h | 60 ++ criu/include/util.h | 60 ++ criu/include/vma.h | 12 + criu/kerndat.c | 96 ++ criu/kmsg.c | 16 + criu/log.c | 4 + criu/lsm.c | 16 +- criu/mem.c | 171 +++- criu/mnl.c | 165 +++ criu/mount.c | 4 +- criu/namespaces.c | 13 +- criu/net.c | 43 +- criu/netfilter.c | 14 +- criu/nftables.c | 954 ++++++++++++++++++ criu/parasite-syscall.c | 34 + criu/pie/Makefile | 1 + criu/pie/Makefile.library | 2 + criu/pie/parasite.c | 50 + criu/pie/restorer.c | 417 +++++++- criu/pie/util-vdso.c | 2 + criu/pie/util.c | 91 ++ criu/proc_parse.c | 98 +- criu/pstree.c | 15 + criu/seize.c | 39 +- criu/sk-inet.c | 186 +++- criu/sk-netlink.c | 50 +- criu/sk-packet.c | 2 +- criu/sk-tcp.c | 99 +- criu/sk-unix.c | 174 +++- criu/sockets.c | 9 +- criu/taskqueue.c | 169 ++++ criu/tty.c | 5 + images/Makefile | 1 + images/chr.proto | 15 + images/core.proto | 5 + images/eventpoll.proto | 3 + images/fdinfo.proto | 5 + images/sk-unix.proto | 1 + images/timer.proto | 1 + images/vma.proto | 1 + include/common/lock.h | 4 + lib/Makefile | 1 + lib/c/Makefile | 2 +- lib/py/cli.py | 7 +- lib/py/images/images.py | 2 +- scripts/nmk/scripts/build.mk | 5 +- test/jenkins/criu-lib.sh | 2 +- test/jenkins/criu-test.sh | 26 + test/modules/Makefile | 21 + test/modules/idr.c | 79 ++ test/modules/jump_table.c | 107 ++ test/modules/var_kern.c | 72 ++ test/modules/var_user.py | 40 + test/modules/workqueue_kern.c | 130 +++ test/zdtm.py | 133 ++- test/zdtm/Makefile | 2 +- test/zdtm/customization/Makefile | 75 ++ test/zdtm/customization/chardev00.c | 65 ++ test/zdtm/customization/chardev00.desc | 1 + test/zdtm/customization/get_smaps_bits.c | 127 +++ test/zdtm/customization/get_smaps_bits.h | 6 + .../customization/infiniband_with_unix_sk.c | 55 + .../infiniband_with_unix_sk.desc | 1 + test/zdtm/customization/ipc.c | 202 ++++ test/zdtm/customization/ipc.desc | 1 + test/zdtm/customization/maps00.c | 271 +++++ test/zdtm/customization/maps00.desc | 1 + test/zdtm/customization/maps007.c | 178 ++++ test/zdtm/customization/maps007.desc | 1 + test/zdtm/customization/maps008.c | 514 ++++++++++ test/zdtm/customization/maps008.desc | 1 + test/zdtm/customization/maps01.c | 183 ++++ test/zdtm/customization/maps01.desc | 1 + test/zdtm/customization/maps02.c | 111 ++ test/zdtm/customization/maps02.desc | 1 + test/zdtm/customization/maps04.c | 57 ++ test/zdtm/customization/maps04.desc | 1 + test/zdtm/customization/maps05.c | 91 ++ test/zdtm/customization/maps05.desc | 1 + test/zdtm/customization/maps06.c | 70 ++ test/zdtm/customization/maps06.desc | 1 + test/zdtm/customization/maps_file_prot.c | 53 + test/zdtm/customization/maps_file_prot.desc | 1 + test/zdtm/customization/notifier00.c | 68 ++ test/zdtm/customization/notifier00.desc | 1 + test/zdtm/customization/tcp00.c | 101 ++ test/zdtm/customization/tcp00.desc | 1 + test/zdtm/lib/fs.c | 11 +- test/zdtm/lib/test.c | 4 +- test/zdtm/mod/.gitignore | 163 +++ test/zdtm/mod/Makefile | 34 + test/zdtm/mod/anon_inode.c | 148 +++ test/zdtm/mod/infiniband_kern.c | 121 +++ test/zdtm/mod/notifier.c | 145 +++ test/zdtm/static/Makefile | 1 + test/zdtm/static/cpu-affinity0.c | 42 + test/zdtm/static/cpu-affinity0.desc | 1 + test/zdtm/static/maps00.c | 8 +- test/zdtm/static/socket-tcp-nfconntrack.desc | 2 +- test/zdtm/static/socket-tcp.c | 13 + test/zdtm_ct.c | 13 +- upgchk/Makefile | 23 + upgchk/lib/modsym.c | 268 +++++ upgchk/lib/modsym.h | 39 + upgchk/setup.py | 20 + upgchk/upgchk/__init__.py | 11 + upgchk/upgchk/kabichk.py | 163 +++ 162 files changed, 9156 insertions(+), 247 deletions(-) create mode 100644 compel/include/uapi/bisect.h create mode 100644 compel/src/lib/bisect.c create mode 100644 criu/char.c create mode 100644 criu/devname.c create mode 100644 criu/include/char.h create mode 100644 criu/include/nftables.h create mode 100644 criu/include/taskqueue.h create mode 100644 criu/kmsg.c create mode 100644 criu/mnl.c create mode 100644 criu/nftables.c create mode 100644 criu/taskqueue.c create mode 100644 images/chr.proto create mode 100644 test/jenkins/criu-test.sh create mode 100644 test/modules/Makefile create mode 100644 test/modules/idr.c create mode 100644 test/modules/jump_table.c create mode 100644 test/modules/var_kern.c create mode 100644 test/modules/var_user.py create mode 100644 test/modules/workqueue_kern.c create mode 100644 test/zdtm/customization/Makefile create mode 100644 test/zdtm/customization/chardev00.c create mode 100644 test/zdtm/customization/chardev00.desc create mode 100644 test/zdtm/customization/get_smaps_bits.c create mode 100644 test/zdtm/customization/get_smaps_bits.h create mode 100644 test/zdtm/customization/infiniband_with_unix_sk.c create mode 100644 test/zdtm/customization/infiniband_with_unix_sk.desc create mode 100644 test/zdtm/customization/ipc.c create mode 100644 test/zdtm/customization/ipc.desc create mode 100644 test/zdtm/customization/maps00.c create mode 100644 test/zdtm/customization/maps00.desc create mode 100644 test/zdtm/customization/maps007.c create mode 100644 test/zdtm/customization/maps007.desc create mode 100644 test/zdtm/customization/maps008.c create mode 100644 test/zdtm/customization/maps008.desc create mode 100644 test/zdtm/customization/maps01.c create mode 100644 test/zdtm/customization/maps01.desc create mode 100644 test/zdtm/customization/maps02.c create mode 100644 test/zdtm/customization/maps02.desc create mode 100644 test/zdtm/customization/maps04.c create mode 100644 test/zdtm/customization/maps04.desc create mode 100644 test/zdtm/customization/maps05.c create mode 100644 test/zdtm/customization/maps05.desc create mode 100644 test/zdtm/customization/maps06.c create mode 100644 test/zdtm/customization/maps06.desc create mode 100644 test/zdtm/customization/maps_file_prot.c create mode 100644 test/zdtm/customization/maps_file_prot.desc create mode 100644 test/zdtm/customization/notifier00.c create mode 100644 test/zdtm/customization/notifier00.desc create mode 100644 test/zdtm/customization/tcp00.c create mode 100644 test/zdtm/customization/tcp00.desc create mode 100644 test/zdtm/mod/.gitignore create mode 100644 test/zdtm/mod/Makefile create mode 100644 test/zdtm/mod/anon_inode.c create mode 100644 test/zdtm/mod/infiniband_kern.c create mode 100644 test/zdtm/mod/notifier.c create mode 100644 test/zdtm/static/cpu-affinity0.c create mode 100644 test/zdtm/static/cpu-affinity0.desc create mode 100644 upgchk/Makefile create mode 100644 upgchk/lib/modsym.c create mode 100644 upgchk/lib/modsym.h create mode 100644 upgchk/setup.py create mode 100644 upgchk/upgchk/__init__.py create mode 100644 upgchk/upgchk/kabichk.py
From: lingsheng lingsheng@huawei.com
--- lib/py/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lib/py/cli.py b/lib/py/cli.py index 966dd4e..f7bda23 100755 --- a/lib/py/cli.py +++ b/lib/py/cli.py @@ -25,7 +25,7 @@ def outf(opts):
def dinf(opts, name): - return open(os.path.join(opts['dir'], name)) + return open(os.path.join(opts['dir'], name), 'rb')
def decode(opts):
From: lingsheng lingsheng@huawei.com
--- lib/py/cli.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/lib/py/cli.py b/lib/py/cli.py index da34302..966dd4e 100755 --- a/lib/py/cli.py +++ b/lib/py/cli.py @@ -16,7 +16,10 @@ def inf(opts):
def outf(opts): if opts['out']: - return open(opts['out'], 'w+') + if getattr(opts['func'], '__name__') == 'encode': + return open(opts['out'], 'wb+') + else: + return open(opts['out'], 'w+') else: return sys.stdout
Criu should dump and restore threads' or processes' cpu affinity.
Add one entry of thread_cpuallow_entry into thread_core_entry to save cpu affinity info.
Restore it after threads restored but before running.
Add option --with-cpu-affinity to enable this function at restore.
Signed-off-by: Sang Yan sangyan@huawei.com Signed-off-by: fu.lin fu.lin10@huawei.com --- .../arch/arm/plugins/std/syscalls/syscall.def | 1 + .../plugins/std/syscalls/syscall-ppc64.tbl | 1 + .../plugins/std/syscalls/syscall-s390.tbl | 1 + .../x86/plugins/std/syscalls/syscall_32.tbl | 1 + .../x86/plugins/std/syscalls/syscall_64.tbl | 1 + criu/config.c | 1 + criu/cr-dump.c | 14 +++++++ criu/cr-restore.c | 29 +++++++++++++ criu/crtools.c | 2 + criu/include/cr_options.h | 2 + criu/include/restorer.h | 3 ++ criu/pie/restorer.c | 38 +++++++++++++++++ criu/pstree.c | 7 ++++ images/core.proto | 5 +++ test/zdtm/static/Makefile | 1 + test/zdtm/static/cpu-affinity0.c | 42 +++++++++++++++++++ test/zdtm/static/cpu-affinity0.desc | 1 + 17 files changed, 150 insertions(+) create mode 100644 test/zdtm/static/cpu-affinity0.c create mode 100644 test/zdtm/static/cpu-affinity0.desc
diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def index f7ebc85..d577373 100644 --- a/compel/arch/arm/plugins/std/syscalls/syscall.def +++ b/compel/arch/arm/plugins/std/syscalls/syscall.def @@ -116,3 +116,4 @@ fsopen 430 430 (char *fsname, unsigned int flags) fsconfig 431 431 (int fd, unsigned int cmd, const char *key, const char *value, int aux) fsmount 432 432 (int fd, unsigned int flags, unsigned int attr_flags) clone3 435 435 (struct clone_args *uargs, size_t size) +sched_setaffinity 122 241 (int fd, size_t cpusetsize, const cpu_set_t *mask) diff --git a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl index 1afaf1e..fa64545 100644 --- a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl +++ b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl @@ -112,3 +112,4 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_sched_setaffinity 222 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask) diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl index ae6fdb5..16f1994 100644 --- a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl +++ b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl @@ -112,3 +112,4 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_sched_setaffinity 239 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl index 7a48711..29c13e3 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl @@ -63,6 +63,7 @@ __NR_mincore 218 sys_mincore (void *addr, unsigned long size, unsigned char * __NR_madvise 219 sys_madvise (unsigned long start, size_t len, int behavior) __NR_gettid 224 sys_gettid (void) __NR_futex 240 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) +__NR_sched_setaffinity 241 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask) __NR_set_thread_area 243 sys_set_thread_area (user_desc_t *info) __NR_get_thread_area 244 sys_get_thread_area (user_desc_t *info) __NR_io_setup 245 sys_io_setup (unsigned nr_reqs, aio_context_t *ctx32p) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl index 6667c07..74f5482 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl @@ -73,6 +73,7 @@ __NR_mount 165 sys_mount (char *dev_nmae, char *dir_name, char *type, unsign __NR_umount2 166 sys_umount2 (char *name, int flags) __NR_gettid 186 sys_gettid (void) __NR_futex 202 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) +__NR_sched_setaffinity 203 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask) __NR_set_thread_area 205 sys_set_thread_area (user_desc_t *info) __NR_io_setup 206 sys_io_setup (unsigned nr_events, aio_context_t *ctx) __NR_io_getevents 208 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) diff --git a/criu/config.c b/criu/config.c index 08606fb..5a53256 100644 --- a/criu/config.c +++ b/criu/config.c @@ -541,6 +541,7 @@ int parse_options(int argc, char **argv, bool *usage_error, { "cgroup-yard", required_argument, 0, 1096 }, { "pre-dump-mode", required_argument, 0, 1097}, { "file-validation", required_argument, 0, 1098 }, + BOOL_OPT("with-cpu-affinity", &opts.with_cpu_affinity), { }, };
diff --git a/criu/cr-dump.c b/criu/cr-dump.c index b9d2914..f078c27 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -140,6 +140,7 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc) { int ret; struct sched_param sp; + cpu_set_t cpumask;
BUILD_BUG_ON(SCHED_OTHER != 0); /* default in proto message */
@@ -185,6 +186,19 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc) tc->has_sched_nice = true; tc->sched_nice = ret;
+ pr_info("\tdumping allowed cpus for %d\n", pid); + ret = syscall(__NR_sched_getaffinity, pid, sizeof(cpumask), &cpumask); + if (ret < 0) { + pr_perror("Can't get sched affinity for %d", pid); + return -1; + } + memcpy(tc->allowed_cpus->cpumask, &cpumask, sizeof(cpu_set_t)); + pr_info("\t 0x%llx, 0x%llx, 0x%llx, 0x%llx\n", + (unsigned long long)tc->allowed_cpus->cpumask[3], + (unsigned long long)tc->allowed_cpus->cpumask[2], + (unsigned long long)tc->allowed_cpus->cpumask[1], + (unsigned long long)tc->allowed_cpus->cpumask[0]); + return 0; }
diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 589087f..1374a69 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -118,6 +118,7 @@ static int prepare_restorer_blob(void); static int prepare_rlimits(int pid, struct task_restore_args *, CoreEntry *core); static int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core); static int prepare_signals(int pid, struct task_restore_args *, CoreEntry *core); +static int prepare_allowed_cpus(int pid, struct task_restore_args *ta, CoreEntry *leader_core);
/* * Architectures can overwrite this function to restore registers that are not @@ -922,6 +923,9 @@ static int restore_one_alive_task(int pid, CoreEntry *core) if (prepare_signals(pid, ta, core)) return -1;
+ if (prepare_allowed_cpus(pid, ta, core)) + return -1; + if (prepare_posix_timers(pid, ta, core)) return -1;
@@ -3196,6 +3200,30 @@ out: return ret; }
+static int prepare_allowed_cpus(int pid, struct task_restore_args *ta, CoreEntry *leader_core) +{ + int i; + int *need_cpu_affinity; + cpu_set_t *cpumaks; + + ta->allowed_cpus = (char *)rst_mem_align_cpos(RM_PRIVATE); + + need_cpu_affinity = rst_mem_alloc(sizeof(int), RM_PRIVATE); + if (need_cpu_affinity == NULL) + return -1; + + *need_cpu_affinity = opts.with_cpu_affinity; + + for (i = 0; i < current->nr_threads; i++) { + cpumaks = rst_mem_alloc(sizeof(cpu_set_t), RM_PRIVATE); + if (!cpumaks) + return -1; + + memcpy(cpumaks, current->core[i]->thread_core->allowed_cpus->cpumask, sizeof(cpu_set_t)); + } + return 0; +} + extern void __gcov_flush(void) __attribute__((weak)); void __gcov_flush(void) {}
@@ -3655,6 +3683,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns RST_MEM_FIXUP_PPTR(task_args->timerfd); RST_MEM_FIXUP_PPTR(task_args->posix_timers); RST_MEM_FIXUP_PPTR(task_args->siginfo); + RST_MEM_FIXUP_PPTR(task_args->allowed_cpus); RST_MEM_FIXUP_PPTR(task_args->rlims); RST_MEM_FIXUP_PPTR(task_args->helpers); RST_MEM_FIXUP_PPTR(task_args->zombies); diff --git a/criu/crtools.c b/criu/crtools.c index 2eb5dba..949dc9f 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -441,6 +441,8 @@ usage: " --file-validation METHOD\n" " pass the validation method to be used; argument\n" " can be 'filesize' or 'buildid' (default).\n" +" --with-cpu-affinity Allow to restore cpu affinity. Only for hosts with\n" +" same cpu quantity.\n" "\n" "Check options:\n" " Without options, "criu check" checks availability of absolutely required\n" diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index ac1c9e9..fda54a4 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -174,6 +174,8 @@ struct cr_options {
/* This stores which method to use for file validation. */ int file_validation_method; + /* restore cpu affinity */ + int with_cpu_affinity; };
extern struct cr_options opts; diff --git a/criu/include/restorer.h b/criu/include/restorer.h index dfb4e6b..bd6ef6a 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -1,6 +1,7 @@ #ifndef __CR_RESTORER_H__ #define __CR_RESTORER_H__
+#include <sched.h> #include <signal.h> #include <limits.h> #include <sys/resource.h> @@ -162,6 +163,8 @@ struct task_restore_args { siginfo_t *siginfo; unsigned int siginfo_n;
+ char *allowed_cpus; + struct rst_tcp_sock *tcp_socks; unsigned int tcp_socks_n;
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index b3d7e2b..c63f96b 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -432,6 +432,40 @@ static int restore_signals(siginfo_t *ptr, int nr, bool group) return 0; }
+static int restore_cpu_affinity(struct task_restore_args *args) +{ + int i; + int pid; + int ret; + int *need_cpu_affinity; + cpu_set_t *cpumask; + cpu_set_t *allowed_cpus; + + need_cpu_affinity = (int *)args->allowed_cpus; + if (!*need_cpu_affinity) { + pr_debug("No need to restore cpu affinity.\n"); + return 0; + } + + allowed_cpus = (cpu_set_t *)(args->allowed_cpus + sizeof(int)); + for (i = 0; i < args->nr_threads; i++) { + pid = args->thread_args[i].pid; + cpumask = &allowed_cpus[i]; + pr_info("Restoring %d allowed_cpus %llx, %llx, %llx, %llx\n", pid, + (unsigned long long)cpumask->__bits[3], + (unsigned long long)cpumask->__bits[2], + (unsigned long long)cpumask->__bits[1], + (unsigned long long)cpumask->__bits[0]); + ret = sys_sched_setaffinity(pid, sizeof(cpu_set_t), cpumask); + if (ret) { + pr_err("\t Restore %d cpumask failed.\n", pid); + return ret; + } + } + + return 0; +} + static int restore_seccomp_filter(pid_t tid, struct thread_restore_args *args) { unsigned int flags = args->seccomp_force_tsync ? SECCOMP_FILTER_FLAG_TSYNC : 0; @@ -1900,6 +1934,10 @@ long __export_restore_task(struct task_restore_args *args) if (ret) goto core_restore_end;
+ ret = restore_cpu_affinity(args); + if (ret) + goto core_restore_end; + restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD);
rst_tcp_socks_all(args); diff --git a/criu/pstree.c b/criu/pstree.c index a876615..f0d7622 100644 --- a/criu/pstree.c +++ b/criu/pstree.c @@ -58,11 +58,13 @@ CoreEntry *core_entry_alloc(int th, int tsk) CredsEntry *ce = NULL;
sz += sizeof(ThreadCoreEntry) + sizeof(ThreadSasEntry) + sizeof(CredsEntry); + sz += sizeof(ThreadAllowedcpusEntry);
sz += CR_CAP_SIZE * sizeof(ce->cap_inh[0]); sz += CR_CAP_SIZE * sizeof(ce->cap_prm[0]); sz += CR_CAP_SIZE * sizeof(ce->cap_eff[0]); sz += CR_CAP_SIZE * sizeof(ce->cap_bnd[0]); + sz += sizeof(cpu_set_t); /* * @groups are dynamic and allocated * on demand. @@ -127,6 +129,11 @@ CoreEntry *core_entry_alloc(int th, int tsk) ce->cap_eff = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_eff[0])); ce->cap_bnd = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_bnd[0]));
+ core->thread_core->allowed_cpus = xptr_pull(&m, ThreadAllowedcpusEntry); + thread_allowedcpus_entry__init(core->thread_core->allowed_cpus); + core->thread_core->allowed_cpus->n_cpumask = sizeof(cpu_set_t) / sizeof(uint64_t); + core->thread_core->allowed_cpus->cpumask = xptr_pull_s(&m, sizeof(cpu_set_t)); + if (arch_alloc_thread_info(core)) { xfree(core); core = NULL; diff --git a/images/core.proto b/images/core.proto index 9e9e393..2981120 100644 --- a/images/core.proto +++ b/images/core.proto @@ -81,6 +81,10 @@ message thread_sas_entry { required uint32 ss_flags = 3; }
+message thread_allowedcpus_entry { + repeated uint64 cpumask = 1; +} + message thread_core_entry { required uint64 futex_rla = 1; required uint32 futex_rla_len = 2; @@ -99,6 +103,7 @@ message thread_core_entry {
optional string comm = 13; optional uint64 blk_sigset_extended = 14; + required thread_allowedcpus_entry allowed_cpus = 15; }
message task_rlimits_entry { diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index aae4983..ad8fc6a 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -235,6 +235,7 @@ TST_NOFILE := \ timens_nested \ timens_for_kids \ zombie_leader \ + cpu-affinity0 \ # jobctl00 \
pkg-config-check = $(shell sh -c 'pkg-config $(1) && echo y') diff --git a/test/zdtm/static/cpu-affinity0.c b/test/zdtm/static/cpu-affinity0.c new file mode 100644 index 0000000..83dee19 --- /dev/null +++ b/test/zdtm/static/cpu-affinity0.c @@ -0,0 +1,42 @@ +#include <errno.h> +#include <stdlib.h> +#include <sched.h> + +#include "zdtmtst.h" + +const char *test_doc = "Check that with-cpu-affinity option can restore cpu affinity"; +const char *test_author = "Sang Yan sangyan@huawei.com"; + +int main(int argc, char **argv) +{ + cpu_set_t old; + cpu_set_t new; + + test_init(argc, argv); + + CPU_ZERO(&old); + CPU_ZERO(&new); + + /* test only 0 core because of CI test env limited */ + CPU_SET(0, &old); + + if (sched_setaffinity(getpid(), sizeof(old), &old) < 0) { + pr_perror("Can't set old cpu affinity! errno: %d", errno); + exit(1); + } + + test_daemon(); + test_waitsig(); + + if (sched_getaffinity(getpid(), sizeof(new), &new) < 0) { + pr_perror("Can't get new cpu affinity! errno: %d", errno); + exit(1); + } + + if (memcmp(&old, &new, sizeof(cpu_set_t))) + fail("Cpu affinity restore failed."); + else + pass(); + + return 0; +} diff --git a/test/zdtm/static/cpu-affinity0.desc b/test/zdtm/static/cpu-affinity0.desc new file mode 100644 index 0000000..0d0b8ae --- /dev/null +++ b/test/zdtm/static/cpu-affinity0.desc @@ -0,0 +1 @@ +{'dopts': '', 'ropts': '--with-cpu-affinity', 'flags': 'reqrst '}
From: anatasluo luolongjuna@gmail.com
When I compile criu with "make DEBUG=1" and run it to restore my program, it produces a segmentation fault.
In aarch64, with compile flag "-O0", when criu executes the code in pie, it is unable to visit the content of ARCH_VDSO_SYMBOLS. So I put these variables into the stack.
Signed-off-by: anatasluo luolongjuna@gmail.com --- criu/arch/aarch64/include/asm/vdso.h | 17 +++++++------- criu/arch/arm/include/asm/vdso.h | 9 +++++--- criu/arch/ppc64/include/asm/vdso.h | 34 +++++++++++++++++++--------- criu/arch/s390/include/asm/vdso.h | 17 +++++++++----- criu/arch/x86/include/asm/vdso.h | 23 +++++++++++++------ criu/pie/util-vdso.c | 2 ++ 6 files changed, 67 insertions(+), 35 deletions(-)
diff --git a/criu/arch/aarch64/include/asm/vdso.h b/criu/arch/aarch64/include/asm/vdso.h index 8a65e09..97a2440 100644 --- a/criu/arch/aarch64/include/asm/vdso.h +++ b/criu/arch/aarch64/include/asm/vdso.h @@ -16,15 +16,16 @@ * Workaround for VDSO array symbol table's relocation. * XXX: remove when compel/piegen will support aarch64. */ -static const char* __maybe_unused aarch_vdso_symbol1 = "__kernel_clock_getres"; -static const char* __maybe_unused aarch_vdso_symbol2 = "__kernel_clock_gettime"; -static const char* __maybe_unused aarch_vdso_symbol3 = "__kernel_gettimeofday"; -static const char* __maybe_unused aarch_vdso_symbol4 = "__kernel_rt_sigreturn"; +#define ARCH_VDSO_SYMBOLS_LIST \ + const char* aarch_vdso_symbol1 = "__kernel_clock_getres"; \ + const char* aarch_vdso_symbol2 = "__kernel_clock_gettime"; \ + const char* aarch_vdso_symbol3 = "__kernel_gettimeofday"; \ + const char* aarch_vdso_symbol4 = "__kernel_rt_sigreturn";
-#define ARCH_VDSO_SYMBOLS \ - aarch_vdso_symbol1, \ - aarch_vdso_symbol2, \ - aarch_vdso_symbol3, \ +#define ARCH_VDSO_SYMBOLS \ + aarch_vdso_symbol1, \ + aarch_vdso_symbol2, \ + aarch_vdso_symbol3, \ aarch_vdso_symbol4
extern void write_intraprocedure_branch(unsigned long to, unsigned long from); diff --git a/criu/arch/arm/include/asm/vdso.h b/criu/arch/arm/include/asm/vdso.h index f57790a..e96514e 100644 --- a/criu/arch/arm/include/asm/vdso.h +++ b/criu/arch/arm/include/asm/vdso.h @@ -11,8 +11,11 @@ */ #define VDSO_SYMBOL_MAX 2 #define VDSO_SYMBOL_GTOD 1 -#define ARCH_VDSO_SYMBOLS \ - "__vdso_clock_gettime", \ - "__vdso_gettimeofday" +#define ARCH_VDSO_SYMBOLS_LIST \ + const char* aarch_vdso_symbol1 = "__vdso_clock_gettime"; \ + const char* aarch_vdso_symbol2 = "__vdso_gettimeofday"; +#define ARCH_VDSO_SYMBOLS \ + aarch_vdso_symbol1, \ + aarch_vdso_symbol2,
#endif /* __CR_ASM_VDSO_H__ */ diff --git a/criu/arch/ppc64/include/asm/vdso.h b/criu/arch/ppc64/include/asm/vdso.h index 6c92348..fe04336 100644 --- a/criu/arch/ppc64/include/asm/vdso.h +++ b/criu/arch/ppc64/include/asm/vdso.h @@ -14,16 +14,28 @@ */ #define VDSO_SYMBOL_MAX 10 #define VDSO_SYMBOL_GTOD 5 -#define ARCH_VDSO_SYMBOLS \ - "__kernel_clock_getres", \ - "__kernel_clock_gettime", \ - "__kernel_get_syscall_map", \ - "__kernel_get_tbfreq", \ - "__kernel_getcpu", \ - "__kernel_gettimeofday", \ - "__kernel_sigtramp_rt64", \ - "__kernel_sync_dicache", \ - "__kernel_sync_dicache_p5", \ - "__kernel_time" +#define ARCH_VDSO_SYMBOLS_LIST \ + const char* aarch_vdso_symbol1 = "__kernel_clock_getres"; \ + const char* aarch_vdso_symbol2 = "__kernel_clock_gettime"; \ + const char* aarch_vdso_symbol3 = "__kernel_get_syscall_map"; \ + const char* aarch_vdso_symbol4 = "__kernel_get_tbfreq"; \ + const char* aarch_vdso_symbol5 = "__kernel_getcpu"; \ + const char* aarch_vdso_symbol6 = "__kernel_gettimeofday"; \ + const char* aarch_vdso_symbol7 = "__kernel_sigtramp_rt64"; \ + const char* aarch_vdso_symbol8 = "__kernel_sync_dicache"; \ + const char* aarch_vdso_symbol9 = "__kernel_sync_dicache_p5"; \ + const char* aarch_vdso_symbol10 = "__kernel_time"; + +#define ARCH_VDSO_SYMBOLS \ + aarch_vdso_symbol1, \ + aarch_vdso_symbol2, \ + aarch_vdso_symbol3, \ + aarch_vdso_symbol4, \ + aarch_vdso_symbol5, \ + aarch_vdso_symbol6, \ + aarch_vdso_symbol7, \ + aarch_vdso_symbol8, \ + aarch_vdso_symbol9, \ + aarch_vdso_symbol10
#endif /* __CR_ASM_VDSO_H__ */ diff --git a/criu/arch/s390/include/asm/vdso.h b/criu/arch/s390/include/asm/vdso.h index c54d848..ac71f59 100644 --- a/criu/arch/s390/include/asm/vdso.h +++ b/criu/arch/s390/include/asm/vdso.h @@ -12,13 +12,18 @@ #define VDSO_SYMBOL_GTOD 0
/* - * This definition is used in pie/util-vdso.c to initialize the vdso symbol + * These definitions are used in pie/util-vdso.c to initialize the vdso symbol * name string table 'vdso_symbols' */ -#define ARCH_VDSO_SYMBOLS \ - "__kernel_gettimeofday", \ - "__kernel_clock_gettime", \ - "__kernel_clock_getres", \ - "__kernel_getcpu" +#define ARCH_VDSO_SYMBOLS_LIST \ + const char* aarch_vdso_symbol1 = "__kernel_gettimeofday"; \ + const char* aarch_vdso_symbol2 = "__kernel_clock_gettime"; \ + const char* aarch_vdso_symbol3 = "__kernel_clock_getres"; \ + const char* aarch_vdso_symbol4 = "__kernel_getcpu"; +#define ARCH_VDSO_SYMBOLS \ + aarch_vdso_symbol1, \ + aarch_vdso_symbol2, \ + aarch_vdso_symbol3, \ + aarch_vdso_symbol4
#endif /* __CR_ASM_VDSO_H__ */ diff --git a/criu/arch/x86/include/asm/vdso.h b/criu/arch/x86/include/asm/vdso.h index 28ae2d1..54d1fba 100644 --- a/criu/arch/x86/include/asm/vdso.h +++ b/criu/arch/x86/include/asm/vdso.h @@ -35,13 +35,22 @@ * vsyscall will be patched again when addressing: * https://github.com/checkpoint-restore/criu/issues/512 */ -#define ARCH_VDSO_SYMBOLS \ - "__vdso_clock_gettime", \ - "__vdso_getcpu", \ - "__vdso_gettimeofday", \ - "__vdso_time", \ - "__kernel_sigreturn", \ - "__kernel_rt_sigreturn" + +#define ARCH_VDSO_SYMBOLS_LIST \ + const char* aarch_vdso_symbol1 = "__vdso_clock_gettime"; \ + const char* aarch_vdso_symbol2 = "__vdso_getcpu"; \ + const char* aarch_vdso_symbol3 = "__vdso_gettimeofday"; \ + const char* aarch_vdso_symbol4 = "__vdso_time"; \ + const char* aarch_vdso_symbol5 = "__kernel_sigreturn"; \ + const char* aarch_vdso_symbol6 = "__kernel_rt_sigreturn"; + +#define ARCH_VDSO_SYMBOLS \ + aarch_vdso_symbol1, \ + aarch_vdso_symbol2, \ + aarch_vdso_symbol3, \ + aarch_vdso_symbol4, \ + aarch_vdso_symbol5, \ + aarch_vdso_symbol6
/* "__kernel_vsyscall", */
diff --git a/criu/pie/util-vdso.c b/criu/pie/util-vdso.c index 58b2768..c717f2d 100644 --- a/criu/pie/util-vdso.c +++ b/criu/pie/util-vdso.c @@ -219,6 +219,8 @@ static void parse_elf_symbols(uintptr_t mem, size_t size, Phdr_t *load, struct vdso_symtable *t, uintptr_t dynsymbol_names, Hash_t *hash, Dyn_t *dyn_symtab) { + ARCH_VDSO_SYMBOLS_LIST + const char *vdso_symbols[VDSO_SYMBOL_MAX] = { ARCH_VDSO_SYMBOLS };
From: "fu.lin" fu.lin10@huawei.com
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: fu.lin fu.lin10@huawei.com --- criu/namespaces.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/criu/namespaces.c b/criu/namespaces.c index 796f412..9ffcd16 100644 --- a/criu/namespaces.c +++ b/criu/namespaces.c @@ -1294,10 +1294,10 @@ static int usernsd(int sk) }
unsc_msg_pid_fd(&um, &pid, &fd); - pr_debug("uns: daemon calls %p (%d, %d, %x)\n", call, pid, fd, flags); + pr_debug("uns: daemon calls (%d, %d, %x)\n", pid, fd, flags);
if (fd < 0 && flags & UNS_FDOUT) { - pr_err("uns: bad flags/fd %p %d %x\n", call, fd, flags); + pr_err("uns: bad flags/fd %d %x\n", fd, flags); BUG(); }
From: Jingxian He hejingxian@huawei.com
Add secure compilation options: -fstack-protector -fstack-protector-all -Wl,-z,relro,-z,now,-z,noexecstack
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: Fu Lin fulin10@huawei.com --- Makefile | 4 ++++ criu/Makefile | 2 +- criu/pie/Makefile | 1 + criu/pie/Makefile.library | 2 ++ lib/Makefile | 1 + lib/c/Makefile | 2 +- scripts/nmk/scripts/build.mk | 5 +++-- 7 files changed, 13 insertions(+), 4 deletions(-)
diff --git a/Makefile b/Makefile index c33494b..a9d7d94 100644 --- a/Makefile +++ b/Makefile @@ -80,6 +80,10 @@ ifeq ($(ARCH),mips) DEFINES := -DCONFIG_MIPS endif
+# secure compilation options +CFLAGS += -fstack-protector-all -fPIE +LDFLAGS += -pie + # # CFLAGS_PIE: # diff --git a/criu/Makefile b/criu/Makefile index ceb49ce..0fabffc 100644 --- a/criu/Makefile +++ b/criu/Makefile @@ -85,7 +85,7 @@ $(obj)/%: pie
$(obj)/criu: $(PROGRAM-BUILTINS) $(call msg-link, $@) - $(Q) $(CC) $(CFLAGS) $^ $(LIBS) $(WRAPFLAGS) $(LDFLAGS) $(GMONLDOPT) -rdynamic -o $@ + $(Q) $(CC) $(CFLAGS) $^ $(LIBS) $(WRAPFLAGS) $(LDFLAGS) $(GMONLDOPT) -rdynamic -o $@ -Wl,-z,relro,-z,now,-z,noexecstack -fPIE -pie
# diff --git a/criu/pie/Makefile b/criu/pie/Makefile index 265dcf8..40b5804 100644 --- a/criu/pie/Makefile +++ b/criu/pie/Makefile @@ -6,6 +6,7 @@ target := parasite restorer
CFLAGS := $(filter-out -pg $(CFLAGS-GCOV) $(CFLAGS-ASAN),$(CFLAGS)) CFLAGS += $(CFLAGS_PIE) +CFLAGS := $(filter-out -fstack-protector -fstack-protector-all,$(CFLAGS)) ccflags-y += -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0 ccflags-y += -Wp,-U_FORTIFY_SOURCE -Wp,-D_FORTIFY_SOURCE=0
diff --git a/criu/pie/Makefile.library b/criu/pie/Makefile.library index da2a2fa..c022d06 100644 --- a/criu/pie/Makefile.library +++ b/criu/pie/Makefile.library @@ -27,3 +27,5 @@ CFLAGS += $(CFLAGS_PIE) ifeq ($(ARCH),mips) CFLAGS += -fno-stack-protector -DCR_NOGLIBC -mno-abicalls -fno-pic endif + +CFLAGS := $(filter-out -fstack-protector -fstack-protector-all,$(CFLAGS)) diff --git a/lib/Makefile b/lib/Makefile index f9b6670..bc1b513 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -14,6 +14,7 @@ lib/c/Makefile: ; lib/c/%: .FORCE $(Q) $(MAKE) $(build)=lib/c $@
+CFLAGS := $(filter-out -fPIE,$(CFLAGS)) cflags-so += $(CFLAGS) -rdynamic -Wl,-soname,$(CRIU_SO).$(CRIU_SO_VERSION_MAJOR) ldflags-so += -lprotobuf-c
diff --git a/lib/c/Makefile b/lib/c/Makefile index af01467..d7f6491 100644 --- a/lib/c/Makefile +++ b/lib/c/Makefile @@ -4,5 +4,5 @@ obj-y += ./images/rpc.pb-c.o ccflags-y += -iquote criu/$(ARCH_DIR)/include ccflags-y += -iquote criu/include ccflags-y += -iquote images -ccflags-y += -fPIC -fno-stack-protector +ccflags-y += -fPIC ldflags-y += -r -z noexecstack diff --git a/scripts/nmk/scripts/build.mk b/scripts/nmk/scripts/build.mk index d01d2b7..6f366d7 100644 --- a/scripts/nmk/scripts/build.mk +++ b/scripts/nmk/scripts/build.mk @@ -15,8 +15,9 @@ lib-name := lib-target := hostprogs-y := libso-y := -ld_flags := -ldflags-so := +ld_flags := -Wl,-z,relro,-z,now,-z,noexecstack +ldflags-so := -Wl,-z,relro,-z,now,-z,noexecstack +ldflags-y := -z relro -z now -z noexecstack arflags-y := target := deps-y :=
From: "fu.lin" fu.lin10@huawei.com
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: fu.lin fu.lin10@huawei.com --- criu/tty.c | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/criu/tty.c b/criu/tty.c index dee8d46..b34cfc2 100644 --- a/criu/tty.c +++ b/criu/tty.c @@ -2023,6 +2023,11 @@ static int dump_one_tty(int lfd, u32 id, const struct fd_parms *p) pr_info("Dumping tty %d with id %#x\n", lfd, id);
driver = get_tty_driver(p->stat.st_rdev, p->stat.st_dev); + if (driver == NULL) { + pr_err("Can't get tty driver\n"); + return -1; + } + if (driver->fd_get_index) index = driver->fd_get_index(lfd, p); else
Signed-off-by: fu.lin fulin10@huawei.com --- test/zdtm/static/socket-tcp-nfconntrack.desc | 2 +- test/zdtm/static/socket-tcp.c | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-)
diff --git a/test/zdtm/static/socket-tcp-nfconntrack.desc b/test/zdtm/static/socket-tcp-nfconntrack.desc index add2513..05bdb49 100644 --- a/test/zdtm/static/socket-tcp-nfconntrack.desc +++ b/test/zdtm/static/socket-tcp-nfconntrack.desc @@ -1 +1 @@ -{'flavor': 'h', 'opts': '--tcp-established', 'flags': 'suid'} +{'flavor': 'h', 'opts': '--tcp-established', 'flags': 'suid excl'} diff --git a/test/zdtm/static/socket-tcp.c b/test/zdtm/static/socket-tcp.c index 5158fe3..f73b504 100644 --- a/test/zdtm/static/socket-tcp.c +++ b/test/zdtm/static/socket-tcp.c @@ -57,6 +57,13 @@ int write_data(int fd, const unsigned char *buf, int size) return 0; }
+#ifdef ZDTM_CONNTRACK +static void ipt_flush(void) +{ + system("iptables -w --flush"); +} +#endif + int main(int argc, char **argv) { unsigned char buf[BUF_SIZE]; @@ -72,6 +79,12 @@ int main(int argc, char **argv) pr_perror("unshare"); return 1; } + + if (atexit(ipt_flush) != 0) { + pr_perror("atexit"); + return 1; + } + if (system("ip link set up dev lo")) return 1; if (system("iptables -w -A INPUT -i lo -p tcp -m state --state NEW,ESTABLISHED -j ACCEPT"))
This case sometimes will cause SIGILL signal in arm64 platform.
<<ARM Coretex-A series Programmer's Guide for ARMv8-A>> notes: The ARM architecture does not require the hardware to ensure coherency between instruction caches and memory, even for locations of shared memory.
Therefore, we need flush dcache and icache for self-modifying code.
- https://developer.arm.com/documentation/den0024/a/Caches/Point-of-coherency-...
Signed-off-by: fu.lin fulin10@huawei.com --- test/zdtm/static/maps00.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/test/zdtm/static/maps00.c b/test/zdtm/static/maps00.c index f2da9b9..83533f8 100644 --- a/test/zdtm/static/maps00.c +++ b/test/zdtm/static/maps00.c @@ -173,7 +173,8 @@ static int check_map(struct map *map) if (!sigsetjmp(segv_ret, 1)) { if (map->prot & PROT_WRITE) { - memcpy(map->ptr,test_func, getpagesize()); + memcpy(map->ptr,test_func, ONE_MAP_SIZE); + __builtin___clear_cache(map->ptr, map->ptr+ONE_MAP_SIZE); } else { if (!(map->flag & MAP_ANONYMOUS)) { uint8_t funlen = (uint8_t *)check_map - (uint8_t *)test_func; @@ -184,14 +185,15 @@ static int check_map(struct map *map) } } } - if (!(map->flag & MAP_ANONYMOUS) || map->prot & PROT_WRITE) + if (!(map->flag & MAP_ANONYMOUS) || (map->prot & PROT_WRITE)) { /* Function body has been copied into the mapping */ ((int (*)(void))map->ptr)(); /* perform exec access */ - else + } else { /* No way to copy function body into mapping, * clear exec bit from effective protection */ prot &= PROT_WRITE | PROT_READ | !PROT_EXEC; + } } else prot &= PROT_WRITE | PROT_READ | !PROT_EXEC;
From: Jingxian He hejingxian@huawei.com
Add pin memory method for criu to improve memory recover speed and avoid user private data saving to files.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: Jingxian He hejingxian@huawei.com --- criu/config.c | 1 + criu/cr-dump.c | 5 ++ criu/cr-restore.c | 5 ++ criu/crtools.c | 3 +- criu/include/cr_options.h | 1 + criu/include/mem.h | 2 + criu/include/restorer.h | 28 ++++++++ criu/mem.c | 130 +++++++++++++++++++++++++++++++++++++- criu/pie/restorer.c | 25 +++++++- criu/seize.c | 6 ++ 10 files changed, 203 insertions(+), 3 deletions(-)
diff --git a/criu/config.c b/criu/config.c index 5a53256..61b81fa 100644 --- a/criu/config.c +++ b/criu/config.c @@ -542,6 +542,7 @@ int parse_options(int argc, char **argv, bool *usage_error, { "pre-dump-mode", required_argument, 0, 1097}, { "file-validation", required_argument, 0, 1098 }, BOOL_OPT("with-cpu-affinity", &opts.with_cpu_affinity), + BOOL_OPT("pin-memory", &opts.pin_memory), { }, };
diff --git a/criu/cr-dump.c b/criu/cr-dump.c index f078c27..8575516 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -1778,6 +1778,11 @@ static int cr_dump_finish(int ret) close_service_fd(CR_PROC_FD_OFF); close_image_dir();
+ if (ret == 0 && opts.pin_memory) { + pr_info("start restore_task_special_pages\n"); + restore_task_special_pages(0); + } + if (ret) { pr_err("Dumping FAILED.\n"); } else { diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 1374a69..27f3c54 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -3869,6 +3869,11 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns task_args->clone_restore_fn, task_args->thread_args);
+ if (opts.pin_memory) + task_args->pin_memory = true; + else + task_args->pin_memory = false; + /* * An indirect call to task_restore, note it never returns * and restoring core is extremely destructive. diff --git a/criu/crtools.c b/criu/crtools.c index 949dc9f..7bda86d 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -441,8 +441,9 @@ usage: " --file-validation METHOD\n" " pass the validation method to be used; argument\n" " can be 'filesize' or 'buildid' (default).\n" -" --with-cpu-affinity Allow to restore cpu affinity. Only for hosts with\n" +" --with-cpu-affinity Allow to restore cpu affinity. Only for hosts with\n" " same cpu quantity.\n" +" --pin-memory Use pin memory method for checkpoint and restore.\n" "\n" "Check options:\n" " Without options, "criu check" checks availability of absolutely required\n" diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index fda54a4..a4dc5b8 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -176,6 +176,7 @@ struct cr_options { int file_validation_method; /* restore cpu affinity */ int with_cpu_affinity; + int pin_memory; };
extern struct cr_options opts; diff --git a/criu/include/mem.h b/criu/include/mem.h index 251cb1a..3b3fdf8 100644 --- a/criu/include/mem.h +++ b/criu/include/mem.h @@ -50,4 +50,6 @@ int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta); int unmap_guard_pages(struct pstree_item *t); int prepare_mappings(struct pstree_item *t); bool should_dump_page(VmaEntry *vmae, u64 pme); +int dump_task_special_pages(int pid); +int restore_task_special_pages(int pid); #endif /* __CR_MEM_H__ */ diff --git a/criu/include/restorer.h b/criu/include/restorer.h index bd6ef6a..9614720 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -225,6 +225,7 @@ struct task_restore_args { int lsm_type; int child_subreaper; bool has_clone3_set_tid; + bool pin_memory; } __aligned(64);
/* @@ -317,4 +318,31 @@ enum { #define __r_sym(name) restorer_sym ## name #define restorer_sym(rblob, name) (void*)(rblob + __r_sym(name))
+#define PIN_MEM_FILE "/dev/pinmem" +#define PIN_MEM_MAGIC 0x59 +#define _SET_PIN_MEM_AREA 1 +#define _CLEAR_PIN_MEM_AREA 2 +#define _REMAP_PIN_MEM_AREA 3 +#define _DUMP_SEPCIAL_PAGES 6 +#define _RETORE_SEPCIAL_PAGES 7 +#define SET_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _SET_PIN_MEM_AREA, struct pin_mem_area_set) +#define CLEAR_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _CLEAR_PIN_MEM_AREA, int) +#define REMAP_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _REMAP_PIN_MEM_AREA, int) +#define DUMP_SEPCIAL_PAGES _IOW(PIN_MEM_MAGIC, _DUMP_SEPCIAL_PAGES, int) +#define RETORE_SEPCIAL_PAGES _IOW(PIN_MEM_MAGIC, _RETORE_SEPCIAL_PAGES, int) + +#define ONCE_PIN_MEM_SIZE_LIMIT 32 * 1024 * 1024 +#define MAX_PIN_MEM_AREA_NUM 16 + +struct pin_mem_area { + unsigned long virt_start; + unsigned long virt_end; +}; + +struct pin_mem_area_set { + unsigned int pid; + unsigned int area_num; + struct pin_mem_area mem_area[MAX_PIN_MEM_AREA_NUM]; +}; + #endif /* __CR_RESTORER_H__ */ diff --git a/criu/mem.c b/criu/mem.c index 167838b..2eabb8d 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -438,6 +438,119 @@ again: return ret; }
+bool should_pin_vmae(VmaEntry *vmae) +{ + /* + * vDSO area must be always dumped because on restore + * we might need to generate a proxy. + */ + if (vma_entry_is(vmae, VMA_AREA_VDSO)) + return false; + /* + * In turn VVAR area is special and referenced from + * vDSO area by IP addressing (at least on x86) thus + * never ever dump its content but always use one provided + * by the kernel on restore, ie runtime VVAR area must + * be remapped into proper place.. + */ + if (vma_entry_is(vmae, VMA_AREA_VVAR)) + return false; + + if (vma_entry_is(vmae, VMA_AREA_AIORING)) + return false; + if (vma_entry_is(vmae, VMA_ANON_PRIVATE)) + return true; + + return false; +} + +static int pin_one_pmas(int fd, unsigned long start, + unsigned long *pend, struct pstree_item *item) +{ + int ret; + unsigned int index = 0; + unsigned long end; + unsigned long next = start; + struct pin_mem_area_set pmas; + struct pin_mem_area *pma; + + end = *pend; + while (start < end) { + next = (start + ONCE_PIN_MEM_SIZE_LIMIT > end) ? end : (start + ONCE_PIN_MEM_SIZE_LIMIT); + pma = &(pmas.mem_area[index]); + pma->virt_start = start; + pma->virt_end = next; + index++; + start += ONCE_PIN_MEM_SIZE_LIMIT; + if (index >= MAX_PIN_MEM_AREA_NUM) + break; + } + *pend = next; + pmas.area_num = index; + pmas.pid = vpid(item); + ret = ioctl(fd, SET_PIN_MEM_AREA, &pmas); + if (ret < 0) + pr_err("pin mem fail, errno: %s\n", strerror(errno)); + return ret; +} +static int pin_vmae(VmaEntry *vmae, struct pstree_item *item) +{ + int fd; + int ret = 0; + unsigned long start, end; + + fd = open(PIN_MEM_FILE, O_RDWR); + if (fd < 0) { + pr_err("open file: %s fail.\n", PIN_MEM_FILE); + return -1; + } + start = vmae->start; + while (start < vmae->end) { + end = vmae->end; + ret = pin_one_pmas(fd, start, &end, item); + if (ret < 0) + break; + start = end; + } + close(fd); + return ret; +} + +int dump_task_special_pages(int pid) +{ + int fd, ret; + + fd = open(PIN_MEM_FILE, O_RDWR, 0); + if (fd < 0) { + pr_warn("error open file: %s\n", PIN_MEM_FILE); + return -1; + } + ret = ioctl(fd, DUMP_SEPCIAL_PAGES, (unsigned long) &pid); + if (ret < 0) { + pr_warn("No need DUMP_SEPCIAL_PAGES for %d\n", pid); + } + close(fd); + return ret; +} + +int restore_task_special_pages(int pid) +{ + int fd, ret; + + fd = open(PIN_MEM_FILE, O_RDWR, 0); + if (fd < 0) { + pr_warn("error open file: %s\n", PIN_MEM_FILE); + return -1; + } + ret = ioctl(fd, RETORE_SEPCIAL_PAGES, (unsigned long) &pid); + if (ret < 0) { + pr_warn("No need RETORE_SEPCIAL_PAGES for %d\n", pid); + } + close(fd); + return ret; +} + + static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasite_dump_pages_args *args, struct vm_area_list *vma_area_list, @@ -513,7 +626,18 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, if (possible_pid_reuse == -1) goto out_xfer; } - + if (opts.pin_memory) { + /* pin memory before dump pages */ + list_for_each_entry(vma_area, &vma_area_list->h, list) { + if (should_pin_vmae(vma_area->e)) { + ret = pin_vmae(vma_area->e, item); + if (ret) { + exit_code = -1; + goto out_xfer; + } + } + } + }
/* * Step 1 -- generate the pagemap @@ -524,6 +648,10 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, parent_predump_mode = mdc->parent_ie->pre_dump_mode;
list_for_each_entry(vma_area, &vma_area_list->h, list) { + if (opts.pin_memory && should_pin_vmae(vma_area->e)) { + continue; + } + ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl, &pmc, has_parent, mdc->pre_dump, parent_predump_mode); diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index c63f96b..1565e3c 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1414,6 +1414,24 @@ int cleanup_current_inotify_events(struct task_restore_args *task_args) return 0; }
+int remap_vmas(int pid) +{ + int fd, ret = 0; + + fd = sys_open(PIN_MEM_FILE, O_RDWR, 0); + if (fd == -1) { + pr_err("open file: %s fail.\n", PIN_MEM_FILE); + return -1;; + } + + ret = sys_ioctl(fd, REMAP_PIN_MEM_AREA, (unsigned long) &pid); + if (ret < 0) + pr_err("remap pin mem fail for pid: %d\n", pid); + sys_close(fd); + return ret; +} + + /* * The main routine to restore task via sigreturn. * This one is very special, we never return there @@ -1585,7 +1603,12 @@ long __export_restore_task(struct task_restore_args *args) goto core_restore_end; } } - + if (args->pin_memory) { + if (remap_vmas(my_pid) < 0) { + pr_err("Remap vmas fail\n"); + goto core_restore_end; + } + } /* * Now read the contents (if any) */ diff --git a/criu/seize.c b/criu/seize.c index f973806..a661097 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -23,6 +23,7 @@ #include "string.h" #include "xmalloc.h" #include "util.h" +#include "mem.h"
#define NR_ATTEMPTS 5
@@ -655,6 +656,11 @@ static void unseize_task_and_threads(const struct pstree_item *item, int st) if (item->pid->state == TASK_DEAD) return;
+ if (opts.pin_memory) { + for (i = 0; i < item->nr_threads; i++) + dump_task_special_pages(item->threads[i].real); + } + /* * The st is the state we want to switch tasks into, * the item->state is the state task was in when we seized one.
From: Jingxian He hejingxian@huawei.com
The default pid recover method cannot recover the task pid at every time. We add a new pid recover method by setting the fork_pid of the parent task struct, add the kernel will alloc pid by the fork_pid. The new pid recover method can also avoid other tasks using the dumping task pids.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: Jingxian He hejingxian@huawei.com --- criu/config.c | 1 + criu/cr-restore.c | 25 ++++++++++++++++++++++++- criu/crtools.c | 1 + criu/include/cr_options.h | 1 + criu/include/restorer.h | 3 +++ criu/pie/restorer.c | 25 ++++++++++++++++++++++++- 6 files changed, 54 insertions(+), 2 deletions(-)
diff --git a/criu/config.c b/criu/config.c index 61b81fa..a5bcf10 100644 --- a/criu/config.c +++ b/criu/config.c @@ -543,6 +543,7 @@ int parse_options(int argc, char **argv, bool *usage_error, { "file-validation", required_argument, 0, 1098 }, BOOL_OPT("with-cpu-affinity", &opts.with_cpu_affinity), BOOL_OPT("pin-memory", &opts.pin_memory), + BOOL_OPT("use-fork-pid", &opts.use_fork_pid), { }, };
diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 27f3c54..e050b88 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1365,6 +1365,23 @@ static int set_next_pid(void *arg) return 0; }
+static int write_fork_pid(int pid) +{ + int fd, ret; + + fd = open(PIN_MEM_FILE, O_RDWR); + if (fd < 0) { + pr_warn("error open file: %s\n", PIN_MEM_FILE); + return -1; + } + ret = ioctl(fd, SET_FORK_PID, &pid); + if (ret < 0) { + pr_warn("write fork pid fail, errno: %s\n", strerror(errno)); + } + close(fd); + return ret; +} + static inline int fork_with_pid(struct pstree_item *item) { unsigned long clone_flags; @@ -1462,7 +1479,7 @@ static inline int fork_with_pid(struct pstree_item *item) if (!(clone_flags & CLONE_NEWPID)) { lock_last_pid();
- if (!kdat.has_clone3_set_tid) { + if (!kdat.has_clone3_set_tid && !opts.use_fork_pid) { if (pid_ns && pid_ns->ext_key) { /* * Restoring into another namespace requires a helper @@ -1495,6 +1512,11 @@ static inline int fork_with_pid(struct pstree_item *item) ~(CLONE_NEWNET | CLONE_NEWCGROUP | CLONE_NEWTIME)), SIGCHLD, pid); } else { + if (opts.use_fork_pid) { + ret = write_fork_pid(pid); + if (ret < 0) + goto err_unlock; + } /* * Some kernel modules, such as network packet generator * run kernel thread upon net-namespace creation taking @@ -3873,6 +3895,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns task_args->pin_memory = true; else task_args->pin_memory = false; + task_args->use_fork_pid = opts.use_fork_pid ? true : false;
/* * An indirect call to task_restore, note it never returns diff --git a/criu/crtools.c b/criu/crtools.c index 7bda86d..9b3ef33 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -444,6 +444,7 @@ usage: " --with-cpu-affinity Allow to restore cpu affinity. Only for hosts with\n" " same cpu quantity.\n" " --pin-memory Use pin memory method for checkpoint and restore.\n" +" --use-fork-pid Allow to restore task pid by setting fork pid of task struct.\n" "\n" "Check options:\n" " Without options, "criu check" checks availability of absolutely required\n" diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index a4dc5b8..7fad678 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -177,6 +177,7 @@ struct cr_options { /* restore cpu affinity */ int with_cpu_affinity; int pin_memory; + int use_fork_pid; };
extern struct cr_options opts; diff --git a/criu/include/restorer.h b/criu/include/restorer.h index 9614720..8fd47e2 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -226,6 +226,7 @@ struct task_restore_args { int child_subreaper; bool has_clone3_set_tid; bool pin_memory; + bool use_fork_pid; } __aligned(64);
/* @@ -325,11 +326,13 @@ enum { #define _REMAP_PIN_MEM_AREA 3 #define _DUMP_SEPCIAL_PAGES 6 #define _RETORE_SEPCIAL_PAGES 7 +#define _SET_FORK_PID 8 #define SET_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _SET_PIN_MEM_AREA, struct pin_mem_area_set) #define CLEAR_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _CLEAR_PIN_MEM_AREA, int) #define REMAP_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _REMAP_PIN_MEM_AREA, int) #define DUMP_SEPCIAL_PAGES _IOW(PIN_MEM_MAGIC, _DUMP_SEPCIAL_PAGES, int) #define RETORE_SEPCIAL_PAGES _IOW(PIN_MEM_MAGIC, _RETORE_SEPCIAL_PAGES, int) +#define SET_FORK_PID _IOW(PIN_MEM_MAGIC, _SET_FORK_PID, int)
#define ONCE_PIN_MEM_SIZE_LIMIT 32 * 1024 * 1024 #define MAX_PIN_MEM_AREA_NUM 16 diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 1565e3c..4ab8a45 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1431,6 +1431,22 @@ int remap_vmas(int pid) return ret; }
+int write_fork_pid(int pid) +{ + int fd, ret; + + fd = sys_open(PIN_MEM_FILE, O_RDWR, 0); + if (fd < 0) { + pr_warn("error open file: %s\n", PIN_MEM_FILE); + return -1; + } + ret = sys_ioctl(fd, SET_FORK_PID, (unsigned long) &pid); + if (ret < 0) { + pr_warn("write fork pid fail fail: %d\n", pid); + } + sys_close(fd); + return ret; +}
/* * The main routine to restore task via sigreturn. @@ -1834,7 +1850,7 @@ long __export_restore_task(struct task_restore_args *args) long parent_tid; int i, fd = -1;
- if (!args->has_clone3_set_tid) { + if (!args->has_clone3_set_tid && !args->use_fork_pid) { /* One level pid ns hierarhy */ fd = sys_openat(args->proc_fd, LAST_PID_PATH, O_RDWR, 0); if (fd < 0) { @@ -1866,6 +1882,13 @@ long __export_restore_task(struct task_restore_args *args) c_args.parent_tid = ptr_to_u64(&parent_tid); pr_debug("Using clone3 to restore the process\n"); RUN_CLONE3_RESTORE_FN(ret, c_args, sizeof(c_args), &thread_args[i], args->clone_restore_fn); + } else if (args->use_fork_pid) { + if (write_fork_pid(thread_args[i].pid) < 0) { + pr_err("Clone fail with fork pid\n"); + mutex_unlock(&task_entries_local->last_pid_mutex); + goto core_restore_end; + } + RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, thread_args, args->clone_restore_fn); } else { last_pid_len = std_vprint_num(last_pid_buf, sizeof(last_pid_buf), thread_args[i].pid - 1, &s); sys_lseek(fd, 0, SEEK_SET);
Signed-off-by: fu.lin fulin10@huawei.com --- test/zdtm.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/test/zdtm.py b/test/zdtm.py index dff64d4..bd44ad1 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- # vim: noet ts=8 sw=8 sts=8 from __future__ import absolute_import, division, print_function, unicode_literals
@@ -2102,7 +2103,8 @@ class Launcher:
if self.__fail: print_sep("FAIL", "#") - sys.exit(1) + + return self.__fail
def all_tests(opts): @@ -2356,10 +2358,11 @@ def run_tests(opts): else: launcher.skip(t, "no flavors") finally: - launcher.finish() + fail = launcher.finish() if opts['join_ns']: subprocess.Popen(["ip", "netns", "delete", "zdtm_netns"]).wait() - + if fail: + sys.exit(1)
sti_fmt = "%-40s%-10s%s"
From: Jingxian He hejingxian@huawei.com
criu checkpoint/restore the task, it only restore the context instead of the memory address storing the context.
To handle the problem resulted by CVE bugfix, details: - https://nvd.nist.gov/vuln/detail/CVE-2016-4565 - https://openfabrics.org/images/2018workshop/presentations/113_MRuhl_Journeyt...
Brief: Refresh the security context address of file. The infiniband code use write()` as bi-directional `ioctl()`, there is `struct cred` address uring `write()` process. However, criu uses some syscall, such as capset()` and `setgroups()`, to regenerate the new cred, the file red is fixed by `fcntl(F_SETOWN)`, then the address of new cred is ifferent from the file. This patch fix the `struct cred` address checking problem resulted by VE fixed in infiniband drivers.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: luolongjun luolongjun@huawei.com Signed-off-by: fu.lin fu.lin10@huawei.com --- criu/config.c | 1 + criu/cr-restore.c | 35 +++++++++++++++++++++++++++++++++++ criu/crtools.c | 2 ++ criu/include/cr_options.h | 1 + criu/include/fcntl.h | 4 ++++ criu/include/prctl.h | 4 ++++ criu/include/restorer.h | 3 +++ criu/pie/restorer.c | 38 ++++++++++++++++++++++++++++++++++++++ 8 files changed, 88 insertions(+)
diff --git a/criu/config.c b/criu/config.c index e1de191..4d2b709 100644 --- a/criu/config.c +++ b/criu/config.c @@ -545,6 +545,7 @@ int parse_options(int argc, char **argv, bool *usage_error, BOOL_OPT("pin-memory", &opts.pin_memory), BOOL_OPT("use-fork-pid", &opts.use_fork_pid), BOOL_OPT("with-notifier", &opts.with_notifier_kup), + BOOL_OPT("with-fd-cred", &opts.with_fd_cred), { }, };
diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 1e2ed9a..05de2ef 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -709,6 +709,28 @@ static int __collect_child_pids(struct pstree_item *p, int state, unsigned int * return 0; }
+static int collect_child_fds(int state, unsigned int *n, struct pstree_item *me) +{ + struct list_head *list = &rsti(me)->fds; + struct fdinfo_list_entry *fle, *tmp; + + *n = 0; + list_for_each_entry_safe(fle, tmp, list, ps_list) { + if (fle->fe->type == state) { + int *child; + + child = rst_mem_alloc(sizeof(*child), RM_PRIVATE); + if (!child) + return -1; + + (*n)++; + *child = fle->fe->fd; + } + } + + return 0; +} + static int collect_child_pids(int state, unsigned int *n) { struct pstree_item *pi; @@ -733,6 +755,12 @@ static int collect_child_pids(int state, unsigned int *n) return __collect_child_pids(current, state, n); }
+static int collect_chr_fds(struct pstree_item *me, struct task_restore_args *ta) +{ + ta->setcred_pids = (int *)rst_mem_align_cpos(RM_PRIVATE); + return collect_child_fds(FD_TYPES__CHR, &ta->setcred_pids_n, me); +} + static int collect_helper_pids(struct task_restore_args *ta) { ta->helpers = (pid_t *)rst_mem_align_cpos(RM_PRIVATE); @@ -938,6 +966,9 @@ static int restore_one_alive_task(int pid, CoreEntry *core) if (collect_zombie_pids(ta) < 0) return -1;
+ if (opts.with_fd_cred && collect_chr_fds(current, ta) < 0) + return -1; + if (collect_inotify_fds(ta) < 0) return -1;
@@ -3723,6 +3754,10 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns RST_MEM_FIXUP_PPTR(task_args->helpers); RST_MEM_FIXUP_PPTR(task_args->zombies); RST_MEM_FIXUP_PPTR(task_args->vma_ios); + if (opts.with_fd_cred) + RST_MEM_FIXUP_PPTR(task_args->setcred_pids); + else + task_args->setcred_pids_n = UINT_MAX; RST_MEM_FIXUP_PPTR(task_args->inotify_fds);
task_args->compatible_mode = core_is_compat(core); diff --git a/criu/crtools.c b/criu/crtools.c index d53be3d..942e683 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -448,6 +448,8 @@ usage: " --with-notifier Allow to checkout/restore kup notifier chain. This\n" " feature needs the kernel's assistance.\n" " Only for the host with these feature.\n" +" --with-fd-cred Allow to make the restored process has the same cred\n" +" as checkout assisted by kernel.\n" "\n" "Check options:\n" " Without options, "criu check" checks availability of absolutely required\n" diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index 1acb5ef..5b0ff24 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -179,6 +179,7 @@ struct cr_options { int pin_memory; int use_fork_pid; int with_notifier_kup; + int with_fd_cred; };
extern struct cr_options opts; diff --git a/criu/include/fcntl.h b/criu/include/fcntl.h index ea9d48c..0936337 100644 --- a/criu/include/fcntl.h +++ b/criu/include/fcntl.h @@ -19,6 +19,10 @@ struct f_owner_ex { #define F_GETOWNER_UIDS 17 #endif
+#ifndef F_SETCRED +#define F_SETCRED 18 +#endif + /* * These things are required to compile on CentOS-6 */ diff --git a/criu/include/prctl.h b/criu/include/prctl.h index 8e7fef3..ecbc69a 100644 --- a/criu/include/prctl.h +++ b/criu/include/prctl.h @@ -82,4 +82,8 @@ struct prctl_mm_map { # define PR_GET_THP_DISABLE 42 #endif
+#ifndef PR_DEFAULT_CRED +# define PR_DEFAULT_CRED 54 +#endif + #endif /* __CR_PRCTL_H__ */ diff --git a/criu/include/restorer.h b/criu/include/restorer.h index 7152b34..4afff1b 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -180,6 +180,9 @@ struct task_restore_args { pid_t *zombies; unsigned int zombies_n;
+ int *setcred_pids; + unsigned int setcred_pids_n; + int *inotify_fds; /* fds to cleanup inotify events at CR_STATE_RESTORE_SIGCHLD stage */ unsigned int inotify_fds_n;
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index a6245e4..2173c5e 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -78,6 +78,7 @@ static struct task_entries *task_entries_local; static futex_t thread_inprogress; static futex_t thread_start; +static futex_t cred_set; static pid_t *helpers; static int n_helpers; static pid_t *zombies; @@ -345,6 +346,41 @@ static int restore_creds(struct thread_creds_args *args, int procfd, return 0; }
+static int update_cred_ref(struct task_restore_args *ta) +{ + int i; + int ret; + int pid = sys_getpid(); + long int tid = sys_gettid(); + + if (ta->setcred_pids_n == UINT_MAX) { + pr_info("no need to keep the same cred \n"); + return 0; + } + + if (pid == tid) { + /* let main thread finish cred update first */ + ret = sys_prctl(PR_DEFAULT_CRED, 0, 0, 0, 0); + pr_info("main cred restore \n"); + futex_set_and_wake(&cred_set, 1); + } else { + futex_wait_until(&cred_set, 1); + pr_info("other cred restore \n"); + ret = sys_prctl(PR_DEFAULT_CRED, 0, 0, 0, 0); + } + + if (ret) + return ret; + + pr_info("%ld (%d) is going to update current cred \n", tid, pid); + + for (i = 0; i < ta->setcred_pids_n; i++) { + sys_fcntl(ta->setcred_pids[i], F_SETCRED, 0); + } + + return 0; +} + /* * This should be done after creds restore, as * some creds changes might drop the value back @@ -708,6 +744,7 @@ long __export_restore_thread(struct thread_restore_args *args)
ret = restore_creds(args->creds_args, args->ta->proc_fd, args->ta->lsm_type); + ret = ret || update_cred_ref(args->ta); ret = ret || restore_dumpable_flag(&args->ta->mm); ret = ret || restore_pdeath_sig(args); if (ret) @@ -2099,6 +2136,7 @@ long __export_restore_task(struct task_restore_args *args) */ ret = restore_creds(args->t->creds_args, args->proc_fd, args->lsm_type); + ret = ret || update_cred_ref(args); ret = ret || restore_dumpable_flag(&args->mm); ret = ret || restore_pdeath_sig(args->t); ret = ret || restore_child_subreaper(args->child_subreaper);
From: Jingxian He hejingxian@huawei.com
Add notifier calling method for checkpoint and restore during kernel module upgrading.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: Xiaoguang Li lixiaoguang2@huawei.com Signed-off-by: He Jingxian hejingxian@huawei.com Signed-off-by: fu.lin fu.lin10@huawei.com --- criu/config.c | 1 + criu/cr-dump.c | 33 ++++++++++ criu/cr-restore.c | 22 ++++++- criu/crtools.c | 3 + criu/include/cr_options.h | 1 + criu/include/restorer.h | 1 + criu/include/util.h | 42 ++++++++++++ criu/pie/restorer.c | 135 ++++++++++++++++++++++++++++++++++---- criu/pie/util.c | 91 +++++++++++++++++++++++++ include/common/lock.h | 4 ++ 10 files changed, 319 insertions(+), 14 deletions(-)
diff --git a/criu/config.c b/criu/config.c index a5bcf10..e1de191 100644 --- a/criu/config.c +++ b/criu/config.c @@ -544,6 +544,7 @@ int parse_options(int argc, char **argv, bool *usage_error, BOOL_OPT("with-cpu-affinity", &opts.with_cpu_affinity), BOOL_OPT("pin-memory", &opts.pin_memory), BOOL_OPT("use-fork-pid", &opts.use_fork_pid), + BOOL_OPT("with-notifier", &opts.with_notifier_kup), { }, };
diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 8575516..96c0cd3 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -1699,6 +1699,8 @@ static int cr_lazy_mem_dump(void) return ret; }
+static enum notifier_state notifier_state = NOTHING_COMPLETE; + static int cr_dump_finish(int ret) { int post_dump_ret = 0; @@ -1783,6 +1785,20 @@ static int cr_dump_finish(int ret) restore_task_special_pages(0); }
+ if (ret != 0 && opts.with_notifier_kup) { + pr_info("call notifier rollback\n"); + switch (notifier_state) { + case PRE_FREEZE_COMPLETE: + notifier_kup(PRE_FREEZE, ROLLBACK, true); + break; + case FREEZE_TO_KILL_COMPLETE: + notifier_kup(FREEZE_TO_KILL, ROLLBACK, true); + break; + default: + break; + } + } + if (ret) { pr_err("Dumping FAILED.\n"); } else { @@ -1816,6 +1832,14 @@ int cr_dump_tasks(pid_t pid) goto err; root_item->pid->real = pid;
+ if (notifier_kup(PRE_FREEZE, PREPARE, opts.with_notifier_kup)) { + /* disable rollback function because we has already rollbacked. */ + opts.with_notifier_kup = false; + pr_err("call notifier: %d err\n", PRE_FREEZE); + goto err; + } else + notifier_state = PRE_FREEZE_COMPLETE; + pre_dump_ret = run_scripts(ACT_PRE_DUMP); if (pre_dump_ret != 0) { pr_err("Pre dump script failed with %d!\n", pre_dump_ret); @@ -1971,6 +1995,15 @@ int cr_dump_tasks(pid_t pid) ret = write_img_inventory(&he); if (ret) goto err; + + ret = notifier_kup(FREEZE_TO_KILL, PREPARE, opts.with_notifier_kup); + if (ret) { + opts.with_notifier_kup = false; + pr_err("call notifier:%d err\n", FREEZE_TO_KILL); + goto err; + } else + notifier_state = FREEZE_TO_KILL_COMPLETE; + err: if (parent_ie) inventory_entry__free_unpacked(parent_ie, NULL); diff --git a/criu/cr-restore.c b/criu/cr-restore.c index e050b88..1e2ed9a 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1977,8 +1977,10 @@ static int restore_task_with_children(void *_arg) return 0;
err: - if (current->parent == NULL) + if (current->parent == NULL) { + do_notifier_rollback(opts.with_notifier_kup, POST_UPDATE_KERNEL_COMPLETE); futex_abort_and_wake(&task_entries->nr_in_progress); + } exit(1); }
@@ -2421,8 +2423,10 @@ skip_ns_bouncing: */ attach_to_tasks(root_seized);
- if (restore_switch_stage(CR_STATE_RESTORE_CREDS)) + if (restore_switch_stage(CR_STATE_RESTORE_CREDS)) { + pr_err("Can't switch to CR_STATE_RESTORE_CREDS stage\n"); goto out_kill_network_unlocked; + }
timing_stop(TIME_RESTORE);
@@ -2599,6 +2603,15 @@ int cr_restore_tasks(void) goto err;
ret = restore_root_task(root_item); + if (ret) + goto err; + + ret = notifier_kup(POST_RUN, PREPARE, opts.with_notifier_kup); + if (ret < 0) { + opts.with_notifier_kup = false; + pr_err("calling POST_RUN notifier list return err"); + } + err: cr_plugin_fini(CR_PLUGIN_STAGE__RESTORE, ret); return ret; @@ -3861,6 +3874,8 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns */ task_args->lsm_type = kdat.lsm;
+ task_args->with_notifier_kup = opts.with_notifier_kup; + /* * Make root and cwd restore _that_ late not to break any * attempts to open files by paths above (e.g. /proc). @@ -3907,6 +3922,9 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns err: free_mappings(&self_vmas); err_nv: + if (current->parent == NULL && opts.with_notifier_kup) + do_notifier_rollback(true, POST_UPDATE_KERNEL_COMPLETE); + /* Just to be sure */ exit(1); return -1; diff --git a/criu/crtools.c b/criu/crtools.c index 9b3ef33..d53be3d 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -445,6 +445,9 @@ usage: " same cpu quantity.\n" " --pin-memory Use pin memory method for checkpoint and restore.\n" " --use-fork-pid Allow to restore task pid by setting fork pid of task struct.\n" +" --with-notifier Allow to checkout/restore kup notifier chain. This\n" +" feature needs the kernel's assistance.\n" +" Only for the host with these feature.\n" "\n" "Check options:\n" " Without options, "criu check" checks availability of absolutely required\n" diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index 7fad678..1acb5ef 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -178,6 +178,7 @@ struct cr_options { int with_cpu_affinity; int pin_memory; int use_fork_pid; + int with_notifier_kup; };
extern struct cr_options opts; diff --git a/criu/include/restorer.h b/criu/include/restorer.h index 8fd47e2..7152b34 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -227,6 +227,7 @@ struct task_restore_args { bool has_clone3_set_tid; bool pin_memory; bool use_fork_pid; + bool with_notifier_kup; } __aligned(64);
/* diff --git a/criu/include/util.h b/criu/include/util.h index c2baf27..d226d2c 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -13,6 +13,8 @@ #include <sys/sysmacros.h> #include <dirent.h> #include <poll.h> +#include <sys/stat.h> +#include <fcntl.h>
#include "int.h" #include "common/compiler.h" @@ -380,4 +382,44 @@ static inline void print_stack_trace(pid_t pid) {}
extern int mount_detached_fs(const char *fsname);
+#define NOTIFY_PROC_PATH "/sys/kernel/modrestore/nvwa_notifier" + +#if __has_include("linux/modrestore.h") +#define CONFIG_EULEROS_MODRESTORE_NOTIFY +# include <linux/modrestore.h> +#else +enum KUP_HOOK_POINT { + PRE_FREEZE, + FREEZE_TO_KILL, + PRE_UPDATE_KERNEL, + POST_UPDATE_KERNEL, + UNFREEZE_TO_RUN, + POST_RUN, + + KUP_HOOK_MAX, +}; + +enum nvwa_cmd { + PREPARE = 0, + ROLLBACK, + + NVWA_CMD_MAX, +}; +#endif + +enum notifier_state { + NOTHING_COMPLETE, + PRE_FREEZE_COMPLETE, + FREEZE_TO_KILL_COMPLETE, + PRE_UPDATE_KERNEL_COMPLETE, + POST_UPDATE_KERNEL_COMPLETE, + UNFREEZE_TO_RUN_COMPLETE, + POST_RUN_COMPLETE, + + NOTIFIER_ROLLBACK_DONE = 0xfc17173b, /* has done rollback */ +}; + +int notifier_kup(enum KUP_HOOK_POINT, enum nvwa_cmd, bool); +void do_notifier_rollback(bool, enum notifier_state); + #endif /* __CR_UTIL_H__ */ diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 4ab8a45..a6245e4 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -77,6 +77,7 @@
static struct task_entries *task_entries_local; static futex_t thread_inprogress; +static futex_t thread_start; static pid_t *helpers; static int n_helpers; static pid_t *zombies; @@ -119,10 +120,28 @@ void parasite_cleanup(void) extern void cr_restore_rt (void) asm ("__cr_restore_rt") __attribute__ ((visibility ("hidden")));
+static int args_with_notifier_kup; +static enum notifier_state notifier_state = POST_UPDATE_KERNEL_COMPLETE; +static futex_t notifier_done; + static void sigchld_handler(int signal, siginfo_t *siginfo, void *data) { char *r; int i; + rt_sigaction_t act; + + if (signal == SIGSEGV || signal == SIGBUS || signal == SIGILL) { + /* Make sure we exit with the right signal at the end. So for instance + * the core will be dumped if enabled. */ + pr_info("recv signal: %d\n", signal); + do_notifier_rollback(args_with_notifier_kup, notifier_state); + ksigemptyset (&act.rt_sa_mask); + act.rt_sa_flags = SA_SIGINFO | SA_RESTART; + act.rt_sa_handler = (rt_sighandler_t)SIG_DFL; + sys_sigaction(signal, &act, NULL, sizeof(k_rtsigset_t)); + sys_kill(sys_getpid(),signal); + return; + }
/* We can ignore helpers that die, we expect them to after * CR_STATE_RESTORE is finished. */ @@ -149,10 +168,14 @@ static void sigchld_handler(int signal, siginfo_t *siginfo, void *data)
pr_info("Task %d %s %d\n", siginfo->si_pid, r, siginfo->si_status);
+ pr_info("%s: trace do_notifier_rollback\n", __func__); + do_notifier_rollback(args_with_notifier_kup, notifier_state); futex_abort_and_wake(&task_entries_local->nr_in_progress); /* sa_restorer may be unmaped, so we can't go back to userspace*/ sys_kill(sys_getpid(), SIGSTOP); sys_exit_group(1); + + /* for notifier, do nothing when receiving SIGCHLD signal */ }
static int lsm_set_label(char *label, char *type, int procfd) @@ -604,6 +627,27 @@ static void noinline rst_sigreturn(unsigned long new_sp, ARCH_RT_SIGRETURN(new_sp, sigframe); }
+/* Notice: only one task, so it isn't necessary to consider concurrent. */ +static int do_notifier(bool *notify) +{ + int retval = 0; + + if (!*notify) + return 0; + + pr_info("unfreeze_to_run restore notifier\n"); + retval = notifier_kup(UNFREEZE_TO_RUN, PREPARE, true); + if (retval) { + *notify = false; + notifier_state = NOTIFIER_ROLLBACK_DONE; + pr_err("call notifier: %d err\n", UNFREEZE_TO_RUN); + } + + notifier_state = UNFREEZE_TO_RUN_COMPLETE; + + return retval; +} + /* * Threads restoration via sigreturn. Note it's locked * routine and calls for unlock at the end. @@ -642,12 +686,18 @@ long __export_restore_thread(struct thread_restore_args *args)
pr_info("%ld: Restored\n", sys_gettid());
- restore_finish_stage(task_entries_local, CR_STATE_RESTORE); + if (!!(restore_finish_stage(task_entries_local, CR_STATE_RESTORE) & FUTEX_ABORT_FLAG)) { + pr_err("%s: abort by CR_STATE_RESTORE\n", __func__); + goto core_restore_end; + }
if (restore_signals(args->siginfo, args->siginfo_n, false)) goto core_restore_end;
- restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD); + if (!!(restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD) & FUTEX_ABORT_FLAG)) { + pr_err("%s: abort by CR_STATE_RESTORE_SIGCHLD\n", __func__); + goto core_restore_end; + }
/* * Make sure it's before creds, since it's privileged @@ -663,16 +713,29 @@ long __export_restore_thread(struct thread_restore_args *args) if (ret) BUG();
- restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS); + if (!!(restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS) & FUTEX_ABORT_FLAG)) { + pr_err("%s: abort by CR_STATE_RESTORE_CREDS\n", __func__); + goto core_restore_end; + }
futex_dec_and_wake(&thread_inprogress); + futex_wait_while(&thread_start, 0); + if (!!(futex_get(&thread_start) & FUTEX_ABORT_FLAG)) { + pr_err("%s: abort by thread_start\n", __func__); + goto wait_notifier; + }
new_sp = (long)rt_sigframe + RT_SIGFRAME_OFFSET(rt_sigframe); rst_sigreturn(new_sp, rt_sigframe);
core_restore_end: - pr_err("Restorer abnormal termination for %ld\n", sys_getpid()); - futex_abort_and_wake(&task_entries_local->nr_in_progress); + futex_abort_and_wake(&thread_start); + futex_abort_and_wake(&task_entries_local->start); + +wait_notifier: + pr_err("%s: Restorer abnormal termination for %ld\n", __func__, sys_getpid()); + futex_wait_while(¬ifier_done, 0); + sys_exit_group(1); return -1; } @@ -1470,6 +1533,10 @@ long __export_restore_task(struct task_restore_args *args) rt_sigaction_t act; bool has_vdso_proxy;
+ futex_set(&thread_inprogress, 1); + futex_set(&thread_start, 0); + futex_set(¬ifier_done, 0); + bootstrap_start = args->bootstrap_start; bootstrap_len = args->bootstrap_len;
@@ -1486,6 +1553,7 @@ long __export_restore_task(struct task_restore_args *args) #ifdef ARCH_HAS_LONG_PAGES __page_size = args->page_size; #endif + args_with_notifier_kup = args->with_notifier_kup;
ksigfillset(&act.rt_sa_mask); act.rt_sa_handler = sigchld_handler; @@ -1496,9 +1564,29 @@ long __export_restore_task(struct task_restore_args *args) pr_err("Failed to set SIGCHLD %ld\n", ret); goto core_restore_end; } + ret = sys_sigaction(SIGSEGV, &act, NULL, sizeof(k_rtsigset_t)); + if (ret) { + pr_err("Failed to set SIGCHLD %ld\n", ret); + goto core_restore_end; + } + + ret = sys_sigaction(SIGBUS, &act, NULL, sizeof(k_rtsigset_t)); + if (ret) { + pr_err("Failed to set SIGCHLD %ld\n", ret); + goto core_restore_end; + } + + ret = sys_sigaction(SIGILL, &act, NULL, sizeof(k_rtsigset_t)); + if (ret) { + pr_err("Failed to set SIGCHLD %ld\n", ret); + goto core_restore_end; + }
ksigemptyset(&to_block); ksigaddset(&to_block, SIGCHLD); + ksigaddset(&to_block, SIGSEGV); + ksigaddset(&to_block, SIGBUS); + ksigaddset(&to_block, SIGILL); ret = sys_sigprocmask(SIG_UNBLOCK, &to_block, NULL, sizeof(k_rtsigset_t)); if (ret) { pr_err("Failed to unblock SIGCHLD %ld\n", ret); @@ -1912,7 +2000,8 @@ long __export_restore_task(struct task_restore_args *args) pr_err("Unable to create a thread: %ld\n", ret); mutex_unlock(&task_entries_local->last_pid_mutex); goto core_restore_end; - } + } else + futex_inc(&thread_inprogress); }
mutex_unlock(&task_entries_local->last_pid_mutex); @@ -1936,7 +2025,14 @@ long __export_restore_task(struct task_restore_args *args)
pr_info("%ld: Restored\n", sys_getpid());
- restore_finish_stage(task_entries_local, CR_STATE_RESTORE); + if (!!(restore_finish_stage(task_entries_local, CR_STATE_RESTORE) & FUTEX_ABORT_FLAG)) { + pr_err("%s: abort by CR_STATE_RESTORE\n", __func__); + goto core_restore_end; + } + + ret = do_notifier(&args->with_notifier_kup); + if (ret) + goto core_restore_end;
if (wait_helpers(args) < 0) goto core_restore_end; @@ -1984,7 +2080,8 @@ long __export_restore_task(struct task_restore_args *args) if (ret) goto core_restore_end;
- restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD); + if (!!(restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD) & FUTEX_ABORT_FLAG)) + goto core_restore_end;
rst_tcp_socks_all(args);
@@ -2006,15 +2103,20 @@ long __export_restore_task(struct task_restore_args *args) ret = ret || restore_pdeath_sig(args->t); ret = ret || restore_child_subreaper(args->child_subreaper);
- futex_set_and_wake(&thread_inprogress, args->nr_threads); - - restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS); + if (!!(restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS) & FUTEX_ABORT_FLAG)) + goto core_restore_end;
if (ret) BUG();
/* Wait until children stop to use args->task_entries */ futex_wait_while_gt(&thread_inprogress, 1); + if (!!(futex_get(&thread_start) & FUTEX_ABORT_FLAG)) { + pr_err("%s: terminate by main thread futex_start\n", __func__); + goto handle_notifier; + } + + futex_set_and_wake(&thread_start, 1);
sys_close(args->proc_fd); std_log_set_fd(-1); @@ -2052,8 +2154,17 @@ long __export_restore_task(struct task_restore_args *args) rst_sigreturn(new_sp, rt_sigframe);
core_restore_end: - futex_abort_and_wake(&task_entries_local->nr_in_progress); + futex_abort_and_wake(&thread_start); + futex_abort_and_wake(&task_entries_local->start); + +handle_notifier: + do_notifier_rollback(args->with_notifier_kup, notifier_state); + + futex_abort_and_wake(&task_entries_local->nr_in_progress); /* notifier the criu main process */ pr_err("Restorer fail %ld\n", sys_getpid()); + + futex_set_and_wake(¬ifier_done, 1); /* wake all other threads to exit */ + sys_exit_group(1); return -1; } diff --git a/criu/pie/util.c b/criu/pie/util.c index 4945483..752e5d0 100644 --- a/criu/pie/util.c +++ b/criu/pie/util.c @@ -11,6 +11,7 @@ #include "fcntl.h" #include "log.h" #include "util-pie.h" +#include "util.h"
#ifdef CR_NOGLIBC # include <compel/plugins/std/syscall.h> @@ -52,3 +53,93 @@ err_close: __sys(close)(fd); return -1; } + +#define KUP_BUF_SIZE 256 + +static int int_to_string(unsigned number, char *buf, size_t total) { + unsigned remainder, quotient, i, len; + + quotient = number; + len = 0; + do { + quotient /= 10; + len += 1; + } while (quotient > 0); + + if (len > total - 1) + return -1; + + quotient = number; + i = 1; + do { + remainder = quotient % 10; + quotient = quotient / 10; + buf[len-i] = '0' + remainder; + i++; + } while (quotient > 0); + buf[len] = '\0'; + + return len == 0 ? -1 : len; +} + +int notifier_kup(enum KUP_HOOK_POINT action, enum nvwa_cmd cmd, bool enable) +{ + int fd, count = 0, retval = 0; + char buf[KUP_BUF_SIZE] = {0}; + + if (!enable) + return 0; + + fd = __sys(open)(NOTIFY_PROC_PATH, O_WRONLY, 0); + if (fd == -EACCES) { + /* there is no priviledge to open file, ignore this condition. */ + pr_info("%s: open %s failed, retval: %d (-EACCES)\n", + __func__, NOTIFY_PROC_PATH, -EACCES); + return 0; + } else if (fd < 0) { + __pr_perror("%s: Can't open %s: %d\n", __func__, NOTIFY_PROC_PATH, fd); + return fd; + } + + retval = int_to_string(action, buf, sizeof(buf)-count); + if (retval <= 0) { + __pr_perror("%s: int_to_string error\n", __func__); + goto err_close; + } + + buf[retval] = ':'; + count = retval + 1; + + retval = int_to_string(cmd, buf+count, sizeof(buf)-count); + if (retval <= 0) { + __pr_perror("%s: int_to_string error\n", __func__); + goto err_close; + } + + count += retval; + retval = __sys(write)(fd, buf, count); + if (retval < 0) + __pr_perror("%s: Can't write to %s\n", __func__, NOTIFY_PROC_PATH); + +err_close: + __sys(close)(fd); + + return retval < 0 ? -1 : 0; +} + +void do_notifier_rollback(bool rollback, enum notifier_state status) +{ + if (!rollback) + return; + + switch (status) { + case POST_UPDATE_KERNEL_COMPLETE: + notifier_kup(POST_UPDATE_KERNEL, ROLLBACK, true); + break; + case UNFREEZE_TO_RUN_COMPLETE: + notifier_kup(UNFREEZE_TO_RUN, ROLLBACK, true); + break; + default: + break; + } +} diff --git a/include/common/lock.h b/include/common/lock.h index 4782b63..3db17ae 100644 --- a/include/common/lock.h +++ b/include/common/lock.h @@ -106,6 +106,10 @@ static inline void futex_inc_and_wake(futex_t *f) LOCK_BUG_ON(sys_futex((uint32_t *)&f->raw.counter, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) < 0); }
+static inline uint32_t futex_inc_return(futex_t *f) { + return atomic_inc_return(&f->raw); +} + /* Plain increment futex @f value */ static inline void futex_inc(futex_t *f) { atomic_inc(&f->raw); }
From: Jingxian He hejingxian@huawei.com
Add block device dump and restore method for kernel module upgrading.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: Xiaoguang Li lixiaoguang2@huawei.com --- criu/files.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+)
diff --git a/criu/files.c b/criu/files.c index 0912d1a..6f580af 100644 --- a/criu/files.c +++ b/criu/files.c @@ -449,6 +449,30 @@ static const struct fdtype_ops *get_mem_dev_ops(struct fd_parms *p, int minor) return ops; }
+static int dump_blkdev(struct fd_parms *p, int lfd, FdinfoEntry *e) +{ + struct fd_link *link_old = p->link; + int maj = major(p->stat.st_rdev); + const struct fdtype_ops *ops; + int err; + + switch (maj) { + case SCSI_DISK0_MAJOR: + ops = ®file_dump_ops; + break; + default: { + char more[32] = "block_dev"; + + err = dump_unsupp_fd(p, lfd, "blk", more, e); + p->link = link_old; + return err; + } + } + err = do_dump_gen_file(p, lfd, ops, e); + p->link = link_old; + return err; +} + static int dump_chrdev(struct fd_parms *p, int lfd, FdinfoEntry *e) { struct fd_link *link_old = p->link; @@ -516,6 +540,9 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, p.fd_ctl = ctl; /* Some dump_opts require this to talk to parasite */ p.dfds = dfds; /* epoll needs to verify if target fd exist */
+ if (S_ISBLK(p.stat.st_mode)) + return dump_blkdev(&p, lfd, e); + if (S_ISSOCK(p.stat.st_mode)) return dump_socket(&p, lfd, e);
From: lingsheng lingsheng@huawei.com
--- lib/py/images/images.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lib/py/images/images.py b/lib/py/images/images.py index 9c8e144..c330b97 100644 --- a/lib/py/images/images.py +++ b/lib/py/images/images.py @@ -171,7 +171,7 @@ class entry_handler:
while True: buf = f.read(4) - if buf == '': + if len(buf) == 0: break size, = struct.unpack('i', buf) f.seek(size, 1)
From: Jingxian He hejingxian@huawei.com
Add support for anon inode fd dump and restore during module upgrade.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: Xiaoguang Li lixiaoguang2@huawei.com Signed-off-by: Jingxian He hejingxian@huawei.com
Signed-off-by: fu.lin fu.lin10@huawei.com --- criu/cr-restore.c | 3 +++ criu/files-reg.c | 3 ++- criu/include/image.h | 1 + criu/include/mem.h | 1 + criu/include/restorer.h | 6 ++++++ criu/mem.c | 24 +++++++++++++++++++++++- criu/pie/restorer.c | 32 ++++++++++++++++++++++++++++++++ criu/proc_parse.c | 36 ++++++++++++++++++++++++++++++------ images/vma.proto | 1 + 9 files changed, 99 insertions(+), 8 deletions(-)
diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 05de2ef..7ceb8fe 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1001,6 +1001,8 @@ static int restore_one_alive_task(int pid, CoreEntry *core) if (prepare_vmas(current, ta)) return -1;
+ if (prepare_vma_names(current, ta)) + return -1; /* * Sockets have to be restored in their network namespaces, * so a task namespace has to be restored after sockets. @@ -3744,6 +3746,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns #endif
RST_MEM_FIXUP_PPTR(task_args->vmas); + RST_MEM_FIXUP_PPTR(task_args->vma_names); RST_MEM_FIXUP_PPTR(task_args->rings); RST_MEM_FIXUP_PPTR(task_args->tcp_socks); RST_MEM_FIXUP_PPTR(task_args->timerfd); diff --git a/criu/files-reg.c b/criu/files-reg.c index aed1e73..4724994 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -2215,7 +2215,7 @@ int do_open_reg_noseek_flags(int ns_root_fd, struct reg_file_info *rfi, void *ar
/* unnamed temporary files are restored as ghost files */ flags &= ~O_TMPFILE; - + pr_info("openat path is: %s\n", rfi->path); fd = openat(ns_root_fd, rfi->path, flags); if (fd < 0) { pr_perror("Can't open file %s on restore", rfi->path); @@ -2387,6 +2387,7 @@ int collect_filemap(struct vma_area *vma) if (!fd) return -1;
+ pr_info("find fd for %lx, shmid: %lx\n", vma->e->start, vma->e->shmid); vma->vmfd = fd; vma->vm_open = open_filemap; return 0; diff --git a/criu/include/image.h b/criu/include/image.h index 62c8d7b..939db37 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -84,6 +84,7 @@ #define VMA_AREA_VVAR (1 << 12) #define VMA_AREA_AIORING (1 << 13) #define VMA_AREA_MEMFD (1 << 14) +#define VMA_AREA_ANON_INODE (1 << 15)
#define VMA_CLOSE (1 << 28) #define VMA_NO_PROT_WRITE (1 << 29) diff --git a/criu/include/mem.h b/criu/include/mem.h index 3b3fdf8..b329c9e 100644 --- a/criu/include/mem.h +++ b/criu/include/mem.h @@ -47,6 +47,7 @@ extern int parasite_dump_pages_seized(struct pstree_item *item, struct task_restore_args; int open_vmas(struct pstree_item *t); int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta); +int prepare_vma_names(struct pstree_item *t, struct task_restore_args *ta); int unmap_guard_pages(struct pstree_item *t); int prepare_mappings(struct pstree_item *t); bool should_dump_page(VmaEntry *vmae, u64 pme); diff --git a/criu/include/restorer.h b/criu/include/restorer.h index 4afff1b..f6b45d6 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -127,6 +127,10 @@ struct restore_vma_io {
#define RIO_SIZE(niovs) (sizeof(struct restore_vma_io) + (niovs) * sizeof(struct iovec))
+struct vma_names { + char name[PATH_MAX]; +}; + struct task_restore_args { struct thread_restore_args *t; /* thread group leader */
@@ -150,6 +154,8 @@ struct task_restore_args { VmaEntry *vmas; unsigned int vmas_n;
+ struct vma_names *vma_names; + int vma_ios_fd; struct restore_vma_io *vma_ios; unsigned int vma_ios_n; diff --git a/criu/mem.c b/criu/mem.c index 2eabb8d..dd64f10 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -652,6 +652,9 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, continue; }
+ if (vma_entry_is(vma_area->e, VMA_AREA_ANON_INODE)) + continue; + ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl, &pmc, has_parent, mdc->pre_dump, parent_predump_mode); @@ -845,7 +848,6 @@ int prepare_mm_pid(struct pstree_item *i) }
pr_info("vma 0x%"PRIx64" 0x%"PRIx64"\n", vma->e->start, vma->e->end); - if (vma_area_is(vma, VMA_ANON_SHARED)) ret = collect_shmem(pid, vma); else if (vma_area_is(vma, VMA_FILE_PRIVATE) || @@ -1500,6 +1502,9 @@ int open_vmas(struct pstree_item *t) filemap_ctx_init(false);
list_for_each_entry(vma, &vmas->h, list) { + if (vma_area_is(vma, VMA_AREA_ANON_INODE)) + continue; + if (!vma_area_is(vma, VMA_AREA_REGULAR) || !vma->vm_open) continue;
@@ -1585,3 +1590,20 @@ int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta)
return prepare_vma_ios(t, ta); } + +int prepare_vma_names(struct pstree_item *t, struct task_restore_args *ta) +{ + struct vma_area *vma; + struct vm_area_list *vmas = &rsti(t)->vmas; + ta->vma_names = (struct vma_names *)rst_mem_align_cpos(RM_PRIVATE); + + list_for_each_entry(vma, &vmas->h, list) { + struct vma_names *vma_names; + vma_names = rst_mem_alloc(sizeof(*vma_names), RM_PRIVATE); + if (!vma_names) + return -1; + + memcpy(vma_names->name, vma->e->name, strlen(vma->e->name) + 1); + } + return 0; +} diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 2173c5e..0bd220a 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -66,6 +66,7 @@ #define FALLOC_FL_PUNCH_HOLE 0x02 #endif
+#define ANON_PROC_PATH "/sys/kernel/modrestore/anon_state_restore"
#define sys_prctl_safe(opcode, val1, val2, val3) \ ({ \ @@ -798,6 +799,25 @@ unsigned long arch_shmat(int shmid, void *shmaddr, } #endif
+static int restore_anon_mapping(VmaEntry *vma_entry, struct vma_names *vma_name) +{ + int fd; + + fd = sys_open(ANON_PROC_PATH, O_WRONLY, 0); + if (fd < 0) { + pr_info("anon sys fs open fail:%s\n", ANON_PROC_PATH); + return fd; + } + pr_info("restore anon mapping: %s\n", vma_name->name); + + if (sys_write(fd, vma_name->name, 4096) < 0) { + sys_close(fd); + return -1; + } + sys_close(fd); + return 0; +} + static unsigned long restore_mapping(VmaEntry *vma_entry) { int prot = vma_entry->prot; @@ -1569,6 +1589,7 @@ long __export_restore_task(struct task_restore_args *args) pid_t my_pid = sys_getpid(); rt_sigaction_t act; bool has_vdso_proxy; + struct vma_names *vma_name;
futex_set(&thread_inprogress, 1); futex_set(&thread_start, 0); @@ -1729,6 +1750,14 @@ long __export_restore_task(struct task_restore_args *args) */ for (i = 0; i < args->vmas_n; i++) { vma_entry = args->vmas + i; + vma_name = args->vma_names + i; + + if (vma_entry_is(vma_entry, VMA_AREA_ANON_INODE)) { + pr_info("anon vma name:%s\n", vma_name->name); + if (restore_anon_mapping(vma_entry, vma_name) < 0) + goto core_restore_end; + continue; + }
if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR) && !vma_entry_is(vma_entry, VMA_AREA_AIORING)) @@ -1853,6 +1882,9 @@ long __export_restore_task(struct task_restore_args *args) if (!vma_entry->has_madv || !vma_entry->madv) continue;
+ if (vma_entry_is(vma_entry, VMA_AREA_ANON_INODE)) + continue; + for (m = 0; m < sizeof(vma_entry->madv) * 8; m++) { if (vma_entry->madv & (1ul << m)) { ret = sys_madvise(vma_entry->start, diff --git a/criu/proc_parse.c b/criu/proc_parse.c index ba60832..23db7f3 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -77,6 +77,7 @@ static char *buf = __buf.buf; */
#define AIO_FNAME "/[aio]" +#define ANON_FNAME "anon_inode"
/* check the @line starts with "%lx-%lx" format */ static bool __is_vma_range_fmt(char *line) @@ -174,8 +175,19 @@ static void parse_vma_vmflags(char *buf, struct vma_area *vma_area) * only exception is VVAR area that mapped by the kernel as * VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP */ - if (io_pf && !vma_area_is(vma_area, VMA_AREA_VVAR)) + /* There are many types of io/pf vm_map, not only vvar, but also + * anon_inode, and char device. + * For anon_inode and char device, we use anon_notifier to restore + * status. Therefore, we disable the broken code here. + */ + /* + if (io_pf && !vma_area_is(vma_area, VMA_AREA_VVAR) && + !vma_area_is(vma_area, VMA_AREA_ANON_INODE)) + { + pr_info("set current status tp VMA_UNSUPP\n"); vma_area->e->status |= VMA_UNSUPP; + } + */
if (vma_area->e->madv) vma_area->e->has_madv = true; @@ -435,7 +447,6 @@ static int vma_get_mapfile(const char *fname, struct vma_area *vma, DIR *mfd,
if (fstatat(dirfd(mfd), path, &buf, 0)) return -1; - if (S_ISSOCK(buf.st_mode)) { pr_info("Found socket mapping @%"PRIx64"\n", vma->e->start); vma->vm_socket_id = buf.st_ino; @@ -450,6 +461,21 @@ static int vma_get_mapfile(const char *fname, struct vma_area *vma, DIR *mfd, return 0; }
+ if (!strncmp(fname, ANON_FNAME, sizeof(ANON_FNAME) - 1)) { + /*anon_inode*/ + close_safe(vm_file_fd); + vma->e->status = VMA_AREA_ANON_INODE; + vma->e->name = xmalloc(PATH_MAX); + if (!vma->e->name) { + pr_err("alloc vma name of anon-inode fail.\n"); + return -1; + } + snprintf(vma->e->name, PATH_MAX - 1, "%"PRIx64"-%"PRIx64 " %s", vma->e->start, vma->e->end, fname); + vma->e->name[PATH_MAX - 1] = 0; + pr_info("set vma_area status to: %d, name:%s\n", vma->e->status, vma->e->name); + return 0; + } + pr_err("Unknown shit %o (%s)\n", buf.st_mode, fname); return -1; } @@ -548,7 +574,7 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, if (vma_get_mapfile(file_path, vma_area, map_files_dir, vfi, prev_vfi, vm_file_fd)) goto err_bogus_mapfile; - + pr_info("handle_vam, vma status is: %d\n", vma_area->e->status); if (vma_area->e->status != 0) return 0;
@@ -584,6 +610,7 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, vma_area->e->shmid = prev->e->shmid; vma_area->vmst = prev->vmst; vma_area->mnt_id = prev->mnt_id; + vma_area->e->name = prev->e->name;
if (!(vma_area->e->status & VMA_AREA_SYSVIPC)) { vma_area->e->status &= ~(VMA_FILE_PRIVATE | VMA_FILE_SHARED); @@ -753,7 +780,6 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, if (IS_ERR(str)) goto err; eof = (str == NULL); - if (!eof && !__is_vma_range_fmt(str)) { if (!strncmp(str, "Nonlinear", 9)) { BUG_ON(!vma_area); @@ -772,7 +798,6 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, } else continue; } - if (vma_area && vma_list_add(vma_area, vma_area_list, &prev_end, &vfi, &prev_vfi)) goto err; @@ -819,7 +844,6 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, if (handle_vma(pid, vma_area, str + path_off, map_files_dir, &vfi, &prev_vfi, &vm_file_fd)) goto err; - if (vma_entry_is(vma_area->e, VMA_FILE_PRIVATE) || vma_entry_is(vma_area->e, VMA_FILE_SHARED)) { if (dump_filemap && dump_filemap(vma_area, vm_file_fd)) diff --git a/images/vma.proto b/images/vma.proto index 7085f42..f1ae4fb 100644 --- a/images/vma.proto +++ b/images/vma.proto @@ -22,4 +22,5 @@ message vma_entry {
/* file status flags */ optional uint32 fdflags = 10 [(criu).hex = true]; + required string name = 11; }
From: Jingxian He hejingxian@huawei.com
Add support for char device dump and restore during module upgrade.
`/sys/kernel/repairing_device` provides the char device whiltelist with `IOCTL_CMD_{NEEDREPAIR, REPAIR}` command besides the internal device list. The device modules could use `mures_{add, del}_devname()` to add, or delete the char device whitelist dynamically.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: Xiaoguang Li lixiaoguang2@huawei.com Signed-off-by: Jingxian He hejingxian@huawei.com Signed-off-by: fu.lin fulin10@huawei.com --- criu/Makefile.crtools | 1 + criu/config.c | 1 + criu/cr-dump.c | 3 + criu/cr-restore.c | 4 +- criu/crtools.c | 1 + criu/devname.c | 130 ++++++++++++++++++++++++++++ criu/files-reg.c | 34 +++++++- criu/files.c | 159 ++++++++++++++++++++++++++++++++++- criu/include/cr_options.h | 1 + criu/include/files-reg.h | 9 ++ criu/include/files.h | 6 ++ criu/include/image-desc.h | 1 + criu/include/image.h | 1 + criu/include/protobuf-desc.h | 1 + criu/include/util.h | 3 + criu/mem.c | 6 +- criu/proc_parse.c | 16 +++- images/Makefile | 1 + images/chr.proto | 12 +++ images/fdinfo.proto | 3 + 20 files changed, 382 insertions(+), 11 deletions(-) create mode 100644 criu/devname.c create mode 100644 images/chr.proto
diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index dcffb4f..a9008f0 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -89,6 +89,7 @@ obj-y += servicefd.o obj-y += pie-util-vdso.o obj-y += vdso.o obj-y += timens.o +obj-y += devname.o obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 diff --git a/criu/config.c b/criu/config.c index 4d2b709..0ccd2b5 100644 --- a/criu/config.c +++ b/criu/config.c @@ -546,6 +546,7 @@ int parse_options(int argc, char **argv, bool *usage_error, BOOL_OPT("use-fork-pid", &opts.use_fork_pid), BOOL_OPT("with-notifier", &opts.with_notifier_kup), BOOL_OPT("with-fd-cred", &opts.with_fd_cred), + BOOL_OPT("dump-char-dev", &opts.dump_char_dev), { }, };
diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 96c0cd3..9ba27a2 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -1827,6 +1827,9 @@ int cr_dump_tasks(pid_t pid) */ rlimit_unlimit_nofile();
+ if (opts.dump_char_dev && parse_devname() < 0) + goto err; + root_item = alloc_pstree_item(); if (!root_item) goto err; diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 7ceb8fe..7c198ce 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -349,11 +349,11 @@ static int root_prepare_shared(void) if (pi->pid->state == TASK_HELPER) continue;
- ret = prepare_mm_pid(pi); + ret = prepare_fd_pid(pi); if (ret < 0) break;
- ret = prepare_fd_pid(pi); + ret = prepare_mm_pid(pi); if (ret < 0) break;
diff --git a/criu/crtools.c b/criu/crtools.c index 942e683..26010b5 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -450,6 +450,7 @@ usage: " Only for the host with these feature.\n" " --with-fd-cred Allow to make the restored process has the same cred\n" " as checkout assisted by kernel.\n" +" --dump-char-dev Dump char dev files as normal file with repair cmd\n" "\n" "Check options:\n" " Without options, "criu check" checks availability of absolutely required\n" diff --git a/criu/devname.c b/criu/devname.c new file mode 100644 index 0000000..5f6fbed --- /dev/null +++ b/criu/devname.c @@ -0,0 +1,130 @@ +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "log.h" +#include "common/xmalloc.h" + +#define REPAIRING_DEVICE_FILE "/sys/kernel/repairing_device" +#define ASCII_SIZE 128 + +static void *root_bucket[ASCII_SIZE]; + +static int insert_devname_internal(void *bucket[], const char *name) +{ + void *new = NULL; + int idx = *name; + + if (bucket[idx] != NULL) + return insert_devname_internal(bucket[idx], name+1); + else if (idx == '\0') { + new = xmalloc(sizeof(void *)); + if (!new) { + pr_perror("alloc devname failed\n"); + return -1; + } + bucket[idx] = new; + return 0; + } else { + new = xmalloc(sizeof(void *) * ASCII_SIZE); + if (!new) { + pr_perror("alloc devname failed\n"); + return -1; + } + memset(new, 0, sizeof(void *) * ASCII_SIZE); + bucket[idx] = new; + return insert_devname_internal(bucket[idx], name+1); + } +} + +int insert_devname(const char *devname) +{ + if (devname == NULL || *devname == '\0') // ignore + return 0; + + pr_debug("insert device '%s'\n", devname); + return insert_devname_internal(root_bucket, devname); +} + +int parse_devname(void) +{ + int retval = -1; + char *line = NULL; + size_t len = 0; + ssize_t nread = 0; + FILE *fp = NULL; + + fp = fopen(REPAIRING_DEVICE_FILE, "r"); + if (fp == NULL) { + pr_info("Unable to open %s, downgrade to use internal whitelist\n", + REPAIRING_DEVICE_FILE); + return 0; + } + + while ((nread = getline(&line, &len, fp)) != -1) { + if (nread <= 1) // ignore empty string + continue; + + line[nread-1] = '\0'; // drop '\n' + retval = insert_devname(line); + if (retval != 0) + goto out; + } + retval = 0; + +out: + free(line); + fclose(fp); + return retval; +} + +static const char *steal_devname(const char *name, ssize_t len) +{ + ssize_t off = len; + + for (off -= 1; off > 0; off--) { + if (name[off] == '/') + break; + } + + return name + off + 1; +} + +static bool find_devname_internal(void *bucket[], const char *name) +{ + int idx = *name; + + if (*name == '\0' && bucket[idx] != NULL) + return true; + else if (bucket[idx] == NULL) + return false; + else { + return find_devname_internal(bucket[idx], name+1); + } +} + +bool find_devname(const char *name) +{ + const char *devname; + size_t len = 0; + bool found = false; + + if (name == NULL) + return false; + else if ((len = strlen(name)) == 0) + return false; + + devname = steal_devname(name, len); + found = find_devname_internal(root_bucket, devname); + + pr_debug("device '%s' (original name '%s') %s found in %s\n", + devname, name, found ? "is" : "isn't", REPAIRING_DEVICE_FILE); + + /* Compatible with the old version, there are still `strstr` branch in the following */ + found |= (strstr(name, "uverbs") != NULL + || strstr(name, "rdma_cm") != NULL + || strstr(name, "umad") != NULL); + + return found; +} diff --git a/criu/files-reg.c b/criu/files-reg.c index 4724994..ba78c67 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -1701,8 +1701,8 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) rfe.has_mnt_id = true; }
- pr_info("Dumping path for %d fd via self %d [%s]\n", - p->fd, lfd, &link->name[1]); + pr_info("Dumping path for %d fd via self %d [%s], id: %d\n", + p->fd, lfd, &link->name[1], id);
/* * The regular path we can handle should start with slash. @@ -2366,6 +2366,34 @@ static int open_filemap(int pid, struct vma_area *vma) return 0; }
+int collect_chr_map(struct pstree_item *me, struct vma_area *vma) +{ + struct list_head *list = &rsti(me)->fds; + struct fdinfo_list_entry *fle, *tmp; + struct chrfile_info *ci; + bool exist_fd; + + list_for_each_entry_safe(fle, tmp, list, ps_list) { + struct file_desc *d = fle->desc; + + if (d->ops->type != FD_TYPES__CHR) + continue; + + ci = container_of(d, struct chrfile_info, d); + if (!strcmp(vma->e->name, ci->path)) { + vma->vmfd = d; + vma->e->fd = fle->fe->fd; + exist_fd = true; + break; + } + } + + if (!exist_fd) + return -EEXIST; + + return 0; +} + int collect_filemap(struct vma_area *vma) { struct file_desc *fd; @@ -2453,7 +2481,7 @@ static int collect_one_regfile(void *o, ProtobufCMessage *base, struct cr_img *i rfi->remap = NULL; rfi->size_mode_checked = false;
- pr_info("Collected [%s] ID %#x\n", rfi->path, rfi->rfe->id); + pr_info("Collected regfile [%s] ID %#x\n", rfi->path, rfi->rfe->id); return file_desc_add(&rfi->d, rfi->rfe->id, ®_desc_ops); }
diff --git a/criu/files.c b/criu/files.c index 6f580af..34aa8be 100644 --- a/criu/files.c +++ b/criu/files.c @@ -331,10 +331,32 @@ int do_dump_gen_file(struct fd_parms *p, int lfd, e->fd = p->fd; e->flags = p->fd_flags;
+ pr_info("fdinfoEntry fd: %d\n", e->fd); ret = fd_id_generate(p->pid, e, p); if (ret == 1) /* new ID generated */ ret = ops->dump(lfd, e->id, p); - else + else if (ops->type == FD_TYPES__CHR) { + /* + * Sometimes the app_data subprocess may inherit the fd from + * app_data. Those fds may result the unconditional oops during + * the restoration of app_data. Therefore, prevent the dump in + * those condition. + */ + struct fd_link _link, *link; + + if (!p->link) { + if (fill_fdlink(lfd, p, &_link)) + return -1; + link = &_link; + } else + link = p->link; + + if (find_devname(link->name)) { + pr_err("char dev '%s' fd %d is owned by multi-processes\n", + link->name, e->fd); + ret = -1; + } + } else /* Remove locks generated by the fd before going to the next */ discard_dup_locks_tail(p->pid, e->fd);
@@ -473,6 +495,58 @@ static int dump_blkdev(struct fd_parms *p, int lfd, FdinfoEntry *e) return err; }
+static int dump_chr_file(int lfd, u32 id, const struct fd_parms *p) +{ + int ret; + struct fd_link _link, *link; + struct cr_img *img; + FileEntry fe = FILE_ENTRY__INIT; + ChrfileEntry cfe = CHRFILE_ENTRY__INIT; + + if (!p->link) { + if (fill_fdlink(lfd, p, &_link)) + return -1; + link = &_link; + } else + link = p->link; + + pr_info("Dumping chr-file fd %d with lfd %d with id %d, name: %s\n", p->fd, lfd, id, link->name); + + if (strstr(link->name, "(deleted)") != NULL) { + pr_err("char device '%s' is deleted\n", link->name); + return -ENXIO; + } + + cfe.repair = false; + if (find_devname(link->name)) { + ret = ioctl(lfd, IOCTL_CMD_NEEDREPAIR, 0); + if (ret <= 0) { + pr_err("ioctl cmd needrepair failed, errno: %d, %s\n", ret, strerror(errno)); + return -1; + } else { + pr_info("char device needrepair cmd return: %d\n", ret); + cfe.index = ret; + cfe.repair = true; + } + } + + cfe.id = id; + cfe.name = &link->name[1]; + cfe.flags = p->flags; + fe.type = FD_TYPES__CHR; + fe.id = cfe.id; + fe.chr = &cfe; + + img = img_from_set(glob_imgset, CR_FD_FILES); + ret = pb_write_one(img, &fe, PB_FILE); + return ret; +} + +const struct fdtype_ops chr_dump_ops = { + .type = FD_TYPES__CHR, + .dump = dump_chr_file, +}; + static int dump_chrdev(struct fd_parms *p, int lfd, FdinfoEntry *e) { struct fd_link *link_old = p->link; @@ -500,6 +574,10 @@ static int dump_chrdev(struct fd_parms *p, int lfd, FdinfoEntry *e) ops = &tty_dump_ops; break; } + if (opts.dump_char_dev) { + ops = &chr_dump_ops; + break; + }
sprintf(more, "%d:%d", maj, minor(p->stat.st_rdev)); err = dump_unsupp_fd(p, lfd, "chr", more, e); @@ -513,6 +591,12 @@ static int dump_chrdev(struct fd_parms *p, int lfd, FdinfoEntry *e) return err; }
+/* Checks if file descriptor @lfd is infinibandevent */ +int is_infiniband_link(char *link) +{ + return is_anon_link_type(link, "[infinibandevent]"); +} + static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, struct parasite_ctl *ctl, FdinfoEntry *e, struct parasite_drain_fd *dfds) @@ -567,6 +651,8 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, ops = &signalfd_dump_ops; else if (is_timerfd_link(link)) ops = &timerfd_dump_ops; + else if (is_infiniband_link(link)) + return 1; #ifdef CONFIG_HAS_LIBBPF else if (is_bpfmap_link(link)) ops = &bpfmap_dump_ops; @@ -673,9 +759,15 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item,
ret = dump_one_file(item->pid, dfds->fds[i + off], lfds[i], opts + i, ctl, &e, dfds); - if (ret) + if (ret < 0) break; + /* infiniband link file */ + if (ret > 0) { + ret = 0; + continue; + }
+ pr_info("write fdinfoEntry fd=%d id=%d\n", (&e)->fd, (&e)->id); ret = pb_write_one(img, &e, PB_FDINFO); if (ret) break; @@ -933,6 +1025,7 @@ int prepare_fd_pid(struct pstree_item *item) if (!img) return -1;
+ pr_info("prepare_fd_pid\n"); while (1) { FdinfoEntry *e;
@@ -1140,6 +1233,7 @@ int setup_and_serve_out(struct fdinfo_list_entry *fle, int new_fd) if (reopen_fd_as(fle->fe->fd, new_fd)) return -1;
+ pr_info("*******flags: %d",fle->fe->flags); if (fcntl(fle->fe->fd, F_SETFD, fle->fe->flags) == -1) { pr_perror("Unable to set file descriptor flags"); return -1; @@ -1690,6 +1784,64 @@ out: return ret; }
+static int chrfile_open(struct file_desc *d, int *new_fd) +{ + int fd, mntns_root; + int ret = 0; + struct chrfile_info *ci; + + ci = container_of(d, struct chrfile_info, d); + + mntns_root = open_pid_proc(getpid()); + fd = openat(mntns_root, ci->path, ci->cfe->flags | O_REPAIR); + if (fd < 0){ + pr_err("open chr file failed\n"); + return -1; + } + + if (ci->cfe->repair) { + ret = ioctl(fd, IOCTL_CMD_REPAIR , ci->cfe->index); + pr_info("repair ioctl return: %d, index: %d\n", ret, ci->cfe->index); + if (ret) + goto err; + } + + *new_fd = fd; + return ret; +err: + close(fd); + return ret; +} + +static struct file_desc_ops chrfile_desc_ops = { + .type = FD_TYPES__CHR, + .open = chrfile_open, +}; + +static int collect_one_chrfile(void *o, ProtobufCMessage *base, struct cr_img *i) +{ + struct chrfile_info *ci = o; + static char dot[] = "."; + + ci->cfe = pb_msg(base, ChrfileEntry); + if (ci->cfe->name[1] == '\0') + ci->path = dot; + else + ci->path = ci->cfe->name; + + pr_info("Collected chr file: %#x, name: %s\n", ci->cfe->id, ci->path); + file_desc_add(&ci->d, ci->cfe->id, &chrfile_desc_ops); + + return 0; +} + +struct collect_image_info chrfile_cinfo = { + .fd_type = CR_FD_CHRFILE, + .pb_type = PB_CHRFILE, + .priv_size = sizeof(struct chrfile_info), + .collect = collect_one_chrfile, +}; + static int collect_one_file_entry(FileEntry *fe, u_int32_t id, ProtobufCMessage *base, struct collect_image_info *cinfo) { @@ -1770,6 +1922,9 @@ static int collect_one_file(void *o, ProtobufCMessage *base, struct cr_img *i) ret = collect_one_file_entry(fe, fe->bpf->id, &fe->bpf->base, &bpfmap_cinfo); break; #endif + case FD_TYPES__CHR: + ret = collect_one_file_entry(fe, fe->chr->id, &fe->chr->base, &chrfile_cinfo); + break; }
return ret; diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index 5b0ff24..5ca177a 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -180,6 +180,7 @@ struct cr_options { int use_fork_pid; int with_notifier_kup; int with_fd_cred; + int dump_char_dev; };
extern struct cr_options opts; diff --git a/criu/include/files-reg.h b/criu/include/files-reg.h index 016d76a..458fe89 100644 --- a/criu/include/files-reg.h +++ b/criu/include/files-reg.h @@ -4,6 +4,7 @@ #include "files.h"
#include "images/regfile.pb-c.h" +#include "images/chr.pb-c.h" #include "images/ghost-file.pb-c.h"
struct cr_imgset; @@ -26,12 +27,19 @@ struct reg_file_info { char *path; };
+struct chrfile_info { + struct file_desc d; + ChrfileEntry *cfe; + char *path; +}; + extern int open_reg_by_id(u32 id); extern int open_reg_fd(struct file_desc *); extern int open_path(struct file_desc *, int (*open_cb)(int ns_root_fd, struct reg_file_info *, void *), void *arg);
extern const struct fdtype_ops regfile_dump_ops; +extern const struct fdtype_ops chr_dump_ops; extern int do_open_reg_noseek_flags(int ns_root_fd, struct reg_file_info *rfi, void *arg); extern int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p);
@@ -40,6 +48,7 @@ extern struct file_remap *lookup_ghost_remap(u32 dev, u32 ino); extern struct file_desc *try_collect_special_file(u32 id, int optional); #define collect_special_file(id) try_collect_special_file(id, 0) extern int collect_filemap(struct vma_area *); +extern int collect_chr_map(struct pstree_item *me, struct vma_area *); extern void filemap_ctx_init(bool auto_close); extern void filemap_ctx_fini(void);
diff --git a/criu/include/files.h b/criu/include/files.h index 2c1e1e7..b12d079 100644 --- a/criu/include/files.h +++ b/criu/include/files.h @@ -15,6 +15,12 @@ #include "images/fown.pb-c.h" #include "images/vma.pb-c.h"
+#ifndef IOCTL_CMD_NEEDREPAIR +#define IOCTL_CMD_NEEDREPAIR 0x00100000UL +#define IOCTL_CMD_REPAIR 0x00200000UL +#define O_REPAIR 040000000 +#endif + struct parasite_drain_fd; struct pstree_item; struct file_desc; diff --git a/criu/include/image-desc.h b/criu/include/image-desc.h index f69cc58..22676ae 100644 --- a/criu/include/image-desc.h +++ b/criu/include/image-desc.h @@ -114,6 +114,7 @@ enum { CR_FD_MEMFD_FILE,
CR_FD_AUTOFS, + CR_FD_CHRFILE,
CR_FD_MAX }; diff --git a/criu/include/image.h b/criu/include/image.h index 939db37..70f17a5 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -85,6 +85,7 @@ #define VMA_AREA_AIORING (1 << 13) #define VMA_AREA_MEMFD (1 << 14) #define VMA_AREA_ANON_INODE (1 << 15) +#define VMA_AREA_CHR (1 << 16)
#define VMA_CLOSE (1 << 28) #define VMA_NO_PROT_WRITE (1 << 29) diff --git a/criu/include/protobuf-desc.h b/criu/include/protobuf-desc.h index 35fa1a9..e7df57e 100644 --- a/criu/include/protobuf-desc.h +++ b/criu/include/protobuf-desc.h @@ -69,6 +69,7 @@ enum { PB_PIDNS, PB_BPFMAP_FILE, PB_BPFMAP_DATA, + PB_CHRFILE,
/* PB_AUTOGEN_STOP */
diff --git a/criu/include/util.h b/criu/include/util.h index d226d2c..cf9a8f4 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -422,4 +422,7 @@ enum notifier_state { int notifier_kup(enum KUP_HOOK_POINT, enum nvwa_cmd, bool); void do_notifier_rollback(bool, enum notifier_state);
+int parse_devname(void); +bool find_devname(const char *name); + #endif /* __CR_UTIL_H__ */ diff --git a/criu/mem.c b/criu/mem.c index dd64f10..d56f69e 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -848,7 +848,9 @@ int prepare_mm_pid(struct pstree_item *i) }
pr_info("vma 0x%"PRIx64" 0x%"PRIx64"\n", vma->e->start, vma->e->end); - if (vma_area_is(vma, VMA_ANON_SHARED)) + if (vma_area_is(vma, VMA_AREA_CHR)) + ret = collect_chr_map(i, vma); + else if (vma_area_is(vma, VMA_ANON_SHARED)) ret = collect_shmem(pid, vma); else if (vma_area_is(vma, VMA_FILE_PRIVATE) || vma_area_is(vma, VMA_FILE_SHARED)) @@ -1502,7 +1504,7 @@ int open_vmas(struct pstree_item *t) filemap_ctx_init(false);
list_for_each_entry(vma, &vmas->h, list) { - if (vma_area_is(vma, VMA_AREA_ANON_INODE)) + if (vma_area_is(vma, VMA_AREA_ANON_INODE) || vma_area_is(vma, VMA_AREA_CHR)) continue;
if (!vma_area_is(vma, VMA_AREA_REGULAR) || !vma->vm_open) diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 23db7f3..4f5bbaa 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -622,11 +622,23 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, } else if (*vm_file_fd >= 0) { struct stat *st_buf = vma_area->vmst;
+ pr_info("file mode is: %x, st_ino: %ld\n", st_buf->st_mode, st_buf->st_ino); if (S_ISREG(st_buf->st_mode)) /* regular file mapping -- supported */; - else if (S_ISCHR(st_buf->st_mode) && (st_buf->st_rdev == DEVZERO)) + else if (S_ISCHR(st_buf->st_mode)) { /* devzero mapping -- also makes sense */; - else { + if (opts.dump_char_dev && (strstr(file_path, "uverbs") != NULL)) { + int len = strlen(file_path) + 1; + vma_area->e->status |= VMA_AREA_CHR; + vma_area->e->name = xmalloc(len); + if (!vma_area->e->name) { + pr_err("alloc vma area name fail\n"); + goto err; + } + strncpy(vma_area->e->name, file_path, len); + pr_info("uverbs name content is: %s\n", vma_area->e->name); + } + } else { pr_err("Can't handle non-regular mapping on %d's map %"PRIx64"\n", pid, vma_area->e->start); goto err; } diff --git a/images/Makefile b/images/Makefile index 34bc367..efd6fcb 100644 --- a/images/Makefile +++ b/images/Makefile @@ -70,6 +70,7 @@ proto-obj-y += timens.o proto-obj-y += img-streamer.o proto-obj-y += bpfmap-file.o proto-obj-y += bpfmap-data.o +proto-obj-y += chr.o
CFLAGS += -iquote $(obj)/
diff --git a/images/chr.proto b/images/chr.proto new file mode 100644 index 0000000..67929db --- /dev/null +++ b/images/chr.proto @@ -0,0 +1,12 @@ +syntax = "proto2"; + +import "opts.proto"; + +message chrfile_entry { + required uint32 id = 1; + required uint32 flags = 2 [(criu).flags = "rfile.flags"]; + required uint32 index = 3; + required string name = 4; + required bool repair = 5; +}; + diff --git a/images/fdinfo.proto b/images/fdinfo.proto index f5e1895..8561da4 100644 --- a/images/fdinfo.proto +++ b/images/fdinfo.proto @@ -18,6 +18,7 @@ import "pipe.proto"; import "tty.proto"; import "memfd.proto"; import "bpfmap-file.proto"; +import "chr.proto";
enum fd_types { UND = 0; @@ -40,6 +41,7 @@ enum fd_types { TIMERFD = 17; MEMFD = 18; BPFMAP = 19; + CHR = 21;
/* Any number above the real used. Not stored to image */ CTL_TTY = 65534; @@ -76,4 +78,5 @@ message file_entry { optional tty_file_entry tty = 19; optional memfd_file_entry memfd = 20; optional bpfmap_file_entry bpf = 21; + optional chrfile_entry chr = 23; }
From: Jingxian He hejingxian@huawei.com
Fix connect error of invalid param during module upgrade.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: Xiaoguang Li lixiaoguang2@huawei.com Signed-off-by: fu.lin fu.lin10@huawei.com --- criu/include/sockets.h | 1 + criu/sk-inet.c | 13 +++++++++++-- criu/sockets.c | 5 ++++- 3 files changed, 16 insertions(+), 3 deletions(-)
diff --git a/criu/include/sockets.h b/criu/include/sockets.h index e971f3e..74c5ae4 100644 --- a/criu/include/sockets.h +++ b/criu/include/sockets.h @@ -27,6 +27,7 @@ struct socket_desc { extern int dump_socket(struct fd_parms *p, int lfd, FdinfoEntry *); extern int dump_socket_opts(int sk, SkOptsEntry *soe); extern int restore_socket_opts(int sk, SkOptsEntry *soe); +extern int restore_bound_opts(int sk, SkOptsEntry *soe); extern void release_skopts(SkOptsEntry *); extern int restore_prepare_socket(int sk); extern void preload_socket_modules(void); diff --git a/criu/sk-inet.c b/criu/sk-inet.c index d90c53b..7a05de2 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -102,19 +102,24 @@ static void show_one_inet(const char *act, const struct inet_sk_desc *sk) static void show_one_inet_img(const char *act, const InetSkEntry *e) { char src_addr[INET_ADDR_LEN] = "<unknown>"; + char dst_addr[INET_ADDR_LEN] = "<unknown>";
if (inet_ntop(e->family, (void *)e->src_addr, src_addr, INET_ADDR_LEN) == NULL) { pr_perror("Failed to translate address"); } + if (inet_ntop(e->family, (void *)e->dst_addr, dst_addr, + INET_ADDR_LEN) == NULL) { + pr_perror("Failed to translate address"); + }
pr_debug("\t%s: family %-10s type %-14s proto %-16s port %d " - "state %-16s src_addr %s\n", act, + "state %-16s src_addr %s dst_addr %s\n", act, ___socket_family_name(e->family), ___socket_type_name(e->type), ___socket_proto_name(e->proto), e->src_port, ___tcp_state_name(e->state), - src_addr); + src_addr, dst_addr); }
static int can_dump_ipproto(unsigned int ino, int proto, int type) @@ -876,6 +881,10 @@ static int open_inet_sk(struct file_desc *d, int *new_fd) if (restore_opt(sk, SOL_SOCKET, SO_REUSEPORT, &yes)) goto err;
+ if(restore_bound_opts(sk, ie->opts) < 0){ + goto err; + } + if (tcp_connection(ie)) { if (!opts.tcp_established_ok && !opts.tcp_close) { pr_err("Connected TCP socket in image\n"); diff --git a/criu/sockets.c b/criu/sockets.c index a73967e..609bfb1 100644 --- a/criu/sockets.c +++ b/criu/sockets.c @@ -605,7 +605,6 @@ int restore_socket_opts(int sk, SkOptsEntry *soe) tv.tv_usec = soe->so_rcv_tmo_usec; ret |= restore_opt(sk, SOL_SOCKET, SO_RCVTIMEO, &tv);
- ret |= restore_bound_dev(sk, soe); ret |= restore_socket_filter(sk, soe);
/* The restore of SO_REUSEADDR depends on type of socket */ @@ -613,6 +612,10 @@ int restore_socket_opts(int sk, SkOptsEntry *soe) return ret; }
+int restore_bound_opts(int sk, SkOptsEntry *soe){ + return restore_bound_dev(sk, soe); +} + int do_dump_opt(int sk, int level, int name, void *val, int len) { socklen_t aux = len;
From: Jingxian He hejingxian@huawei.com
Add task exit notify mask method for criu during kernel module upgrade.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: Jingxian He hejingxian@huawei.com --- criu/config.c | 1 + criu/cr-restore.c | 8 ++++++++ criu/crtools.c | 1 + criu/include/cr_options.h | 1 + criu/include/util.h | 5 +++++ criu/seize.c | 33 ++++++++++++++++++++++++++++++++- 6 files changed, 48 insertions(+), 1 deletion(-)
diff --git a/criu/config.c b/criu/config.c index 0ccd2b5..9c4d8ce 100644 --- a/criu/config.c +++ b/criu/config.c @@ -547,6 +547,7 @@ int parse_options(int argc, char **argv, bool *usage_error, BOOL_OPT("with-notifier", &opts.with_notifier_kup), BOOL_OPT("with-fd-cred", &opts.with_fd_cred), BOOL_OPT("dump-char-dev", &opts.dump_char_dev), + BOOL_OPT("mask-exit-notify", &opts.mask_exit_notify), { }, };
diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 7c198ce..ecebdfe 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1583,6 +1583,14 @@ static inline int fork_with_pid(struct pstree_item *item) item->pid->real, vpid(item)); }
+ if (opts.mask_exit_notify) { + int mask_pid = ret; + pr_info("start unmask for %d\n", mask_pid); + ret = mask_task_exit_notify(mask_pid, false); + if (ret) + pr_err("unmask exit notify fail for: %d\n", mask_pid); + } + err_unlock: if (!(clone_flags & CLONE_NEWPID)) unlock_last_pid(); diff --git a/criu/crtools.c b/criu/crtools.c index 26010b5..8694ed0 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -451,6 +451,7 @@ usage: " --with-fd-cred Allow to make the restored process has the same cred\n" " as checkout assisted by kernel.\n" " --dump-char-dev Dump char dev files as normal file with repair cmd\n" +" --mask-exit-notify Mask task exit notify during dump and restore\n" "\n" "Check options:\n" " Without options, "criu check" checks availability of absolutely required\n" diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index 5ca177a..5b3ff86 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -181,6 +181,7 @@ struct cr_options { int with_notifier_kup; int with_fd_cred; int dump_char_dev; + int mask_exit_notify; };
extern struct cr_options opts; diff --git a/criu/include/util.h b/criu/include/util.h index cf9a8f4..3a4b8f9 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -425,4 +425,9 @@ void do_notifier_rollback(bool, enum notifier_state); int parse_devname(void); bool find_devname(const char *name);
+#define PID_BUF_SIZE 32 +#define MASK_EXIT_NOTIFY_DIR "/sys/kernel/mask_exit_notify" +#define UNMASK_EXIT_NOTIFY_DIR "/sys/kernel/unmask_exit_notify" +int mask_task_exit_notify(int pid, bool mask); + #endif /* __CR_UTIL_H__ */ diff --git a/criu/seize.c b/criu/seize.c index a661097..e4f674b 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -649,9 +649,35 @@ free: return ret < 0 ? ret : nr_inprogress; }
+int mask_task_exit_notify(int pid, bool mask) +{ + int fd, retval; + char buf[PID_BUF_SIZE] = {0}; + + if (pid <= 0) + return -1; + + snprintf(buf, PID_BUF_SIZE - 1, "%d", pid); + if (mask) + fd = open(MASK_EXIT_NOTIFY_DIR, O_WRONLY, 0); + else + fd = open(UNMASK_EXIT_NOTIFY_DIR, O_WRONLY, 0); + if (fd < 0) { + pr_err("open mask exit notify file fail\n"); + return fd; + } + + retval = write(fd, buf, PID_BUF_SIZE); + if (retval < 0) + pr_err("Write mask exit pid: %s fail\n", buf); + close(fd); + + return retval < 0 ? -1 : 0; +} + static void unseize_task_and_threads(const struct pstree_item *item, int st) { - int i; + int i, ret;
if (item->pid->state == TASK_DEAD) return; @@ -660,6 +686,11 @@ static void unseize_task_and_threads(const struct pstree_item *item, int st) for (i = 0; i < item->nr_threads; i++) dump_task_special_pages(item->threads[i].real); } + if (opts.mask_exit_notify && (st == TASK_DEAD)) { + ret = mask_task_exit_notify(item->threads[0].real, true); + if (ret) + pr_err("mask exit notify for %d fail.\n", item->threads[0].real); + }
/* * The st is the state we want to switch tasks into,
From: Jingxian He hejingxian@huawei.com
Fix eventpollfd problem of improper usage in appdata.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: Jingxian He hejingxian@huawei.com Signed-off-by: fu.lin fu.lin10@huawei.com --- criu/eventpoll.c | 16 +++++++++++----- criu/proc_parse.c | 2 ++ images/eventpoll.proto | 3 +++ 3 files changed, 16 insertions(+), 5 deletions(-)
diff --git a/criu/eventpoll.c b/criu/eventpoll.c index 9818f24..6097e42 100644 --- a/criu/eventpoll.c +++ b/criu/eventpoll.c @@ -67,8 +67,8 @@ int is_eventpoll_link(char *link)
static void pr_info_eventpoll_tfd(char *action, uint32_t id, EventpollTfdEntry *e) { - pr_info("%seventpoll-tfd: id %#08x tfd %8d events %#08x data %#016"PRIx64"\n", - action, id, e->tfd, e->events, e->data); + pr_info("%seventpoll-tfd: id %#08x tfd %8d events %#08x data %#016"PRIx64" ignore %d\n", + action, id, e->tfd, e->events, e->data, e->ignore); }
static void pr_info_eventpoll(char *action, EventpollFileEntry *e) @@ -146,9 +146,9 @@ int flush_eventpoll_dinfo_queue(void) }; struct kid_elem *t = kid_lookup_epoll_tfd(&fd_tree, &ke, &slot); if (!t) { - pr_debug("kid_lookup_epoll: no match pid %d efd %d tfd %d toff %u\n", - dinfo->pid, dinfo->efd, tfde->tfd, dinfo->toff[i].off); - goto err; + tfde->ignore = 1; + pr_info("Drop tfd entry, efd=%d, tfd=%d\n", slot.efd, slot.tfd); + continue; }
pr_debug("kid_lookup_epoll: rbsearch match pid %d efd %d tfd %d toff %u -> %d\n", @@ -161,6 +161,7 @@ int flush_eventpoll_dinfo_queue(void) goto err; }
+ pr_info("Change tfd: %d -> %d @ efd=%d\n", tfde->tfd, t->idx, slot.efd); tfde->tfd = t->idx; }
@@ -416,6 +417,11 @@ static int eventpoll_retore_tfd(int fd, int id, EventpollTfdEntry *tdefe) { struct epoll_event event;
+ if (tdefe->ignore) { + pr_info_eventpoll_tfd("Ignore ", id, tdefe); + return 0; + } + pr_info_eventpoll_tfd("Restore ", id, tdefe);
event.events = tdefe->events; diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 4f5bbaa..32d84b3 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -1921,10 +1921,12 @@ static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) e->has_dev = false; e->has_inode = false; e->has_pos = false; + e->has_ignore = false; } else if (ret == 6) { e->has_dev = true; e->has_inode = true; e->has_pos = true; + e->has_ignore = true; } else if (ret < 6) { eventpoll_tfd_entry__free_unpacked(e, NULL); goto parse_err; diff --git a/images/eventpoll.proto b/images/eventpoll.proto index 4a8d1b8..20c9a15 100644 --- a/images/eventpoll.proto +++ b/images/eventpoll.proto @@ -12,6 +12,9 @@ message eventpoll_tfd_entry { optional uint32 dev = 5; optional uint64 inode = 6; optional uint64 pos = 7; + + /* entry validation */ + optional uint32 ignore = 8; }
message eventpoll_file_entry {
From: Xiaoguang Li lixiaoguang2@huawei.com
Background: SELinux has three status: disabled, permissive, and enforcing. If the status of the SELinux wasn't disabled, it would configure the rules using `/etc/selinux/targeted`. However, because of the non-existed rules in `/etc/selinux/targeted`, the security lable of processes is `kernel` instead of `unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023` readed from `/proc/<pid>/attr/current`. It will result the failure of criu dumping.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: lixiaoguang2 lixiaoguang2@huawei.com Signed-off-by: fu.lin fu.lin10@huawei.com --- criu/lsm.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-)
diff --git a/criu/lsm.c b/criu/lsm.c index 7cc3604..6713ca7 100644 --- a/criu/lsm.c +++ b/criu/lsm.c @@ -78,12 +78,22 @@ static int selinux_get_label(pid_t pid, char **output) if (!*output) goto err;
+ pos = (char*)ctx; + /* + * If the SElinux context is not configured, the label maybe look like + * this: + * "kernel" + */ + if (!strstr(pos, ":")) { + ret = 0; + goto err; + } + /* * Make sure it is a valid SELinux label. It should look like this: * * unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 */ - pos = (char*)ctx; for (i = 0; i < 3; i++) { pos = strstr(pos, ":"); if (!pos) {
From: Luo Longjun luolongjun@huawei.com
When dump unix stream socket with external connections, we will tell kernel to turn repair mode on for this sock. And then kernel will keep this sock before restoring it. In this process, the other socket which communicates with this sock in repair mode will get EAGAIN or blocked.
Signed-off-by: Luo Longjun luolongjun@huawei.com
fix unix socket dump and restore err
Fix name-less unix socket dump and restore problem.
Signed-off-by: Jingxian He hejingxian@huawei.com
unix socket:ignore repair error from kernel
leave error for applications to deal with.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: Luo Longjun luolongjun@huawei.com
- enable this feature by check cmdline `unix_stream_restore_enable` - don't set repair mode for non-external socket
Signed-off-by: fu.lin fulin10@huawei.com --- criu/cr-dump.c | 1 + criu/include/kerndat.h | 1 + criu/include/sockets.h | 1 + criu/kerndat.c | 32 ++++++++++ criu/sk-unix.c | 137 ++++++++++++++++++++++++++++++++++++++--- images/sk-unix.proto | 1 + 6 files changed, 164 insertions(+), 9 deletions(-)
diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 9ba27a2..2bbcef3 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -1715,6 +1715,7 @@ static int cr_dump_finish(int ret)
cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret); cgp_fini(); + unix_stream_unlock(ret);
if (!ret) { /* diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index ad5f7d3..665051d 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -68,6 +68,7 @@ struct kerndat_s { bool has_fsopen; bool has_clone3_set_tid; bool has_timens; + bool has_unix_sk_repair; };
extern struct kerndat_s kdat; diff --git a/criu/include/sockets.h b/criu/include/sockets.h index 74c5ae4..c9cf427 100644 --- a/criu/include/sockets.h +++ b/criu/include/sockets.h @@ -43,6 +43,7 @@ extern int add_fake_unix_queuers(void); extern int fix_external_unix_sockets(void); extern int prepare_scms(void); extern int unix_note_scm_rights(int id_for, uint32_t *file_ids, int *fds, int n_ids); +extern void unix_stream_unlock(int ret);
extern struct collect_image_info netlink_sk_cinfo;
diff --git a/criu/kerndat.c b/criu/kerndat.c index b2c47c5..c87f551 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -1052,6 +1052,36 @@ static bool kerndat_has_clone3_set_tid(void) return 0; }
+#define UNIX_STREAM_RESTORE_ENABLE_FILE "/sys/module/kernel/parameters/unix_stream_restore_enable" + +static void kerndat_has_unix_sk_repair(void) +{ + FILE *fp; + char ch = 'N'; + + if (access(UNIX_STREAM_RESTORE_ENABLE_FILE, F_OK) < 0) { + pr_debug("C/R external unix stream socket is not support\n"); + return; + } + + fp = fopen(UNIX_STREAM_RESTORE_ENABLE_FILE, "r"); + if (fp == NULL) { + pr_err("failed to open '%s': %s\n", + UNIX_STREAM_RESTORE_ENABLE_FILE, strerror(errno)); + return; + } + + fscanf(fp, "%c", &ch); + if (ch == 'Y') { + pr_debug("enable C/R external unix stream socket support\n"); + kdat.has_unix_sk_repair = true; + } + + fclose(fp); + + return; +} + int kerndat_init(void) { int ret; @@ -1186,6 +1216,8 @@ int kerndat_init(void) ret = -1; }
+ kerndat_has_unix_sk_repair(); + kerndat_lsm(); kerndat_mmap_min_addr(); kerndat_files_stat(); diff --git a/criu/sk-unix.c b/criu/sk-unix.c index 00d09cc..d4c15ce 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -72,6 +72,7 @@ struct unix_sk_desc { char *name; unsigned int nr_icons; unsigned int *icons; + int repair_ino;
unsigned int vfs_dev; unsigned int vfs_ino; @@ -89,9 +90,18 @@ struct unix_sk_desc { struct list_head peer_list; struct list_head peer_node;
+ struct list_head repair_list; + struct list_head repair_node; + struct unix_stream_extern_socket_desc *ext_node; + UnixSkEntry *ue; };
+struct unix_stream_extern_socket_desc { + struct list_head list; + int fd; +}; + /* * The mutex_ghost is accessed from different tasks, * so make sure it is in shared memory. @@ -99,6 +109,7 @@ struct unix_sk_desc { static mutex_t *mutex_ghost;
static LIST_HEAD(unix_sockets); +static LIST_HEAD(unix_stream_external_sockets); static LIST_HEAD(unix_ghost_addr);
static int unix_resolve_name(int lfd, uint32_t id, struct unix_sk_desc *d, @@ -117,6 +128,26 @@ struct unix_sk_listen_icon {
static struct unix_sk_listen_icon *unix_listen_icons[SK_HASH_SIZE];
+static int unix_stream_repair_on(int fd) +{ + int ret, aux = 1; + ret = setsockopt(fd, SOL_TCP, TCP_REPAIR_OPTIONS, &aux, sizeof(aux)); + if (ret < 0) + pr_err("Can't turn repair mod for unix stream on. \n"); + + return ret; +} + +static int unix_stream_repair_off(int fd) +{ + int ret, aux = 0; + ret = setsockopt(fd, SOL_TCP, TCP_REPAIR_OPTIONS, &aux, sizeof(aux)); + if (ret < 0) + pr_err("Can't turn repair mod for unix stream off. \n"); + + return ret; +} + static struct unix_sk_listen_icon *lookup_unix_listen_icons(unsigned int peer_ino) { struct unix_sk_listen_icon *ic; @@ -338,6 +369,8 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) FilePermsEntry *perms; FownEntry *fown; void *m; + unsigned int len; + int ret;
m = xmalloc(sizeof(UnixSkEntry) + sizeof(SkOptsEntry) + @@ -382,6 +415,7 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) ue->fown = fown; ue->opts = skopts; ue->uflags = 0; + ue->repair_ino = 0;
if (unix_resolve_name(lfd, id, sk, ue, p)) goto err; @@ -431,6 +465,35 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) goto err; }
+ /* don't handle non-external unix socket, criu will restore it. */ + if (kdat.has_unix_sk_repair && !sk->sd.already_dumped + && peer->name && ue->type == SOCK_STREAM) { + struct unix_stream_extern_socket_desc *d; + + d = xzalloc(sizeof(*d)); + if (!d) + goto err; + + /* Attention: used for upgrade in the same machine + * May in conflict with original usage + */ + pr_info("set %d(fd %d) unix stream repair on \n", sk->sd.ino, lfd); + ret = unix_stream_repair_on(lfd); + if (ret < 0) + goto err; + + d->fd = dup(lfd); + pr_info("add %d into unix_stream_external_sockets\n", sk->sd.ino); + list_add_tail(&d->list, &unix_stream_external_sockets); + list_add(&sk->repair_node, &peer->repair_list); + sk->ext_node = d; + + len = sizeof(ue->repair_ino); + ret = getsockopt(lfd, SOL_TCP, TCP_REPAIR_OPTIONS, &ue->repair_ino, &len); + if (ret < 0) + goto err; + } + /* * Peer should have us as peer or have a name by which * we can access one. @@ -535,6 +598,26 @@ dump:
sk->sd.already_dumped = 1;
+ while (!list_empty(&sk->repair_list)) { + struct unix_sk_desc *psk; + struct unix_stream_extern_socket_desc *d; + + psk = list_first_entry(&sk->repair_list, struct unix_sk_desc, repair_node); + list_del_init(&psk->repair_node); + + pr_info("delete ino %d into unix_stream_external_sockets\n", psk->sd.ino); + + d = psk->ext_node; + list_del_init(&d->list); + psk->ext_node = NULL; + /* ino start from 1, using 0 to tag the non-repairing socket is safe. */ + psk->ue->repair_ino = 0; + + unix_stream_repair_off(d->fd); + close_safe(&d->fd); + xfree(d); + } + while (!list_empty(&sk->peer_list)) { struct unix_sk_desc *psk; psk = list_first_entry(&sk->peer_list, struct unix_sk_desc, peer_node); @@ -697,6 +780,8 @@ static int unix_collect_one(const struct unix_diag_msg *m,
INIT_LIST_HEAD(&d->peer_list); INIT_LIST_HEAD(&d->peer_node); + INIT_LIST_HEAD(&d->repair_list); + INIT_LIST_HEAD(&d->repair_node); d->fd = -1;
if (tb[UNIX_DIAG_SHUTDOWN]) @@ -810,16 +895,18 @@ static int __dump_external_socket(struct unix_sk_desc *sk, return -1; }
- if (peer->type != SOCK_DGRAM) { - show_one_unix("Ext stream not supported", peer); - pr_err("Can't dump half of stream unix connection.\n"); + if (peer->type != SOCK_DGRAM && + peer->type != SOCK_STREAM) { + show_one_unix("Ext unix type not supported", peer); + pr_err("Can't dump this kind of unix connection.\n"); return -1; }
- if (!peer->name) { + /* part 1: prevent NULL pointer oops */ + if (!peer->name && !sk->name) { show_one_unix("Ext dgram w/o name", peer); + show_one_unix("Ext dgram w/o name", sk); pr_err("Can't dump name-less external socket.\n"); - pr_err("%d\n", sk->fd); return -1; }
@@ -866,7 +953,7 @@ int fix_external_unix_sockets(void)
fd_id_generate_special(NULL, &e.id); e.ino = sk->sd.ino; - e.type = SOCK_DGRAM; + e.type = sk->type; e.state = TCP_LISTEN; e.name.data = (void *)sk->name; e.name.len = (size_t)sk->namelen; @@ -893,6 +980,19 @@ err: return -1; }
+void unix_stream_unlock(int ret) +{ + struct unix_stream_extern_socket_desc *d; + pr_debug("Unlocking unix stream sockets\n"); + list_for_each_entry(d, &unix_stream_external_sockets, list) { + if (ret) { + pr_debug("unlock fd %d \n", d->fd); + unix_stream_repair_off(d->fd); + } + close_safe(&d->fd); + } +} + struct unix_sk_info { UnixSkEntry *ue; struct list_head list; @@ -1278,6 +1378,7 @@ static int post_open_standalone(struct file_desc *d, int fd) struct unix_sk_info *peer; struct sockaddr_un addr; int cwd_fd = -1, root_fd = -1, ns_fd = -1; + int ret, value;
ui = container_of(d, struct unix_sk_info, d); BUG_ON((ui->flags & (USK_PAIR_MASTER | USK_PAIR_SLAVE)) || @@ -1335,7 +1436,23 @@ static int post_open_standalone(struct file_desc *d, int fd) * while we're connecting in sake of ghost sockets. */ mutex_lock(mutex_ghost); - if (connect(fd, (struct sockaddr *)&addr, sizeof(addr.sun_family) + len) < 0) { + + /* we handle unix stream with external connections here */ + if (kdat.has_unix_sk_repair && peer->name + && ui->ue->type == SOCK_STREAM && ui->ue->repair_ino != 0) { + value = ui->ue->repair_ino; + ret = setsockopt(fd, SOL_TCP, TCP_REPAIR, &value, sizeof(value)); + if (ret < 0) { + /* permit the unix sk resume successfully when the peer has been + * closed, just warn here */ + pr_warn("Can't repair %d socket\n", value); + } + + ret = unix_stream_repair_off(fd); + if (ret < 0) { + goto err_revert_and_exit; + } + } else if (connect(fd, (struct sockaddr *)&addr, sizeof(addr.sun_family) + len) < 0) { pr_perror("Can't connect %d socket", ui->ue->ino); goto err_revert_and_exit; } @@ -2037,8 +2154,10 @@ static int init_unix_sk_info(struct unix_sk_info *ui, UnixSkEntry *ue) }
ui->name = (void *)ue->name.data; - } else - ui->name = NULL; + } else { + /* part 2: prevent NULL pointer oops */ + ui->name = ""; + } ui->name_dir = (void *)ue->name_dir;
ui->flags = 0; diff --git a/images/sk-unix.proto b/images/sk-unix.proto index 2a3a7cc..610080a 100644 --- a/images/sk-unix.proto +++ b/images/sk-unix.proto @@ -52,4 +52,5 @@ message unix_sk_entry { optional uint32 ns_id = 16; optional sint32 mnt_id = 17 [default = -1]; /* Please, don't use field with number 18. */ + required sint32 repair_ino = 19; }
From: Liu Chao liuchao173@huawei.com
When sigev_notify_thread_id is not set, get_pid will return a NULL pointer and do_timer_create will return -EINVAL in kernel. So criu will failed to create posix timer:
(09.806760) pie: 41301: Error (criu/pie/restorer.c:1998): Can't restore posix timers -22 (09.806824) pie: 41301: Error (criu/pie/restorer.c:2133): Restorer fail 41301 (09.891880) Error (criu/cr-restore.c:2596): Restoring FAILED.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: Liu Chao liuchao173@huawei.com --- criu/cr-restore.c | 1 + criu/include/posix-timer.h | 1 + criu/parasite-syscall.c | 1 + criu/pie/restorer.c | 1 + criu/proc_parse.c | 1 + images/timer.proto | 1 + 6 files changed, 6 insertions(+)
diff --git a/criu/cr-restore.c b/criu/cr-restore.c index ecebdfe..2ed61d0 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2848,6 +2848,7 @@ static inline int decode_posix_timer(PosixTimerEntry *pte, pt->spt.si_signo = pte->si_signo; pt->spt.it_sigev_notify = pte->it_sigev_notify; pt->spt.sival_ptr = decode_pointer(pte->sival_ptr); + pt->spt.sigev_notify_thread_id = pte->sigev_notify_thread_id; pt->overrun = pte->overrun;
return 0; diff --git a/criu/include/posix-timer.h b/criu/include/posix-timer.h index fa99d86..11b7618 100644 --- a/criu/include/posix-timer.h +++ b/criu/include/posix-timer.h @@ -8,6 +8,7 @@ struct str_posix_timer { int clock_id; int si_signo; int it_sigev_notify; + int sigev_notify_thread_id; void * sival_ptr; };
diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c index c7074c7..8d9e01b 100644 --- a/criu/parasite-syscall.c +++ b/criu/parasite-syscall.c @@ -316,6 +316,7 @@ static void encode_posix_timer(struct posix_timer *v, pte->si_signo = vp->spt.si_signo; pte->it_sigev_notify = vp->spt.it_sigev_notify; pte->sival_ptr = encode_pointer(vp->spt.sival_ptr); + pte->sigev_notify_thread_id = vp->spt.sigev_notify_thread_id;
pte->overrun = v->overrun;
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 0bd220a..5e06abb 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1224,6 +1224,7 @@ static int create_posix_timers(struct task_restore_args *args) sev.sigev_notify = args->posix_timers[i].spt.it_sigev_notify; sev.sigev_signo = args->posix_timers[i].spt.si_signo; sev.sigev_value.sival_ptr = args->posix_timers[i].spt.sival_ptr; + sev._sigev_un._tid = args->posix_timers[i].spt.sigev_notify_thread_id;
while (1) { ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &next_id); diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 32d84b3..c8a18cf 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -2380,6 +2380,7 @@ int parse_posix_timers(pid_t pid, struct proc_posix_timers_stat *args)
if ( tidpid[0] == 't') { timer->spt.it_sigev_notify = SIGEV_THREAD_ID; + timer->spt.sigev_notify_thread_id = pid_t; } else { switch (sigpid[0]) { case 's' : diff --git a/images/timer.proto b/images/timer.proto index a254a6f..41db460 100644 --- a/images/timer.proto +++ b/images/timer.proto @@ -19,6 +19,7 @@ message posix_timer_entry { required uint64 insec = 8; required uint64 vsec = 9; required uint64 vnsec = 10; + required int32 sigev_notify_thread_id = 11; }
message task_timers_entry {
From: "fu.lin" fu.lin10@huawei.com
In original criu design, SysVIPC memory segment, which belongs to host ipcns, shouldn't be dumped because criu requires the whole ipcns to be dumped. During the restoring ipcns, the new shared memory will be created, and fill the original page data in it.
This patch makes the shared-memory in host ipcns restore possible. Idea: The SysVIPC memory won't disappear after the task exit. The basic information can be got from `/proc/sysvipc/shm` as long as the system doesn't reboot. Compared with restoring the whole ipcns, the processes of the shared memory creating and page data filling are ignored.
Reference: - https://www.criu.org/What_cannot_be_checkpointed
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: fu.lin fu.lin10@huawei.com --- criu/cr-dump.c | 9 ++++----- criu/cr-restore.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 5 deletions(-)
diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 2bbcef3..e76fe5a 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -444,12 +444,11 @@ static int dump_filemap(struct vma_area *vma_area, int fd)
static int check_sysvipc_map_dump(pid_t pid, VmaEntry *vma) { - if (root_ns_mask & CLONE_NEWIPC) - return 0; + if (!(root_ns_mask & CLONE_NEWIPC)) + pr_info("Task %d with SysVIPC shmem map @%"PRIx64" lives in host IPC ns\n", + pid, vma->start);
- pr_err("Task %d with SysVIPC shmem map @%"PRIx64" doesn't live in IPC ns\n", - pid, vma->start); - return -1; + return 0; }
static int get_task_auxv(pid_t pid, MmEntry *mm) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 2ed61d0..ed82524 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1840,6 +1840,49 @@ static int create_children_and_session(void) return 0; }
+static int prepare_rootns_sysv_shm(unsigned long clone_flags) +{ + int retval = 0; + char *line = NULL; + size_t len = 0; + FILE *fp; + key_t key; + int shmid; + mode_t mode; + size_t size; + + /* This is completed by `prepare_namespace()` */ + if (!!(clone_flags & CLONE_NEWIPC)) + return 0; + + pr_info("Restoring SYSV shm in host namespace\n"); + + fp = fopen("/proc/sysvipc/shm", "r"); + if (fp == NULL) { + pr_err("Can't open '/proc/sysvipc/shm', errno(%d): %s\n", errno, strerror(errno)); + return -1; + } + +#if BITS_PER_LONG <= 32 +# define SIZE_SPEC "%10lu" +#else +# define SIZE_SPEC "%21lu" +#endif + + while (getline(&line, &len, fp) != -1) { + if (sscanf(line, "%10d %10d %4o" SIZE_SPEC, &key, &shmid, &mode, &size) != 4) + continue; + + retval = collect_sysv_shmem(shmid, size); + if (retval != 0) + goto out; + } + +out: + fclose(fp); + return retval; +} + static int restore_task_with_children(void *_arg) { struct cr_clone_arg *ca = _arg; @@ -1947,6 +1990,9 @@ static int restore_task_with_children(void *_arg) if (prepare_namespace(current, ca->clone_flags)) goto err;
+ if (prepare_rootns_sysv_shm(ca->clone_flags)) + goto err; + if (restore_finish_ns_stage(CR_STATE_PREPARE_NAMESPACES, CR_STATE_FORKING) < 0) goto err;
From: Xiaoguang Li lixiaoguang2@huawei.com
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 --- criu/sk-netlink.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+)
diff --git a/criu/sk-netlink.c b/criu/sk-netlink.c index 3b86a7d..6d8ab2d 100644 --- a/criu/sk-netlink.c +++ b/criu/sk-netlink.c @@ -68,6 +68,17 @@ int netlink_receive_one(struct nlmsghdr *hdr, struct ns_id *ns, void *arg) return sk_collect_one(m->ndiag_ino, PF_NETLINK, &sd->sd, ns); }
+static int netlink_repair_on(int fd) +{ + int ret, aux = 1; + + ret = setsockopt(fd, SOL_NETLINK, TCP_REPAIR, &aux, sizeof(aux)); + if (ret < 0) + pr_err("Can't turn netlink repair mode ON, error: %d\n", ret); + + return ret; +} + static bool can_dump_netlink_sk(int lfd) { int ret; @@ -90,6 +101,10 @@ static int dump_one_netlink_fd(int lfd, u32 id, const struct fd_parms *p) if (IS_ERR(sk)) goto err;
+ if (netlink_repair_on(lfd) < 0) { + goto err; + } + ne.id = id; ne.ino = p->stat.st_ino;
From: Jingxian He hejingxian@huawei.com
Add O_REPAIR flag when openning vma fd.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: Jingxian He hejingxian@huawei.com --- criu/files-reg.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/criu/files-reg.c b/criu/files-reg.c index e6ae042..6747a3a 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -2334,6 +2334,7 @@ void filemap_ctx_fini(void) } }
+#define O_REPAIR 040000000 static int open_filemap(int pid, struct vma_area *vma) { u32 flags; @@ -2346,13 +2347,15 @@ static int open_filemap(int pid, struct vma_area *vma) */
BUG_ON((vma->vmfd == NULL) || !vma->e->has_fdflags); - flags = vma->e->fdflags; + flags = vma->e->fdflags | O_REPAIR;
if (ctx.flags != flags || ctx.desc != vma->vmfd) { if (vma->e->status & VMA_AREA_MEMFD) ret = memfd_open(vma->vmfd, &flags); - else + else { + ret = open_path(vma->vmfd, do_open_reg_noseek_flags, &flags); + } if (ret < 0) return ret;
From: Jingxian He hejingxian@huawei.com
When the file mode and size larger than dump data, make the restoring process run success.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: Jingxian He hejingxian@huawei.com --- criu/config.c | 1 + criu/crtools.c | 1 + criu/files-reg.c | 8 +++++--- criu/include/cr_options.h | 1 + 4 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/criu/config.c b/criu/config.c index 9c4d8ce..006753a 100644 --- a/criu/config.c +++ b/criu/config.c @@ -548,6 +548,7 @@ int parse_options(int argc, char **argv, bool *usage_error, BOOL_OPT("with-fd-cred", &opts.with_fd_cred), BOOL_OPT("dump-char-dev", &opts.dump_char_dev), BOOL_OPT("mask-exit-notify", &opts.mask_exit_notify), + BOOL_OPT("weak-file-check", &opts.weak_file_check), { }, };
diff --git a/criu/crtools.c b/criu/crtools.c index 8694ed0..239464a 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -452,6 +452,7 @@ usage: " as checkout assisted by kernel.\n" " --dump-char-dev Dump char dev files as normal file with repair cmd\n" " --mask-exit-notify Mask task exit notify during dump and restore\n" +" --weak-file-check Allow file size and mod larger than dumping value\n" "\n" "Check options:\n" " Without options, "criu check" checks availability of absolutely required\n" diff --git a/criu/files-reg.c b/criu/files-reg.c index ba78c67..e6ae042 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -2061,7 +2061,8 @@ static bool validate_file(const int fd, const struct stat *fd_status, { int result = 1;
- if (rfi->rfe->has_size && (fd_status->st_size != rfi->rfe->size)) { + if (rfi->rfe->has_size && ((!opts.weak_file_check && fd_status->st_size != rfi->rfe->size) || + (fd_status->st_size < rfi->rfe->size))) { pr_err("File %s has bad size %"PRIu64" (expect %"PRIu64")\n", rfi->path, fd_status->st_size, rfi->rfe->size); return false; @@ -2176,8 +2177,9 @@ ext: if (!validate_file(tmp, &st, rfi)) return -1;
- if (rfi->rfe->has_mode && (st.st_mode != rfi->rfe->mode)) { - pr_err("File %s has bad mode 0%o (expect 0%o)\n", + if (rfi->rfe->has_mode && ((!opts.weak_file_check && st.st_mode != rfi->rfe->mode) || + (st.st_mode < rfi->rfe->mode))) { + pr_err("%d File %s has bad mode 0%o (expect 0%o)\n", opts.weak_file_check, rfi->path, (int)st.st_mode, rfi->rfe->mode); return -1; diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index 5b3ff86..fc7818c 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -182,6 +182,7 @@ struct cr_options { int with_fd_cred; int dump_char_dev; int mask_exit_notify; + int weak_file_check; };
extern struct cr_options opts;
From: Liu Chao liuchao173@huawei.com
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: fu.lin fu.lin10@huawei.com --- criu/cr-restore.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+)
diff --git a/criu/cr-restore.c b/criu/cr-restore.c index ed82524..4fd29a5 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -111,6 +111,9 @@ #endif
struct pstree_item *current; +#define NETWORK_COLLECTED 0x1 +#define NETWORK_UNLOCK 0x2 +static int network_status = 0;
static int restore_task_with_children(void *); static int sigreturn_restore(pid_t pid, struct task_restore_args *ta, unsigned long alen, CoreEntry *core); @@ -247,6 +250,7 @@ static int crtools_prepare_shared(void) /* Connections are unlocked from criu */ if (!files_collected() && collect_image(&inet_sk_cinfo)) return -1; + network_status |= NETWORK_COLLECTED;
if (collect_binfmt_misc()) return -1; @@ -2496,6 +2500,7 @@ skip_ns_bouncing:
/* Unlock network before disabling repair mode on sockets */ network_unlock(); + network_status |= NETWORK_UNLOCK;
/* * Stop getting sigchld, after we resume the tasks they @@ -2701,6 +2706,15 @@ int cr_restore_tasks(void)
err: cr_plugin_fini(CR_PLUGIN_STAGE__RESTORE, ret); + if (ret < 0) { + if ((network_status & NETWORK_COLLECTED) == 0) { + if (!files_collected() && collect_image(&inet_sk_cinfo)) + pr_err("collect inet sk cinfo fail"); + } + if ((network_status & NETWORK_UNLOCK) == 0) + network_unlock(); + } + return ret; }
From: Sang Yan sangyan@huawei.com
Add new options "--file-locks-repair" to enable repair mode while dumping file locks. Repair mode keeps locks locked while process were killed in dumping operation. Then resume the locks from repair mode at process resuming.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: Sang Yan sangyan@huawei.com --- criu/config.c | 1 + criu/cr-dump.c | 8 ++++++ criu/crtools.c | 1 + criu/file-lock.c | 10 +++++++ criu/include/cr_options.h | 1 + criu/include/fcntl.h | 7 +++++ criu/include/parasite-syscall.h | 2 ++ criu/include/parasite.h | 10 +++++++ criu/parasite-syscall.c | 33 ++++++++++++++++++++++ criu/pie/parasite.c | 50 +++++++++++++++++++++++++++++++++ 10 files changed, 123 insertions(+)
diff --git a/criu/config.c b/criu/config.c index 006753a..7b6da0d 100644 --- a/criu/config.c +++ b/criu/config.c @@ -549,6 +549,7 @@ int parse_options(int argc, char **argv, bool *usage_error, BOOL_OPT("dump-char-dev", &opts.dump_char_dev), BOOL_OPT("mask-exit-notify", &opts.mask_exit_notify), BOOL_OPT("weak-file-check", &opts.weak_file_check), + BOOL_OPT("file-locks-repair", &opts.file_locks_repair), { }, };
diff --git a/criu/cr-dump.c b/criu/cr-dump.c index e76fe5a..db8e01c 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -1398,6 +1398,14 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) goto err_cure; }
+ if (opts.file_locks_repair) { + ret = parasite_dump_file_locks(parasite_ctl, pid); + if (ret) { + pr_err("Can't parasite dump file locks (pid: %d)\n", pid); + goto err_cure; + } + } + ret = dump_task_core_all(parasite_ctl, item, &pps_buf, cr_imgset, &misc); if (ret) { pr_err("Dump core (pid: %d) failed with %d\n", pid, ret); diff --git a/criu/crtools.c b/criu/crtools.c index 239464a..72d8514 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -453,6 +453,7 @@ usage: " --dump-char-dev Dump char dev files as normal file with repair cmd\n" " --mask-exit-notify Mask task exit notify during dump and restore\n" " --weak-file-check Allow file size and mod larger than dumping value\n" +" --file-locks-repair Use repair mode to dump and restore file locks\n" "\n" "Check options:\n" " Without options, "criu check" checks availability of absolutely required\n" diff --git a/criu/file-lock.c b/criu/file-lock.c index 8be7589..44ecc92 100644 --- a/criu/file-lock.c +++ b/criu/file-lock.c @@ -428,6 +428,8 @@ void discard_dup_locks_tail(pid_t pid, int fd) list_for_each_entry_safe_reverse(fl, p, &file_lock_list, list) { if (fl->owners_fd != fd || pid != fl->fl_holder) break; + if (fl->fl_kind == FL_POSIX) + continue;
list_del(&fl->list); xfree(fl); @@ -618,8 +620,12 @@ static int restore_file_lock(FileLockEntry *fle) cmd = fle->type; } else if (fle->type == F_RDLCK) { cmd = LOCK_SH; + if (opts.file_locks_repair) + cmd = LOCK_REPAIR; } else if (fle->type == F_WRLCK) { cmd = LOCK_EX; + if (opts.file_locks_repair) + cmd = LOCK_REPAIR; } else if (fle->type == F_UNLCK) { cmd = LOCK_UN; } else { @@ -645,6 +651,10 @@ static int restore_file_lock(FileLockEntry *fle) flk.l_pid = fle->pid; flk.l_type = fle->type;
+ if (opts.file_locks_repair) + if (fle->type == F_RDLCK || fle->type == F_WRLCK) + flk.l_type = F_REPAIR; + pr_info("(posix)flag: %d, type: %d, pid: %d, fd: %d, " "start: %8"PRIx64", len: %8"PRIx64"\n", fle->flag, fle->type, fle->pid, fle->fd, diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index fc7818c..607b528 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -183,6 +183,7 @@ struct cr_options { int dump_char_dev; int mask_exit_notify; int weak_file_check; + int file_locks_repair; };
extern struct cr_options opts; diff --git a/criu/include/fcntl.h b/criu/include/fcntl.h index 0936337..65f8b36 100644 --- a/criu/include/fcntl.h +++ b/criu/include/fcntl.h @@ -23,6 +23,13 @@ struct f_owner_ex { #define F_SETCRED 18 #endif
+#ifndef F_NEED_REPAIR +#define F_NEED_REPAIR 16 +#define F_REPAIR 32 +#define LOCK_NEED_REPAIR 256 /* REPAIRING lock */ +#define LOCK_REPAIR 512 /* REPAIR lock */ +#endif + /* * These things are required to compile on CentOS-6 */ diff --git a/criu/include/parasite-syscall.h b/criu/include/parasite-syscall.h index c86a724..14e1f31 100644 --- a/criu/include/parasite-syscall.h +++ b/criu/include/parasite-syscall.h @@ -54,4 +54,6 @@ extern int parasite_dump_cgroup(struct parasite_ctl *ctl, struct parasite_dump_c
extern struct parasite_tty_args *parasite_dump_tty(struct parasite_ctl *ctl, int fd, int type);
+extern int parasite_dump_file_locks(struct parasite_ctl *ctl, int pid); + #endif /* __CR_PARASITE_SYSCALL_H__ */ diff --git a/criu/include/parasite.h b/criu/include/parasite.h index d957094..1c702f0 100644 --- a/criu/include/parasite.h +++ b/criu/include/parasite.h @@ -35,6 +35,7 @@ enum { PARASITE_CMD_CHECK_VDSO_MARK, PARASITE_CMD_CHECK_AIOS, PARASITE_CMD_DUMP_CGROUP, + PARASITE_CMD_DUMP_FILELOCKS,
PARASITE_CMD_MAX, }; @@ -236,6 +237,15 @@ struct parasite_dump_cgroup_args { char contents[1 << 12]; };
+struct parasite_dump_filelocks_args { + short kind; + short type; + long start; + long len; + int pid; + int fd; +}; + #endif /* !__ASSEMBLY__ */
#endif /* __CR_PARASITE_H__ */ diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c index 8d9e01b..8fdb475 100644 --- a/criu/parasite-syscall.c +++ b/criu/parasite-syscall.c @@ -32,6 +32,7 @@ #include <compel/plugins/std/syscall-codes.h> #include "signal.h" #include "sigframe.h" +#include "file-lock.h"
#include <string.h> #include <stdlib.h> @@ -591,3 +592,35 @@ struct parasite_ctl *parasite_infect_seized(pid_t pid, struct pstree_item *item,
return ctl; } + +int parasite_dump_file_locks(struct parasite_ctl *ctl, int pid) +{ + struct parasite_dump_filelocks_args *args; + struct file_lock *fl; + int ret; + + args = compel_parasite_args(ctl, struct parasite_dump_filelocks_args); + + list_for_each_entry(fl, &file_lock_list, list) { + if (fl->real_owner != pid) + continue; + + args->pid = fl->real_owner; + args->fd = fl->owners_fd; + args->kind = fl->fl_kind; + args->type = fl->fl_ltype; + args->start = fl->start; + if (!strncmp(fl->end, "EOF", 3)) + args->len = 0; + else + args->len = (atoll(fl->end) + 1) - fl->start; + + ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_FILELOCKS, ctl); + if (ret < 0) { + pr_err("Parasite dump file lock failed! (pid: %d)\n", pid); + return ret; + } + } + + return 0; +} diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c index d839783..635c3f8 100644 --- a/criu/pie/parasite.c +++ b/criu/pie/parasite.c @@ -7,6 +7,8 @@ #include <stdarg.h> #include <sys/ioctl.h> #include <sys/uio.h> +#include <unistd.h> +#include <fcntl.h>
#include "common/config.h" #include "int.h" @@ -20,6 +22,7 @@ #include "criu-log.h" #include "tty.h" #include "aio.h" +#include "file-lock.h"
#include "asm/parasite.h" #include "restorer.h" @@ -677,6 +680,50 @@ static int parasite_dump_cgroup(struct parasite_dump_cgroup_args *args) return 0; }
+static int set_filelocks_needrepair(struct parasite_dump_filelocks_args *args) +{ + int ret; + + if (args->kind == FL_FLOCK) { + if (args->type == F_RDLCK || args->type == F_WRLCK) { + int cmd = LOCK_NEED_REPAIR; + + pr_info("Need Repair flock kind: %d, type: %d, cmd: %d, pid: %d, fd: %d\n", + args->kind, args->type, cmd, args->pid, args->fd); + + ret = sys_flock(args->fd, cmd); + if (ret < 0) { + pr_err("Can not set NEED_REPAIR flock!\n"); + return ret; + } + } + } else if (args->kind == FL_POSIX) { + if (args->type == F_RDLCK || args->type == F_WRLCK) { + struct flock flk; + memset(&flk, 0, sizeof(flk)); + + flk.l_whence = SEEK_SET; + flk.l_start = args->start; + flk.l_len = args->len; + flk.l_pid = args->pid; + flk.l_type = F_NEED_REPAIR; + + pr_info("Need Repair posix lock kind: %d, type: %d, cmd: %d, pid: %d, fd: %d, " + "start: %8"PRIx64", len: %8"PRIx64"\n", + args->kind, args->type, flk.l_type, args->pid, args->fd, + args->start, args->len); + + ret = sys_fcntl(args->fd, F_SETLKW, (long)&flk); + if (ret < 0) { + pr_err("Can not set NEED_REPAIR posix lock!\n"); + return ret; + } + } + } + + return 0; +} + void parasite_cleanup(void) { if (mprotect_args) { @@ -729,6 +776,9 @@ int parasite_daemon_cmd(int cmd, void *args) case PARASITE_CMD_DUMP_CGROUP: ret = parasite_dump_cgroup(args); break; + case PARASITE_CMD_DUMP_FILELOCKS: + ret = set_filelocks_needrepair(args); + break; default: pr_err("Unknown command in parasite daemon thread leader: %d\n", cmd); ret = -1;
From: Sang Yan sangyan@huawei.com
When the socket file is shared with another process, it will not be freed during dumping process. We can repair the socket file by installing it to the old fd number.
Add new options: "--share-dst-ports" and "--share-src-ports" for user to tell criu which socket ports are shared.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: Jingxian He hejingxian@huawei.com --- criu/config.c | 8 ++ criu/crtools.c | 3 + criu/files.c | 18 ++++- criu/include/cr_options.h | 2 + criu/include/files.h | 4 + criu/include/net.h | 1 + criu/include/sk-inet.h | 3 + criu/sk-inet.c | 151 ++++++++++++++++++++++++++++++++++++++ 8 files changed, 189 insertions(+), 1 deletion(-)
diff --git a/criu/config.c b/criu/config.c index 7b6da0d..cb647f7 100644 --- a/criu/config.c +++ b/criu/config.c @@ -541,6 +541,8 @@ int parse_options(int argc, char **argv, bool *usage_error, { "cgroup-yard", required_argument, 0, 1096 }, { "pre-dump-mode", required_argument, 0, 1097}, { "file-validation", required_argument, 0, 1098 }, + { "share-dst-ports", required_argument, 0, 1099 }, + { "share-src-ports", required_argument, 0, 1100 }, BOOL_OPT("with-cpu-affinity", &opts.with_cpu_affinity), BOOL_OPT("pin-memory", &opts.pin_memory), BOOL_OPT("use-fork-pid", &opts.use_fork_pid), @@ -879,6 +881,12 @@ int parse_options(int argc, char **argv, bool *usage_error, if (parse_file_validation_method(&opts, optarg)) return 2; break; + case 1099: + SET_CHAR_OPTS(share_dst_ports, optarg); + break; + case 1100: + SET_CHAR_OPTS(share_src_ports, optarg); + break; case 'V': pr_msg("Version: %s\n", CRIU_VERSION); if (strcmp(CRIU_GITID, "0")) diff --git a/criu/crtools.c b/criu/crtools.c index 72d8514..cf5fd0d 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -107,6 +107,9 @@ int main(int argc, char *argv[], char *envp[]) goto usage; }
+ if (parse_share_ports()) + goto usage; + log_set_loglevel(opts.log_level);
if (optind < argc && !strcmp(argv[optind], "swrk")) { diff --git a/criu/files.c b/criu/files.c index 34aa8be..0ebf26e 100644 --- a/criu/files.c +++ b/criu/files.c @@ -719,6 +719,8 @@ int dump_my_file(int lfd, u32 *id, int *type) return 0; }
+int dst_pid; + int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, struct parasite_drain_fd *dfds) { @@ -743,7 +745,7 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, img = open_image(CR_FD_FDINFO, O_DUMP, item->ids->files_id); if (!img) goto err; - + dst_pid = item->pid->real; ret = 0; /* Don't fail if nr_fds == 0 */ for (off = 0; ret == 0 && off < dfds->nr_fds; off += nr_fds) { if (nr_fds + off > dfds->nr_fds) @@ -1262,6 +1264,20 @@ static int open_fd(struct fdinfo_list_entry *fle) goto out; }
+ if (d->ops->type == FD_TYPES__INETSK) { + if (check_need_repair(d)) { + ret = repair_share_socket(d->id); + if (!ret) { + new_fd = get_share_socket(); + pr_info("get share socket:%d\n", new_fd); + if (new_fd <= 0 || setup_and_serve_out(fle, new_fd) < 0) + return -1; + fle->stage = FLE_RESTORED; + return 0; + } + } + } + /* * Open method returns the following values: * 0 -- restore is successfully finished; diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index 607b528..8aa5d5a 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -184,6 +184,8 @@ struct cr_options { int mask_exit_notify; int weak_file_check; int file_locks_repair; + char *share_dst_ports; + char *share_src_ports; };
extern struct cr_options opts; diff --git a/criu/include/files.h b/criu/include/files.h index b12d079..85ca617 100644 --- a/criu/include/files.h +++ b/criu/include/files.h @@ -210,4 +210,8 @@ extern int open_transport_socket(void); extern int set_fds_event(pid_t virt); extern void wait_fds_event(void);
+extern int repair_share_socket(int id); +extern int check_need_repair(struct file_desc *d); +extern int get_share_socket(void); + #endif /* __CR_FILES_H__ */ diff --git a/criu/include/net.h b/criu/include/net.h index 0a556f3..795d5e8 100644 --- a/criu/include/net.h +++ b/criu/include/net.h @@ -16,6 +16,7 @@ extern int dump_net_ns(struct ns_id *ns); extern int prepare_net_namespaces(void); extern void fini_net_namespaces(void); extern int netns_keep_nsfd(void); +extern int parse_share_ports(void);
struct pstree_item; extern int restore_task_net_ns(struct pstree_item *current); diff --git a/criu/include/sk-inet.h b/criu/include/sk-inet.h index dec67ca..2e28444 100644 --- a/criu/include/sk-inet.h +++ b/criu/include/sk-inet.h @@ -102,4 +102,7 @@ union libsoccr_addr; int restore_sockaddr(union libsoccr_addr *sa, int family, u32 pb_port, u32 *pb_addr, u32 ifindex);
+#define MAX_SHARE_PORT_NUM 64 +extern int dst_pid; + #endif /* __CR_SK_INET_H__ */ diff --git a/criu/sk-inet.c b/criu/sk-inet.c index 7a05de2..d29f03b 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -449,6 +449,152 @@ static bool needs_scope_id(uint32_t *src_addr) return false; }
+#define ADD_SHARE_SOCKET_PATH "/sys/kernel/add_share_socket" +#define REPAIR_SHARE_SOCKET_PATH "/sys/kernel/repair_share_socket" +#define SHARE_SOCKET_PATH "/sys/kernel/share_socket" + +int add_share_socket(u32 id, int fd, int pid, int port) +{ + int retval; + char buf[256] = {0}; + + retval = snprintf(buf, 256, "%u,%d,%d,%d", id, fd, pid, port); + if (retval <= 0) + return -EFAULT; + + fd = open(ADD_SHARE_SOCKET_PATH, O_WRONLY, 0); + if (fd < 0) { + pr_err("open file:%s fail\n", ADD_SHARE_SOCKET_PATH); + return fd; + } + + retval = write(fd, buf, strlen(buf)); + close(fd); + return retval < 0 ? -1 : 0; +} + + +int repair_share_socket(int id) +{ + int retval, fd; + char buf[256] = {0}; + + retval = snprintf(buf, 256, "%u", id); + if (retval <= 0) + return -EFAULT; + + fd = open(REPAIR_SHARE_SOCKET_PATH, O_WRONLY, 0); + if (fd < 0) { + pr_err("open file:%s fail\n", REPAIR_SHARE_SOCKET_PATH); + return fd; + } + retval = write(fd, buf, strlen(buf)); + + close(fd); + return retval < 0 ? -1 : 0; +} + +int get_share_socket(void) +{ + int fd; + ssize_t count; + int retval = -1; + char buf[32] = {0}; + + fd = open(SHARE_SOCKET_PATH, O_RDONLY, 0); + if (fd < 0) { + pr_err("open file:%s fail\n", SHARE_SOCKET_PATH); + return fd; + } + + count = read(fd, buf, sizeof(buf)); + if (count > 0) + retval = atoi(buf); + + close(fd); + return retval; +} + +int g_share_dst_ports[MAX_SHARE_PORT_NUM]; +int g_share_dst_port_num; +int g_share_src_ports[MAX_SHARE_PORT_NUM]; +int g_share_src_port_num; + +int parse_share_ports(void) +{ + char *save, *p; + + if (opts.share_dst_ports) { + p = strtok_r(opts.share_dst_ports, ",", &save); + while (p != NULL) { + if (g_share_dst_port_num >= MAX_SHARE_PORT_NUM) + return -1; + g_share_dst_ports[g_share_dst_port_num] = atoi(p); + if (!g_share_dst_ports[g_share_dst_port_num]) + return -1; + g_share_dst_port_num++; + p = strtok_r(NULL, ",", &save); + } + } + + if (opts.share_src_ports) { + p = strtok_r(opts.share_src_ports, ",", &save); + while (p != NULL) { + if (g_share_src_port_num >= MAX_SHARE_PORT_NUM) + return -1; + g_share_src_ports[g_share_src_port_num] = atoi(p); + if (!g_share_src_ports[g_share_src_port_num]) + return -1; + g_share_src_port_num++; + p = strtok_r(NULL, ",", &save); + } + } + return 0; +} + +int check_share_dst_port(int dst_port) +{ + int i; + int ret = 0; + + for (i = 0; i < g_share_dst_port_num; i++) { + if (dst_port == g_share_dst_ports[i]) { + ret = 1; + break; + } + } + return ret; +} + +int check_share_src_port(int src_port) +{ + int i; + int ret = 0; + + for (i = 0; i < g_share_src_port_num; i++) { + if (src_port == g_share_src_ports[i]) { + ret = 1; + break; + } + } + + return ret; +} + +int check_need_repair(struct file_desc *d) +{ + struct inet_sk_info *ii; + InetSkEntry *ie; + + ii = container_of(d, struct inet_sk_info, d); + ie = ii->ie; + if (check_share_dst_port(ie->dst_port) || + check_share_src_port(ie->src_port)) + return 1; + else + return 0; +} + static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int family) { struct inet_sk_desc *sk; @@ -507,6 +653,11 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa
BUG_ON(sk->sd.already_dumped);
+ if (check_share_dst_port(sk->dst_port) || check_share_src_port(sk->src_port)) { + pr_info("Start add share prot:%d src %d\n", sk->dst_port, sk->src_port); + add_share_socket(id, lfd, dst_pid, sk->src_port); + } + ie.id = id; ie.ino = sk->sd.ino; if (sk->sd.sk_ns) {
From: Sang Yan sangyan@huawei.com
Clean pin mem and netlink repair res when dump fail.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: Jingxian He hejingxian@huawei.com --- criu/cr-dump.c | 26 ++++++++++++++++++++++++++ criu/include/net.h | 1 + criu/sk-netlink.c | 40 +++++++++++++++++++++++++++++++++++----- 3 files changed, 62 insertions(+), 5 deletions(-)
diff --git a/criu/cr-dump.c b/criu/cr-dump.c index db8e01c..600bc4c 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -83,6 +83,7 @@ #include "memfd.h" #include "timens.h" #include "img-streamer.h" +#include "restorer.h"
/* * Architectures can overwrite this function to restore register sets that @@ -1706,6 +1707,23 @@ static int cr_lazy_mem_dump(void) return ret; }
+int clear_pin_mem(int pid) +{ + int fd, ret; + + fd = open(PIN_MEM_FILE, O_RDWR, 0); + if (fd < 0) { + pr_warn("error open file: %s\n", PIN_MEM_FILE); + return -1; + } + ret = ioctl(fd, CLEAR_PIN_MEM_AREA, (unsigned long) &pid); + if (ret < 0) { + pr_warn("clear pin mem fail: %d\n", pid); + } + close(fd); + return ret; +} + static enum notifier_state notifier_state = NOTHING_COMPLETE;
static int cr_dump_finish(int ret) @@ -1791,6 +1809,14 @@ static int cr_dump_finish(int ret) if (ret == 0 && opts.pin_memory) { pr_info("start restore_task_special_pages\n"); restore_task_special_pages(0); + } else if (ret != 0 && opts.pin_memory) { + pr_info("clear pin mem info\n"); + clear_pin_mem(0); + } + + if (ret != 0 && opts.with_notifier_kup) { + pr_info("repair off netlink fd\n"); + netlink_repair_off(); }
if (ret != 0 && opts.with_notifier_kup) { diff --git a/criu/include/net.h b/criu/include/net.h index 795d5e8..bda0ff3 100644 --- a/criu/include/net.h +++ b/criu/include/net.h @@ -54,5 +54,6 @@ extern void check_has_netns_ioc(int fd, bool *kdat_val, const char *name); extern int net_set_ext(struct ns_id *ns); extern struct ns_id *get_root_netns(void); extern int read_net_ns_img(void); +extern int netlink_repair_off(void);
#endif /* __CR_NET_H__ */ diff --git a/criu/sk-netlink.c b/criu/sk-netlink.c index 6d8ab2d..a6c56ff 100644 --- a/criu/sk-netlink.c +++ b/criu/sk-netlink.c @@ -68,15 +68,45 @@ int netlink_receive_one(struct nlmsghdr *hdr, struct ns_id *ns, void *arg) return sk_collect_one(m->ndiag_ino, PF_NETLINK, &sd->sd, ns); }
+struct netlink_repair_fd { + int netlink_fd; + struct list_head nlist; +}; + +static LIST_HEAD(netlink_repair_fds); + static int netlink_repair_on(int fd) { - int ret, aux = 1; + int ret, aux = 1; + struct netlink_repair_fd *nrf;
- ret = setsockopt(fd, SOL_NETLINK, TCP_REPAIR, &aux, sizeof(aux)); - if (ret < 0) - pr_err("Can't turn netlink repair mode ON, error: %d\n", ret); + ret = setsockopt(fd, SOL_NETLINK, TCP_REPAIR, &aux, sizeof(aux)); + if (ret < 0) { + pr_err("Can't turn netlink repair mode ON, error: %d\n", ret); + return ret; + } + nrf = malloc(sizeof(*nrf)); + if (!nrf) + return -ENOMEM; + nrf->netlink_fd = dup(fd); + list_add_tail(&nrf->nlist, &netlink_repair_fds); + return ret; +}
- return ret; +int netlink_repair_off(void) +{ + int aux = 0, ret; + struct netlink_repair_fd *nrf, *n; + + list_for_each_entry_safe(nrf, n, &netlink_repair_fds, nlist) { + ret = setsockopt(nrf->netlink_fd, SOL_NETLINK, TCP_REPAIR, &aux, sizeof(aux)); + if (ret < 0) + pr_err("Failed to turn off repair mode on netlink\n"); + close(nrf->netlink_fd); + list_del(&nrf->nlist); + free(nrf); + } + return 0; }
static bool can_dump_netlink_sk(int lfd)
From: Zhuling zhuling8@huawei.com
Fix file dumping fail problem when the file seek op is null.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: Jingxian He hejingxian@huawei.com --- criu/files-reg.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/criu/files-reg.c b/criu/files-reg.c index 6747a3a..6bbcbee 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -2247,9 +2247,12 @@ static int do_open_reg(int ns_root_fd, struct reg_file_info *rfi, void *arg) if (!(rfi->rfe->flags & O_PATH)) { if (rfi->rfe->pos != -1ULL && lseek(fd, rfi->rfe->pos, SEEK_SET) < 0) { - pr_perror("Can't restore file pos"); - close(fd); - return -1; + pr_info("No ability to restore file pos"); + if (errno != ESPIPE) { + pr_perror("Can't restore file pos"); + close(fd); + return -1; + } } }
From: Liu Chao liuchao173@huawei.com
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 --- criu/config.c | 8 +++- criu/cr-dump.c | 3 ++ criu/crtools.c | 1 + criu/include/cr_options.h | 1 + criu/include/sk-inet.h | 4 ++ criu/include/util.h | 2 + criu/net.c | 6 ++- criu/sk-tcp.c | 85 +++++++++++++++++++++++++++++++++++++++ 8 files changed, 108 insertions(+), 2 deletions(-)
diff --git a/criu/config.c b/criu/config.c index cb647f7..352b3d7 100644 --- a/criu/config.c +++ b/criu/config.c @@ -461,7 +461,7 @@ int parse_options(int argc, char **argv, bool *usage_error, {OPT_NAME, no_argument, SAVE_TO, true},\ {"no-" OPT_NAME, no_argument, SAVE_TO, false}
- static const char short_opts[] = "dSsRt:hD:o:v::x::Vr:jJ:lW:L:M:"; + static const char short_opts[] = "dSsRt:hD:o:v::x::Vr:jJ:lW:L:M:P:"; static struct option long_opts[] = { { "tree", required_argument, 0, 't' }, { "leave-stopped", no_argument, 0, 's' }, @@ -552,6 +552,7 @@ int parse_options(int argc, char **argv, bool *usage_error, BOOL_OPT("mask-exit-notify", &opts.mask_exit_notify), BOOL_OPT("weak-file-check", &opts.weak_file_check), BOOL_OPT("file-locks-repair", &opts.file_locks_repair), + {"reserve-ports", required_argument, 0, 'P' }, { }, };
@@ -895,6 +896,11 @@ int parse_options(int argc, char **argv, bool *usage_error, case 'h': *usage_error = false; return 2; + case 'P': + opts.reserve_ports = atoi(optarg); + if (opts.reserve_ports < 0) + goto bad_arg; + break; default: return 2; } diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 600bc4c..97dc8c9 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -1959,6 +1959,9 @@ int cr_dump_tasks(pid_t pid) goto err; }
+ if (opts.reserve_ports > 0) + set_reserved_ports(); + if (parent_ie) { inventory_entry__free_unpacked(parent_ie, NULL); parent_ie = NULL; diff --git a/criu/crtools.c b/criu/crtools.c index cf5fd0d..42cece0 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -457,6 +457,7 @@ usage: " --mask-exit-notify Mask task exit notify during dump and restore\n" " --weak-file-check Allow file size and mod larger than dumping value\n" " --file-locks-repair Use repair mode to dump and restore file locks\n" +" --reserve-ports Reserve src ports in kernel\n" "\n" "Check options:\n" " Without options, "criu check" checks availability of absolutely required\n" diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index 8aa5d5a..c54d99b 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -186,6 +186,7 @@ struct cr_options { int file_locks_repair; char *share_dst_ports; char *share_src_ports; + int reserve_ports; };
extern struct cr_options opts; diff --git a/criu/include/sk-inet.h b/criu/include/sk-inet.h index 2e28444..4181fbe 100644 --- a/criu/include/sk-inet.h +++ b/criu/include/sk-inet.h @@ -83,6 +83,10 @@ extern void tcp_locked_conn_add(struct inet_sk_info *); extern void rst_unlock_tcp_connections(void); extern void cpt_unlock_tcp_connections(void);
+extern void read_reserved_ports(char *path); +extern void write_reserved_ports(char *path); +extern void set_reserved_ports(void); + extern int dump_one_tcp(int sk, struct inet_sk_desc *sd, SkOptsEntry *soe); extern int restore_one_tcp(int sk, struct inet_sk_info *si);
diff --git a/criu/include/util.h b/criu/include/util.h index 3a4b8f9..d1510fc 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -430,4 +430,6 @@ bool find_devname(const char *name); #define UNMASK_EXIT_NOTIFY_DIR "/sys/kernel/unmask_exit_notify" int mask_task_exit_notify(int pid, bool mask);
+#define RESERVED_PORTS_PATH "/proc/sys/net/ipv4/ip_local_reserved_ports" + #endif /* __CR_UTIL_H__ */ diff --git a/criu/net.c b/criu/net.c index 4f1f7d4..19329cf 100644 --- a/criu/net.c +++ b/criu/net.c @@ -2897,7 +2897,6 @@ static int network_unlock_internal(void) if (switch_ns(root_item->pid->real, &net_ns_desc, &nsret)) return -1;
- ret |= iptables_restore(false, conf, sizeof(conf) - 1); if (kdat.ipv6) ret |= iptables_restore(true, conf, sizeof(conf) - 1); @@ -2926,6 +2925,11 @@ void network_unlock(void) { pr_info("Unlock network\n");
+ if (opts.reserve_ports) { + read_reserved_ports("ip_local_reserved_ports"); + write_reserved_ports(RESERVED_PORTS_PATH); + } + cpt_unlock_tcp_connections(); rst_unlock_tcp_connections();
diff --git a/criu/sk-tcp.c b/criu/sk-tcp.c index 0409e22..67846c3 100644 --- a/criu/sk-tcp.c +++ b/criu/sk-tcp.c @@ -23,6 +23,7 @@ #include "kerndat.h" #include "restorer.h" #include "rst-malloc.h" +#include "xmalloc.h"
#include "protobuf.h" #include "images/tcp-stream.pb-c.h" @@ -33,6 +34,9 @@ static LIST_HEAD(cpt_tcp_repair_sockets); static LIST_HEAD(rst_tcp_repair_sockets);
+static char* reserved_ports; +static int reserved_ports_num; + static int tcp_repair_established(int fd, struct inet_sk_desc *sk) { int ret; @@ -475,3 +479,84 @@ void rst_unlock_tcp_connections(void) list_for_each_entry(ii, &rst_tcp_repair_sockets, rlist) nf_unlock_connection_info(ii); } + +void read_reserved_ports(char *path) +{ + FILE *file = NULL; + char *ch = NULL; + size_t size = 0; + + if (reserved_ports) { + free(reserved_ports); + reserved_ports = NULL; + } + + file = fopen(path, "r"); + if (!file) { + pr_err("Cannot fopen %s\n", path); + return; + } + + if (getline(&reserved_ports, &size, file) <= 0) + pr_err("Cannot getline from %s\n", path); + fclose(file); + + if (!reserved_ports) + return; + + ch = strstr(reserved_ports, "\n"); + if (ch) + *ch = '\0'; +} + +void write_reserved_ports(char *path) +{ + int fd = -1; + char buf[PATH_MAX]; + + fd = open(path, O_RDWR | O_CREAT, 0640); + if (fd < 0) { + pr_err("Cannot open %s ret %d cwd: %s\n", path, fd, buf); + return; + } + + cr_system(-1, fd, -1, "/usr/bin/echo", + (char *[]) { "echo", reserved_ports, NULL}, 0); + close(fd); +} + +static int add_reserved_ports(struct inet_sk_desc *sk) +{ + if (reserved_ports_num >= opts.reserve_ports) + return -1; + + if (strlen(reserved_ports) == 0) + snprintf(reserved_ports, 6, "%u", sk->src_port); + else + snprintf(reserved_ports + strlen(reserved_ports), 7, ",%u", sk->src_port); + reserved_ports_num++; + + return 0; +} + +void set_reserved_ports(void) +{ + struct inet_sk_desc *sk = NULL; + size_t size = 0; + + read_reserved_ports(RESERVED_PORTS_PATH); + + write_reserved_ports("ip_local_reserved_ports"); + + size = strlen(reserved_ports) + 6 * opts.reserve_ports + 1; + if (xrealloc_safe(&reserved_ports, size)) + exit(1); + + list_for_each_entry(sk, &cpt_tcp_repair_sockets, rlist) + add_reserved_ports(sk); + + write_reserved_ports(RESERVED_PORTS_PATH); + + free(reserved_ports); + reserved_ports = NULL; +}
From: Zhuling zhuling8@huawei.com
Fix socket dumping fail problem when user space has no access to getting socket filter.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: Jingxian He hejingxian@huawei.com --- criu/sockets.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/criu/sockets.c b/criu/sockets.c index 609bfb1..0c0b0e0 100644 --- a/criu/sockets.c +++ b/criu/sockets.c @@ -372,7 +372,9 @@ static int dump_socket_filter(int sk, SkOptsEntry *soe)
ret = getsockopt(sk, SOL_SOCKET, SO_GET_FILTER, NULL, &len); if (ret) { - pr_perror("Can't get socket filter len"); + pr_warn("Can't get socket filter len"); + if (errno == EACCES) + return 0; return ret; }
From: "fu.lin" fu.lin10@huawei.com
Some pci devices create bin sysfs file which permit to use `mmap()` syscall, the 6th parameter `offset` is always 0 when those kinds of files create file mapping. The value of `offset` will be assign to `vma->vm_pgoff` in kernel. However, it will be changed to pci address automically during mmap callback function `pci_mmap_resource_range()`, and the offset in `/proc/<pid>/maps` will show non-zero. It will result criu restore fails.
There are many of those files. Just retry the mmap action.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: He Jingxian hejingxian@huawei.com Signed-off-by: fu.lin fu.lin10@huawei.com --- criu/include/image.h | 1 + criu/pie/restorer.c | 16 +++++++++++++--- criu/proc_parse.c | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 3 deletions(-)
diff --git a/criu/include/image.h b/criu/include/image.h index 70f17a5..c929fd0 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -86,6 +86,7 @@ #define VMA_AREA_MEMFD (1 << 14) #define VMA_AREA_ANON_INODE (1 << 15) #define VMA_AREA_CHR (1 << 16) +#define VMA_AREA_DEV_SHARE (1 << 17)
#define VMA_CLOSE (1 << 28) #define VMA_NO_PROT_WRITE (1 << 29) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 5e06abb..d87236d 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -871,9 +871,9 @@ static unsigned long restore_mapping(VmaEntry *vma_entry) * that mechanism as it causes the process to be charged for memory * immediately upon mmap, not later upon preadv(). */ - pr_debug("\tmmap(%"PRIx64" -> %"PRIx64", %x %x %d)\n", + pr_debug("\tmmap(%"PRIx64" -> %"PRIx64", %x %x %d %lx)\n", vma_entry->start, vma_entry->end, - prot, flags, (int)vma_entry->fd); + prot, flags, (int)vma_entry->fd, vma_entry->pgoff); /* * Should map memory here. Note we map them as * writable since we're going to restore page @@ -885,6 +885,15 @@ static unsigned long restore_mapping(VmaEntry *vma_entry) vma_entry->fd, vma_entry->pgoff);
+ if (addr == -EINVAL) { + pr_info("need try mmap with offset 0\n"); + addr = sys_mmap(decode_pointer(vma_entry->start), + vma_entry_len(vma_entry), + prot, flags, + vma_entry->fd, + 0); + } + if ((vma_entry->fd != -1) && (vma_entry->status & VMA_CLOSE)) sys_close(vma_entry->fd); @@ -1883,7 +1892,8 @@ long __export_restore_task(struct task_restore_args *args) if (!vma_entry->has_madv || !vma_entry->madv) continue;
- if (vma_entry_is(vma_entry, VMA_AREA_ANON_INODE)) + if (vma_entry_is(vma_entry, VMA_AREA_ANON_INODE) || + vma_entry_is(vma_entry, VMA_AREA_DEV_SHARE)) continue;
for (m = 0; m < sizeof(vma_entry->madv) * 8; m++) { diff --git a/criu/proc_parse.c b/criu/proc_parse.c index c8a18cf..2c7b926 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -565,6 +565,35 @@ static inline int handle_vvar_vma(struct vma_area *vma) return 0; }
+static bool is_sysfs_resource(const char *path) +{ + char *sub = NULL; + const char *prefix = "resource"; + const char *suffix = "_wc"; + + if (strstr(path, "devices/") == NULL) + return false; + + sub = rindex(path, '/'); + if (sub == NULL) + return false; + + sub += 1; + if (strncmp(sub, prefix, strlen(prefix)) != 0) + return false; + + sub += strlen(prefix); + while (*sub != '\0' && (*sub >= '0' && *sub <= '9')) + sub += 1; + + if (*sub == '\0') + return true; + if (!strcmp(sub, suffix)) + return true; + else + return false; +} + static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_path, DIR *map_files_dir, struct vma_file_info *vfi, @@ -589,6 +618,9 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, goto err; } else if (!strcmp(file_path, "[heap]")) { vma_area->e->status |= VMA_AREA_REGULAR | VMA_AREA_HEAP; + } else if (is_sysfs_resource(file_path)) { + pr_info("find sys device module share memory\n"); + vma_area->e->status |= VMA_AREA_REGULAR | VMA_AREA_DEV_SHARE; } else { vma_area->e->status = VMA_AREA_REGULAR; }
From: Jingxian He hejingxian@huawei.com
Add reuse file method for recover file state of deleted files.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: Jingxian He hejingxian@huawei.com --- criu/files-reg.c | 75 +++++++++++++++++++++++++++++++++++++--- criu/files.c | 24 ++++++++++--- criu/include/files-reg.h | 9 +++++ 3 files changed, 99 insertions(+), 9 deletions(-)
diff --git a/criu/files-reg.c b/criu/files-reg.c index 6bbcbee..46e9eab 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -1184,6 +1184,70 @@ int strip_deleted(struct fd_link *link) return 0; }
+int add_reuse_file(u32 id, int fd, int pid) +{ + int retval; + char buf[256] = {0}; + + retval = snprintf(buf, 256, "%u,%d,%d", id, fd, pid); + if (retval <= 0) + return -EFAULT; + + fd = open(ADD_REUSE_FILE_PATH, O_WRONLY, 0); + if (fd < 0) { + pr_err("open file:%s fail\n", ADD_REUSE_FILE_PATH); + return fd; + } + + retval = write(fd, buf, strlen(buf)); + close(fd); + return retval < 0 ? -1 : 0; +} + + +int repair_reuse_file(int id) +{ + int retval, fd; + char buf[256] = {0}; + + retval = snprintf(buf, 256, "%u", id); + if (retval <= 0) + return -EFAULT; + + fd = open(REPAIR_REUSE_FILE_PATH, O_WRONLY, 0); + if (fd < 0) { + pr_err("open file:%s fail\n", REPAIR_REUSE_FILE_PATH); + return fd; + } + retval = write(fd, buf, strlen(buf)); + + close(fd); + return retval < 0 ? -1 : 0; +} + +int get_reuse_file(void) +{ + int fd; + ssize_t count; + int retval = -1; + char buf[32] = {0}; + + fd = open(REUSE_FILE_PATH, O_RDONLY , 0); + if (fd < 0) { + pr_err("open file:%s fail\n", REUSE_FILE_PATH); + return fd; + } + + count = read(fd, buf, sizeof(buf)); + if (count > 0) + retval = atoi(buf); + + close(fd); + return retval; +} + +extern int dst_pid; +extern int need_reuse_flag; static int check_path_remap(struct fd_link *link, const struct fd_parms *parms, int lfd, u32 id, struct ns_id *nsid) { @@ -1301,9 +1365,12 @@ static int check_path_remap(struct fd_link *link, const struct fd_parms *parms, * name. */
- if (errno == ENOENT) - return dump_linked_remap(rpath + 1, plen - 1, - ost, lfd, id, nsid); + if (errno == ENOENT) { + pr_info("start add no exist file:%s\n", rpath + 1); + add_reuse_file(id, lfd, dst_pid); + need_reuse_flag = O_REUSE; + return 0; + }
pr_perror("Can't stat path"); return -1; @@ -1724,7 +1791,7 @@ ext: rfe.mode = p->stat.st_mode;
if (S_ISREG(p->stat.st_mode) && should_check_size(rfe.flags) && - !store_validation_data(&rfe, p, lfd)) + (need_reuse_flag != O_REUSE) && !store_validation_data(&rfe, p, lfd)) return -1;
fe.type = FD_TYPES__REG; diff --git a/criu/files.c b/criu/files.c index 0ebf26e..16d34fd 100644 --- a/criu/files.c +++ b/criu/files.c @@ -720,7 +720,7 @@ int dump_my_file(int lfd, u32 *id, int *type) }
int dst_pid; - +int need_reuse_flag; int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, struct parasite_drain_fd *dfds) { @@ -758,7 +758,7 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item,
for (i = 0; i < nr_fds; i++) { FdinfoEntry e = FDINFO_ENTRY__INIT; - + need_reuse_flag = 0; ret = dump_one_file(item->pid, dfds->fds[i + off], lfds[i], opts + i, ctl, &e, dfds); if (ret < 0) @@ -768,7 +768,7 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, ret = 0; continue; } - + e.flags |= need_reuse_flag; pr_info("write fdinfoEntry fd=%d id=%d\n", (&e)->fd, (&e)->id); ret = pb_write_one(img, &e, PB_FDINFO); if (ret) @@ -964,8 +964,8 @@ int collect_fd(int pid, FdinfoEntry *e, struct rst_info *rst_info, bool fake) { struct file_desc *fdesc;
- pr_info("Collect fdinfo pid=%d fd=%d id=%#x\n", - pid, e->fd, e->id); + pr_info("Collect fdinfo pid=%d fd=%d id=%#x flags:%x\n", + pid, e->fd, e->id, e->flags);
fdesc = find_file_desc(e); if (fdesc == NULL) { @@ -1255,6 +1255,7 @@ static int open_fd(struct fdinfo_list_entry *fle) struct fdinfo_list_entry *flem; int new_fd = -1, ret;
+ pr_info("open file flags:%x\n", fle->fe->flags); flem = file_master(d); if (fle != flem) { BUG_ON (fle->stage != FLE_INITIALIZED); @@ -1276,6 +1277,19 @@ static int open_fd(struct fdinfo_list_entry *fle) return 0; } } + } else if (fle->fe->flags & O_REUSE) { + pr_info("find reuse file:%d\n", d->id); + ret = repair_reuse_file(d->id); + if (!ret) { + new_fd = get_reuse_file(); + pr_info("get reuse file:%d\n", new_fd); + if (new_fd <= 0 || setup_and_serve_out(fle, new_fd) < 0) { + pr_err("setup reuse file fail\n"); + return -1; + } + fle->stage = FLE_RESTORED; + return 0; + } }
/* diff --git a/criu/include/files-reg.h b/criu/include/files-reg.h index 458fe89..4ec0e14 100644 --- a/criu/include/files-reg.h +++ b/criu/include/files-reg.h @@ -64,4 +64,13 @@ extern int strip_deleted(struct fd_link *link);
extern int dead_pid_conflict(void);
+#define ADD_REUSE_FILE_PATH "/sys/kernel/add_reuse_file" +#define REPAIR_REUSE_FILE_PATH "/sys/kernel/repair_reuse_file" +#define REUSE_FILE_PATH "/sys/kernel/reuse_file" +#define O_REUSE 0100000000 + +extern int add_reuse_file(u32 id, int fd, int pid); +extern int repair_reuse_file(int id); +extern int get_reuse_file(void); + #endif /* __CR_FILES_REG_H__ */
From: "fu.lin" fu.lin10@huawei.com
libmnl provides the communication between userspace and kernelspace for netfilter netlink. I abstract here for the next usage.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: fu.lin fu.lin10@huawei.com --- criu/Makefile.crtools | 1 + criu/Makefile.packages | 6 ++ criu/include/nftables.h | 28 +++++++ criu/mnl.c | 165 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 200 insertions(+) create mode 100644 criu/include/nftables.h create mode 100644 criu/mnl.c
diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index a9008f0..ff6b597 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -90,6 +90,7 @@ obj-y += pie-util-vdso.o obj-y += vdso.o obj-y += timens.o obj-y += devname.o +obj-y += mnl.o obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 diff --git a/criu/Makefile.packages b/criu/Makefile.packages index 13c346f..c1d87a5 100644 --- a/criu/Makefile.packages +++ b/criu/Makefile.packages @@ -7,6 +7,8 @@ REQ-RPM-PKG-NAMES += protobuf-python REQ-RPM-PKG-NAMES += libnl3-devel REQ-RPM-PKG-NAMES += libcap-devel REQ-RPM-PKG-NAMES += $(PYTHON)-future +REQ-RPM-PKG-NAMES += libmnl-devel +REQ-RPM-PKG-NAMES += libnftnl-devel
REQ-RPM-PKG-TEST-NAMES += libaio-devel
@@ -18,6 +20,8 @@ REQ-DEB-PKG-NAMES += $(PYTHON)-protobuf REQ-DEB-PKG-NAMES += $(PYTHON)-future REQ-DEB-PKG-NAMES += libnl-3-dev REQ-DEB-PKG-NAMES += libcap-dev +REQ-DEB-PKG-NAMES += libmnl-dev +REQ-DEB-PKG-NAMES += libnftnl-dev
REQ-DEB-PKG-TEST-NAMES += $(PYTHON)-yaml REQ-DEB-PKG-TEST-NAMES += libaio-dev @@ -31,6 +35,8 @@ REQ-RPM-PKG-TEST-NAMES += $(PYTHON)-pyyaml endif
export LIBS += -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/ -lnet +export LIBS += $(shell pkg-config --libs libmnl) +export LIBS += $(shell pkg-config --libs libnftnl)
check-packages-failed: $(warning Can not find some of the required libraries) diff --git a/criu/include/nftables.h b/criu/include/nftables.h new file mode 100644 index 0000000..0bdab31 --- /dev/null +++ b/criu/include/nftables.h @@ -0,0 +1,28 @@ +#ifndef __CR_NFTABLES_H__ +#define __CR_NFTABLES_H__ + +#include <libmnl/libmnl.h> + +struct mnl_params { + struct mnl_socket *nl; + char *buf; + struct mnl_nlmsg_batch *batch; + uint32_t seq; +}; + +typedef struct nlmsghdr * (*buf_func_t)(struct mnl_params *mnl_params, void *args); +typedef int (*batch_func_t)(struct mnl_params *mnl_params, void *args); +typedef int (*mnl_func_t)(struct mnl_params *mnl, batch_func_t cb, void *args); + +struct mnl_cb_params { + pid_t tree_id; + bool create; + bool ipv6; +}; + +int mnl_sendmsg(batch_func_t batch_cb, void *args); +int mnl_common(mnl_func_t mnl_cb, void *arg1, void *arg2); +int mnl_batch_send_and_recv(struct mnl_params *mnl_params, batch_func_t cb, void *args, int *result); +int mnl_buf_send_and_recv(struct mnl_params *mnl_params, buf_func_t cb, void *args, int *result); + +#endif /* __CR_NFTABLES_H__ */ diff --git a/criu/mnl.c b/criu/mnl.c new file mode 100644 index 0000000..3a03202 --- /dev/null +++ b/criu/mnl.c @@ -0,0 +1,165 @@ +#include <string.h> +#include <time.h> +#include <errno.h> + +#include <libnftnl/common.h> + +#include "nftables.h" +#include "log.h" + +int mnl_common(mnl_func_t mnl_cb, void *arg1, void *arg2) +{ + char buf[MNL_SOCKET_BUFFER_SIZE]; + struct mnl_params mnl = { + .seq = time(NULL), + }; + int retval = -1; + + mnl.nl = mnl_socket_open(NETLINK_NETFILTER); + if (mnl.nl == NULL) { + pr_err("mnl_socket_open failed with %d: %s\n", errno, strerror(errno)); + return -1; + } + + if (mnl_socket_bind(mnl.nl, 0, MNL_SOCKET_AUTOPID) < 0) { + pr_err("mnl_socket_bind wailed with %d: %s\n", errno, strerror(errno)); + goto err_mnl; + } + + mnl.buf = buf; + mnl.batch = mnl_nlmsg_batch_start(buf, sizeof(buf)); + if (mnl.batch == NULL) + goto err_mnl; + + if (mnl_cb(&mnl, arg1, arg2) < 0) + goto err_batch; + + retval = 0; + +err_batch: + mnl_nlmsg_batch_stop(mnl.batch); +err_mnl: + mnl_socket_close(mnl.nl); + + return retval; +} + +static int mnl_sendmsg_internal(struct mnl_params *mnl, batch_func_t cb, void *args) +{ + int retval = -1; + + nftnl_batch_begin(mnl_nlmsg_batch_current(mnl->batch), mnl->seq++); + mnl_nlmsg_batch_next(mnl->batch); + + if (cb(mnl, args) < 0) + goto err_batch; + + nftnl_batch_end(mnl_nlmsg_batch_current(mnl->batch), mnl->seq++); + mnl_nlmsg_batch_next(mnl->batch); + + if (mnl_socket_sendto(mnl->nl, mnl_nlmsg_batch_head(mnl->batch), + mnl_nlmsg_batch_size(mnl->batch)) < 0) { + pr_err("%s: mnl_socket_sendto failed with %d: %s\n", + __func__, errno, strerror(errno)); + goto err_batch; + } + + retval = 0; + +err_batch: + return retval; +} + +int mnl_sendmsg(batch_func_t batch_cb, void *args) +{ + return mnl_common(mnl_sendmsg_internal, batch_cb, args); +} + +int mnl_batch_send_and_recv(struct mnl_params *mnl_params, batch_func_t cb, + void *args, int *result) +{ + struct mnl_socket *nl = mnl_params->nl; + struct mnl_nlmsg_batch *batch = mnl_params->batch; + uint32_t *seq = &mnl_params->seq; + char buf[MNL_SOCKET_BUFFER_SIZE]; + int retval; + + mnl_nlmsg_batch_reset(batch); + nftnl_batch_begin(mnl_nlmsg_batch_current(batch), (*seq)++); + mnl_nlmsg_batch_next(batch); + + if (cb(mnl_params, args) < 0) + return -1; + + nftnl_batch_end(mnl_nlmsg_batch_current(batch), (*seq)++); + mnl_nlmsg_batch_next(batch); + + if (mnl_socket_sendto(nl, mnl_nlmsg_batch_head(batch), + mnl_nlmsg_batch_size(batch)) < 0) { + pr_err("%s: mnl_socket_sendto failed with %d: %s\n", + __func__, errno, strerror(errno)); + return -1; + } + + /* don't care the netlink retval, and nlmsg hdr flags has no `NLM_F_ACK` */ + if (result == NULL) + return 0; + + retval = mnl_socket_recvfrom(nl, buf, sizeof(buf)); + while (retval > 0) { + retval = mnl_cb_run(buf, retval, 0, mnl_socket_get_portid(nl), NULL, NULL); + if (retval <= 0) + break; + retval = mnl_socket_recvfrom(nl, buf, sizeof(buf)); + } + + if (retval < 0) { + pr_err("%s: mnl batch socket recv errno with %d: %s\n", + __func__, errno, strerror(errno)); + *result = errno; + return -1; + } + + *result = 0; + return 0; +} + +int mnl_buf_send_and_recv(struct mnl_params *mnl_params, buf_func_t cb, + void *args, int *result) +{ + struct mnl_socket *nl = mnl_params->nl; + char buf[MNL_SOCKET_BUFFER_SIZE]; + struct nlmsghdr *nlh; + int retval = 0; + + if ((nlh = cb(mnl_params, args)) == NULL) + return -1; + + if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { + pr_err("%s: mnl_socket_sendto failed with %d: %s\n", + __func__, errno, strerror(errno)); + return -1; + } + + /* don't care the netlink retval, and nlmsg hdr flags has no `NLM_F_ACK` */ + if (result == NULL) + return 0; + + retval = mnl_socket_recvfrom(nl, buf, sizeof(buf)); + while (retval > 0) { + retval = mnl_cb_run(buf, retval, 0, mnl_socket_get_portid(nl), NULL, NULL); + if (retval <= 0) + break; + retval = mnl_socket_recvfrom(nl, buf, sizeof(buf)); + } + + if (retval < 0) { + pr_info("%s: mnl buf socket recv errno with %d: %s\n", + __func__, errno, strerror(errno)); + *result = errno; + return -1; + } + + *result = 0; + return 0; +}
From: Jingxian He hejingxian@huawei.com
Repair off the share sockets after reusing them to recover the share socket state.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: Jingxian He hejingxian@huawei.com --- criu/files.c | 34 ++++++++++++++++++++++++++++++++-- criu/sk-inet.c | 6 ++++-- criu/sk-netlink.c | 5 +++-- 3 files changed, 39 insertions(+), 6 deletions(-)
diff --git a/criu/files.c b/criu/files.c index 16d34fd..2dd4d9c 100644 --- a/criu/files.c +++ b/criu/files.c @@ -54,7 +54,7 @@ #include "util.h" #include "images/fs.pb-c.h" #include "images/ext-file.pb-c.h" - +#include "sk-inet.h" #include "plugin.h"
#define FDESC_HASH_SIZE 64 @@ -1235,7 +1235,7 @@ int setup_and_serve_out(struct fdinfo_list_entry *fle, int new_fd) if (reopen_fd_as(fle->fe->fd, new_fd)) return -1;
- pr_info("*******flags: %d",fle->fe->flags); + pr_info("*******flags: %d\n",fle->fe->flags); if (fcntl(fle->fe->fd, F_SETFD, fle->fe->flags) == -1) { pr_perror("Unable to set file descriptor flags"); return -1; @@ -1249,6 +1249,30 @@ int setup_and_serve_out(struct fdinfo_list_entry *fle, int new_fd) return 0; }
+#define MAX_SHARE_SOCKETS_NUM 25000 +int repair_share_sockets[MAX_SHARE_SOCKETS_NUM]; +int repair_share_num; + +int add_repair_share_socket(int fd) +{ + if (repair_share_num >= MAX_SHARE_SOCKETS_NUM) + return -1; + repair_share_sockets[repair_share_num] = fd; + repair_share_num++; + return 0; +} + +void repair_off_share_sockets(void) +{ + int i; + + for (i = 0; i < repair_share_num; i++) { + tcp_repair_off(repair_share_sockets[i]); + pr_info("repair off socket:%d\n", repair_share_sockets[i]); + } + repair_share_num = 0; +} + static int open_fd(struct fdinfo_list_entry *fle) { struct file_desc *d = fle->desc; @@ -1267,6 +1291,7 @@ static int open_fd(struct fdinfo_list_entry *fle)
if (d->ops->type == FD_TYPES__INETSK) { if (check_need_repair(d)) { + pr_info("start repair for:%d\n", d->id); ret = repair_share_socket(d->id); if (!ret) { new_fd = get_share_socket(); @@ -1274,6 +1299,10 @@ static int open_fd(struct fdinfo_list_entry *fle) if (new_fd <= 0 || setup_and_serve_out(fle, new_fd) < 0) return -1; fle->stage = FLE_RESTORED; + if (add_repair_share_socket(fle->fe->fd)) { + pr_perror("add repair share socket fail\n"); + return -1; + } return 0; } } @@ -1388,6 +1417,7 @@ static int open_fdinfos(struct pstree_item *me) wait_fds_event(); } while (again || progress);
+ repair_off_share_sockets(); BUG_ON(!list_empty(list)); /* * Fake fles may be used for restore other diff --git a/criu/sk-inet.c b/criu/sk-inet.c index d29f03b..768c6ed 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -654,8 +654,10 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa BUG_ON(sk->sd.already_dumped);
if (check_share_dst_port(sk->dst_port) || check_share_src_port(sk->src_port)) { - pr_info("Start add share prot:%d src %d\n", sk->dst_port, sk->src_port); - add_share_socket(id, lfd, dst_pid, sk->src_port); + pr_info("Start add share port:%d-%d, dst_pid:%d id:%d\n", sk->dst_port, sk->src_port, dst_pid, id); + ret = add_share_socket(id, lfd, dst_pid, sk->src_port); + if (ret) + pr_warn("add share socket ret:%d\n", ret); }
ie.id = id; diff --git a/criu/sk-netlink.c b/criu/sk-netlink.c index a6c56ff..70d245a 100644 --- a/criu/sk-netlink.c +++ b/criu/sk-netlink.c @@ -115,9 +115,10 @@ static bool can_dump_netlink_sk(int lfd)
ret = fd_has_data(lfd); if (ret == 1) - pr_err("The socket has data to read\n"); + pr_warn("The socket has data to read\n");
- return ret == 0; + /* ignore netlink socket data */ + return true; }
static int dump_one_netlink_fd(int lfd, u32 id, const struct fd_parms *p)
From: "fu.lin" fu.lin10@huawei.com
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: fu.lin fu.lin10@huawei.com --- criu/include/nftables.h | 2 + criu/nftables.c | 112 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+)
diff --git a/criu/include/nftables.h b/criu/include/nftables.h index 3b51a3d..e462919 100644 --- a/criu/include/nftables.h +++ b/criu/include/nftables.h @@ -162,5 +162,7 @@ struct nf_conn_params {
struct inet_sk_desc; int nft_connection_switch(struct inet_sk_desc *sk, bool lock, pid_t tree_id); +int nft_lock(void); +int nft_unlock(void);
#endif /* __CR_NFTABLES_H__ */ diff --git a/criu/nftables.c b/criu/nftables.c index 57774e6..817f157 100644 --- a/criu/nftables.c +++ b/criu/nftables.c @@ -821,3 +821,115 @@ int nft_connection_switch(struct inet_sk_desc *sk, bool lock, pid_t tree_id)
return mnl_sendmsg(nf_connection_switch_raw, ¶m); } + +static int nft_ns_rule_internal(uint8_t family, struct mnl_params *mnl_params, + struct nft_rule_params *params, bool create) +{ + struct nftnl_rule *rule; + + rule = setup_rule(family, TABLE_NAME, params, create, true); + if (rule == NULL) + return -1; + + if (create) { + construct_rule_batch(mnl_params->batch, NFT_MSG_NEWRULE, family, + NLM_F_CREATE|NLM_F_EXCL|NLM_F_ACK, + mnl_params->seq++, rule); + } else { + construct_rule_batch(mnl_params->batch, NFT_MSG_DELRULE, family, + 0, mnl_params->seq++, rule); + } + + return 0; +} + +static int nft_ns_rule_raw(struct mnl_params *mnl_params, struct mnl_cb_params *args, + struct nft_rule_params *params) +{ + params->chain_name = INPUT_CHAIN_NAME; + if (nft_ns_rule_internal(NFPROTO_INET, mnl_params, params, args->create) < 0) { + pr_err("%s: create nft input rule failed!\n", __func__); + return -1; + } + + params->chain_name = OUTPUT_CHAIN_NAME; + if (nft_ns_rule_internal(NFPROTO_INET, mnl_params, params, args->create) < 0) { + pr_err("%s: create nft output rule failed!\n", __func__); + return -1; + } + + return 0; +} + +static int nft_ns_rule(struct mnl_params *mnl_params, void *args) +{ + struct nft_rule_params params = { 0 }; + + params.mark = 0; + params.mark_op = NFT_CMP_EQ; + params.stmt = NF_DROP; + if (nft_ns_rule_raw(mnl_params, args, ¶ms) < 0) + return -1; + + params.mark = SOCCR_MARK; + + params.stmt = NF_ACCEPT; + if (nft_ns_rule_raw(mnl_params, args, ¶ms) < 0) + return -1; + + return 0; +} + +static int nft_ns_rule_common(struct mnl_params *mnl_params, bool create) +{ + struct mnl_cb_params params = { + .create = create, + }; + int result = 0; + + if (create && + (mnl_batch_send_and_recv(mnl_params, nft_ns_rule, ¶ms, &result) < 0 + && (result != 0 && result != EEXIST))) { + pr_err("%s: crete ns rule failed!\n", __func__); + return -1; + } else if (!create && + (mnl_batch_send_and_recv(mnl_params, nft_ns_rule, ¶ms, NULL) < 0)) { + pr_err("%s: delete ns rule failed!\n", __func__); + return -1; + } + + return 0; +} + +static int network_lock_internal(struct mnl_params *params, + batch_func_t _, void *args) +{ + if (nft_table_prepare(params) < 0) + return -1; + + if (nft_chain_prepare(params) < 0) + return -1; + + if (nft_ns_rule_common(params, true) < 0) + return -1; + + return 0; +} + +int nft_lock(void) +{ + return mnl_common(network_lock_internal, NULL, NULL); +} + +static int network_unlock_internal(struct mnl_params *params, batch_func_t _, + void *args) +{ + if (nft_ns_rule_common(params, false) < 0) + return -1; + return 0; +} + +int nft_unlock(void) +{ + return mnl_common(network_unlock_internal, NULL, NULL); +}
From: Jingxian He hejingxian@huawei.com
The sigaction handler register in restorer will change the original sigaction handler of restoring app, We need to remove them or recover them before resuming app.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: Jingxian He hejingxian@huawei.com --- criu/pie/restorer.c | 20 -------------------- 1 file changed, 20 deletions(-)
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index d87236d..603cbee 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1632,29 +1632,9 @@ long __export_restore_task(struct task_restore_args *args) pr_err("Failed to set SIGCHLD %ld\n", ret); goto core_restore_end; } - ret = sys_sigaction(SIGSEGV, &act, NULL, sizeof(k_rtsigset_t)); - if (ret) { - pr_err("Failed to set SIGCHLD %ld\n", ret); - goto core_restore_end; - } - - ret = sys_sigaction(SIGBUS, &act, NULL, sizeof(k_rtsigset_t)); - if (ret) { - pr_err("Failed to set SIGCHLD %ld\n", ret); - goto core_restore_end; - } - - ret = sys_sigaction(SIGILL, &act, NULL, sizeof(k_rtsigset_t)); - if (ret) { - pr_err("Failed to set SIGCHLD %ld\n", ret); - goto core_restore_end; - }
ksigemptyset(&to_block); ksigaddset(&to_block, SIGCHLD); - ksigaddset(&to_block, SIGSEGV); - ksigaddset(&to_block, SIGBUS); - ksigaddset(&to_block, SIGILL); ret = sys_sigprocmask(SIG_UNBLOCK, &to_block, NULL, sizeof(k_rtsigset_t)); if (ret) { pr_err("Failed to unblock SIGCHLD %ld\n", ret);
From: "fu.lin" fu.lin10@huawei.com
usage: criu --use-nft
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: fu.lin fu.lin10@huawei.com --- criu/config.c | 1 + criu/cr-dump.c | 4 ++-- criu/cr-restore.c | 4 ++-- criu/crtools.c | 1 + criu/include/cr_options.h | 1 + criu/include/net.h | 6 ++++-- criu/include/netfilter.h | 7 +++++-- criu/include/sk-inet.h | 2 +- criu/kerndat.c | 3 ++- criu/net.c | 37 +++++++++++++++++++++++++++---------- criu/netfilter.c | 14 +++++++++++--- criu/sk-tcp.c | 14 +++++++------- 12 files changed, 64 insertions(+), 30 deletions(-)
diff --git a/criu/config.c b/criu/config.c index 352b3d7..cdafe17 100644 --- a/criu/config.c +++ b/criu/config.c @@ -553,6 +553,7 @@ int parse_options(int argc, char **argv, bool *usage_error, BOOL_OPT("weak-file-check", &opts.weak_file_check), BOOL_OPT("file-locks-repair", &opts.file_locks_repair), {"reserve-ports", required_argument, 0, 'P' }, + BOOL_OPT("use-nft", &opts.use_nft), { }, };
diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 97dc8c9..2a1864c 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -1782,7 +1782,7 @@ static int cr_dump_finish(int ret) * start rollback procedure and cleanup everything. */ if (ret || post_dump_ret || opts.final_state == TASK_ALIVE) { - network_unlock(); + network_unlock(opts.tree_id); delete_link_remaps(); clean_cr_time_mounts(); } @@ -1935,7 +1935,7 @@ int cr_dump_tasks(pid_t pid) if (collect_pstree_ids()) goto err;
- if (network_lock()) + if (network_lock(opts.tree_id)) goto err;
if (collect_file_locks()) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 4fd29a5..7ec84c8 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2499,7 +2499,7 @@ skip_ns_bouncing: goto out_kill;
/* Unlock network before disabling repair mode on sockets */ - network_unlock(); + network_unlock(vpid(init)); network_status |= NETWORK_UNLOCK;
/* @@ -2712,7 +2712,7 @@ err: pr_err("collect inet sk cinfo fail"); } if ((network_status & NETWORK_UNLOCK) == 0) - network_unlock(); + network_unlock(vpid(root_item)); }
return ret; diff --git a/criu/crtools.c b/criu/crtools.c index 42cece0..4954f0c 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -458,6 +458,7 @@ usage: " --weak-file-check Allow file size and mod larger than dumping value\n" " --file-locks-repair Use repair mode to dump and restore file locks\n" " --reserve-ports Reserve src ports in kernel\n" +" --use-nft Use nft API instead of iptables cmd in network locking" "\n" "Check options:\n" " Without options, "criu check" checks availability of absolutely required\n" diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index c54d99b..236d1c7 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -187,6 +187,7 @@ struct cr_options { char *share_dst_ports; char *share_src_ports; int reserve_ports; + int use_nft; };
extern struct cr_options opts; diff --git a/criu/include/net.h b/criu/include/net.h index bda0ff3..4e704cc 100644 --- a/criu/include/net.h +++ b/criu/include/net.h @@ -30,8 +30,10 @@ struct veth_pair {
extern int collect_net_namespaces(bool for_dump);
-extern int network_lock(void); -extern void network_unlock(void); +extern int network_prepare(pid_t tree_id); +extern void network_unprepare(pid_t tree_id); +extern int network_lock(pid_t tree_id); +extern void network_unlock(pid_t tree_id); extern int network_lock_internal(void);
extern struct ns_desc net_ns_desc; diff --git a/criu/include/netfilter.h b/criu/include/netfilter.h index 35ef262..c92762c 100644 --- a/criu/include/netfilter.h +++ b/criu/include/netfilter.h @@ -1,9 +1,12 @@ #ifndef __CR_NETFILTER_H__ #define __CR_NETFILTER_H__
+#include <sys/types.h> +#include <stdbool.h> + struct inet_sk_desc; -extern int nf_lock_connection(struct inet_sk_desc *); -extern int nf_unlock_connection(struct inet_sk_desc *); +extern int nf_lock_connection(struct inet_sk_desc *, pid_t, bool); +extern int nf_unlock_connection(struct inet_sk_desc *, bool);
struct inet_sk_info; extern int nf_unlock_connection_info(struct inet_sk_info *); diff --git a/criu/include/sk-inet.h b/criu/include/sk-inet.h index 4181fbe..88e0881 100644 --- a/criu/include/sk-inet.h +++ b/criu/include/sk-inet.h @@ -81,7 +81,7 @@ static inline void tcp_repair_off(int fd)
extern void tcp_locked_conn_add(struct inet_sk_info *); extern void rst_unlock_tcp_connections(void); -extern void cpt_unlock_tcp_connections(void); +extern void cpt_unlock_tcp_connections(bool);
extern void read_reserved_ports(char *path); extern void write_reserved_ports(char *path); diff --git a/criu/kerndat.c b/criu/kerndat.c index c87f551..cf9187a 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -1095,7 +1095,8 @@ int kerndat_init(void) memset(&kdat, 0, sizeof(kdat));
preload_socket_modules(); - preload_netfilter_modules(); + if (!opts.use_nft) + preload_netfilter_modules();
if (check_pagemap()) { pr_err("check_pagemap failed when initializing kerndat.\n"); diff --git a/criu/net.c b/criu/net.c index 19329cf..30b1491 100644 --- a/criu/net.c +++ b/criu/net.c @@ -45,6 +45,7 @@ #include "util.h" #include "external.h" #include "fdstore.h" +#include "nftables.h"
#include "protobuf.h" #include "images/netdev.pb-c.h" @@ -2868,9 +2869,13 @@ int network_lock_internal(void) return -1;
- ret |= iptables_restore(false, conf, sizeof(conf) - 1); - if (kdat.ipv6) - ret |= iptables_restore(true, conf, sizeof(conf) - 1); + if (opts.use_nft) + ret = nft_lock(); + else { + ret |= iptables_restore(false, conf, sizeof(conf) - 1); + if (kdat.ipv6) + ret |= iptables_restore(true, conf, sizeof(conf) - 1); + }
if (ret) pr_err("Locking network failed: iptables-restore returned %d. " @@ -2897,9 +2902,13 @@ static int network_unlock_internal(void) if (switch_ns(root_item->pid->real, &net_ns_desc, &nsret)) return -1;
- ret |= iptables_restore(false, conf, sizeof(conf) - 1); - if (kdat.ipv6) - ret |= iptables_restore(true, conf, sizeof(conf) - 1); + if (opts.use_nft) + ret = nft_unlock(); + else { + ret |= iptables_restore(false, conf, sizeof(conf) - 1); + if (kdat.ipv6) + ret |= iptables_restore(true, conf, sizeof(conf) - 1); + }
if (restore_ns(nsret, &net_ns_desc)) ret = -1; @@ -2907,10 +2916,13 @@ static int network_unlock_internal(void) return ret; }
-int network_lock(void) +int network_lock(pid_t tree_id) { pr_info("Lock network\n");
+ if (opts.use_nft && opts.tcp_established_ok && network_prepare(tree_id) < 0) + return -1; + /* Each connection will be locked on dump */ if (!(root_ns_mask & CLONE_NEWNET)) return 0; @@ -2921,7 +2933,7 @@ int network_lock(void) return network_lock_internal(); }
-void network_unlock(void) +void network_unlock(pid_t tree_id) { pr_info("Unlock network\n");
@@ -2930,8 +2942,13 @@ void network_unlock(void) write_reserved_ports(RESERVED_PORTS_PATH); }
- cpt_unlock_tcp_connections(); - rst_unlock_tcp_connections(); + if (opts.use_nft && opts.tcp_established_ok) + network_unprepare(tree_id); + + cpt_unlock_tcp_connections(opts.use_nft); + + if (!opts.use_nft) + rst_unlock_tcp_connections();
if (root_ns_mask & CLONE_NEWNET) { run_scripts(ACT_NET_UNLOCK); diff --git a/criu/netfilter.c b/criu/netfilter.c index 368651c..b2ec7ed 100644 --- a/criu/netfilter.c +++ b/criu/netfilter.c @@ -15,6 +15,8 @@ #include "sk-inet.h" #include "kerndat.h"
+#include "nftables.h" + static char buf[512];
/* @@ -129,13 +131,19 @@ static int nf_connection_switch(struct inet_sk_desc *sk, bool lock) return ret; }
-int nf_lock_connection(struct inet_sk_desc *sk) +int nf_lock_connection(struct inet_sk_desc *sk, pid_t tree_id, bool use_nft) { - return nf_connection_switch(sk, true); + if (use_nft) + return nft_connection_switch(sk, true, tree_id); + else + return nf_connection_switch(sk, true); }
-int nf_unlock_connection(struct inet_sk_desc *sk) +int nf_unlock_connection(struct inet_sk_desc *sk, bool use_nft) { + if (use_nft) + return 0; + return nf_connection_switch(sk, false); }
diff --git a/criu/sk-tcp.c b/criu/sk-tcp.c index 67846c3..a9a9047 100644 --- a/criu/sk-tcp.c +++ b/criu/sk-tcp.c @@ -55,7 +55,7 @@ static int tcp_repair_established(int fd, struct inet_sk_desc *sk) }
if (!(root_ns_mask & CLONE_NEWNET)) { - ret = nf_lock_connection(sk); + ret = nf_lock_connection(sk, opts.tree_id, opts.use_nft); if (ret < 0) goto err2; } @@ -70,21 +70,21 @@ static int tcp_repair_established(int fd, struct inet_sk_desc *sk)
err3: if (!(root_ns_mask & CLONE_NEWNET)) - nf_unlock_connection(sk); + nf_unlock_connection(sk, opts.use_nft); err2: close(sk->rfd); err1: return -1; }
-static void tcp_unlock_one(struct inet_sk_desc *sk) +static void tcp_unlock_one(struct inet_sk_desc *sk, bool use_nft) { int ret;
list_del(&sk->rlist);
- if (!(root_ns_mask & CLONE_NEWNET)) { - ret = nf_unlock_connection(sk); + if (!(root_ns_mask & CLONE_NEWNET) && !use_nft) { + ret = nf_unlock_connection(sk, false); if (ret < 0) pr_perror("Failed to unlock TCP connection"); } @@ -101,12 +101,12 @@ static void tcp_unlock_one(struct inet_sk_desc *sk) close(sk->rfd); }
-void cpt_unlock_tcp_connections(void) +void cpt_unlock_tcp_connections(bool use_nft) { struct inet_sk_desc *sk, *n;
list_for_each_entry_safe(sk, n, &cpt_tcp_repair_sockets, rlist) - tcp_unlock_one(sk); + tcp_unlock_one(sk, use_nft); }
static int dump_tcp_conn_state(struct inet_sk_desc *sk)
From: root root@localhost.localdomain
Add 'clear-pin-mem' option for clearing pin memory data, and 'init-page-map' option for initializationing buffer for reading page map info.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: Jingxian He hejingxian@huawei.com --- criu/crtools.c | 30 ++++++++++++++++++++++++++++++ criu/include/restorer.h | 4 ++++ 2 files changed, 34 insertions(+)
diff --git a/criu/crtools.c b/criu/crtools.c index 4954f0c..bac2992 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -46,6 +46,7 @@
#include "setproctitle.h" #include "sysctl.h" +#include "restorer.h"
void flush_early_log_to_stderr(void) __attribute__((destructor));
@@ -68,6 +69,25 @@ static int image_dir_mode(char *argv[], int optind) return -1; }
+int init_pagemap_read(int para) +{ + int fd, ret; + + fd = open(PIN_MEM_FILE, O_RDWR, 0); + if (fd < 0) { + pr_warn("error open file: %s\n", PIN_MEM_FILE); + return -1; + } + + ret = ioctl(fd, INIT_PAGEMAP_READ, (unsigned long) ¶); + if (ret < 0) { + pr_warn("Init pagemap read fail, errno: %s\n", strerror(errno)); + } + + close(fd); + return ret; +} + int main(int argc, char *argv[], char *envp[]) { int ret = -1; @@ -173,6 +193,14 @@ int main(int argc, char *argv[], char *envp[]) goto usage; }
+ if (!strcmp(argv[optind], "clear-pin-memory")) { + return clear_pin_mem(0); + } + + if (!strcmp(argv[optind], "init-pagemap-read")) { + return init_pagemap_read(0); + } + /* We must not open imgs dir, if service is called */ if (strcmp(argv[optind], "service")) { ret = open_image_dir(opts.imgs_dir, image_dir_mode(argv, optind)); @@ -324,6 +352,8 @@ usage: " dedup remove duplicates in memory dump\n" " cpuinfo dump writes cpu information into image file\n" " cpuinfo check validates cpu information read from image file\n" +" clear-pin-memory clear pin memory manage data\n" +" init-pagemap-read init data buffer for reading page map info\n" );
if (usage_error) { diff --git a/criu/include/restorer.h b/criu/include/restorer.h index f6b45d6..affc155 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -334,12 +334,14 @@ enum { #define _SET_PIN_MEM_AREA 1 #define _CLEAR_PIN_MEM_AREA 2 #define _REMAP_PIN_MEM_AREA 3 +#define _INIT_PAGEMAP_READ 5 #define _DUMP_SEPCIAL_PAGES 6 #define _RETORE_SEPCIAL_PAGES 7 #define _SET_FORK_PID 8 #define SET_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _SET_PIN_MEM_AREA, struct pin_mem_area_set) #define CLEAR_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _CLEAR_PIN_MEM_AREA, int) #define REMAP_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _REMAP_PIN_MEM_AREA, int) +#define INIT_PAGEMAP_READ _IOW(PIN_MEM_MAGIC, _INIT_PAGEMAP_READ, int) #define DUMP_SEPCIAL_PAGES _IOW(PIN_MEM_MAGIC, _DUMP_SEPCIAL_PAGES, int) #define RETORE_SEPCIAL_PAGES _IOW(PIN_MEM_MAGIC, _RETORE_SEPCIAL_PAGES, int) #define SET_FORK_PID _IOW(PIN_MEM_MAGIC, _SET_FORK_PID, int) @@ -358,4 +360,6 @@ struct pin_mem_area_set { struct pin_mem_area mem_area[MAX_PIN_MEM_AREA_NUM]; };
+int clear_pin_mem(int pid); + #endif /* __CR_RESTORER_H__ */
There are two kinds of vmas: anonymous vma and file-based vma. For anonymous vma, criu just map area and fill content to it; for file-based vma, criu preprocess it, such as setting `open_vm()` callback function.
`/dev/hisi_sec2*` char device is different from the normal. The `open`, `mmap`, and `close` syscall actions has a special meaning. - `open`: allocate physical resource of the device - `mmap`: create instance - `close`: release physical resource The vma means the instance in this device. One fd may be associated with a group instances: one mmio (vma size is 2 pages, pgoff is 0), one dus (vma size is 37 pages, pgoff is 0x2000). As for dus vma, it's split two vmas by `mprotect(addr, 0x5000, PROT_READ)`: one size is 0x20000, one size is 0x5000.
This patch makes the /dev/hisi_sec* restore possible. Idea: It's impossible for criu to know the relationship between vma and the mapped file fd. Therefore, just collect the total fds number during collecting /dev/hisi_sec* files, then the fd is tagged that which function is used during vma restoration, and aissign the unused fd to the specific vma. And during `mmap()` process, dus vma is splitted by `mprotect`.
Note: - criu use ino to index the fd. - this physical device drivers is hisi_sec2.ko, which is located in `drivers/crypto/hisilicon/sec2/` of linux kernel. - this device name has prefix "hisi_sec2" that is found from `drivers/crypto/hisilicon/sec2/sec_main.c`.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: fu.lin fulin10@huawei.com --- criu/files-reg.c | 113 ++++++++++++++++++++++++++++++++++ criu/files.c | 17 ++++-- criu/include/files-reg.h | 8 +++ criu/include/util.h | 8 +++ criu/include/vma.h | 12 ++++ criu/pie/restorer.c | 129 ++++++++++++++++++++++++++++++++++++++- criu/proc_parse.c | 19 +++--- 7 files changed, 292 insertions(+), 14 deletions(-)
diff --git a/criu/files-reg.c b/criu/files-reg.c index 46e9eab..01e0895 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -2441,6 +2441,109 @@ static int open_filemap(int pid, struct vma_area *vma) return 0; }
+#define MAX_HISI_SEC_SIZE 3 /* one physical device expose three char dev */ +static struct hlist_head hisi_sec_fds_hash[MAX_HISI_SEC_SIZE]; + +static int collect_hisi_sec_fds(struct list_head *list) +{ + struct fdinfo_list_entry *fle, *tmp; + struct chrfile_info *ci; + struct file_desc *d; + struct hisi_sec_desc *desc; + int idx; + int nr = 0; + + for (idx = 0; idx < MAX_HISI_SEC_SIZE; idx++) + INIT_HLIST_HEAD(&hisi_sec_fds_hash[idx]); + + list_for_each_entry_safe(fle, tmp, list, ps_list) { + d = fle->desc; + + if (d->ops->type != FD_TYPES__CHR) + continue; + + ci = container_of(d, struct chrfile_info, d); + + if (strstr(ci->path, HISI_SEC_DEV) != NULL) { + desc = shmalloc(sizeof(*desc)); + if (desc == NULL) + return -ENOMEM; + + desc->name = ci->path; + desc->fd = fle->fe->fd; + desc->mmio = desc->dus = 0; + + idx = (ci->path[strlen(ci->path)-1] - '0') % MAX_HISI_SEC_SIZE; + hlist_add_head(&desc->hash, &hisi_sec_fds_hash[idx]); + + nr += 1; + } + } + + return nr; +} + +static long delivery_hisi_sec_fd(struct list_head *fds, struct vma_area *vma) +{ + extern unsigned hisi_sec_fds_n; /* defined in criu/files.c */ + static bool initialized = false; + struct hisi_sec_desc *desc; + int fd = -1, idx; + + if (!initialized) { + int nr; + + pr_info("find %d fds for hisi_sec char device\n", hisi_sec_fds_n); + + nr = collect_hisi_sec_fds(fds); + if (nr != hisi_sec_fds_n) { + pr_err("Collected fds(%d) aren't equal opened(%d)\n", + nr, hisi_sec_fds_n); + return -1; + } + + initialized = true; + } else if (vma->e->pgoff != HISI_SEC_MMIO && vma->e->pgoff != HISI_SEC_DUS) { + /* It's impossible value for fd, just as a tag to show it's a + * vma by `mprotect` syscall. + */ + return LONG_MAX; + } + + idx = (vma->e->name[strlen(vma->e->name)-1] - '0') % MAX_HISI_SEC_SIZE; + hlist_for_each_entry(desc, &hisi_sec_fds_hash[idx], hash) { + if (strcmp(desc->name, vma->e->name) != 0) + continue; + + if (vma->e->pgoff == HISI_SEC_MMIO && !desc->mmio) { + fd = desc->fd; + desc->mmio = true; + break; + } else if (vma->e->pgoff == HISI_SEC_DUS && !desc->dus) { + fd = desc->fd; + desc->dus = true; + break; + } + } + + return fd; +} + +static int handle_hisi_vma(struct list_head *fds, struct vma_area *vma) +{ + long fd = delivery_hisi_sec_fd(fds, vma); + + if (fd < 0) { + pr_err("find fd for char dev vma pgoff %lx named %s failed.\n", + vma->e->pgoff, vma->e->name); + return -1; + } + + vma->e->fd = fd; + + return 0; +} + int collect_chr_map(struct pstree_item *me, struct vma_area *vma) { struct list_head *list = &rsti(me)->fds; @@ -2448,6 +2551,13 @@ int collect_chr_map(struct pstree_item *me, struct vma_area *vma) struct chrfile_info *ci; bool exist_fd;
+ if (strstr(vma->e->name, HISI_SEC_DEV) != NULL) { + if (handle_hisi_vma(list, vma) != 0) { + return -1; + } else + goto out; + } + list_for_each_entry_safe(fle, tmp, list, ps_list) { struct file_desc *d = fle->desc;
@@ -2466,6 +2576,9 @@ int collect_chr_map(struct pstree_item *me, struct vma_area *vma) if (!exist_fd) return -EEXIST;
+out: + pr_info(" `- find fd %ld for dev %s at this vma\n", vma->e->fd, vma->e->name); + return 0; }
diff --git a/criu/files.c b/criu/files.c index 2dd4d9c..84d6563 100644 --- a/criu/files.c +++ b/criu/files.c @@ -64,6 +64,8 @@ static LIST_HEAD(fake_master_head);
static u32 max_file_desc_id = 0;
+unsigned hisi_sec_fds_n; + static void init_fdesc_hash(void) { int i; @@ -1847,11 +1849,14 @@ out: static int chrfile_open(struct file_desc *d, int *new_fd) { int fd, mntns_root; - int ret = 0; + int ret = -1; struct chrfile_info *ci;
ci = container_of(d, struct chrfile_info, d);
+ pr_info("charfile: Opening %s (repair %d index %d)\n", + ci->path, ci->cfe->repair, ci->cfe->index); + mntns_root = open_pid_proc(getpid()); fd = openat(mntns_root, ci->path, ci->cfe->flags | O_REPAIR); if (fd < 0){ @@ -1867,6 +1872,8 @@ static int chrfile_open(struct file_desc *d, int *new_fd) }
*new_fd = fd; + ret = 0; + return ret; err: close(fd); @@ -1889,10 +1896,12 @@ static int collect_one_chrfile(void *o, ProtobufCMessage *base, struct cr_img *i else ci->path = ci->cfe->name;
- pr_info("Collected chr file: %#x, name: %s\n", ci->cfe->id, ci->path); - file_desc_add(&ci->d, ci->cfe->id, &chrfile_desc_ops); + /* collect `/dev/hisi_sec2*` fds */ + if (strstr(ci->path, HISI_SEC_DEV) != NULL) + hisi_sec_fds_n += 1;
- return 0; + pr_info("Collected chr file: %#x, name: %s\n", ci->cfe->id, ci->path); + return file_desc_add(&ci->d, ci->cfe->id, &chrfile_desc_ops); }
struct collect_image_info chrfile_cinfo = { diff --git a/criu/include/files-reg.h b/criu/include/files-reg.h index 4ec0e14..6c15a19 100644 --- a/criu/include/files-reg.h +++ b/criu/include/files-reg.h @@ -33,6 +33,14 @@ struct chrfile_info { char *path; };
+struct hisi_sec_desc { + struct hlist_node hash; + char *name; + bool mmio; + bool dus; + int fd; +}; + extern int open_reg_by_id(u32 id); extern int open_reg_fd(struct file_desc *); extern int open_path(struct file_desc *, int (*open_cb)(int ns_root_fd, diff --git a/criu/include/util.h b/criu/include/util.h index d1510fc..c176981 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -432,4 +432,12 @@ int mask_task_exit_notify(int pid, bool mask);
#define RESERVED_PORTS_PATH "/proc/sys/net/ipv4/ip_local_reserved_ports"
+#define HISI_SEC_DEV "hisi_sec2" /* `/dev/hisi_sec2*` char device */ + +/* here is the selection of offset in `mmap`, they're from drivers */ +enum hisi_sec_dev { + HISI_SEC_MMIO = 0x0, + HISI_SEC_DUS = 0x2000, +}; + #endif /* __CR_UTIL_H__ */ diff --git a/criu/include/vma.h b/criu/include/vma.h index 5e3f352..f649a95 100644 --- a/criu/include/vma.h +++ b/criu/include/vma.h @@ -133,4 +133,16 @@ static inline bool vma_entry_can_be_lazy(VmaEntry *e) !(vma_entry_is(e, VMA_AREA_VSYSCALL))); }
+struct vma_attr { + int prot; + int flags; +}; + +enum ALIEN_MAP_METHOD { + PGOFF_IS_ZERO, + MAP_THEN_PROTECT, + + MAX_ALIEN_MAP_METHOD, +}; + #endif /* __CR_VMA_H__ */ diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 603cbee..949384e 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -901,6 +901,129 @@ static unsigned long restore_mapping(VmaEntry *vma_entry) return addr; }
+static unsigned long restore_map_then_protect_mapping(VmaEntry *curr, + struct vma_attr *curr_attr, + VmaEntry *next, + struct vma_attr *next_attr) +{ + int retval; + unsigned long addr; + + if (next->fd != LONG_MAX + || curr->end != next->start + || (vma_entry_len(curr) + curr->pgoff) != next->pgoff + || curr->prot == next->prot + || curr->flags != next->flags) { + pr_err("They looks not currect:\n"); + pr_err(" `- vma A: (%x %x %d %lx)\n", + curr_attr->prot, curr_attr->flags, + (int)curr->fd, curr->pgoff); + pr_err(" `- vma B: (%x %x %d %lx)\n", + next_attr->prot, next_attr->flags, + (int)next->fd, next->pgoff); + return -1; + } + + pr_info("\tmmap(%x %x %d %lx) in map then protect mapping\n", + curr_attr->prot, curr_attr->flags, + (int)curr->fd, curr->pgoff); + + addr = sys_mmap(decode_pointer(curr->start), + vma_entry_len(curr) + vma_entry_len(next), + curr_attr->prot, curr_attr->flags, curr->fd, curr->pgoff); + if (addr != curr->start) { + pr_err("%s: mmap failed with code %ld\n", __func__, addr); + goto out; + } + + pr_info("\t mprotect(%x)\n", next_attr->prot); + retval = sys_mprotect(decode_pointer(next->start), + vma_entry_len(next), next_attr->prot); + if (retval != 0) { + addr = retval; + pr_err("%s: mprotect failed with code %d\n", __func__, retval); + } + +out: + return addr; +} + +static unsigned long restore_pgoff_is_zero_mapping(VmaEntry *curr, struct vma_attr *attr) +{ + unsigned long addr; + + pr_debug("\tmmap(%x %x %d %lx) in pgoff is zero mapping\n", + attr->prot, attr->flags, (int)curr->fd, curr->pgoff); + + addr = sys_mmap(decode_pointer(curr->start), + vma_entry_len(curr), + attr->prot, attr->flags, + curr->fd, curr->pgoff); + + return addr; +} + +static unsigned long restore_hisi_sec_mapping(struct task_restore_args *args, + int i, int *step) +{ + VmaEntry *curr = args->vmas + i; + VmaEntry *next = args->vmas + i + 1; + struct vma_attr curr_attr = { + .prot = curr->prot, + .flags = curr->flags | MAP_FIXED, + }; + struct vma_attr next_attr = { + .prot = next->prot, + .flags = next->flags | MAP_FIXED, + }; + unsigned long addr; + + switch (curr->pgoff) { + case HISI_SEC_MMIO: + addr = restore_pgoff_is_zero_mapping(curr, &curr_attr); + break; + case HISI_SEC_DUS: + *step = 2; + addr = restore_map_then_protect_mapping(curr, &curr_attr, next, &next_attr); + break; + default: + pr_err("invalid pgoff %lx for vma\n", curr->pgoff); + return -1; + } + return addr; +} + +static bool find(const char *s1, const char *s2) +{ + if (s1 == NULL || s2 == NULL) + return NULL; + + while (*s1 != '\0' && *s2 != '\0') { + if (*s1 == *s2) { + s1 += 1; + s2 += 1; + } else + s1 += 1; + + if (*s2 == '\0') + return true; + } + + return false; +} + +static unsigned long distribute_restore_mapping(struct task_restore_args *args, + int i, int *step) +{ + VmaEntry *vma = args->vmas + i; + struct vma_names *vma_name = args->vma_names + i; + + if (vma_entry_is(vma, VMA_AREA_CHR) && find(vma_name->name, HISI_SEC_DEV)) + return restore_hisi_sec_mapping(args, i, step); + else + return restore_mapping(vma); +} + /* * This restores aio ring header, content, head and in-kernel position * of tail. To set tail, we write to /dev/null and use the fact this @@ -1588,7 +1711,7 @@ int write_fork_pid(int pid) long __export_restore_task(struct task_restore_args *args) { long ret = -1; - int i; + int i, step; VmaEntry *vma_entry; unsigned long va; struct restore_vma_io *rio; @@ -1738,7 +1861,7 @@ long __export_restore_task(struct task_restore_args *args) /* * OK, lets try to map new one. */ - for (i = 0; i < args->vmas_n; i++) { + for (i = 0, step = 1; i < args->vmas_n; i += step, step = 1) { vma_entry = args->vmas + i; vma_name = args->vma_names + i;
@@ -1756,7 +1879,7 @@ long __export_restore_task(struct task_restore_args *args) if (vma_entry_is(vma_entry, VMA_PREMMAPED)) continue;
- va = restore_mapping(vma_entry); + va = distribute_restore_mapping(args, i, &step);
if (va != vma_entry->start) { pr_err("Can't restore %"PRIx64" mapping with %lx\n", vma_entry->start, va); diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 2c7b926..b3d1c0b 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -659,17 +659,22 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, /* regular file mapping -- supported */; else if (S_ISCHR(st_buf->st_mode)) { /* devzero mapping -- also makes sense */; - if (opts.dump_char_dev && (strstr(file_path, "uverbs") != NULL)) { - int len = strlen(file_path) + 1; - vma_area->e->status |= VMA_AREA_CHR; - vma_area->e->name = xmalloc(len); - if (!vma_area->e->name) { + + if (!opts.dump_char_dev) { + /* do nothing, it's original progoss */ + } else if (strstr(file_path, "uverbs") != NULL + || strstr(file_path, HISI_SEC_DEV) != NULL) { + int len = strlen(file_path) + 1; + + vma_area->e->status |= VMA_AREA_CHR; + vma_area->e->name = xmalloc(len); + if (!vma_area->e->name) { pr_err("alloc vma area name fail\n"); goto err; } strncpy(vma_area->e->name, file_path, len); - pr_info("uverbs name content is: %s\n", vma_area->e->name); - } + pr_info("vma name content is: %s\n", vma_area->e->name); + } } else { pr_err("Can't handle non-regular mapping on %d's map %"PRIx64"\n", pid, vma_area->e->start); goto err;
We should try out best to ensure the success of criu. As for unix dgram socket, criu use re-connect instead of repair instead of unix stream socket. Therefore, this patch does the following things:
- detect unix dgram unix sock file when criu dumps unix dgram socket - add the fault tolerance of unix dgram socket connecting (focus on the condition of `/dev/log` disappearance when rsyslog restart)
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: fu.lin fulin10@huawei.com --- criu/sk-unix.c | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-)
diff --git a/criu/sk-unix.c b/criu/sk-unix.c index d4c15ce..b4c24ed 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -11,6 +11,7 @@ #include <stdlib.h> #include <dlfcn.h> #include <libgen.h> +#include <time.h>
#include "libnetlink.h" #include "cr_options.h" @@ -1371,6 +1372,33 @@ err: return -1; }
+/* + * Sometimes, `/dev/log` will disappear because of the restart of rsyslog when + * rotating, criu try to connect `/dev/log` will report error at this time. We + * should try our best to ensure the success of criu restoration. Therefore, + * retry three times here. + */ +static int unix_dgram_reconnect(int fd, struct sockaddr_un *addr, int len) +{ + int retval = 0; + struct timespec tim = { + .tv_sec = 0, + .tv_nsec = 5e+8, + }; + + for (int i = 0; i < 3; i++) { + nanosleep(&tim, NULL); + pr_warn("Can't connect unix socket(%s), %d retry\n", + addr->sun_path, i); + retval = connect(fd, (struct sockaddr *)addr, + sizeof(addr->sun_family) + len); + if (retval == 0) + break; + } + + return retval; +} + static int post_open_standalone(struct file_desc *d, int fd) { int fdstore_fd = -1, procfs_self_dir = -1, len; @@ -1453,8 +1481,11 @@ static int post_open_standalone(struct file_desc *d, int fd) goto err_revert_and_exit; } } else if (connect(fd, (struct sockaddr *)&addr, sizeof(addr.sun_family) + len) < 0) { - pr_perror("Can't connect %d socket", ui->ue->ino); - goto err_revert_and_exit; + if (ui->ue->type != SOCK_DGRAM || errno != ENOENT + || unix_dgram_reconnect(fd, &addr, len) != 0) { + pr_perror("Can't connect %d socket", ui->ue->ino); + goto err_revert_and_exit; + } } mutex_unlock(mutex_ghost);
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: fu.lin fulin10@huawei.com --- criu/sk-inet.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-)
diff --git a/criu/sk-inet.c b/criu/sk-inet.c index 768c6ed..b614cec 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -1187,8 +1187,24 @@ int inet_bind(int sk, struct inet_sk_info *ii) }
if (bind(sk, (struct sockaddr *)&addr, addr_size) == -1) { - pr_perror("Can't bind inet socket (id %d)", ii->ie->id); - return -1; + InetSkEntry *ie = ii->ie; + + /* + * Sometimes the ping-like program restoration may appear + * `bind()` error when it is specified the address. In view + * of the principle that we should try our best to restore the + * process, and ping-like program works abnormal can tolerate, + * just warn here instead of report error. + */ + if (ie->proto == IPPROTO_ICMP || ie->proto == IPPROTO_ICMPV6) { + pr_warn("Can't bind inet socket (id %d) proto %s\n", + ie->id, + ie->proto == IPPROTO_ICMP ? + "IPPROTO_ICMP" : "IPPROTO_ICMPV6"); + } else { + pr_perror("Can't bind inet socket (id %d)", ii->ie->id); + return -1; + } }
if (rst_freebind) {
From: "fu.lin" fu.lin10@huawei.com
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: fu.lin fu.lin10@huawei.com --- criu/Makefile.crtools | 1 + criu/include/nftables.h | 138 +++++++ criu/nftables.c | 823 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 962 insertions(+) create mode 100644 criu/nftables.c
diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index ff6b597..cda5b82 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -91,6 +91,7 @@ obj-y += vdso.o obj-y += timens.o obj-y += devname.o obj-y += mnl.o +obj-y += nftables.o obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 diff --git a/criu/include/nftables.h b/criu/include/nftables.h index 0bdab31..3b51a3d 100644 --- a/criu/include/nftables.h +++ b/criu/include/nftables.h @@ -3,6 +3,99 @@
#include <libmnl/libmnl.h>
+#include <libnftnl/table.h> +#include <libnftnl/chain.h> +#include <libnftnl/set.h> +#include <libnftnl/rule.h> +#include <libnftnl/expr.h> + +#define construct_buf(buf, type, family, flags, seq, payload, cb_prefix) \ + ({ \ + struct nlmsghdr *_nlh; \ + \ + _nlh = nftnl_##cb_prefix##_nlmsg_build_hdr((buf), \ + (type), (family), (flags), (seq)); \ + nftnl_##cb_prefix##_nlmsg_build_payload(_nlh, (payload)); \ + nftnl_##cb_prefix##_free((payload)); \ + _nlh; \ + }) + +#define construct_table_buf(buf, type, family, flags, seq, payload) \ + construct_buf((buf), (type), (family), (flags), (seq), \ + (payload), table) + +#define construct_chain_buf(buf, type, family, flags, seq, payload) \ + construct_buf((buf), (type), (family), (flags), (seq), \ + (payload), chain) + +#define construct_batch(batch, type, family, flags, seq, payload, cb_prefix) \ + { \ + struct nlmsghdr *_nlh; \ + \ + _nlh = nftnl_##cb_prefix##_nlmsg_build_hdr( \ + mnl_nlmsg_batch_current(batch), \ + (type), (family), (flags), (seq)); \ + nftnl_##cb_prefix##_nlmsg_build_payload(_nlh, (payload)); \ + nftnl_##cb_prefix##_free((payload)); \ + mnl_nlmsg_batch_next((batch)); \ + } + +#define construct_table_batch(batch, type, family, flags, seq, payload) \ + construct_batch((batch), (type), (family), (flags), (seq), \ + (payload), table) + +#define construct_chain_batch(batch, type, family, flags, seq, payload) \ + construct_batch((batch), (type), (family), (flags), (seq), \ + (payload), chain) + +#define construct_set_batch(batch, type, family, flags, seq, payload) \ + construct_batch((batch), (type), (family), (flags), (seq), \ + (payload), set) + +#define construct_rule_batch(batch, type, family, flags, seq, payload) \ + construct_batch((batch), (type), (family), (flags), (seq), \ + (payload), rule) + +#define construct_set_elems_batch(batch, type, family, flags, seq, payload) \ + { \ + struct nlmsghdr *_nlh; \ + \ + _nlh = nftnl_nlmsg_build_hdr( \ + mnl_nlmsg_batch_current(batch), \ + (type), (family), (flags), (seq)); \ + nftnl_set_elems_nlmsg_build_payload(_nlh, (payload)); \ + nftnl_set_free((payload)); \ + mnl_nlmsg_batch_next((batch)); \ + } + +#define TABLE_NAME "filter" +#define INPUT_CHAIN_NAME "criu-input" +#define OUTPUT_CHAIN_NAME "criu-output" +#define INPUT_IPV4_SET_NAME "criu-input-ipv4-blacklist-%d" +#define INPUT_IPV6_SET_NAME "criu-input-ipv6-blacklist-%d" +#define OUTPUT_IPV4_SET_NAME "criu-output-ipv4-blacklist-%d" +#define OUTPUT_IPV6_SET_NAME "criu-output-ipv6-blacklist-%d" + +/* set key type, see nftables/include/datatypes.h + * The rule of the datatype calculation: + * Each type occupies 6 bits, type: + * - ipaddr: 7, 4 bytes + * - ip6addr: 8, 16 types + * - inet_service: 13, 2 bytes (pading to 4 bytes) + * + * 0x1cd1cd: 0b 000111 001101 000111 001101 + * 0x20d20d: 0b 001000 001101 001000 001101 + */ +#define INET_SERVICE_LEN 2 +#define IPADDR_LEN 4 +#define IP6ADDR_LEN 16 +#define div_round_up(n, d) (((n) + (d) - 1) / (d)) + +#define IPv4_KEY_TYPE 0x1cd1cd +#define IPv4_KEY_LEN div_round_up(IPADDR_LEN + INET_SERVICE_LEN, 4) * 4 * 2 +#define IPv6_KEY_TYPE 0x20d20d +#define IPv6_KEY_LEN div_round_up(IP6ADDR_LEN + INET_SERVICE_LEN, 4) * 4 * 2 + struct mnl_params { struct mnl_socket *nl; char *buf; @@ -25,4 +118,49 @@ int mnl_common(mnl_func_t mnl_cb, void *arg1, void *arg2); int mnl_batch_send_and_recv(struct mnl_params *mnl_params, batch_func_t cb, void *args, int *result); int mnl_buf_send_and_recv(struct mnl_params *mnl_params, buf_func_t cb, void *args, int *result);
+struct nft_chain_params { + char *name; + uint32_t hooknum; + char *type; + uint32_t prio; + uint32_t policy; +}; + +struct nft_set_params { + char name[128]; + uint32_t id; + uint32_t datatype; + uint32_t key_len; +}; + +struct nft_rule_params { + char *chain_name; + char set_name[128]; + uint32_t mark; + uint16_t mark_op; + uint32_t nfproto; + uint8_t l4proto; + unsigned int stmt; + bool ipv6; +}; + +struct nft_set_elem_params { + char set_name[128]; + char data[40]; + size_t data_len; +}; + +struct nf_conn_params { + uint8_t family; + uint32_t *src_addr; + uint16_t src_port; + uint32_t *dst_addr; + uint16_t dst_port; + bool lock; + pid_t tree_id; +}; + +struct inet_sk_desc; +int nft_connection_switch(struct inet_sk_desc *sk, bool lock, pid_t tree_id); + #endif /* __CR_NFTABLES_H__ */ diff --git a/criu/nftables.c b/criu/nftables.c new file mode 100644 index 0000000..57774e6 --- /dev/null +++ b/criu/nftables.c @@ -0,0 +1,823 @@ +#include <libmnl/libmnl.h> +#include <stddef.h> +#include <string.h> +#include <sys/socket.h> +#include <time.h> + +#include <netinet/in.h> +#include <arpa/inet.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter/nf_tables.h> +#include <linux/if_ether.h> +#include <linux/ip.h> +#include <linux/ipv6.h> + +#include "sk-inet.h" +#include "nftables.h" + +#include "../soccr/soccr.h" + +#include "log.h" + +static struct nftnl_table *setup_table(uint8_t family, const char *table) +{ + struct nftnl_table *t; + + t = nftnl_table_alloc(); + if (t == NULL) + return NULL; + + nftnl_table_set_u32(t, NFTNL_TABLE_FAMILY, family); + if (nftnl_table_set_str(t, NFTNL_TABLE_NAME, table) < 0) + goto err; + + return t; +err: + nftnl_table_free(t); + return NULL; +} + +static struct nftnl_chain *setup_chain(const char *table, + struct nft_chain_params *params, + bool create) +{ + struct nftnl_chain *c; + + c = nftnl_chain_alloc(); + if (c == NULL) + return NULL; + + if (nftnl_chain_set_str(c, NFTNL_CHAIN_TABLE, table) < 0) + goto err; + if (nftnl_chain_set_str(c, NFTNL_CHAIN_NAME, params->name) < 0) + goto err; + if (create) { + nftnl_chain_set_u32(c, NFTNL_CHAIN_HOOKNUM, params->hooknum); + if (nftnl_chain_set_str(c, NFTNL_CHAIN_TYPE, params->type) < 0) + goto err; + nftnl_chain_set_u32(c, NFTNL_CHAIN_PRIO, params->prio); + nftnl_chain_set_u32(c, NFTNL_CHAIN_POLICY, params->policy); + } + + return c; +err: + nftnl_chain_free(c); + return NULL; +} + +static struct nftnl_set *setup_set(uint8_t family, const char *table, + struct nft_set_params *params, + bool create) +{ + struct nftnl_set *s; + + s = nftnl_set_alloc(); + if (s == NULL) + return NULL; + + if (nftnl_set_set_str(s, NFTNL_SET_TABLE, table) < 0) + goto err; + if (nftnl_set_set_str(s, NFTNL_SET_NAME, params->name) < 0) + goto err; + if (create) { + nftnl_set_set_u32(s, NFTNL_SET_FAMILY, family); + nftnl_set_set_u32(s, NFTNL_SET_ID, params->id); + + nftnl_set_set_u32(s, NFTNL_SET_KEY_TYPE, params->datatype); + nftnl_set_set_u32(s, NFTNL_SET_KEY_LEN, params->key_len); + } + + return s; +err: + nftnl_set_free(s); + return NULL; +} + +static int add_mark(struct nftnl_rule *r, uint32_t meta_key, enum nft_registers dreg) +{ + struct nftnl_expr *e; + + e = nftnl_expr_alloc("meta"); + if (e == NULL) + return -1; + + nftnl_expr_set_u32(e, NFTNL_EXPR_META_KEY, meta_key); + nftnl_expr_set_u32(e, NFTNL_EXPR_META_DREG, dreg); + + nftnl_rule_add_expr(r, e); + + return 0; +} + +static int add_proto(struct nftnl_rule *r, enum nft_registers dreg) +{ + struct nftnl_expr *e; + + e = nftnl_expr_alloc("meta"); + if (e == NULL) + return -1; + + nftnl_expr_set_u32(e, NFTNL_EXPR_META_KEY, NFT_META_L4PROTO); + nftnl_expr_set_u32(e, NFTNL_EXPR_META_DREG, dreg); + + nftnl_rule_add_expr(r, e); + + return 0; +} + +static int add_payload(struct nftnl_rule *r, uint32_t base, uint32_t dreg, + uint32_t offset, uint32_t len) +{ + struct nftnl_expr *e; + + e = nftnl_expr_alloc("payload"); + if (e == NULL) + return -1; + + nftnl_expr_set_u32(e, NFTNL_EXPR_PAYLOAD_BASE, base); + nftnl_expr_set_u32(e, NFTNL_EXPR_PAYLOAD_DREG, dreg); + nftnl_expr_set_u32(e, NFTNL_EXPR_PAYLOAD_OFFSET, offset); + nftnl_expr_set_u32(e, NFTNL_EXPR_PAYLOAD_LEN, len); + + nftnl_rule_add_expr(r, e); + + return 0; +} + +static int add_cmp(struct nftnl_rule *r, enum nft_registers sreg, uint32_t op, + const void *data, uint32_t data_len) +{ + struct nftnl_expr *e; + + e = nftnl_expr_alloc("cmp"); + if (e == NULL) + return -1; + + nftnl_expr_set_u32(e, NFTNL_EXPR_CMP_SREG, sreg); + nftnl_expr_set_u32(e, NFTNL_EXPR_CMP_OP, op); + nftnl_expr_set(e, NFTNL_EXPR_CMP_DATA, data, data_len); + + nftnl_rule_add_expr(r, e); + + return 0; +} + +static int add_lookup(struct nftnl_rule *r, enum nft_registers sreg, + const char *set) +{ + struct nftnl_expr *e; + + e = nftnl_expr_alloc("lookup"); + if (e == NULL) + return -1; + + if (nftnl_expr_set_str(e, NFTNL_EXPR_LOOKUP_SET, set) < 0) + goto err; + nftnl_expr_set_u32(e, NFTNL_EXPR_LOOKUP_SREG, sreg); + + nftnl_rule_add_expr(r, e); + + return 0; +err: + nftnl_expr_free(e); + return -1; +} + +static int add_counter(struct nftnl_rule *r) +{ + struct nftnl_expr *e; + + e = nftnl_expr_alloc("counter"); + if (e == NULL) + return -1; + + nftnl_rule_add_expr(r, e); + + return 0; +} + +static int add_verdict(struct nftnl_rule *r, const char *chain, int verdict) +{ + struct nftnl_expr *e; + + e = nftnl_expr_alloc("immediate"); + if (e == NULL) + return -1; + + nftnl_expr_set_u32(e, NFTNL_EXPR_IMM_DREG, NFT_REG_VERDICT); + nftnl_expr_set_u32(e, NFTNL_EXPR_IMM_VERDICT, verdict); + + nftnl_rule_add_expr(r, e); + + return 0; +} + +static int __setup_rule(struct nftnl_rule *r, struct nft_rule_params *params) +{ + /* meta nfproto == <nfproto> */ + if (add_mark(r, NFT_META_PROTOCOL, NFT_REG32_00) < 0) + return -1; + if (add_cmp(r, NFT_REG32_00, NFT_CMP_EQ, ¶ms->nfproto, sizeof(uint32_t))< 0) + return -1; + + /* meta l4proto == <l4proto> */ + if (add_proto(r, NFT_REG32_00) < 0) + return -1; + if (add_cmp(r, NFT_REG32_00, NFT_CMP_EQ, ¶ms->l4proto, sizeof(uint8_t)) < 0) + return -1; + + /* ip saddr . sport . daddr . dport @<set> */ + if (params->ipv6 == false) { + if (add_payload(r, NFT_PAYLOAD_NETWORK_HEADER, NFT_REG32_00, + offsetof(struct iphdr, saddr), IPADDR_LEN) < 0) + return -1; + if (add_payload(r, NFT_PAYLOAD_TRANSPORT_HEADER, NFT_REG32_01, + offsetof(struct tcphdr, source), INET_SERVICE_LEN) < 0) + return -1; + if (add_payload(r, NFT_PAYLOAD_NETWORK_HEADER, NFT_REG32_02, + offsetof(struct iphdr, daddr), IPADDR_LEN) < 0) + return -1; + if (add_payload(r, NFT_PAYLOAD_TRANSPORT_HEADER, NFT_REG32_03, + offsetof(struct tcphdr, dest), INET_SERVICE_LEN) < 0) + return -1; + + if (add_lookup(r, NFT_REG32_00, params->set_name) < 0) + return -1; + } else { + if (add_payload(r, NFT_PAYLOAD_NETWORK_HEADER, NFT_REG32_00, + offsetof(struct ipv6hdr, saddr), IP6ADDR_LEN) < 0) + return -1; + if (add_payload(r, NFT_PAYLOAD_TRANSPORT_HEADER, NFT_REG32_04, + offsetof(struct tcphdr, source), INET_SERVICE_LEN) < 0) + return -1; + if (add_payload(r, NFT_PAYLOAD_NETWORK_HEADER, NFT_REG32_05, + offsetof(struct ipv6hdr, daddr), IP6ADDR_LEN) < 0) + return -1; + if (add_payload(r, NFT_PAYLOAD_TRANSPORT_HEADER, NFT_REG32_09, + offsetof(struct tcphdr, dest), INET_SERVICE_LEN) < 0) + return -1; + + if (add_lookup(r, NFT_REG32_00, params->set_name) < 0) + return -1; + } + + /* counter */ + if (add_counter(r) < 0) + return -1; + + return 0; +} + +static struct nftnl_rule *setup_rule(uint8_t family, const char *table, + struct nft_rule_params *params, + bool create, bool ns) +{ + struct nftnl_rule *r = NULL; + + r = nftnl_rule_alloc(); + if (r == NULL) + return NULL; + + if (nftnl_rule_set_str(r, NFTNL_RULE_TABLE, table) < 0) + goto err; + nftnl_rule_set_u32(r, NFTNL_RULE_FAMILY, family); + if (nftnl_rule_set_str(r, NFTNL_RULE_CHAIN, params->chain_name) < 0) + goto err; + + if (params->mark != 0) { + /* meta mark != <mark> */ + if (add_mark(r, NFT_META_MARK, NFT_REG32_00) < 0) + goto err; + if (add_cmp(r, NFT_REG32_00, params->mark_op, ¶ms->mark, sizeof(uint32_t)) < 0) + goto err; + } + + if (!ns && __setup_rule(r, params) < 0) + goto err; + + /* drop */ + if (add_verdict(r, params->chain_name, params->stmt) < 0) + goto err; + + return r; + +err: + nftnl_rule_free(r); + return NULL; +} + +static struct nlmsghdr *nft_table_detect(struct mnl_params *mnl_params, void *args) +{ + struct nftnl_table *table; + + table = setup_table(NFPROTO_INET, TABLE_NAME); + if (table == NULL) + return NULL; + + return construct_table_buf(mnl_params->buf, NFT_MSG_GETTABLE, NFPROTO_INET, + NLM_F_ACK, mnl_params->seq++, table); +} + +static int nft_table_create(struct mnl_params *mnl_params, void *args) +{ + struct nftnl_table *table; + + table = setup_table(NFPROTO_INET, TABLE_NAME); + if (table == NULL) + return -1; + + construct_table_batch(mnl_params->batch, NFT_MSG_NEWTABLE, NFPROTO_INET, + NLM_F_CREATE|NLM_F_EXCL|NLM_F_ACK, + mnl_params->seq++, table); + + return 0; +} + +static int nft_table_prepare(struct mnl_params *mnl_params) +{ + int result = 0; + + if (mnl_buf_send_and_recv(mnl_params, nft_table_detect, NULL, &result) == 0) + return 0; + + pr_debug("%s: detect table result %d\n", __func__, result); + + if (result == ENOENT && + (mnl_batch_send_and_recv(mnl_params, nft_table_create, NULL, &result) < 0 + && (result != 0 && result != EEXIST))) { + pr_err("%s: create nftables table failed!\n", __func__); + return -1; + } else if (result != 0) { + pr_err("%s: detect table result %d\n", __func__, -result); + return -1; + } + + return 0; +} + +static struct nlmsghdr *nft_chain_detect(struct mnl_params *mnl_params, void *args) +{ + struct nftnl_chain *chain; + + chain = setup_chain(TABLE_NAME, args, false); + if (chain == NULL) + return NULL; + + return construct_chain_buf(mnl_params->buf, NFT_MSG_GETCHAIN, NFPROTO_INET, + NLM_F_ACK, mnl_params->seq++, chain); +} + +static int nft_chain_create(struct mnl_params *mnl_params, void *args) +{ + struct nftnl_chain *chain; + + chain = setup_chain(TABLE_NAME, args, true); + if (chain == NULL) + return -1; + + construct_chain_batch(mnl_params->batch, NFT_MSG_NEWCHAIN, NFPROTO_INET, + NLM_F_CREATE|NLM_F_EXCL|NLM_F_ACK, mnl_params->seq++, chain); + + return 0; +} + +static int nft_chain_prepare_internal(struct mnl_params *mnl_params, + struct nft_chain_params *params) +{ + int result = 0; + + if (mnl_buf_send_and_recv(mnl_params, nft_chain_detect, params, &result) == 0) + return 0; + + pr_debug("%s: detect chain result %d\n", __func__, result); + + if (result == ENOENT && + (mnl_batch_send_and_recv(mnl_params, nft_chain_create, params, &result) < 0 + && (result != 0 && result != EEXIST))) { + pr_err("%s: nftables create chain %s failed!\n", + __func__, params->name); + return -1; + } else if (result != 0) { + pr_err("%s: detect chain result %d\n", __func__, -result); + return -1; + } + + return result; +} + +static int nft_chain_prepare(struct mnl_params *mnl_params) +{ + struct nft_chain_params params = { + .type = "filter", + .prio = NF_IP_PRI_FILTER, + .policy = NF_ACCEPT, + }; + + /* prepare ipv4 input chain in filter table */ + params.name = INPUT_CHAIN_NAME; + params.hooknum = NF_INET_LOCAL_IN; + + if (nft_chain_prepare_internal(mnl_params, ¶ms) < 0) + return -1; + + /* prepare ipv4 output chain in filter table */ + params.name = OUTPUT_CHAIN_NAME; + params.hooknum = NF_INET_LOCAL_OUT; + + if (nft_chain_prepare_internal(mnl_params, ¶ms) < 0) + return -1; + + return 0; +} + +static int nft_set_internal(uint8_t family, struct mnl_params *mnl_params, + struct nft_set_params *params, bool create) +{ + struct nftnl_set *set; + + set = setup_set(family, TABLE_NAME, params, create); + if (set == NULL) + return -1; + + if (create) { + construct_set_batch(mnl_params->batch, NFT_MSG_NEWSET, family, + NLM_F_CREATE|NLM_F_EXCL|NLM_F_ACK, mnl_params->seq++, set); + } else { + construct_set_batch(mnl_params->batch, NFT_MSG_DELSET, family, + 0, mnl_params->seq++, set); + } + + return 0; +} + +static int nft_set_raw(struct mnl_params *mnl_params, + struct mnl_cb_params *args, bool input) +{ + const uint32_t set_id_base = input ? 0x12315 : 0x17173; + const uint8_t family = NFPROTO_INET; + struct nft_set_params params = { 0 }; + char *set_name; + int idx = 0; + + if (!args->ipv6) { + params.datatype = IPv4_KEY_TYPE; + params.key_len = IPv4_KEY_LEN; + idx = 4; + } else { + params.datatype = IPv6_KEY_TYPE; + params.key_len = IPv6_KEY_LEN; + idx = 6; + } + + if (args->ipv6 && input) + set_name = INPUT_IPV6_SET_NAME; + else if (args->ipv6 && !input) + set_name = OUTPUT_IPV6_SET_NAME; + else if (!args->ipv6 && input) + set_name = INPUT_IPV4_SET_NAME; + else + set_name = OUTPUT_IPV4_SET_NAME; + + snprintf(params.name, sizeof(params.name)-1, set_name, args->tree_id); + params.id = set_id_base + args->tree_id + idx; + + if (nft_set_internal(family, mnl_params, ¶ms, args->create) < 0) { + pr_err("%s: create nftables %s %s set failed!\n", __func__, + args->ipv6 ? "ipv6" : "ipv4", + input ? "input" : "output"); + return -1; + } + + return 0; +} + +static int nft_set(struct mnl_params *mnl_params, void *args) +{ + struct mnl_cb_params *params = args; + + params->ipv6 = false; + if (nft_set_raw(mnl_params, params, true) < 0) + return -1; + + if (nft_set_raw(mnl_params, params, false) < 0) + return -1; + + params->ipv6 = true; + if (nft_set_raw(mnl_params, params, true) < 0) + return -1; + + if (nft_set_raw(mnl_params, params, false) < 0) + return -1; + + return 0; +} + +static int nft_set_common(struct mnl_params *mnl_params, pid_t tree_id, bool create) +{ + struct mnl_cb_params params = { + .tree_id = tree_id, + .create = create, + }; + int result = 0; + + if (create && + (mnl_batch_send_and_recv(mnl_params, nft_set, ¶ms, &result) < 0 + && (result != 0 && result != EEXIST))) { + pr_err("%s: create set failed!\n", __func__); + return -1; + } else if (!create && + mnl_batch_send_and_recv(mnl_params, nft_set, ¶ms, NULL) < 0) { + pr_err("%s: delete set failed!\n", __func__); + return -1; + } + + return 0; +} + +static int nft_rule_internal(uint8_t family, struct mnl_params *mnl_params, + struct nft_rule_params *params, bool create) +{ + struct nftnl_rule *rule; + + rule = setup_rule(family, TABLE_NAME, params, create, false); + if (rule == NULL) + return -1; + + if (create) { + construct_rule_batch(mnl_params->batch, NFT_MSG_NEWRULE, family, + NLM_F_CREATE|NLM_F_EXCL|NLM_F_ACK, + mnl_params->seq++, rule); + } else { + construct_rule_batch(mnl_params->batch, NFT_MSG_DELRULE, family, + 0, mnl_params->seq++, rule); + } + + return 0; +} + +static int nft_rule_raw(struct mnl_params *mnl_params, struct mnl_cb_params *args, + struct nft_rule_params *params) +{ + char *set_name; + + params->nfproto = params->ipv6 ? htons(ETH_P_IPV6) : htons(ETH_P_IP); + + set_name = params->ipv6 ? INPUT_IPV6_SET_NAME : INPUT_IPV4_SET_NAME; + params->chain_name = INPUT_CHAIN_NAME; + snprintf(params->set_name, sizeof(params->set_name)-1, set_name, args->tree_id); + if (nft_rule_internal(NFPROTO_INET, mnl_params, params, args->create) < 0) { + pr_err("%s: create nft %s input rule failed!\n", + __func__, params->ipv6 ? "ipv6" : "ipv4"); + return -1; + } + + set_name = params->ipv6 ? OUTPUT_IPV6_SET_NAME : OUTPUT_IPV4_SET_NAME; + params->chain_name = OUTPUT_CHAIN_NAME; + snprintf(params->set_name, sizeof(params->set_name)-1, set_name, args->tree_id); + if (nft_rule_internal(NFPROTO_INET, mnl_params, params, args->create) < 0) { + pr_err("%s: create nftables %s output rule failed!\n", + __func__, params->ipv6 ? "ipv6" : "ipv4"); + return -1; + } + + return 0; +} + +static int nft_rule(struct mnl_params *mnl_params, void *args) +{ + struct nft_rule_params params = { + .l4proto = IPPROTO_TCP, + .mark = SOCCR_MARK, + .mark_op = NFT_CMP_NEQ, + .stmt = NF_DROP, + }; + + params.ipv6 = false; + if (nft_rule_raw(mnl_params, args, ¶ms) < 0) + return -1; + + params.ipv6 = true; + if (nft_rule_raw(mnl_params, args, ¶ms) < 0) + return -1; + + return 0; +} + +static int nft_rule_common(struct mnl_params *mnl_params, pid_t tree_id, bool create) +{ + struct mnl_cb_params params = { + .tree_id = tree_id, + .create = create, + }; + int result = 0; + + if (create && + (mnl_batch_send_and_recv(mnl_params, nft_rule, ¶ms, &result) < 0 + && (result != 0 && result != EEXIST))) { + pr_err("%s: create rule failed!\n", __func__); + return -1; + } else if (!create && + mnl_batch_send_and_recv(mnl_params, nft_rule, ¶ms, NULL) < 0) { + pr_err("%s: delete rule failed!\n", __func__); + return -1; + } + + return 0; +} + +static int network_prepare_internal(struct mnl_params *params, batch_func_t _, void *args) +{ + pid_t tree_id = *(pid_t *)args; + + if (nft_table_prepare(params) < 0) + return -1; + + if (nft_chain_prepare(params) < 0) + return -1; + + if (nft_set_common(params, tree_id, true) < 0) + return -1; + + if (nft_rule_common(params, tree_id, true) < 0) + return -1; + + return 0; +} + +int network_prepare(pid_t tree_id) +{ + pr_info("Prepare network\n"); + + return mnl_common(network_prepare_internal, NULL, &tree_id); +} + +static int network_unprepare_internal(struct mnl_params *params, + batch_func_t _, void *args) +{ + pid_t tree_id = *(pid_t *)args; + + if (nft_rule_common(params, tree_id, false) < 0) + return -1; + + if (nft_set_common(params, tree_id, false) < 0) + return -1; + + return 0; +} + +void network_unprepare(pid_t tree_id) +{ + pr_info("Unprepare network\n"); + + mnl_common(network_unprepare_internal, NULL, &tree_id); +} + +static int add_set_elem_internal(struct nftnl_set *s, void *data, size_t len) +{ + struct nftnl_set_elem *e; + + e = nftnl_set_elem_alloc(); + if (e == NULL) + return -1; + + nftnl_set_elem_set(e, NFTNL_SET_ELEM_KEY, data, len); + + nftnl_set_elem_add(s, e); + + return 0; +} + +static struct nftnl_set *add_set_elem(const char *table, const char *set, + void *data, size_t len) +{ + struct nftnl_set *s; + + s = nftnl_set_alloc(); + if (s == NULL) + return NULL; + + if (nftnl_set_set_str(s, NFTNL_SET_TABLE, table) < 0) + goto err; + if (nftnl_set_set_str(s, NFTNL_SET_NAME, set) < 0) + goto err; + + if (add_set_elem_internal(s, data, len) < 0) + goto err; + + return s; + +err: + nftnl_set_free(s); + return NULL; +} + +static int nft_set_elem(uint8_t family, struct mnl_params *mnl_param, + struct nft_set_elem_params *elem_param, + bool lock) +{ + struct nftnl_set *set; + + set = add_set_elem(TABLE_NAME, elem_param->set_name, + elem_param->data, elem_param->data_len); + if (set == NULL) + return -1; + + if (lock) { + construct_set_elems_batch(mnl_param->batch, NFT_MSG_NEWSETELEM, + family, NLM_F_CREATE|NLM_F_EXCL, + mnl_param->seq++, set); + } else { + construct_set_elems_batch(mnl_param->batch, NFT_MSG_DELSETELEM, + family, 0, mnl_param->seq++, set); + } + + return 0; +} + +static void construct_set_elem_key(void *data, struct nf_conn_params *param, bool output) +{ + size_t offset = 0; + size_t addr_len = param->family == AF_INET ? IPADDR_LEN : IP6ADDR_LEN; + + memcpy(data+offset, output ? param->src_addr : param->dst_addr, addr_len); + offset = addr_len; + *(uint32_t *)(data + offset) = htons(output ? param->src_port : param->dst_port); + offset += sizeof(uint32_t); + memcpy(data+offset, output ? param->dst_addr : param->src_addr, addr_len); + offset += addr_len; + *(uint32_t *)(data + offset) = htons(output ? param->dst_port : param->src_port); +} + +static int nf_connection_switch_raw(struct mnl_params *mnl_params, void *args) +{ + struct nf_conn_params *param = args; + char *input_set_name, *output_set_name; + struct nft_set_elem_params elem; + + switch (param->family) { + case AF_INET: + input_set_name = INPUT_IPV4_SET_NAME; + output_set_name = OUTPUT_IPV4_SET_NAME; + elem.data_len = IPv4_KEY_LEN; + break; + case AF_INET6: + input_set_name = INPUT_IPV6_SET_NAME; + output_set_name = OUTPUT_IPV6_SET_NAME; + elem.data_len = IPv6_KEY_LEN; + break; + default: + pr_err("Unknown socket family %d\n", param->family); + return -1; + } + + construct_set_elem_key(elem.data, param, false); + snprintf(elem.set_name, sizeof(elem.set_name)-1, input_set_name, param->tree_id); + if (nft_set_elem(NFPROTO_INET, mnl_params, &elem, param->lock) < 0) + return -1; + + construct_set_elem_key(elem.data, param, true); + snprintf(elem.set_name, sizeof(elem.set_name)-1, output_set_name, param->tree_id); + if (nft_set_elem(NFPROTO_INET, mnl_params, &elem, param->lock) < 0) + return -1; + + return 0; +} + +/* IPv4-Mapped IPv6 Addresses */ +static int ipv6_addr_mapped(uint32_t *addr) +{ + return (addr[2] == htonl(0x0000ffff)); +} + +int nft_connection_switch(struct inet_sk_desc *sk, bool lock, pid_t tree_id) +{ + char sip[INET_ADDR_LEN], dip[INET_ADDR_LEN]; + struct nf_conn_params param = { + .family = sk->sd.family, + .src_addr = sk->src_addr, + .src_port = sk->src_port, + .dst_addr = sk->dst_addr, + .dst_port = sk->dst_port, + .lock = lock, + .tree_id = tree_id, + }; + + if (param.family == AF_INET6 && ipv6_addr_mapped(param.dst_addr)) { + param.family = AF_INET; + param.src_addr = ¶m.src_addr[3]; + param.dst_addr = ¶m.dst_addr[3]; + } + + if (!inet_ntop(param.family, (void *)param.src_addr, sip, INET_ADDR_LEN) || + !inet_ntop(param.family, (void *)param.dst_addr, dip, INET_ADDR_LEN)) { + pr_perror("nf: Can't translate ip addr"); + return -1; + } + + pr_info("%s %s:%d - %s:%d connection\n", lock ? "Locked" : "Unlocked", + sip, (int)param.src_port, dip, (int)param.dst_port); + + return mnl_sendmsg(nf_connection_switch_raw, ¶m); +}
criu use `ptrace(PTRACE_SYSCALL)` to watch whether the tracee steps in correct status, it isn't necessory to stop tracee at every syscall. Therefore, customizing `ptrace(PTRACE_SYSCALL_NR)` to make tracee stop at the specific syscall can save time (1000 threads consume about 140ms).
ptrace syntax: long ptrace(PTRACE_SYSCALL_NR, pid_t pid, void *addr, void *data);
The argument `addr` is unused in original `ptrace(PTRACE_SYSCALL)`, Here `ptrace(PTRACE_SYSCALL_NR)` use `addr` parameter to give the specific sysno which is wanted to trace.
use `criu check` to generate `/run/criu.kdat` before the first usage of criu, or auto-check during `criu {dump, restore}`.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/25
Signed-off-by: fu.lin fulin10@huawei.com --- compel/Makefile | 1 + compel/arch/aarch64/src/lib/infect.c | 2 +- compel/include/uapi/bisect.h | 30 +++++ compel/include/uapi/infect.h | 15 ++- compel/src/lib/bisect.c | 92 +++++++++++++++ compel/src/lib/infect.c | 169 +++++++++++++++++++++++++-- criu/cgroup-props.c | 6 +- criu/cgroup.c | 12 +- criu/cr-dump.c | 10 +- criu/cr-restore.c | 97 ++++++++++++++- criu/eventfd.c | 2 +- criu/eventpoll.c | 4 +- criu/files-reg.c | 4 +- criu/files.c | 16 +-- criu/include/kerndat.h | 1 + criu/kerndat.c | 67 ++++++++++- criu/lsm.c | 4 +- criu/mount.c | 4 +- criu/sk-packet.c | 2 +- criu/sk-unix.c | 2 +- 20 files changed, 487 insertions(+), 53 deletions(-) create mode 100644 compel/include/uapi/bisect.h create mode 100644 compel/src/lib/bisect.c
diff --git a/compel/Makefile b/compel/Makefile index de9318c..eea93a7 100644 --- a/compel/Makefile +++ b/compel/Makefile @@ -27,6 +27,7 @@ lib-y += src/lib/infect-rpc.o lib-y += src/lib/infect-util.o lib-y += src/lib/infect.o lib-y += src/lib/ptrace.o +lib-y += src/lib/bisect.o
# handle_elf() has no support of ELF relocations on ARM (yet?) ifneq ($(filter arm aarch64,$(ARCH)),) diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index 4b59390..c897b52 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -67,7 +67,7 @@ int get_task_regs(pid_t pid, user_regs_struct_t *regs, save_regs_t save, user_fpregs_struct_t fpsimd; int ret;
- pr_info("Dumping GP/FPU registers for %d\n", pid); + pr_debug("Dumping GP/FPU registers for %d\n", pid);
iov.iov_base = regs; iov.iov_len = sizeof(user_regs_struct_t); diff --git a/compel/include/uapi/bisect.h b/compel/include/uapi/bisect.h new file mode 100644 index 0000000..55ebcbd --- /dev/null +++ b/compel/include/uapi/bisect.h @@ -0,0 +1,30 @@ +#ifndef __COMPEL_BISECT_H__ +#define __COMPEL_BISECT_H__ + +#include <sys/types.h> + +enum tf { + TRACE_INTERRUPT, + TRACE_SYSCALL_ENTER, + TRACE_SYSCALL_EXIT, +}; + +struct trace_flag { + pid_t key; + enum tf flag; +}; + +struct bisect_meta { + int size; + int used; + void *data; /* data pointer array */ + void *__data; /* data array */ +}; + +struct trace_flag *tf_bisect(struct bisect_meta *meta, pid_t key); +struct trace_flag *tf_insert(struct bisect_meta *meta, pid_t key); +int tf_create(struct bisect_meta *meta, int len); +void tf_destroy(struct bisect_meta *meta); +void tf_clear(struct bisect_meta *meta); + +#endif /* __COMPEL_BISECT_H__ */ diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index 257658a..9b356ef 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -8,6 +8,7 @@ #include <compel/ksigset.h> #include <compel/handle-elf.h> #include <compel/task-state.h> +#include <compel/bisect.h>
#include "common/compiler.h"
@@ -41,7 +42,7 @@ extern int __must_check compel_infect(struct parasite_ctl *ctl, extern struct parasite_thread_ctl __must_check *compel_prepare_thread(struct parasite_ctl *ctl, int pid); extern void compel_release_thread(struct parasite_thread_ctl *);
-extern int __must_check compel_stop_daemon(struct parasite_ctl *ctl); +extern int __must_check compel_stop_daemon(struct parasite_ctl *ctl, bool customize); extern int __must_check compel_cure_remote(struct parasite_ctl *ctl); extern int __must_check compel_cure_local(struct parasite_ctl *ctl); extern int __must_check compel_cure(struct parasite_ctl *ctl); @@ -90,6 +91,14 @@ extern int __must_check compel_stop_pie(pid_t pid, void *addr,
extern int __must_check compel_unmap(struct parasite_ctl *ctl, unsigned long addr);
+extern int __must_check compel_stop_on_syscall_customize(int tasks, + const int sys_nr, const int exit_sys_nr, struct bisect_meta *meta); + +extern int __must_check compel_stop_pie_customize(pid_t pid, + const int sys_nr, struct trace_flag *tf); + +extern int __must_check compel_unmap_customize(struct parasite_ctl *ctl, unsigned long addr); + extern int compel_mode_native(struct parasite_ctl *ctl);
extern k_rtsigset_t *compel_task_sigmask(struct parasite_ctl *ctl); @@ -173,4 +182,8 @@ extern unsigned long compel_task_size(void); extern uint64_t compel_get_leader_sp(struct parasite_ctl *ctl); extern uint64_t compel_get_thread_sp(struct parasite_thread_ctl *tctl);
+#ifndef PTRACE_SYSCALL_NR +#define PTRACE_SYSCALL_NR 0xff00 +#endif + #endif diff --git a/compel/src/lib/bisect.c b/compel/src/lib/bisect.c new file mode 100644 index 0000000..807a5a9 --- /dev/null +++ b/compel/src/lib/bisect.c @@ -0,0 +1,92 @@ +#include <stddef.h> + +#include "log.h" +#include "common/xmalloc.h" +#include "bisect.h" + +struct trace_flag *tf_bisect(struct bisect_meta *meta, pid_t key) +{ + struct trace_flag **tfs = meta->data; + int lo = 0, hi = meta->used, mid; + + if (meta->used <= 0) + return NULL; + + while (lo < hi) { + mid = (int)((lo + hi) / 2); + if (tfs[mid]->key == key) { + return tfs[mid]; + } else if (tfs[mid]->key > key) { + hi = mid; + } else { + lo = mid + 1; + } + } + + return NULL; +} + +/* used in cr-restore */ +struct trace_flag *tf_insert(struct bisect_meta *meta, pid_t key) +{ + struct trace_flag **tfs = meta->data; + struct trace_flag *tf = &((struct trace_flag *)meta->__data)[meta->used]; + int i = 0, j = 0; + + if (meta->used == meta->size) + return NULL; + + for (i = 0; i < meta->used; i++) { + if (tfs[i]->key >= key) /* impossible condition: `tfs[i]->key == key` */ + break; + } + + j = meta->used; + meta->used += 1; + + while (j > i) { + tfs[j] = tfs[j-1]; + j -= 1; + } + + tfs[i] = tf; + tf->key = key; + + return tf; +} + +int tf_create(struct bisect_meta *meta, int len) +{ + struct trace_flag *tfs; + struct trace_flag **tfs_ptr; + + tfs = xzalloc(sizeof(*tfs) * len); + if (tfs == NULL) + return -1; + + tfs_ptr = xmalloc(sizeof(*tfs_ptr) * len); + if (tfs_ptr == NULL) + goto err; + + meta->size = len; + meta->used = 0; + meta->__data = tfs; + meta->data = tfs_ptr; + + return 0; +err: + xfree(tfs); + return -1; +} + +void tf_destroy(struct bisect_meta *meta) +{ + xfree(meta->__data); + xfree(meta->data); +} + +void tf_clear(struct bisect_meta *meta) +{ + meta->used = 0; + __builtin_memset(meta->data, 0, sizeof(struct trace_flag **)*meta->size); +} diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index 38846c2..6b1d445 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -442,7 +442,7 @@ static int restore_child_handler(struct parasite_ctl *ctl) }
static int parasite_run(pid_t pid, int cmd, unsigned long ip, void *stack, - user_regs_struct_t *regs, struct thread_ctx *octx) + user_regs_struct_t *regs, struct thread_ctx *octx, void *addr) { k_rtsigset_t block;
@@ -458,7 +458,7 @@ static int parasite_run(pid_t pid, int cmd, unsigned long ip, void *stack, goto err_regs; }
- if (ptrace(cmd, pid, NULL, NULL)) { + if (ptrace(cmd, pid, addr, NULL)) { pr_perror("Can't run parasite at %d", pid); goto err_cont; } @@ -565,7 +565,7 @@ int compel_execute_syscall(struct parasite_ctl *ctl, return -1; }
- err = parasite_run(pid, PTRACE_CONT, ctl->ictx.syscall_ip, 0, regs, &ctl->orig); + err = parasite_run(pid, PTRACE_CONT, ctl->ictx.syscall_ip, 0, regs, &ctl->orig, NULL); if (!err) err = parasite_trap(ctl, pid, regs, &ctl->orig);
@@ -583,7 +583,7 @@ int compel_run_at(struct parasite_ctl *ctl, unsigned long ip, user_regs_struct_t user_regs_struct_t regs = ctl->orig.regs; int ret;
- ret = parasite_run(ctl->rpid, PTRACE_CONT, ip, 0, ®s, &ctl->orig); + ret = parasite_run(ctl->rpid, PTRACE_CONT, ip, 0, ®s, &ctl->orig, NULL); if (!ret) ret = parasite_trap(ctl, ctl->rpid, ret_regs ? ret_regs : ®s, &ctl->orig); return ret; @@ -632,7 +632,7 @@ static int parasite_init_daemon(struct parasite_ctl *ctl) goto err;
regs = ctl->orig.regs; - if (parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, ctl->rstack, ®s, &ctl->orig)) + if (parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, ctl->rstack, ®s, &ctl->orig, NULL)) goto err;
futex_wait_while_eq(&args->daemon_connected, 0); @@ -1272,7 +1272,7 @@ static bool task_in_parasite(struct parasite_ctl *ctl, user_regs_struct_t *regs) addr < ctl->remote_map + ctl->map_length; }
-static int parasite_fini_seized(struct parasite_ctl *ctl) +static int parasite_fini_seized(struct parasite_ctl *ctl, bool customize) { pid_t pid = ctl->rpid; user_regs_struct_t regs; @@ -1317,9 +1317,37 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) if (ret) return -1;
+ /* use customize ptrace */ + if (customize) { + struct trace_flag tf = { .key = pid, .flag = TRACE_SYSCALL_ENTER }; + struct trace_flag *tf_ptr[] = { &tf }; + struct bisect_meta meta = { + .size = 1, + .used = 1, + .__data = &tf, + .data = tf_ptr, + }; + + ret = compel_stop_pie_customize(pid, __NR(rt_sigreturn, 0), &tf); + if (ret < 0) + return ret; + + /* The process is going to execute the required syscall, the + * original syscall should be forgot(set `-1`) in + * `syscall_trace_enter()` handler in kernel when no other + * else operation in tracer. + * + * Note: -1 means NO_SYSCALL which is defined in + * `arch/arm64/include/asm/ptrace.h`. + */ + return compel_stop_on_syscall_customize(1, + __NR(rt_sigreturn, 0), + -1, &meta); + } + /* Go to sigreturn as closer as we can */ ret = compel_stop_pie(pid, ctl->sigreturn_addr, &flag, - ctl->ictx.flags & INFECT_NO_BREAKPOINTS); + ctl->ictx.flags & INFECT_NO_BREAKPOINTS); if (ret < 0) return ret;
@@ -1339,7 +1367,7 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) return 0; }
-int compel_stop_daemon(struct parasite_ctl *ctl) +int compel_stop_daemon(struct parasite_ctl *ctl, bool customize) { if (ctl->daemonized) { /* @@ -1349,7 +1377,7 @@ int compel_stop_daemon(struct parasite_ctl *ctl) if (ctl->tsock < 0) return -1;
- if (parasite_fini_seized(ctl)) { + if (parasite_fini_seized(ctl, customize)) { close_safe(&ctl->tsock); return -1; } @@ -1365,7 +1393,7 @@ int compel_cure_remote(struct parasite_ctl *ctl) long ret; int err;
- if (compel_stop_daemon(ctl)) + if (compel_stop_daemon(ctl, false)) return -1;
if (!ctl->remote_map) @@ -1434,7 +1462,7 @@ int compel_run_in_thread(struct parasite_thread_ctl *tctl, unsigned int cmd)
*ctl->cmd = cmd;
- ret = parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, stack, ®s, octx); + ret = parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, stack, ®s, octx, NULL); if (ret == 0) ret = parasite_trap(ctl, pid, ®s, octx); if (ret == 0) @@ -1457,7 +1485,7 @@ int compel_unmap(struct parasite_ctl *ctl, unsigned long addr) pid_t pid = ctl->rpid; int ret = -1;
- ret = parasite_run(pid, PTRACE_SYSCALL, addr, ctl->rstack, ®s, &ctl->orig); + ret = parasite_run(pid, PTRACE_SYSCALL, addr, ctl->rstack, ®s, &ctl->orig, NULL); if (ret) goto err;
@@ -1470,6 +1498,45 @@ err: return ret; }
+int compel_unmap_customize(struct parasite_ctl *ctl, unsigned long addr) +{ + user_regs_struct_t regs = ctl->orig.regs; + pid_t pid = ctl->rpid; + int ret = -1; + struct trace_flag tf = { .key = pid, .flag = TRACE_SYSCALL_ENTER }; + struct trace_flag *tf_ptr[] = { &tf }; + struct bisect_meta meta = { + .size = 1, + .used = 1, + .__data = &tf, + .data = tf_ptr, + }; + + /* + * Here it parasite code. Unlike trap code `compel_stop_pie()`, it + * won't let tracee forget the original syscall. In such way, tracer + * just trace the syscall called by tracee. The log likes the following + * if tracee forget syscall: + * + * [ 817.638332] set pid 1877 ptrace sysno 215 + * [ 817.638343] syscall_trace_enter: pid 1877 ptrace_sysno 0 current_sysno 215 + * [ 817.638363] (00.006280) Error (compel/src/lib/infect.c:1582): 1877 (native) is going to execute the syscall 215, required is 215 + * [ 817.638368] set pid 1877 ptrace sysno 0 + * [ 817.638402] syscall_trace_exit: pid 1877 ptrace_sysno 0 current_sysno 215 + */ + ret = parasite_run(pid, PTRACE_SYSCALL_NR, addr, ctl->rstack, ®s, + &ctl->orig, (void *)(long)__NR(munmap, 0)); + if (ret) + goto err; + + ret = compel_stop_on_syscall_customize(1, __NR(munmap, 0), 0, &meta); + + if (restore_thread_ctx(pid, &ctl->orig)) + ret = -1; +err: + return ret; +} + int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) { int ret; @@ -1505,6 +1572,17 @@ int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) return 0; }
+int compel_stop_pie_customize(pid_t pid, const int sys_nr, struct trace_flag *tf) +{ + if (ptrace(PTRACE_SYSCALL_NR, pid, sys_nr, NULL)) { + pr_perror("Unable to restart the %d process", pid); + return -1; + } + + tf->flag = TRACE_SYSCALL_ENTER; + return 0; +} + static bool task_is_trapped(int status, pid_t pid) { if (WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP) @@ -1617,6 +1695,73 @@ goon: return 0; }
+int compel_stop_on_syscall_customize(int tasks, const int sys_nr, + const int exit_sys_nr, struct bisect_meta *meta) +{ + struct trace_flag *tf; + user_regs_struct_t regs; + int status, ret; + pid_t pid; + + while (tasks) { + pid = wait4(-1, &status, __WALL, NULL); + if (pid == -1) { + pr_perror("wait4 failed"); + return -1; + } + + tf = tf_bisect(meta, pid); + if (tf == NULL) { + pr_warn("Unexpected task %d, state %d signal %d: %s\n", + pid, WEXITSTATUS(status), + WTERMSIG(status), strsignal(WTERMSIG(status))); + continue; + } + + if (!task_is_trapped(status, pid)) + return -1; + + switch (tf->flag) { + case TRACE_SYSCALL_ENTER: + pr_debug("%d was trapped\n", pid); + pr_debug("`- Expecting exit\n"); + + ret = ptrace_get_regs(pid, ®s); + if (ret) { + pr_perror("ptrace"); + return -1; + } + + if (is_required_syscall(®s, pid, sys_nr, sys_nr)) { + ret = ptrace(PTRACE_SYSCALL_NR, pid, exit_sys_nr, NULL); + if (ret) { + pr_perror("ptrace"); + return -1; + } + tf->flag = TRACE_SYSCALL_EXIT; + } else { + pr_warn("Impossible condition, check the system, try our best to restore...\n"); + ret = ptrace(PTRACE_SYSCALL_NR, pid, sys_nr, NULL); + if (ret) { + pr_perror("ptrace"); + return -1; + } + } + break; + case TRACE_SYSCALL_EXIT: + pr_debug("%d was stopped\n", pid); + tasks--; + break; + + default: + pr_err("pid %d invalid status: %d\n", pid, tf->flag); + return -1; + } + } + + return 0; +} + int compel_mode_native(struct parasite_ctl *ctl) { return user_regs_native(&ctl->orig.regs); diff --git a/criu/cgroup-props.c b/criu/cgroup-props.c index 2f628f4..4f0458d 100644 --- a/criu/cgroup-props.c +++ b/criu/cgroup-props.c @@ -245,7 +245,7 @@ static int cgp_parse_stream(char *stream, size_t len) goto err_parse; }
- pr_info("Parsing controller "%s"\n", p); + pr_debug("Parsing controller "%s"\n", p);
cgp_entry = xzalloc(sizeof(*cgp_entry)); if (cgp_entry) { @@ -287,7 +287,7 @@ static int cgp_parse_stream(char *stream, size_t len) goto err_parse; }
- pr_info("\tStrategy "%s"\n", p); + pr_debug("\tStrategy "%s"\n", p); xfree(p);
if (!eat_symbols(&stream, &len, "\n - ", 4, true)) { @@ -324,7 +324,7 @@ static int cgp_parse_stream(char *stream, size_t len) }
cgp_entry->cgp.props[cgp_entry->cgp.nr_props++] = p; - pr_info("\tProperty "%s"\n", p); + pr_debug("\tProperty "%s"\n", p);
if (!eat_symbol(&stream, &len, ',', true)) { if (stream[0] == ']') { diff --git a/criu/cgroup.c b/criu/cgroup.c index e7e15bc..4088b08 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -417,7 +417,7 @@ static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd, const prop->value = new; }
- pr_info("Dumping value %s from %s/%s\n", prop->value, fpath, prop->name); + pr_debug("Dumping value %s from %s/%s\n", prop->value, fpath, prop->name); list_add_tail(&prop->list, &ncd->properties); ncd->n_properties++; } @@ -455,7 +455,7 @@ static int add_cgroup(const char *fpath, const struct stat *sb, int typeflag) if (typeflag == FTW_D) { int mtype;
- pr_info("adding cgroup %s\n", fpath); + pr_debug("adding cgroup %s\n", fpath);
ncd = xmalloc(sizeof(*ncd)); if (!ncd) @@ -737,7 +737,7 @@ int dump_task_cgroup(struct pstree_item *item, u32 *cg_id, struct parasite_dump_ else pid = getpid();
- pr_info("Dumping cgroups for %d\n", pid); + pr_debug("Dumping cgroups for %d\n", pid); if (parse_task_cgroup(pid, args, &ctls, &n_ctls)) return -1;
@@ -748,17 +748,17 @@ int dump_task_cgroup(struct pstree_item *item, u32 *cg_id, struct parasite_dump_ if (!item) { BUG_ON(criu_cgset); criu_cgset = cs; - pr_info("Set %d is criu one\n", cs->id); + pr_debug("Set %d is criu one\n", cs->id); } else { if (item == root_item) { BUG_ON(root_cgset); root_cgset = cs; - pr_info("Set %d is root one\n", cs->id); + pr_debug("Set %d is root one\n", cs->id); } else { struct cg_ctl *root, *stray;
BUG_ON(!root_cgset); - pr_info("Set %d is a stray\n", cs->id); + pr_debug("Set %d is a stray\n", cs->id);
/* Copy the cgns prefix from the root cgset for each * controller. This is ok because we know that there is diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 0c212a8..cbd40a9 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -158,7 +158,7 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc) return -1; }
- pr_info("%d has %d sched policy\n", pid, ret); + pr_debug("%d has %d sched policy\n", pid, ret); tc->has_sched_policy = true; tc->sched_policy = ret;
@@ -186,18 +186,18 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc) return -1; }
- pr_info("\tdumping %d nice for %d\n", ret, pid); + pr_debug("\tdumping %d nice for %d\n", ret, pid); tc->has_sched_nice = true; tc->sched_nice = ret;
- pr_info("\tdumping allowed cpus for %d\n", pid); + pr_debug("\tdumping allowed cpus for %d\n", pid); ret = syscall(__NR_sched_getaffinity, pid, sizeof(cpumask), &cpumask); if (ret < 0) { pr_perror("Can't get sched affinity for %d", pid); return -1; } memcpy(tc->allowed_cpus->cpumask, &cpumask, sizeof(cpu_set_t)); - pr_info("\t 0x%llx, 0x%llx, 0x%llx, 0x%llx\n", + pr_debug("\t 0x%llx, 0x%llx, 0x%llx, 0x%llx\n", (unsigned long long)tc->allowed_cpus->cpumask[3], (unsigned long long)tc->allowed_cpus->cpumask[2], (unsigned long long)tc->allowed_cpus->cpumask[1], @@ -1428,7 +1428,7 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) goto err_cure; }
- ret = compel_stop_daemon(parasite_ctl); + ret = compel_stop_daemon(parasite_ctl, kdat.has_customize_ptrace); if (ret) { pr_err("Can't stop daemon in parasite (pid: %d)\n", pid); goto err_cure; diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 3049e07..ccb2690 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2171,6 +2171,64 @@ static int catch_tasks(bool root_seized, enum trace_flags *flag) return 0; }
+static int cache_tasks_customize(bool root_seized, struct bisect_meta *meta) +{ + struct pstree_item *item; + struct trace_flag *tf; + + for_each_pstree_item(item) { + int status, i, ret; + pid_t pid; + + if (!task_alive(item)) + continue; + + if (item->nr_threads == 1) { + item->threads[0].real = item->pid->real; + } else { + if (parse_threads(item->pid->real, &item->threads, &item->nr_threads)) + return -1; + } + + for (i = 0; i < item->nr_threads; i++) { + pid = item->threads[i].real; + + if (ptrace(PTRACE_INTERRUPT, pid, 0, 0)) { + pr_perror("Can't interrupt the %d task", pid); + return -1; + } + + tf = tf_insert(meta, pid); + if (tf == NULL) { + pr_err("Can't find trace flag for %d, used %d\n", + pid, meta->used); + return -1; + } + tf->flag = TRACE_INTERRUPT; + } + + for (i = 0; i < item->nr_threads; i++) { + pid = wait4(-1, &status, __WALL, NULL); + + tf = tf_bisect(meta, pid); + if (tf == NULL) { + pr_err("Can't find trace flag for %d, used %d\n", + pid, meta->used); + return -1; + } + + ret = compel_stop_pie_customize(pid, + __NR(rt_sigreturn, 0), + tf); + if (ret < 0) + return -1; + + } + } + + return 0; +} + static int clear_breakpoints(void) { struct pstree_item *item; @@ -2197,6 +2255,7 @@ static void finalize_restore(void) pid_t pid = item->pid->real; struct parasite_ctl *ctl; unsigned long restorer_addr; + int retval;
if (!task_alive(item)) continue; @@ -2207,7 +2266,12 @@ static void finalize_restore(void) continue;
restorer_addr = (unsigned long)rsti(item)->munmap_restorer; - if (compel_unmap(ctl, restorer_addr)) + if (!kdat.has_customize_ptrace) + retval = compel_unmap(ctl, restorer_addr); + else + retval = compel_unmap_customize(ctl, restorer_addr); + + if (retval) pr_err("Failed to unmap restorer from %d\n", pid);
xfree(ctl); @@ -2312,11 +2376,18 @@ static int write_restored_pid(void)
static int restore_root_task(struct pstree_item *init) { + struct bisect_meta tfs_meta; enum trace_flags flag = TRACE_ALL; int ret, fd, mnt_ns_fd = -1; int root_seized = 0; struct pstree_item *item;
+ if (kdat.has_customize_ptrace + && tf_create(&tfs_meta, task_entries->nr_threads) != 0) { + pr_err("Can't alloc memory, tf_create failed\n"); + return -1; + } + ret = run_scripts(ACT_PRE_RESTORE); if (ret != 0) { pr_err("Aborting restore due to pre-restore script ret code %d\n", ret); @@ -2521,7 +2592,12 @@ skip_ns_bouncing:
timing_stop(TIME_RESTORE);
- if (catch_tasks(root_seized, &flag)) { + if (!kdat.has_customize_ptrace) + ret = catch_tasks(root_seized, &flag); + else + ret = cache_tasks_customize(root_seized, &tfs_meta); + + if (ret) { pr_err("Can't catch all tasks\n"); goto out_kill_network_unlocked; } @@ -2531,8 +2607,14 @@ skip_ns_bouncing:
__restore_switch_stage(CR_STATE_COMPLETE);
- ret = compel_stop_on_syscall(task_entries->nr_threads, - __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag); + if (!kdat.has_customize_ptrace) { + ret = compel_stop_on_syscall(task_entries->nr_threads, + __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag); + } else { + ret = compel_stop_on_syscall_customize(task_entries->nr_threads, + __NR(rt_sigreturn, 0), + -1, &tfs_meta); + } if (ret) { pr_err("Can't stop all tasks on rt_sigreturn\n"); goto out_kill_network_unlocked; @@ -2575,6 +2657,9 @@ skip_ns_bouncing: if (!opts.restore_detach && !opts.exec_cmd) wait(NULL);
+ if (kdat.has_customize_ptrace) + tf_destroy(&tfs_meta); + return 0;
out_kill_network_unlocked: @@ -2608,6 +2693,10 @@ out: stop_usernsd(); __restore_switch_stage(CR_STATE_FAIL); pr_err("Restoring FAILED.\n"); + + if (kdat.has_customize_ptrace) + tf_destroy(&tfs_meta); + return -1; }
diff --git a/criu/eventfd.c b/criu/eventfd.c index da31ce9..17cbceb 100644 --- a/criu/eventfd.c +++ b/criu/eventfd.c @@ -38,7 +38,7 @@ int is_eventfd_link(char *link)
static void pr_info_eventfd(char *action, EventfdFileEntry *efe) { - pr_info("%s: id %#08x flags %#04x counter %#016"PRIx64"\n", + pr_debug("%s: id %#08x flags %#04x counter %#016"PRIx64"\n", action, efe->id, efe->flags, efe->counter); }
diff --git a/criu/eventpoll.c b/criu/eventpoll.c index 6097e42..d8c8166 100644 --- a/criu/eventpoll.c +++ b/criu/eventpoll.c @@ -67,13 +67,13 @@ int is_eventpoll_link(char *link)
static void pr_info_eventpoll_tfd(char *action, uint32_t id, EventpollTfdEntry *e) { - pr_info("%seventpoll-tfd: id %#08x tfd %8d events %#08x data %#016"PRIx64" ignore %d\n", + pr_debug("%seventpoll-tfd: id %#08x tfd %8d events %#08x data %#016"PRIx64" ignore %d\n", action, id, e->tfd, e->events, e->data, e->ignore); }
static void pr_info_eventpoll(char *action, EventpollFileEntry *e) { - pr_info("%seventpoll: id %#08x flags %#04x\n", action, e->id, e->flags); + pr_debug("%seventpoll: id %#08x flags %#04x\n", action, e->id, e->flags); }
static int queue_dinfo(FileEntry **fe, EventpollFileEntry **e, toff_t **toff, const struct fd_parms *p) diff --git a/criu/files-reg.c b/criu/files-reg.c index 01e0895..4752085 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -1718,7 +1718,7 @@ static bool store_validation_data(RegFileEntry *rfe, return false;
if (!result) - pr_info("Only file size could be stored for validation for file %s\n", + pr_debug("Only file size could be stored for validation for file %s\n", rfe->name); return true; } @@ -1768,7 +1768,7 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) rfe.has_mnt_id = true; }
- pr_info("Dumping path for %d fd via self %d [%s], id: %d\n", + pr_debug("Dumping path for %d fd via self %d [%s], id: %d\n", p->fd, lfd, &link->name[1], id);
/* diff --git a/criu/files.c b/criu/files.c index 0e5be91..b4382fd 100644 --- a/criu/files.c +++ b/criu/files.c @@ -334,7 +334,7 @@ int do_dump_gen_file(struct fd_parms *p, int lfd, e->fd = p->fd; e->flags = p->fd_flags;
- pr_info("fdinfoEntry fd: %d\n", e->fd); + pr_debug("fdinfoEntry fd: %d\n", e->fd); ret = fd_id_generate(p->pid, e, p); if (ret == 1) /* new ID generated */ ret = ops->dump(lfd, e->id, p); @@ -422,7 +422,7 @@ static int fill_fd_params(struct pid *owner_pid, int fd, int lfd,
fown_entry__init(&p->fown);
- pr_info("%d fdinfo %d: pos: %#16"PRIx64" flags: %16o/%#x\n", + pr_debug("%d fdinfo %d: pos: %#16"PRIx64" flags: %16o/%#x\n", owner_pid->real, fd, p->pos, p->flags, (int)p->fd_flags);
if (p->flags & O_PATH) @@ -513,7 +513,7 @@ static int dump_chr_file(int lfd, u32 id, const struct fd_parms *p) } else link = p->link;
- pr_info("Dumping chr-file fd %d with lfd %d with id %d, name: %s\n", p->fd, lfd, id, link->name); + pr_debug("Dumping chr-file fd %d with lfd %d with id %d, name: %s\n", p->fd, lfd, id, link->name);
if (strstr(link->name, "(deleted)") != NULL) { pr_err("char device '%s' is deleted\n", link->name); @@ -727,9 +727,9 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, int i, ret = -1; int off, nr_fds = min((int) PARASITE_MAX_FDS, dfds->nr_fds);
- pr_info("\n"); - pr_info("Dumping opened files (pid: %d)\n", item->pid->real); - pr_info("----------------------------------------\n"); + pr_debug("\n"); + pr_debug("Dumping opened files (pid: %d)\n", item->pid->real); + pr_debug("----------------------------------------\n");
lfds = xmalloc(nr_fds * sizeof(int)); if (!lfds) @@ -762,7 +762,7 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, break;
e.flags |= need_reuse_flag; - pr_info("write fdinfoEntry fd=%d id=%d\n", (&e)->fd, (&e)->id); + pr_debug("write fdinfoEntry fd=%d id=%d\n", (&e)->fd, (&e)->id); ret = pb_write_one(img, &e, PB_FDINFO); if (ret) break; @@ -772,7 +772,7 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, close(lfds[i]); }
- pr_info("----------------------------------------\n"); + pr_debug("----------------------------------------\n"); err: if (img) close_image(img); diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index 665051d..76fe342 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -69,6 +69,7 @@ struct kerndat_s { bool has_clone3_set_tid; bool has_timens; bool has_unix_sk_repair; + bool has_customize_ptrace; };
extern struct kerndat_s kdat; diff --git a/criu/kerndat.c b/criu/kerndat.c index cf9187a..7e26740 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -13,6 +13,9 @@ #include <arpa/inet.h> /* for sockaddr_in and inet_ntoa() */ #include <sys/prctl.h> #include <sys/inotify.h> +#include <sys/ptrace.h> +#include <sys/wait.h> +#include <linux/ptrace.h>
#include "common/config.h" @@ -1082,6 +1085,66 @@ static void kerndat_has_unix_sk_repair(void) return; }
+static void kerndat_has_customize_ptrace(void) +{ + pid_t tracee = fork(); + int status; + int retval; + + if (tracee == 0) { + /* ensure */ + prctl(PR_SET_PDEATHSIG, SIGKILL); + + while (true) + sleep(1); + } else if (tracee > 0) { + pr_debug("fork task %d as tracee\n", tracee); + retval = ptrace(PTRACE_ATTACH, tracee, 0, 0); + if (retval < 0) { + pr_perror("Unexpect error from ptrace(PTRACE_ATTACH)"); + return; + } + + retval = wait4(-1, &status, __WALL, NULL); + if (retval == -1) + pr_perror("Unexpect error from wait"); + else if (retval != tracee || !(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)) + pr_err("Task %d (expect %d) is unexpect, status: %d," + " stoped: %d signal: %d(%s)\n", + retval, tracee, status, + WIFSTOPPED(status), WSTOPSIG(status), + strsignal(WTERMSIG(status))); + else { + retval = ptrace(PTRACE_SYSCALL_NR, tracee, 0, 0); + if (retval == 0) + kdat.has_customize_ptrace = true; + else + pr_perror("Unexpect error from ptrace(PTRACE_SYSCALL_NR)"); + } + + if (kill(tracee, SIGKILL) != 0) { + pr_perror("kill tracee %d failed", tracee); + return; + } + + /* + * To prevent wait4 unexpect task when criu.kdat is generated + * in dump process. + */ + retval = waitpid(tracee, &status, 0); + if (retval == -1) + pr_err("waitpid() failed"); + else + pr_debug("tracee %d exited, status %d, signal %d(%s)\n", + WEXITSTATUS(status), WTERMSIG(status), + WTERMSIG(status), strsignal(WTERMSIG(status))); + } else { + pr_perror("Unexpected error from fork\n"); + } + + return; +} + int kerndat_init(void) { int ret; @@ -1095,8 +1158,7 @@ int kerndat_init(void) memset(&kdat, 0, sizeof(kdat));
preload_socket_modules(); - if (!opts.use_nft) - preload_netfilter_modules(); + preload_netfilter_modules();
if (check_pagemap()) { pr_err("check_pagemap failed when initializing kerndat.\n"); @@ -1218,6 +1280,7 @@ int kerndat_init(void) }
kerndat_has_unix_sk_repair(); + kerndat_has_customize_ptrace();
kerndat_lsm(); kerndat_mmap_min_addr(); diff --git a/criu/lsm.c b/criu/lsm.c index 6713ca7..9d9d38e 100644 --- a/criu/lsm.c +++ b/criu/lsm.c @@ -265,9 +265,9 @@ int collect_lsm_profile(pid_t pid, CredsEntry *ce) }
if (ce->lsm_profile) - pr_info("%d has lsm profile %s\n", pid, ce->lsm_profile); + pr_debug("%d has lsm profile %s\n", pid, ce->lsm_profile); if (ce->lsm_sockcreate) - pr_info("%d has lsm sockcreate label %s\n", pid, ce->lsm_sockcreate); + pr_debug("%d has lsm sockcreate label %s\n", pid, ce->lsm_sockcreate);
return ret; } diff --git a/criu/mount.c b/criu/mount.c index 25ef7f0..124c9c8 100644 --- a/criu/mount.c +++ b/criu/mount.c @@ -449,13 +449,13 @@ static void mnt_tree_show(struct mount_info *tree, int off) { struct mount_info *m;
- pr_info("%*s[%s](%d->%d)\n", off, "", + pr_debug("%*s[%s](%d->%d)\n", off, "", tree->mountpoint, tree->mnt_id, tree->parent_mnt_id);
list_for_each_entry(m, &tree->children, siblings) mnt_tree_show(m, off + 1);
- pr_info("%*s<--\n", off, ""); + pr_debug("%*s<--\n", off, ""); }
/* Returns -1 on error, 1 if external mount resolved, 0 otherwise */ diff --git a/criu/sk-packet.c b/criu/sk-packet.c index 0abe840..82b6b2c 100644 --- a/criu/sk-packet.c +++ b/criu/sk-packet.c @@ -262,7 +262,7 @@ int packet_receive_one(struct nlmsghdr *hdr, struct ns_id *ns, void *arg) m = NLMSG_DATA(hdr); nlmsg_parse(hdr, sizeof(struct packet_diag_msg), tb, PACKET_DIAG_MAX, NULL); - pr_info("Collect packet sock %u %u\n", m->pdiag_ino, (unsigned int)m->pdiag_num); + pr_debug("Collect packet sock %u %u\n", m->pdiag_ino, (unsigned int)m->pdiag_num);
if (!tb[PACKET_DIAG_INFO]) { pr_err("No packet sock info in nlm\n"); diff --git a/criu/sk-unix.c b/criu/sk-unix.c index b4c24ed..3d9af75 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -584,7 +584,7 @@ dump: if (dump_socket_opts(lfd, skopts)) goto err;
- pr_info("Dumping unix socket at %d\n", p->fd); + pr_debug("Dumping unix socket at %d\n", p->fd); show_one_unix("Dumping", sk);
sk->ue = ue;
From: Jingxian He hejingxian@huawei.com
In order to improve criu dump and restore performance, enable pin method for exec file mapping.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21
Signed-off-by: Jingxian He hejingxian@huawei.com --- criu/config.c | 4 ++++ criu/crtools.c | 3 ++- criu/include/cr_options.h | 1 + criu/mem.c | 15 ++++++++++++++- 4 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/criu/config.c b/criu/config.c index 5d195cb..91e0fa8 100644 --- a/criu/config.c +++ b/criu/config.c @@ -543,6 +543,7 @@ int parse_options(int argc, char **argv, bool *usage_error, { "file-validation", required_argument, 0, 1098 }, { "share-dst-ports", required_argument, 0, 1099 }, { "share-src-ports", required_argument, 0, 1100 }, + { "exec-pin-start", required_argument, 0, 1101 }, BOOL_OPT("with-cpu-affinity", &opts.with_cpu_affinity), BOOL_OPT("pin-memory", &opts.pin_memory), BOOL_OPT("use-fork-pid", &opts.use_fork_pid), @@ -891,6 +892,9 @@ int parse_options(int argc, char **argv, bool *usage_error, case 1100: SET_CHAR_OPTS(share_src_ports, optarg); break; + case 1101: + opts.exec_pin_start = atoi(optarg); + break; case 'V': pr_msg("Version: %s\n", CRIU_VERSION); if (strcmp(CRIU_GITID, "0")) diff --git a/criu/crtools.c b/criu/crtools.c index 35a479f..3e4b0ae 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -490,7 +490,8 @@ usage: " --reserve-ports Reserve src ports in kernel\n" " --use-nft Use nft API instead of iptables cmd in network locking\n" " --parallel Parallel to accellrate dumping speed\n" -" --async-clear-nft Async to clear nft table set" +" --async-clear-nft Async to clear nft table set\n" +" --exec-pin-start Exec file map's pin start index\n" "\n" "Check options:\n" " Without options, "criu check" checks availability of absolutely required\n" diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index aa519c8..f7301d8 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -190,6 +190,7 @@ struct cr_options { int use_nft; int parallel; int async_clear_nft; + int exec_pin_start; };
extern struct cr_options opts; diff --git a/criu/mem.c b/criu/mem.c index d56f69e..2afd2da 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -461,6 +461,8 @@ bool should_pin_vmae(VmaEntry *vmae) if (vma_entry_is(vmae, VMA_ANON_PRIVATE)) return true;
+ if (opts.exec_pin_start && vma_entry_is(vmae, VMA_FILE_PRIVATE)) + return true; return false; }
@@ -567,6 +569,7 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, int possible_pid_reuse = 0; bool has_parent; int parent_predump_mode = -1; + int dump_iov;
pr_info("\n"); pr_info("Dumping pages (type: %d pid: %d)\n", CR_FD_PAGES, item->pid->real); @@ -647,9 +650,19 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, if(mdc->parent_ie) parent_predump_mode = mdc->parent_ie->pre_dump_mode;
+ dump_iov = 0; list_for_each_entry(vma_area, &vma_area_list->h, list) { if (opts.pin_memory && should_pin_vmae(vma_area->e)) { - continue; + if (!opts.exec_pin_start) + continue; + else { + if (vma_entry_is(vma_area->e, VMA_FILE_PRIVATE) && ((vma_area->e->prot & PROT_WRITE) || !(vma_area->e->prot & PROT_EXEC))) { + dump_iov++; + if (dump_iov > opts.exec_pin_start + 1) + continue; + } else + continue; + } }
if (vma_entry_is(vma_area->e, VMA_AREA_ANON_INODE))
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/26
Signed-off-by: fu.lin fulin10@huawei.com --- criu/cr-restore.c | 10 ++++++++++ 1 file changed, 10 insertions(+)
diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 1c92372..e70f90c 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2307,6 +2307,16 @@ static int finalize_restore_detach(void) return -1; } if (ptrace(PTRACE_DETACH, pid, NULL, 0)) { + /* + * There is delta between task resume and + * `ptrace(PTRACE_DETACH)`, task maybe exit + * initiative during this time. + */ + if (errno == ESRCH) { + pr_warn("Unable to detach %d, task has dead\n", pid); + continue; + } + pr_perror("Unable to detach %d", pid); return -1; }
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/26
Signed-off-by: fu.lin fulin10@huawei.com --- criu/cr-restore.c | 68 +++++++++++++++++++++++++++++++++++++++++++ criu/include/pstree.h | 1 + criu/pstree.c | 8 +++++ 3 files changed, 77 insertions(+)
diff --git a/criu/cr-restore.c b/criu/cr-restore.c index ccb2690..1c92372 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1580,6 +1580,8 @@ static inline int fork_with_pid(struct pstree_item *item) goto err_unlock; }
+ /* disable criu rollback capability. */ + criu_roll = false;
if (item == root_item) { item->pid->real = ret; @@ -2732,6 +2734,71 @@ int prepare_dummy_task_state(struct pstree_item *pi) return 0; }
+static int criu_rollback_internal(void *_arg) +{ + bool unmask = *(int *)_arg; + pid_t pid = getpid(); + + if (unmask && mask_task_exit_notify(pid, false) != 0) + pr_err("unmask exit notify failed for %d\n", pid); + + do_notifier_rollback(true, POST_UPDATE_KERNEL_COMPLETE); + return 0; +} + +static void criu_rollback(void) +{ + pid_t pid; + unsigned long clone_flags; + int retval = 0; + + if (!criu_roll || !opts.with_notifier_kup) + return; + + pid = vpid(root_item); + clone_flags = rsti(root_item)->clone_flags; + + pr_info("do criu rollback\n"); + + /* Some rollback notifier must be call in the specific task context. */ + if (opts.use_fork_pid) + retval = write_fork_pid(vpid(root_item)); + else if (!kdat.has_clone3_set_tid) + retval = set_next_pid((void *)&pid); + + if (retval < 0) { + pr_err("set next pid %d failed, can't do rollback.", pid); + return; + } + + if (!kdat.has_clone3_set_tid) { + retval = clone_noasan(criu_rollback_internal, + clone_flags | SIGCHLD, + &opts.mask_exit_notify); + } else { + retval = clone3_with_pid_noasan(criu_rollback_internal, + &opts.mask_exit_notify, + clone_flags, + SIGCHLD, pid); + } + + if (retval < 0) { + pr_err("Can't fork for %d to do rollback: %s.\n", + pid, strerror(errno)); + } else { + int status; + + if (retval != pid) + pr_err("clone pid %d isn't equal with %d\n", + retval, pid); + + if (waitpid(pid, &status, 0) < 0) { + pr_warn("Unable to wait %d: %s\n", + pid, strerror(errno)); + } + } +} + int cr_restore_tasks(void) { int ret = -1; @@ -2801,6 +2868,7 @@ int cr_restore_tasks(void) err: cr_plugin_fini(CR_PLUGIN_STAGE__RESTORE, ret); if (ret < 0) { + criu_rollback(); if ((network_status & NETWORK_COLLECTED) == 0) { if (!files_collected() && collect_image(&inet_sk_cinfo)) pr_err("collect inet sk cinfo fail"); diff --git a/criu/include/pstree.h b/criu/include/pstree.h index be0942a..d59562c 100644 --- a/criu/include/pstree.h +++ b/criu/include/pstree.h @@ -45,6 +45,7 @@ enum { }; #define FDS_EVENT (1 << FDS_EVENT_BIT)
+extern bool criu_roll; extern struct pstree_item *current;
struct rst_info; diff --git a/criu/pstree.c b/criu/pstree.c index f0d7622..3d17056 100644 --- a/criu/pstree.c +++ b/criu/pstree.c @@ -20,6 +20,11 @@ #include "images/pstree.pb-c.h" #include "crtools.h"
+/* + * Sometimes, img may be broken, set flag here to enable roll capibility + * before forking restorer. + */ +bool criu_roll; struct pstree_item *root_item; static struct rb_root pid_root_rb;
@@ -619,6 +624,9 @@ static int read_pstree_image(pid_t *pid_max) if (!img) return -1;
+ /* enable rollback capibility when opening img successfully. */ + criu_roll = true; + do { ret = read_one_pstree_item(img, pid_max); } while (ret > 0);
--- criu/cr-restore.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-)
diff --git a/criu/cr-restore.c b/criu/cr-restore.c index e70f90c..92f656d 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1869,16 +1869,12 @@ static int prepare_rootns_sysv_shm(unsigned long clone_flags) return -1; }
-#if BITS_PER_LONG <= 32 -# define SIZE_SPEC "%10lu" -#else -# define SIZE_SPEC "%21lu" -#endif - while (getline(&line, &len, fp) != -1) { - if (sscanf(line, "%10d %10d %4o" SIZE_SPEC, &key, &shmid, &mode, &size) != 4) + if (sscanf(line, "%d %d %o %lu", &key, &shmid, &mode, &size) != 4) continue;
+ pr_debug("sscanf key: %d shmid: %d mode %o size %lu\n", + key, shmid, mode, size); retval = collect_sysv_shmem(shmid, size); if (retval != 0) goto out;
From: Jingxian He hejingxian@huawei.com
When there exist multi processes need to dump, the child process may have the same fds as parent process. During the restore processing, criu choose the process which has the min pid value to be the master process to recover fds. However, choosing the parent process as the master process is more suitable.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: Jingxian He hejingxian@huawei.com --- criu/cr-restore.c | 5 ++--- criu/files.c | 7 +------ 2 files changed, 3 insertions(+), 9 deletions(-)
diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 7ec84c8..6114dd4 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2068,10 +2068,9 @@ static int restore_task_with_children(void *_arg) return 0;
err: - if (current->parent == NULL) { - do_notifier_rollback(opts.with_notifier_kup, POST_UPDATE_KERNEL_COMPLETE); + do_notifier_rollback(opts.with_notifier_kup, POST_UPDATE_KERNEL_COMPLETE); + if (current->parent == NULL) futex_abort_and_wake(&task_entries->nr_in_progress); - } exit(1); }
diff --git a/criu/files.c b/criu/files.c index 84d6563..10c9661 100644 --- a/criu/files.c +++ b/criu/files.c @@ -925,12 +925,7 @@ static struct fdinfo_list_entry *alloc_fle(int pid, FdinfoEntry *fe)
static void __collect_desc_fle(struct fdinfo_list_entry *new_le, struct file_desc *fdesc) { - struct fdinfo_list_entry *le; - - list_for_each_entry_reverse(le, &fdesc->fd_info_head, desc_list) - if (pid_rst_prio_eq(le->pid, new_le->pid)) - break; - list_add(&new_le->desc_list, &le->desc_list); + list_add_tail(&new_le->desc_list, &fdesc->fd_info_head); }
static void collect_desc_fle(struct fdinfo_list_entry *new_le,
The criu log can't be flushed to disk when OS crash in storage environment, therefore, output high level msg to /dev/kmsg.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: fu.lin fulin10@huawei.com --- criu/Makefile.crtools | 1 + criu/include/log.h | 2 ++ criu/kmsg.c | 16 ++++++++++++++++ criu/log.c | 4 ++++ 4 files changed, 23 insertions(+) create mode 100644 criu/kmsg.c
diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index cda5b82..0bea576 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -92,6 +92,7 @@ obj-y += timens.o obj-y += devname.o obj-y += mnl.o obj-y += nftables.o +obj-y += kmsg.o obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 diff --git a/criu/include/log.h b/criu/include/log.h index 58d0123..9556d15 100644 --- a/criu/include/log.h +++ b/criu/include/log.h @@ -72,6 +72,8 @@ void flush_early_log_buffer(int fd); print_on_level(LOG_DEBUG, \ LOG_PREFIX fmt, ##__VA_ARGS__)
+void write_kmsg(const void *buf, size_t count); + #ifndef CR_NOGLIBC
#define pr_perror(fmt, ...) \ diff --git a/criu/kmsg.c b/criu/kmsg.c new file mode 100644 index 0000000..c956dfb --- /dev/null +++ b/criu/kmsg.c @@ -0,0 +1,16 @@ +#include <fcntl.h> +#include <unistd.h> + +#define SYSLOG_DEV "/dev/kmsg" + +void write_kmsg(const void *buf, size_t count) +{ + int fd; + + fd = open(SYSLOG_DEV, O_CLOEXEC | O_WRONLY); + if (fd < 0) + return; + + write(fd, buf, count); + close(fd); +} diff --git a/criu/log.c b/criu/log.c index 439a899..4254f72 100644 --- a/criu/log.c +++ b/criu/log.c @@ -379,6 +379,10 @@ static void vprint_on_level(unsigned int loglevel, const char *format, va_list p size += buf_off;
while (off < size) { + if (loglevel <= LOG_WARN) { + write_kmsg(buffer + off, size - off); + } + ret = write(fd, buffer + off, size - off); if (ret <= 0) break;
Signed-off-by: fu.lin fulin10@huawei.com --- test/zdtm.py | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/test/zdtm.py b/test/zdtm.py index bd44ad1..73de5ac 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -23,6 +23,7 @@ import subprocess import sys import tempfile import time +import pathlib from builtins import (input, int, open, range, str, zip)
import pycriu as crpc @@ -2642,6 +2643,9 @@ rp.add_argument("--pre-dump-mode", help="Use splice or read mode of pre-dumping", choices=['splice', 'read'], default='splice') +rp.add_argument("--kdat", + help="Path to criu.kdat, default '/run/criu.kdat'", + default="/run/criu.kdat")
lp = sp.add_parser("list", help="List tests") lp.set_defaults(action=list_tests) @@ -2672,6 +2676,10 @@ if opts['debug']:
if opts['action'] == 'run': criu.available() + # remove kdat file before testing + kdat = pathlib.Path(opts['kdat']) + if kdat.exists(): + kdat.unlink() for tst in test_classes.values(): tst.available()
From: Jingxian He hejingxian@huawei.com
Some special char dev cannot work in child processes, we make dump fail when the special char dev fd is in child processes. In the char dev repair process, user may need recover fd. We should make thre repair process running after the char dev fd is reopened as dumped fd.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: Jingxian He hejingxian@huawei.com --- criu/files.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-)
diff --git a/criu/files.c b/criu/files.c index 10c9661..3bb68c7 100644 --- a/criu/files.c +++ b/criu/files.c @@ -1275,6 +1275,7 @@ static int open_fd(struct fdinfo_list_entry *fle) struct file_desc *d = fle->desc; struct fdinfo_list_entry *flem; int new_fd = -1, ret; + struct chrfile_info *ci;
pr_info("open file flags:%x\n", fle->fe->flags); flem = file_master(d); @@ -1335,6 +1336,17 @@ static int open_fd(struct fdinfo_list_entry *fle) if (ret != -1 && new_fd >= 0) { if (setup_and_serve_out(fle, new_fd) < 0) return -1; + if (d->ops->type == FD_TYPES__CHR) { + ci = container_of(d, struct chrfile_info, d); + if (ci->cfe->repair) { + ret = ioctl(fle->fe->fd, IOCTL_CMD_REPAIR , ci->cfe->index); + pr_info("repair ioctl return: %d, index: %d\n", ret, ci->cfe->index); + if (ret) { + close(fle->fe->fd); + return -1; + } + } + } } out: if (ret == 0) @@ -1859,19 +1871,9 @@ static int chrfile_open(struct file_desc *d, int *new_fd) return -1; }
- if (ci->cfe->repair) { - ret = ioctl(fd, IOCTL_CMD_REPAIR , ci->cfe->index); - pr_info("repair ioctl return: %d, index: %d\n", ret, ci->cfe->index); - if (ret) - goto err; - } - *new_fd = fd; ret = 0;
- return ret; -err: - close(fd); return ret; }
--- test/zdtm/Makefile | 2 +- test/zdtm/customization/Makefile | 53 ++++++++ test/zdtm/customization/ipc.c | 202 +++++++++++++++++++++++++++++++ test/zdtm/customization/ipc.desc | 1 + 4 files changed, 257 insertions(+), 1 deletion(-) create mode 100644 test/zdtm/customization/Makefile create mode 100644 test/zdtm/customization/ipc.c create mode 100644 test/zdtm/customization/ipc.desc
diff --git a/test/zdtm/Makefile b/test/zdtm/Makefile index 24a33f2..8f9857b 100644 --- a/test/zdtm/Makefile +++ b/test/zdtm/Makefile @@ -1,4 +1,4 @@ -SUBDIRS := lib static transition +SUBDIRS := lib static transition customization
all: $(SUBDIRS) .PHONY: all $(SUBDIRS) diff --git a/test/zdtm/customization/Makefile b/test/zdtm/customization/Makefile new file mode 100644 index 0000000..563b7b1 --- /dev/null +++ b/test/zdtm/customization/Makefile @@ -0,0 +1,53 @@ +LIBDIR := ../lib +LIB := $(LIBDIR)/libzdtmtst.a +LDLIBS += $(LIB) +CPPFLAGS += -I$(LIBDIR) + +TST = \ + ipc + +SRC = $(TST:%=%.c) +OBJ = $(SRC:%.c=%.o) +DEP = $(SRC:%.c=%.d) +PID = $(TST:%=%.pid) +OUT = $(TST:%=%.out) + +include ../Makefile.inc + +all: $(TST) +install: all +.PHONY: all install + +$(TST:%=%.pid): %.pid: % + $(<D)/$(<F) --pidfile=$@ --outfile=$<.out + +%.out: %.pid % + -kill -TERM `cat $<` + +start: $(PID) + +%.is_running: %.pid + kill -0 `cat $<` + +check_start: $(PID:%.pid=%.is_running) + +stop: + -kill -TERM `awk '{print}' *.pid` + +WAIT_TIME=10 +wait_stop: + -for i in `seq 1 $(WAIT_TIME)`; do \ + kill -0 `awk '{print}' *.pid 2>/dev/null` 2>/dev/null || break; \ + sleep 1; \ + done + +$(TST): | $(LIB) + +%: %.sh + cp $< $@ + chmod +x $@ + +$(LIB): force + $(Q) $(MAKE) -C $(LIBDIR) + +.PHONY: force start check_start stop wait_stop diff --git a/test/zdtm/customization/ipc.c b/test/zdtm/customization/ipc.c new file mode 100644 index 0000000..2b3c2b1 --- /dev/null +++ b/test/zdtm/customization/ipc.c @@ -0,0 +1,202 @@ +#include <sched.h> + +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <sys/sem.h> +#include <sys/ipc.h> +#include <sys/shm.h> +#include <signal.h> +#include <errno.h> + +#include "zdtmtst.h" + +const char *test_doc="Tests ipc sems and shmems migrate fine"; +const char *test_author="Pavel Emelianov xemul@parallels.com"; + +static struct sembuf unlock = { + .sem_op = 1, + .sem_num = 0, + .sem_flg = 0, +}; + +static struct sembuf lock = { + .sem_op = -1, + .sem_num = 0, + .sem_flg = 0, +}; + +#define DEF_MEM_SIZE (40960) +unsigned int shmem_size = DEF_MEM_SIZE; +TEST_OPTION(shmem_size, uint, "Size of shared memory segment", 0); + +#define INIT_CRC (~0) + +#define POISON 0xac +static inline void poison_area(int *mem) +{ + memset(mem, POISON, shmem_size); +} + +static int child(key_t key) +{ + int sem, shm, ret, res = 0; + uint8_t *mem; + uint32_t crc; + + sem = semget(key, 1, 0777); + if (sem == -1) + return -1; + shm = shmget(key, shmem_size, 0777); + if (shm == -1) + return -2; + mem = shmat(shm, NULL, 0); + if (mem == (uint8_t *)-1) + return -3; + + while (test_go()) { + ret = semop(sem, &lock, 1); + if (ret) { + if (errno == EINTR) + continue; + fail("Error in semop lock"); + res = errno; + break; + } + crc = INIT_CRC; + datagen(mem, shmem_size, &crc); + while ((ret = semop(sem, &unlock, 1)) && (errno == EINTR)); + if (ret) { + fail("Error in semop unlock"); + res = errno; + break; + } + } + shmdt(mem); + return res; +} + +int main(int argc, char **argv) +{ + key_t key; + int sem, shm, pid1, pid2; + int fail_count = 0; + uint8_t *mem; + uint32_t crc; + int ret; + + test_init(argc, argv); + + /* using the large number to fill string length */ + key = ftok(argv[0], 1822155650); + if (key == -1) { + pr_perror("Can't make key"); + goto out; + } + + sem = semget(key, 1, 0777 | IPC_CREAT | IPC_EXCL); + if (sem == -1) { + pr_perror("Can't get sem"); + goto out; + } + + if (semctl(sem, 0, SETVAL, 1) == -1) { + pr_perror("Can't init sem"); + fail_count++; + goto out_sem; + } + + shm = shmget(key, shmem_size, 0777 | IPC_CREAT | IPC_EXCL); + if (shm == -1) { + pr_perror("Can't get shm"); + fail_count++; + goto out_sem; + } + + mem = shmat(shm, NULL, 0); + if (mem == (void *)-1) { + pr_perror("Can't attach shm"); + fail_count++; + goto out_shm; + } + + poison_area((int *)mem); + + pid1 = test_fork(); + if (pid1 == -1) { + pr_perror("Can't fork 1st time"); + goto out_shdt; + } else if (pid1 == 0) + exit(child(key)); + + pid2 = test_fork(); + if (pid2 == -1) { + pr_perror("Can't fork 2nd time"); + fail_count++; + goto out_child; + } else if (pid2 == 0) + exit(child(key)); + + test_daemon(); + while (test_go()) { + ret = semop(sem, &lock, 1); + if (ret) { + if (errno == EINTR) + continue; + fail_count++; + fail("Error in semop lock"); + break; + } + if (mem[0] != POISON) { + crc = INIT_CRC; + if (datachk(mem, shmem_size, &crc)) { + fail_count++; + fail("Semaphore protection is broken or " + "shmem pages are messed"); + semop(sem, &unlock, 1); + break; + } + poison_area((int *)mem); + } + while ((ret = semop(sem, &unlock, 1)) && (errno == EINTR)); + if (ret) { + fail_count++; + fail("Error in semop unlock"); + break; + } + } + test_waitsig(); + + kill(pid2, SIGTERM); + waitpid(pid2, &ret, 0); + if (!WIFEXITED(ret)) { + fail_count++; + pr_perror("Child 2 was killed"); + } else if (WEXITSTATUS(ret)) { + fail_count++; + pr_perror("Child 2 couldn't inititalise"); + } +out_child: + kill(pid1, SIGTERM); + waitpid(pid1, &ret, 0); + if (!WIFEXITED(ret)) { + fail_count++; + pr_perror("Child 1 was killed"); + } else if (WEXITSTATUS(ret)) { + fail_count++; + pr_perror("Child 1 couldn't inititalise"); + } +out_shdt: + shmdt(mem); +out_shm: + shmctl(shm, IPC_RMID, NULL); +out_sem: + semctl(sem, 1, IPC_RMID); + if (fail_count == 0) + pass(); +out: + return 0; +} diff --git a/test/zdtm/customization/ipc.desc b/test/zdtm/customization/ipc.desc new file mode 100644 index 0000000..63df42a --- /dev/null +++ b/test/zdtm/customization/ipc.desc @@ -0,0 +1 @@ +{'flavor': 'h'}
From: Jingxian He hejingxian@huawei.com
In order to reduce the cost time of network unlock, make the nft table clearing operation run after resuming.
The nft has two part: rules and set. criu delete nft rules to unlock network during restoration. The set deletion action consumes about hundreds of ms when there are too many elements in nft set. Dealying set deletion is helpful to save restoration time.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21
Signed-off-by: Jingxian He hejingxian@huawei.com Signed-off-by: fu.lin fulin10@huawei.com --- criu/config.c | 1 + criu/cr-dump.c | 7 ++++-- criu/cr-restore.c | 11 ++++++++++ criu/crtools.c | 1 + criu/include/cr_options.h | 1 + criu/include/net.h | 3 ++- criu/include/taskqueue.h | 12 ++++++++++- criu/net.c | 2 +- criu/nftables.c | 37 ++++++++++++++++++++++++-------- criu/taskqueue.c | 45 +++++++++++++++++++++++++++++++++++++++ 10 files changed, 106 insertions(+), 14 deletions(-)
diff --git a/criu/config.c b/criu/config.c index e09445a..5d195cb 100644 --- a/criu/config.c +++ b/criu/config.c @@ -555,6 +555,7 @@ int parse_options(int argc, char **argv, bool *usage_error, {"reserve-ports", required_argument, 0, 'P' }, BOOL_OPT("use-nft", &opts.use_nft), BOOL_OPT("parallel", &opts.parallel), + BOOL_OPT("async-clear-nft", &opts.async_clear_nft), { }, };
diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 6caee06..0c212a8 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -1801,7 +1801,10 @@ static int cr_dump_finish(int ret) network_unlock(opts.tree_id); delete_link_remaps(); clean_cr_time_mounts(); - + if (opts.async_clear_nft) + parallel_nft_clean((long)opts.tree_id); + else + network_delete_set(opts.tree_id); }
if (!ret && opts.lazy_pages) @@ -1831,7 +1834,7 @@ static int cr_dump_finish(int ret) clear_pin_mem(0); }
- if (ret != 0 && opts.with_notifier_kup) { + if (ret) { pr_info("repair off netlink fd\n"); netlink_repair_off(); } diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 6114dd4..3049e07 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2566,6 +2566,12 @@ skip_ns_bouncing: if (ret != 0) pr_err("Post-resume script ret code %d\n", ret);
+ pr_info("start delete_set\n"); + if (opts.async_clear_nft) + parallel_nft_clean((long)vpid(init)); + else + network_delete_set(vpid(init)); + if (!opts.restore_detach && !opts.exec_cmd) wait(NULL);
@@ -2712,6 +2718,11 @@ err: } if ((network_status & NETWORK_UNLOCK) == 0) network_unlock(vpid(root_item)); + + if (opts.use_nft && opts.async_clear_nft) + parallel_nft_clean(vpid(root_item)); + else if (opts.use_nft) + network_delete_set(vpid(root_item)); }
return ret; diff --git a/criu/crtools.c b/criu/crtools.c index 18945e7..35a479f 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -490,6 +490,7 @@ usage: " --reserve-ports Reserve src ports in kernel\n" " --use-nft Use nft API instead of iptables cmd in network locking\n" " --parallel Parallel to accellrate dumping speed\n" +" --async-clear-nft Async to clear nft table set" "\n" "Check options:\n" " Without options, "criu check" checks availability of absolutely required\n" diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index cc8d1ae..aa519c8 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -189,6 +189,7 @@ struct cr_options { int reserve_ports; int use_nft; int parallel; + int async_clear_nft; };
extern struct cr_options opts; diff --git a/criu/include/net.h b/criu/include/net.h index 4e704cc..9daea8d 100644 --- a/criu/include/net.h +++ b/criu/include/net.h @@ -31,7 +31,8 @@ struct veth_pair { extern int collect_net_namespaces(bool for_dump);
extern int network_prepare(pid_t tree_id); -extern void network_unprepare(pid_t tree_id); +extern void network_delete_rule(pid_t tree_id); +extern void network_delete_set(pid_t tree_id); extern int network_lock(pid_t tree_id); extern void network_unlock(pid_t tree_id); extern int network_lock_internal(void); diff --git a/criu/include/taskqueue.h b/criu/include/taskqueue.h index 16f9e3d..906c784 100644 --- a/criu/include/taskqueue.h +++ b/criu/include/taskqueue.h @@ -6,7 +6,6 @@ #include <semaphore.h>
#include "vma.h" -#include "pstree.h"
#include "common/list.h"
@@ -47,4 +46,15 @@ struct mappings_info { int start_collect_mappings_thread(void); int end_collect_mappings_thread(struct pstree_item *item);
+#define STACK_SIZE (1024 *1024) +typedef void (*daemon_t)(void *); +int parallel_task(daemon_t fn, void *_arg); + +struct daemon { + daemon_t fn; + void *arg; +}; + +void parallel_nft_clean(long tree_id); + #endif /* __CR_TASKQUEUE_H__ */ diff --git a/criu/net.c b/criu/net.c index 30b1491..2bd6f77 100644 --- a/criu/net.c +++ b/criu/net.c @@ -2943,7 +2943,7 @@ void network_unlock(pid_t tree_id) }
if (opts.use_nft && opts.tcp_established_ok) - network_unprepare(tree_id); + network_delete_rule(tree_id);
cpt_unlock_tcp_connections(opts.use_nft);
diff --git a/criu/nftables.c b/criu/nftables.c index 817f157..500d485 100644 --- a/criu/nftables.c +++ b/criu/nftables.c @@ -16,6 +16,7 @@
#include "sk-inet.h" #include "nftables.h" +#include "taskqueue.h"
#include "../soccr/soccr.h"
@@ -653,25 +654,43 @@ int network_prepare(pid_t tree_id) return mnl_common(network_prepare_internal, NULL, &tree_id); }
-static int network_unprepare_internal(struct mnl_params *params, +static int network_delete_rule_internal(struct mnl_params *params, batch_func_t _, void *args) { pid_t tree_id = *(pid_t *)args;
- if (nft_rule_common(params, tree_id, false) < 0) - return -1; + return nft_rule_common(params, tree_id, false); +}
- if (nft_set_common(params, tree_id, false) < 0) - return -1; +/* here split the deletion of rule and set to accelete the restoration process */ +void network_delete_rule(pid_t tree_id) +{ + mnl_common(network_delete_rule_internal, NULL, &tree_id); +}
- return 0; +static int network_delete_set_internal(struct mnl_params *params, + batch_func_t _, void *args) +{ + pid_t tree_id = *(pid_t *)args; + + return nft_set_common(params, tree_id, false); }
-void network_unprepare(pid_t tree_id) +void network_delete_set(pid_t tree_id) { - pr_info("Unprepare network\n"); + pr_info("clear nft set\n"); + + mnl_common(network_delete_set_internal, NULL, &tree_id); +}
- mnl_common(network_unprepare_internal, NULL, &tree_id); +void parallel_nft_clean_internal(void *arg) +{ + network_delete_set((long)arg); +} + +void parallel_nft_clean(long tree_id) +{ + parallel_task(parallel_nft_clean_internal, (void *)tree_id); }
static int add_set_elem_internal(struct nftnl_set *s, void *data, size_t len) diff --git a/criu/taskqueue.c b/criu/taskqueue.c index 1196a5e..7d500e9 100644 --- a/criu/taskqueue.c +++ b/criu/taskqueue.c @@ -122,3 +122,48 @@ int end_collect_mappings_thread(struct pstree_item *item) */ return retval; } + +static int daemonize(void *arg) +{ + struct daemon *d = arg; + + if (daemon(0, 0) < 0) + pr_perror("daemonize failed"); + + d->fn(d->arg); + + return 0; +} + +int parallel_task(daemon_t fn, void *_arg) +{ + struct daemon arg = { + .fn = fn, + .arg = _arg, + }; + char *stack; + char *stack_top; + pid_t pid; + + stack = mmap(NULL, STACK_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0); + if (stack == MAP_FAILED) { + pr_perror("mmap failed"); + return -1; + } + + stack_top = stack + STACK_SIZE; + + /* ignore SIGCHLD signal */ + pid = clone(daemonize, stack_top, 0, &arg); + if (pid > 0) + return 0; /* parent */ + else if (pid < 0) { + pr_perror("clone failed"); + return -1; + } + + /* unreachable */ + __builtin_unreachable(); + return 0; +}
Phenomenon: Operating uverbs device will generate anonymous fd named `anon_inode:[infinibandevent]`. When `anon_inode:[infinibandevent]` fd is the last opened fd, and some kind of unix socket fd exist, which is generated by syscalls like `socketpair()` at the same tim, `anon_inode:[infinibandevent]` will restore fail probabilistically.
log as the following:
``` (00.254523) 63959: open file flags:1 (00.254526) 63959: unix: Opening standalone (stage 0 id 0x1ff ino 1019605 peer 0) (00.254571) 63959: *******flags: 0 (00.254575) 63959: Create fd for 1408 # the fake fd (00.254578) 63959: *******flags: 1 (00.254580) 63959: Create fd for 445 # the restoration fd ```
Reason: During the restoration of unix socket, `socketpair()` will generate two fds, one is used to the current restoration, another is called fake fd which fd nr is owned by `find_unused_fd()`. When `anon_inode:[infinibandevent]` fd is the last one, criu don't dump the fd information for `anon_inode:[infinibandevent]` in original implementation, and criu think the fd nr which should belong to `anon_inode:[infinibandevent]` isn't used. Therefore, it cause the `anon_inode:[infinibandevent]` restoration fail.
This patch fix the above problem. Core: dump `anon_inode:[infinibandevent]` fd information, make the criu is aware that that fd nr is used.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: fu.lin fulin10@huawei.com --- criu/Makefile.crtools | 1 + criu/char.c | 68 ++++++++++++++++++++++++++++++++++++ criu/files.c | 23 ++++++------ criu/include/char.h | 17 +++++++++ criu/include/image-desc.h | 1 + criu/include/protobuf-desc.h | 1 + images/chr.proto | 3 ++ images/fdinfo.proto | 2 ++ 8 files changed, 103 insertions(+), 13 deletions(-) create mode 100644 criu/char.c create mode 100644 criu/include/char.h
diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index 0bea576..70d1b73 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -93,6 +93,7 @@ obj-y += devname.o obj-y += mnl.o obj-y += nftables.o obj-y += kmsg.o +obj-y += char.o obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 diff --git a/criu/char.c b/criu/char.c new file mode 100644 index 0000000..153145f --- /dev/null +++ b/criu/char.c @@ -0,0 +1,68 @@ +#include "imgset.h" +#include "char.h" +#include "log.h" + +#include "protobuf.h" + +static void pr_info_infiniband(char *action, InfinibandEntry *infiniband) +{ + pr_info("%sinfiniband: id %#08x\n", action, infiniband->id); +} + +/* Checks if file descriptor @lfd is infinibandevent */ +int is_infiniband_link(char *link) +{ + return is_anon_link_type(link, "[infinibandevent]"); +} + +static int dump_one_infiniband(int lfd, u32 id, const struct fd_parms *p) +{ + FileEntry fe = FILE_ENTRY__INIT; + InfinibandEntry infiniband = INFINIBAND_ENTRY__INIT; + + infiniband.id = id; + + fe.type = FD_TYPES__INFINIBAND; + fe.id = infiniband.id; + fe.infiniband = &infiniband; + + pr_info_infiniband("Dumping ", &infiniband); + + return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); +} + +const struct fdtype_ops infiniband_dump_ops = { + .type = FD_TYPES__INFINIBAND, + .dump = dump_one_infiniband, +}; + +static int infiniband_open(struct file_desc *d, int *new_fd) { + /* + * `*new_fd == -1` at this time, it means this open operation shouldn't + * be served out, which is why this function does nothing here. + */ + return 0; +}; + +static struct file_desc_ops infiniband_desc_ops = { + .type = FD_TYPES__INFINIBAND, + .open = infiniband_open, +}; + +static int collect_one_infiniband(void *o, ProtobufCMessage *base, struct cr_img *i) +{ + struct infiniband_file_info *info = o; + + info->infiniband = pb_msg(base, InfinibandEntry); + pr_info_infiniband("Collected ", info->infiniband); + + /* add the fd to `file_desc_hash` list to prevent from NULL pointer */ + return file_desc_add(&info->d, info->infiniband->id, &infiniband_desc_ops); +} + +struct collect_image_info infiniband_cinfo = { + .fd_type = CR_FD_INFINIBAND, + .pb_type = PB_INFINIBAND, + .priv_size = sizeof(struct infiniband_file_info), + .collect = collect_one_infiniband, +}; diff --git a/criu/files.c b/criu/files.c index 3bb68c7..0e5be91 100644 --- a/criu/files.c +++ b/criu/files.c @@ -49,6 +49,7 @@ #include "kerndat.h" #include "fdstore.h" #include "bpfmap.h" +#include "char.h"
#include "protobuf.h" #include "util.h" @@ -593,12 +594,6 @@ static int dump_chrdev(struct fd_parms *p, int lfd, FdinfoEntry *e) return err; }
-/* Checks if file descriptor @lfd is infinibandevent */ -int is_infiniband_link(char *link) -{ - return is_anon_link_type(link, "[infinibandevent]"); -} - static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, struct parasite_ctl *ctl, FdinfoEntry *e, struct parasite_drain_fd *dfds) @@ -654,7 +649,7 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, else if (is_timerfd_link(link)) ops = &timerfd_dump_ops; else if (is_infiniband_link(link)) - return 1; + ops = &infiniband_dump_ops; #ifdef CONFIG_HAS_LIBBPF else if (is_bpfmap_link(link)) ops = &bpfmap_dump_ops; @@ -765,11 +760,7 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, lfds[i], opts + i, ctl, &e, dfds); if (ret < 0) break; - /* infiniband link file */ - if (ret > 0) { - ret = 0; - continue; - } + e.flags |= need_reuse_flag; pr_info("write fdinfoEntry fd=%d id=%d\n", (&e)->fd, (&e)->id); ret = pb_write_one(img, &e, PB_FDINFO); @@ -1864,8 +1855,11 @@ static int chrfile_open(struct file_desc *d, int *new_fd) pr_info("charfile: Opening %s (repair %d index %d)\n", ci->path, ci->cfe->repair, ci->cfe->index);
+ if (ci->cfe->repair) + ci->cfe->flags |= O_REPAIR; + mntns_root = open_pid_proc(getpid()); - fd = openat(mntns_root, ci->path, ci->cfe->flags | O_REPAIR); + fd = openat(mntns_root, ci->path, ci->cfe->flags); if (fd < 0){ pr_err("open chr file failed\n"); return -1; @@ -1991,6 +1985,9 @@ static int collect_one_file(void *o, ProtobufCMessage *base, struct cr_img *i) case FD_TYPES__CHR: ret = collect_one_file_entry(fe, fe->chr->id, &fe->chr->base, &chrfile_cinfo); break; + case FD_TYPES__INFINIBAND: + ret = collect_one_file_entry(fe, fe->infiniband->id, &fe->infiniband->base, &infiniband_cinfo); + break; }
return ret; diff --git a/criu/include/char.h b/criu/include/char.h new file mode 100644 index 0000000..c63b8f1 --- /dev/null +++ b/criu/include/char.h @@ -0,0 +1,17 @@ +#ifndef __CR_CHAR_H__ +#define __CR_CHAR_H__ + +#include "files.h" +#include "images/chr.pb-c.h" + +struct infiniband_file_info { + InfinibandEntry *infiniband; + struct file_desc d; +}; + +extern const struct fdtype_ops infiniband_dump_ops; +extern struct collect_image_info infiniband_cinfo; + +int is_infiniband_link(char *link); + +#endif /* __CR_CHAR_H__ */ diff --git a/criu/include/image-desc.h b/criu/include/image-desc.h index 22676ae..4231716 100644 --- a/criu/include/image-desc.h +++ b/criu/include/image-desc.h @@ -115,6 +115,7 @@ enum {
CR_FD_AUTOFS, CR_FD_CHRFILE, + CR_FD_INFINIBAND,
CR_FD_MAX }; diff --git a/criu/include/protobuf-desc.h b/criu/include/protobuf-desc.h index e7df57e..023bbfc 100644 --- a/criu/include/protobuf-desc.h +++ b/criu/include/protobuf-desc.h @@ -70,6 +70,7 @@ enum { PB_BPFMAP_FILE, PB_BPFMAP_DATA, PB_CHRFILE, + PB_INFINIBAND,
/* PB_AUTOGEN_STOP */
diff --git a/images/chr.proto b/images/chr.proto index 67929db..ed65005 100644 --- a/images/chr.proto +++ b/images/chr.proto @@ -10,3 +10,6 @@ message chrfile_entry { required bool repair = 5; };
+message infiniband_entry { + required uint32 id = 1; +}; diff --git a/images/fdinfo.proto b/images/fdinfo.proto index 8561da4..2fa34f8 100644 --- a/images/fdinfo.proto +++ b/images/fdinfo.proto @@ -42,6 +42,7 @@ enum fd_types { MEMFD = 18; BPFMAP = 19; CHR = 21; + INFINIBAND = 22;
/* Any number above the real used. Not stored to image */ CTL_TTY = 65534; @@ -79,4 +80,5 @@ message file_entry { optional memfd_file_entry memfd = 20; optional bpfmap_file_entry bpf = 21; optional chrfile_entry chr = 23; + optional infiniband_entry infiniband = 25; }
From: Jingxian He hejingxian@huawei.com
In order to improve criu dump performance, make the collecting vmas operation parallel run with the other collecting operations.
In order to prevent the concurrency problem by `find_unused_fd`, only the main root task will parallel.
Usage: criu --parallel
Note: Ensure criu can use multi-core, otherwise the performance will deterioration.
Conflict:NA Reference:https://gitee.com/src-openeuler/criu/pulls/21 Signed-off-by: fu.lin fulin10@huawei.com Signed-off-by: hewenliang hewenliang4@huawei.com Signed-off-by: Jingxian He hejingxian@huawei.com --- criu/Makefile.crtools | 1 + criu/Makefile.packages | 1 + criu/config.c | 1 + criu/cr-dump.c | 55 ++++++++++++----- criu/crtools.c | 3 +- criu/include/cr_options.h | 1 + criu/include/pstree.h | 3 + criu/include/taskqueue.h | 50 +++++++++++++++ criu/namespaces.c | 9 ++- criu/proc_parse.c | 6 ++ criu/taskqueue.c | 124 ++++++++++++++++++++++++++++++++++++++ 11 files changed, 237 insertions(+), 17 deletions(-) create mode 100644 criu/include/taskqueue.h create mode 100644 criu/taskqueue.c
diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index 70d1b73..6deb855 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -94,6 +94,7 @@ obj-y += mnl.o obj-y += nftables.o obj-y += kmsg.o obj-y += char.o +obj-y += taskqueue.o obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 diff --git a/criu/Makefile.packages b/criu/Makefile.packages index c1d87a5..875df02 100644 --- a/criu/Makefile.packages +++ b/criu/Makefile.packages @@ -37,6 +37,7 @@ endif export LIBS += -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/ -lnet export LIBS += $(shell pkg-config --libs libmnl) export LIBS += $(shell pkg-config --libs libnftnl) +export LIBS += -lpthread
check-packages-failed: $(warning Can not find some of the required libraries) diff --git a/criu/config.c b/criu/config.c index cdafe17..e09445a 100644 --- a/criu/config.c +++ b/criu/config.c @@ -554,6 +554,7 @@ int parse_options(int argc, char **argv, bool *usage_error, BOOL_OPT("file-locks-repair", &opts.file_locks_repair), {"reserve-ports", required_argument, 0, 'P' }, BOOL_OPT("use-nft", &opts.use_nft), + BOOL_OPT("parallel", &opts.parallel), { }, };
diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 2a1864c..6caee06 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -18,6 +18,7 @@
#include <sched.h> #include <sys/resource.h> +#include <sys/sysinfo.h>
#include "types.h" #include "protobuf.h" @@ -85,6 +86,8 @@ #include "img-streamer.h" #include "restorer.h"
+#include "taskqueue.h" + /* * Architectures can overwrite this function to restore register sets that * are not covered by ptrace_set/get_regs(). @@ -404,7 +407,7 @@ static int dump_pid_misc(pid_t pid, TaskCoreEntry *tc) return 0; }
-static int dump_filemap(struct vma_area *vma_area, int fd) +int dump_filemap(struct vma_area *vma_area, int fd) { struct fd_parms p = FD_PARMS_INIT; VmaEntry *vma = vma_area->e; @@ -1233,7 +1236,7 @@ err_cure: static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) { pid_t pid = item->pid->real; - struct vm_area_list vmas; + struct vm_area_list *vmas = NULL; struct parasite_ctl *parasite_ctl; int ret, exit_code = -1; struct parasite_dump_misc misc; @@ -1242,8 +1245,6 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) struct proc_posix_timers_stat proc_args; struct mem_dump_ctl mdc;
- vm_area_list_init(&vmas); - pr_info("========================================\n"); pr_info("Dumping task (pid: %d)\n", pid); pr_info("========================================\n"); @@ -1254,12 +1255,23 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) */ return 0;
+ if (!opts.parallel || root_item->pid->real != item->pid->real ) { + vmas = xmalloc(sizeof(struct vm_area_list)); + if (vmas == NULL) { + pr_err("xmalloc no memory\n"); + return -1; + } + vm_area_list_init(vmas); + } else + vmas = item->maps_info.vmas; + pr_info("Obtaining task stat ... \n"); ret = parse_pid_stat(pid, &pps_buf); if (ret < 0) goto err;
- ret = collect_mappings(pid, &vmas, dump_filemap); + ret = (opts.parallel && root_item->pid->real == item->pid->real) ? + 0 : collect_mappings(pid, vmas, dump_filemap); if (ret) { pr_err("Collect mappings (pid: %d) failed with %d\n", pid, ret); goto err; @@ -1293,7 +1305,10 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) goto err; }
- parasite_ctl = parasite_infect_seized(pid, item, &vmas); + if (opts.parallel && end_collect_mappings_thread(item)) + goto err; + + parasite_ctl = parasite_infect_seized(pid, item, vmas); if (!parasite_ctl) { pr_err("Can't infect (pid: %d) with parasite\n", pid); goto err; @@ -1317,13 +1332,13 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) goto err_cure_imgset; }
- ret = parasite_fixup_vdso(parasite_ctl, pid, &vmas); + ret = parasite_fixup_vdso(parasite_ctl, pid, vmas); if (ret) { pr_err("Can't fixup vdso VMAs (pid: %d)\n", pid); goto err_cure_imgset; }
- ret = parasite_collect_aios(parasite_ctl, &vmas); /* FIXME -- merge with above */ + ret = parasite_collect_aios(parasite_ctl, vmas); /* FIXME -- merge with above */ if (ret) { pr_err("Failed to check aio rings (pid: %d)\n", pid); goto err_cure_imgset; @@ -1377,7 +1392,7 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) mdc.stat = &pps_buf; mdc.parent_ie = parent_ie;
- ret = parasite_dump_pages_seized(item, &vmas, &mdc, parasite_ctl); + ret = parasite_dump_pages_seized(item, vmas, &mdc, parasite_ctl); if (ret) goto err_cure;
@@ -1438,7 +1453,7 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) goto err; }
- ret = dump_task_mm(pid, &pps_buf, &misc, &vmas, cr_imgset); + ret = dump_task_mm(pid, &pps_buf, &misc, vmas, cr_imgset); if (ret) { pr_err("Dump mappings (pid: %d) failed with %d\n", pid, ret); goto err; @@ -1454,7 +1469,8 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) exit_code = 0; err: close_pid_proc(); - free_mappings(&vmas); + free_mappings(vmas); + free(vmas); xfree(dfds); return exit_code;
@@ -1785,6 +1801,7 @@ static int cr_dump_finish(int ret) network_unlock(opts.tree_id); delete_link_remaps(); clean_cr_time_mounts(); + }
if (!ret && opts.lazy_pages) @@ -1839,6 +1856,13 @@ static int cr_dump_finish(int ret) write_stats(DUMP_STATS); pr_info("Dumping finished successfully\n"); } + + /* + * Don't care threads' status and ignore unfree resources, use + * `exit_group()` to ensure exit all threads. + */ + syscall(SYS_exit_group, post_dump_ret ? : (ret != 0)); + return post_dump_ret ? : (ret != 0); }
@@ -1864,6 +1888,9 @@ int cr_dump_tasks(pid_t pid) if (opts.dump_char_dev && parse_devname() < 0) goto err;
+ if (opts.parallel && init_parallel_env() != 0) + goto err; + root_item = alloc_pstree_item(); if (!root_item) goto err; @@ -1941,13 +1968,13 @@ int cr_dump_tasks(pid_t pid) if (collect_file_locks()) goto err;
- if (collect_namespaces(true) < 0) - goto err; - glob_imgset = cr_glob_imgset_open(O_DUMP); if (!glob_imgset) goto err;
+ if (collect_namespaces(true) < 0) + goto err; + if (seccomp_collect_dump_filters() < 0) goto err;
diff --git a/criu/crtools.c b/criu/crtools.c index bac2992..18945e7 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -488,7 +488,8 @@ usage: " --weak-file-check Allow file size and mod larger than dumping value\n" " --file-locks-repair Use repair mode to dump and restore file locks\n" " --reserve-ports Reserve src ports in kernel\n" -" --use-nft Use nft API instead of iptables cmd in network locking" +" --use-nft Use nft API instead of iptables cmd in network locking\n" +" --parallel Parallel to accellrate dumping speed\n" "\n" "Check options:\n" " Without options, "criu check" checks availability of absolutely required\n" diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index 236d1c7..cc8d1ae 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -188,6 +188,7 @@ struct cr_options { char *share_src_ports; int reserve_ports; int use_nft; + int parallel; };
extern struct cr_options opts; diff --git a/criu/include/pstree.h b/criu/include/pstree.h index 61ab0ce..be0942a 100644 --- a/criu/include/pstree.h +++ b/criu/include/pstree.h @@ -1,6 +1,8 @@ #ifndef __CR_PSTREE_H__ #define __CR_PSTREE_H__
+#include "taskqueue.h" + #include "common/list.h" #include "common/lock.h" #include "pid.h" @@ -30,6 +32,7 @@ struct pstree_item { futex_t task_st; unsigned long task_st_le_bits; }; + struct mappings_info maps_info; };
static inline pid_t vpid(const struct pstree_item *i) diff --git a/criu/include/taskqueue.h b/criu/include/taskqueue.h new file mode 100644 index 0000000..16f9e3d --- /dev/null +++ b/criu/include/taskqueue.h @@ -0,0 +1,50 @@ +#ifndef __CR_TASKQUEUE_H__ +#define __CR_TASKQUEUE_H__ + +#include <stdbool.h> +#include <pthread.h> +#include <semaphore.h> + +#include "vma.h" +#include "pstree.h" + +#include "common/list.h" + +#define TASKQUEUE_HASH_SIZE 8 + +struct taskqueue { + pthread_t task; + void *(*routine)(void *); + void *arg; + int result; +}; +#define queue_task queue.task +#define queue_routine queue.routine +#define queue_arg queue.arg +#define queue_result queue.result + +int init_parallel_env(void); + +static inline int taskqueue_create(struct taskqueue *queue) +{ + return pthread_create(&queue->task, NULL, queue->routine, queue->arg); +} + +static inline int taskqueue_join(struct taskqueue *queue) +{ + return pthread_join(queue->task, NULL); +} + +/* parallel collect smaps */ +struct mappings_info { + struct hlist_node hash; + pid_t pid; + struct vm_area_list *vmas; + dump_filemap_t dump_file; + struct taskqueue queue; +}; + +int start_collect_mappings_thread(void); +int end_collect_mappings_thread(struct pstree_item *item); + +#endif /* __CR_TASKQUEUE_H__ */ diff --git a/criu/namespaces.c b/criu/namespaces.c index 9ffcd16..e71817f 100644 --- a/criu/namespaces.c +++ b/criu/namespaces.c @@ -27,6 +27,7 @@ #include "net.h" #include "cgroup.h" #include "fdstore.h" +#include "taskqueue.h"
#include "protobuf.h" #include "util.h" @@ -1570,11 +1571,15 @@ int collect_namespaces(bool for_dump) { int ret;
- ret = collect_user_namespaces(for_dump); + ret = collect_mnt_namespaces(for_dump); if (ret < 0) return ret;
- ret = collect_mnt_namespaces(for_dump); + /* need mnt info provided by `mntinfo` */ + if (opts.parallel && start_collect_mappings_thread()) + return -1; + + ret = collect_user_namespaces(for_dump); if (ret < 0) return ret;
diff --git a/criu/proc_parse.c b/criu/proc_parse.c index b3d1c0b..4a6a598 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -64,6 +64,12 @@
#define BUF_SIZE 4096 /* Good enough value - can be changed */
+/* cancel log to optimize performance because of the lock contention of print */ +#undef pr_info +#undef pr_debug +#define pr_info(fmt, ...) +#define pr_debug(fmt, ...) + struct buffer { char buf[BUF_SIZE]; char end; /* '\0' */ diff --git a/criu/taskqueue.c b/criu/taskqueue.c new file mode 100644 index 0000000..1196a5e --- /dev/null +++ b/criu/taskqueue.c @@ -0,0 +1,124 @@ +/* + * Target: + * parallel dump process + */ + +#include <string.h> +#include <errno.h> +#include <pthread.h> +#include <sys/sysinfo.h> + +#include "pstree.h" +#include "log.h" +#include "taskqueue.h" + +/* + * Sometimes, only one cpu can be used which is bad for parallel routine. + * Therefore, set cpu affinity for criu routine. + */ +static int set_cpuaffinity(void) +{ + cpu_set_t *set; + int num_cpus = get_nprocs_conf(); + size_t cpusetsize = CPU_ALLOC_SIZE(num_cpus); + int retval; + + set = CPU_ALLOC(num_cpus); + memset(set, 0xff, cpusetsize); + + retval = sched_setaffinity(getpid(), cpusetsize, set); + if (retval != 0) + pr_err("sched_setaffinity failed: %s\n", strerror(errno)); + + CPU_FREE(set); + + return retval; +} + +int init_parallel_env(void) +{ + return set_cpuaffinity(); +} + +static void *collect_mappings_routine(void *_arg) +{ + struct mappings_info *info = _arg; + + info->queue_result = collect_mappings(info->pid, info->vmas, info->dump_file); + return NULL; +} + +int dump_filemap(struct vma_area *vma_area, int fd); /* defined in criu/cr-dump.c */ + +int start_collect_mappings_thread(void) +{ + struct pstree_item *pi; + struct mappings_info *info; + + for_each_pstree_item(pi) { + /* disable parallel collect for non-root item because of the + * concurrence. + */ + if (pi->pid->real != root_item->pid->real) + continue; + + info = &pi->maps_info; + + info->vmas = xmalloc(sizeof(struct vm_area_list)); + if (info->vmas == NULL) { + pr_err("xzalloc vmas no memory\n"); + return -1; + } + vm_area_list_init(info->vmas); + + info->pid = pi->pid->real; + info->dump_file = dump_filemap; + info->queue_routine = collect_mappings_routine; + info->queue_arg = info; + + pr_info("Start thread to collect %d mappings\n", info->pid); + + if (taskqueue_create(&info->queue) < 0) { + pr_err("parallel_collect_mappings failed: %s\n", strerror(errno)); + free(info->vmas); + /* + * Don't care other threads status, use `exit_group()` + * to ensure all threads exit. + */ + return -1; + } + } + + return 0; +} + +int end_collect_mappings_thread(struct pstree_item *item) +{ + struct mappings_info *info = &item->maps_info; + int retval; + + /* disable parallel collect for non-root item because of the + * concurrence. + */ + if (root_item->pid->real != item->pid->real) + return 0; + + retval = taskqueue_join(&info->queue); + if (retval != 0 || info->queue_result != 0) { + pr_err("taskqueue_join failed, retval %d(errno %d: %s)," + " queue_result: %d\n", + retval, + retval == 0 ? 0 : errno, + retval == 0 ? "nil" : strerror(errno), + info->queue_result); + retval = -1; + } + + pr_info("End thread to collect %d mappings\n", info->pid); + + /* + * Don't care other threads status, use `exit_group()` to ensure all + * threads exit. + */ + return retval; +}
--- test/zdtm/lib/fs.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/test/zdtm/lib/fs.c b/test/zdtm/lib/fs.c index e82011e..f026f22 100644 --- a/test/zdtm/lib/fs.c +++ b/test/zdtm/lib/fs.c @@ -4,6 +4,7 @@ #include <errno.h> #include <unistd.h> #include <limits.h> +#include <sys/stat.h>
#include "zdtmtst.h" #include "fs.h" @@ -106,11 +107,15 @@ int get_cwd_check_perm(char **result) }
if (access(cwd, X_OK)) { - pr_err("access check for bit X for current dir path '%s' " - "failed for uid:%d,gid:%d, error: %d(%s). " + struct stat sb; + + stat(cwd, &sb); + pr_err("access check for bit X for current dir path '%s'(uid:%d,gid:%d,mode:%o) " + "failed for uid:%d,gid:%d,euid:%d, error: %d(%s). " "Bit 'x' should be set in all path components of " "this directory\n", - cwd, getuid(), getgid(), errno, strerror(errno) + cwd, sb.st_uid, sb.st_gid, sb.st_mode, getuid(), getgid(), geteuid(), + errno, strerror(errno) ); return -1; }
- char dev `ioctl({IOCTL_CMD_NEEDREPAIR, IOCTL_CMD_REPAIR})` checkpoint/restore test - anonymous inode checkpoint/restore test --- test/zdtm/customization/Makefile | 3 +- test/zdtm/customization/chardev00.c | 65 +++++++++++ test/zdtm/customization/chardev00.desc | 1 + test/zdtm/mod/Makefile | 5 +- test/zdtm/mod/anon_inode.c | 148 +++++++++++++++++++++++++ 5 files changed, 220 insertions(+), 2 deletions(-) create mode 100644 test/zdtm/customization/chardev00.c create mode 100644 test/zdtm/customization/chardev00.desc create mode 100644 test/zdtm/mod/anon_inode.c
diff --git a/test/zdtm/customization/Makefile b/test/zdtm/customization/Makefile index 93922c7..7d08db3 100644 --- a/test/zdtm/customization/Makefile +++ b/test/zdtm/customization/Makefile @@ -11,7 +11,8 @@ TST_NOFILE = \ maps05 \ maps007 \ maps008 \ - notifier00 + notifier00 \ + chardev00
TST_FILE = \ maps00 \ diff --git a/test/zdtm/customization/chardev00.c b/test/zdtm/customization/chardev00.c new file mode 100644 index 0000000..c708699 --- /dev/null +++ b/test/zdtm/customization/chardev00.c @@ -0,0 +1,65 @@ +#include <stdio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <sys/ioctl.h> +#include "zdtmtst.h" + +#define CHARDEV_PATH "/dev/anon_test" + +const char *test_doc="Tests char dev and anonmous inode map checkpoint/restore"; + +static int check_maps(unsigned long addr) +{ + FILE *fp = fopen("/proc/self/maps", "r"); + char *line = NULL; + size_t n = 0; + unsigned long start = 0; + + if (fp == NULL) { + pr_perror("open self maps failed"); + return -1; + } + + while (getline(&line, &n, fp) != -1) { + test_msg("%s", line); + sscanf(line, "%lx-", &start); + if (start == addr) + return 0; + } + + return -1; +} + +int main(int argc, char *argv[]) +{ + int fd, retval = 0; + unsigned long addr; + + test_init(argc, argv); + + fd = open(CHARDEV_PATH, O_RDWR); + if (fd < 0) { + pr_perror("open '%s' failed", CHARDEV_PATH); + return -1; + } + + retval = ioctl(fd, 0, &addr); + if (retval < 0) { + pr_perror("create anonymous map failed"); + retval = -1; + goto out; + } + test_msg("create anonymous vma start 0x%lx\n", addr); + + test_daemon(); + test_waitsig(); + + retval = check_maps(addr); + if (retval == 0) + pass(); + else + fail("anonymous inode map don't restore"); +out: + return retval; +} diff --git a/test/zdtm/customization/chardev00.desc b/test/zdtm/customization/chardev00.desc new file mode 100644 index 0000000..9c51ba8 --- /dev/null +++ b/test/zdtm/customization/chardev00.desc @@ -0,0 +1 @@ +{'arch': 'aarch64', 'opts': '--dump-char-dev', 'flavor': 'h', 'flags': 'suid excl', 'sysfs': '/sys/kernel/modrestore/anon_state_restore /sys/kernel/repairing_device', 'mod': 'anon_inode.ko'} diff --git a/test/zdtm/mod/Makefile b/test/zdtm/mod/Makefile index 10c9c9a..0bc89f7 100644 --- a/test/zdtm/mod/Makefile +++ b/test/zdtm/mod/Makefile @@ -2,7 +2,7 @@ # `ARCH` var is used in both criu and kernel, but they have the different value # for the same architecture(e.g. arm64). Therefore, this Makefile can't be # included in the criu Makefile. -obj-m += notifier.o +obj-m += notifier.o anon_inode.o
# specific the kernel devel path # example (use `/home/me/kernel` as `KDIR`): @@ -26,3 +26,6 @@ clean:
notifier.ko: $(MAKE) -C $(KDIR) M=$(MOD) notifier.ko + +anon_inode.ko: + $(MAKE) -C $(KDIR) M=$(MOD) anon_inode.ko diff --git a/test/zdtm/mod/anon_inode.c b/test/zdtm/mod/anon_inode.c new file mode 100644 index 0000000..d9c7d2a --- /dev/null +++ b/test/zdtm/mod/anon_inode.c @@ -0,0 +1,148 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/miscdevice.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/anon_inodes.h> +#include <linux/file.h> +#include <linux/modrestore.h> + +static int anon_mmap(struct file *file, struct vm_area_struct *vma) +{ + pr_info("call %s\n", __func__); + return 0; +} + +static const struct file_operations none_fops = { + .owner = THIS_MODULE, + .mmap = anon_mmap, +}; + +static unsigned long create_mmap(void) +{ + struct file *filp; + unsigned long start; + + pr_info("call %s\n", __func__); + filp = anon_inode_getfile("test", &none_fops, NULL, O_RDWR); + if (IS_ERR(filp)) { + pr_warn("anon_inode_getfile('test') failed: %d\n", (int)PTR_ERR(filp)); + return PTR_ERR(filp); + } + + start = vm_mmap(filp, 0, 1<<20, PROT_READ | PROT_WRITE, MAP_SHARED, 0); + if (IS_ERR_VALUE(start)) { + pr_warn("vm_mmap failed with: %d\n", (int)PTR_ERR((void *)start)); + } + + fput(filp); + + return start; +} + +static int anon_inode_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct vma_anon_entry *vma_entry = data; + struct file *filp; + unsigned long start; + + filp = anon_inode_getfile("test", &none_fops, NULL, O_RDWR); + if (IS_ERR(filp)) { + pr_warn("anon_inode_getfile('test') failed: %d\n", (int)PTR_ERR(filp)); + return 0; + } + + start = vm_mmap(filp, vma_entry->start, vma_entry -> end-vma_entry->start, + PROT_READ | PROT_WRITE, MAP_SHARED, 0); + if (start != vma_entry->start) + pr_warn("vm_mmap() failed: %#lx\n", start); + + fput(filp); + return 0; +} + +static long anon_ioctl(struct file *file, unsigned int cmd, unsigned long argp) +{ + unsigned long start; + + switch (cmd) { + case 0: + start = create_mmap(); + if (IS_ERR_VALUE(start)) + return -EINVAL; + if (put_user(start, (unsigned long __user *)argp)) + return -EFAULT; + break; + case IOCTL_CMD_NEEDREPAIR: + pr_info("call IOCTL_CMD_NEEDREPAIR"); + /* do nothing, just a request slot */ + return 17173; + case IOCTL_CMD_REPAIR: + pr_info("call IOCTL_CMD_REPAIR"); + /* do nothing, just a request slot */ + break; + default: + pr_warn("wrong cmd\n"); + return -EINVAL; + } + + return 0; +} + +static const struct file_operations anon_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = anon_ioctl, + .compat_ioctl = anon_ioctl, +}; + +static struct miscdevice anon_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "anon_test", + .fops = &anon_fops, +}; + +static struct notifier_block anon_inode_nb = { + .notifier_call = anon_inode_notifier, +}; + +static int __init anon_init(void) +{ + int retval; + + retval = mures_add_devname(anon_dev.name); + if (retval != 0) + goto out; + + retval = register_anon_notifier(&anon_inode_nb); + if (retval != 0) + goto del_devname; + + retval = misc_register(&anon_dev); + if (retval != 0) + goto del_notifier; + + return 0; + +del_notifier: + unregister_anon_notifier(&anon_inode_nb); +del_devname: + mures_del_devname(anon_dev.name); +out: + return retval; +} + +static void __exit anon_exit(void) +{ + mures_del_devname(anon_dev.name); + unregister_anon_notifier(&anon_inode_nb); + misc_deregister(&anon_dev); + return; +} + +module_init(anon_init); +module_exit(anon_exit); +MODULE_LICENSE("GPL");
--- test/jenkins/criu-lib.sh | 2 +- test/jenkins/criu-test.sh | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 test/jenkins/criu-test.sh
diff --git a/test/jenkins/criu-lib.sh b/test/jenkins/criu-lib.sh index 72d41b5..89dc936 100644 --- a/test/jenkins/criu-lib.sh +++ b/test/jenkins/criu-lib.sh @@ -15,7 +15,7 @@ function prep()
ulimit -c unlimited && export CFLAGS=-g - git clean -dfx && +# git clean -dfx && make -j 4 && make -j 4 -C test/zdtm/ && make -C test zdtm_ct && diff --git a/test/jenkins/criu-test.sh b/test/jenkins/criu-test.sh new file mode 100644 index 0000000..3035f21 --- /dev/null +++ b/test/jenkins/criu-test.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +set -e +source `dirname $0`/criu-lib.sh +prep + +rm -rf /var/run/criu.kdat + +make zdtm + +if [ -z $(grep 58467 /etc/group) ]; then + groupadd -g 58467 zdtm +fi +if [ -z $(grep 58467 /etc/passwd) ]; then + useradd -u 18943 -g 58467 zdtm +fi + +#./test/zdtm.py run --all --keep-going --report report -f h --ignore-taint --parallel 1 --load-pinmem-dev || fail + +#./test/zdtm.py run -t zdtm/static/del_standalone_un --keep-going -f h --ignore-taint --parallel 1 --load-pinmem-dev --keep-img always + +./test/zdtm.py run -t zdtm/customization/chardev00 -t zdtm/customization/notifier00 --keep-going -f h --ignore-taint --parallel 1 --load-pinmem-dev --keep-img always + +#./test/zdtm.py run -t zdtm/static/socket-tcp-nfconntrack --join-ns --keep-going --ignore-taint --parallel 1 --load-pinmem-dev --keep-img always + +./test/zdtm.py run -t zdtm/customization/tcp00 --keep-going -f h --ignore-taint --parallel 1 --load-pinmem-dev --keep-img always
--- test/zdtm/customization/Makefile | 4 +- .../customization/infiniband_with_unix_sk.c | 55 ++++++++ .../infiniband_with_unix_sk.desc | 1 + test/zdtm/mod/Makefile | 5 +- test/zdtm/mod/infiniband_kern.c | 121 ++++++++++++++++++ 5 files changed, 184 insertions(+), 2 deletions(-) create mode 100644 test/zdtm/customization/infiniband_with_unix_sk.c create mode 100644 test/zdtm/customization/infiniband_with_unix_sk.desc create mode 100644 test/zdtm/mod/infiniband_kern.c
diff --git a/test/zdtm/customization/Makefile b/test/zdtm/customization/Makefile index 7d08db3..728646b 100644 --- a/test/zdtm/customization/Makefile +++ b/test/zdtm/customization/Makefile @@ -12,7 +12,8 @@ TST_NOFILE = \ maps007 \ maps008 \ notifier00 \ - chardev00 + chardev00 \ + infiniband_with_unix_sk
TST_FILE = \ maps00 \ @@ -61,6 +62,7 @@ wait_stop: $(TST): | $(LIB)
maps02: get_smaps_bits.o +infiniband_with_unix_sk: LDFLAGS += -lpthread
%: %.sh cp $< $@ diff --git a/test/zdtm/customization/infiniband_with_unix_sk.c b/test/zdtm/customization/infiniband_with_unix_sk.c new file mode 100644 index 0000000..4a9e108 --- /dev/null +++ b/test/zdtm/customization/infiniband_with_unix_sk.c @@ -0,0 +1,55 @@ +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <stdio.h> +#include <stdbool.h> +#include <fcntl.h> +#include <unistd.h> +#include <pthread.h> +#include "zdtmtst.h" + +#define DEV_PATH "/dev/infiniband_test" + +const char *test_doc = "test infiniband fd checkpoint/restore, and the conflict condition with the half-closing anonymous unix socket"; + +static int fd; +static int sv[2]; + +static void *wait(void *arg) { + while (true) { + test_msg("sleep...\n"); + sleep(1); + } + + return NULL; +} + +int main(int argc, char *argv[]) { + pthread_t thread; + + test_init(argc, argv); + + if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sv) < 0) { + pr_perror("socketpair"); + return -1; + } + printf("sv[0]: %d sv[1]: %d\n", sv[0], sv[1]); + + if ((fd = open(DEV_PATH, O_RDWR)) < 0) { + pr_perror("open"); + return -1; + } + if (close(sv[1]) < 0) { + pr_perror("close"); + return -1; + } + + pthread_create(&thread, NULL, wait, NULL); + + test_daemon(); + test_waitsig(); + + pass(); + + return 0; +} diff --git a/test/zdtm/customization/infiniband_with_unix_sk.desc b/test/zdtm/customization/infiniband_with_unix_sk.desc new file mode 100644 index 0000000..43a93e6 --- /dev/null +++ b/test/zdtm/customization/infiniband_with_unix_sk.desc @@ -0,0 +1 @@ +{'arch': 'aarch64', 'opts': '--dump-char-dev', 'flavor': 'h', 'flags': 'suid excl', 'sysfs': '/sys/kernel/repairing_device', 'mod': 'infiniband_kern.ko'} diff --git a/test/zdtm/mod/Makefile b/test/zdtm/mod/Makefile index 0bc89f7..58f9a27 100644 --- a/test/zdtm/mod/Makefile +++ b/test/zdtm/mod/Makefile @@ -2,7 +2,7 @@ # `ARCH` var is used in both criu and kernel, but they have the different value # for the same architecture(e.g. arm64). Therefore, this Makefile can't be # included in the criu Makefile. -obj-m += notifier.o anon_inode.o +obj-m += notifier.o anon_inode.o infiniband_kern.o
# specific the kernel devel path # example (use `/home/me/kernel` as `KDIR`): @@ -29,3 +29,6 @@ notifier.ko:
anon_inode.ko: $(MAKE) -C $(KDIR) M=$(MOD) anon_inode.ko + +infiniband_kern.ko: + $(MAKE) -C $(KDIR) M=$(MOD) infiniband_kern.ko diff --git a/test/zdtm/mod/infiniband_kern.c b/test/zdtm/mod/infiniband_kern.c new file mode 100644 index 0000000..a61df3a --- /dev/null +++ b/test/zdtm/mod/infiniband_kern.c @@ -0,0 +1,121 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/miscdevice.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/anon_inodes.h> +#include <linux/file.h> +#include <linux/modrestore.h> +#include <linux/uaccess.h> + +static const struct file_operations none_fops = { + .owner = THIS_MODULE, +}; + +static const struct file_operations anonfd_fops = { + .owner = THIS_MODULE, +}; + +static int infiniband_open(struct inode *inode, struct file *filp) +{ + long fd; + + if (!!(filp->f_flags & O_REPAIR)) { + pr_info("reuse\n"); + return 0; + } + + fd = anon_inode_getfd("[infinibandevent]", &anonfd_fops, NULL, 0); + if (fd < 0) + return fd; + else + filp->private_data = (void *)fd; + + return 0; +} + +static int infiniband_repair(struct file *filp, int from) +{ + struct file *fp; + long fd; + int retval = 0; + + fp = anon_inode_getfile("[infinibandevent]", &anonfd_fops, NULL, 0); + if (IS_ERR(fp)) + return PTR_ERR(fp); + + fd = mures_f_dupfd(from, fp, 0); + if (fd != from) { + pr_err("different fd, old: %d, dup: %ld\n", from, fd); + retval = -EEXIST; + } + fput(fp); + filp->private_data = (long *)fd; + + return retval; +} + +static long infiniband_ioctl(struct file *filp, unsigned int cmd, unsigned long argp) +{ + long retval = 0; + + switch (cmd) { + case IOCTL_CMD_NEEDREPAIR: + retval = (long )filp->private_data; + break; + case IOCTL_CMD_REPAIR: + retval = infiniband_repair(filp, argp); + break; + default: + pr_warn("wrong cmd\n"); + return -EINVAL; + } + return retval; +} + +static const struct file_operations infiniband_fops = { + .owner = THIS_MODULE, + .open = infiniband_open, + .unlocked_ioctl = infiniband_ioctl, + .compat_ioctl = infiniband_ioctl, +}; + +static struct miscdevice infiniband_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "infiniband_test", + .fops = &infiniband_fops, +}; + +static int __init infiniband_init(void) +{ + int retval; + + retval = mures_add_devname(infiniband_dev.name); + if (retval != 0) + goto out; + + retval = misc_register(&infiniband_dev); + if (retval != 0) + goto del_devname; + + return 0; + +del_devname: + mures_del_devname(infiniband_dev.name); +out: + return retval; +} + +static void __exit infiniband_exit(void) +{ + mures_del_devname(infiniband_dev.name); + misc_deregister(&infiniband_dev); + return; +} + +module_init(infiniband_init); +module_exit(infiniband_exit); +MODULE_LICENSE("GPL");
--- test/zdtm/customization/Makefile | 3 +- test/zdtm/customization/tcp00.c | 101 +++++++++++++++++++++++++++++ test/zdtm/customization/tcp00.desc | 1 + 3 files changed, 104 insertions(+), 1 deletion(-) create mode 100644 test/zdtm/customization/tcp00.c create mode 100644 test/zdtm/customization/tcp00.desc
diff --git a/test/zdtm/customization/Makefile b/test/zdtm/customization/Makefile index 728646b..1111908 100644 --- a/test/zdtm/customization/Makefile +++ b/test/zdtm/customization/Makefile @@ -13,7 +13,8 @@ TST_NOFILE = \ maps008 \ notifier00 \ chardev00 \ - infiniband_with_unix_sk + infiniband_with_unix_sk \ + tcp00
TST_FILE = \ maps00 \ diff --git a/test/zdtm/customization/tcp00.c b/test/zdtm/customization/tcp00.c new file mode 100644 index 0000000..d1ead82 --- /dev/null +++ b/test/zdtm/customization/tcp00.c @@ -0,0 +1,101 @@ +#include <stdio.h> +#include <stdbool.h> +#include <unistd.h> +#include <string.h> +#include <arpa/inet.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <netinet/tcp.h> +#include "zdtmtst.h" + +#define PORT 17173 + +const char *test_doc = "Test TCP SO_REUSEADDR checkpoint/restore using `share_{src,dst}_ports`"; + +static int sock_bind_and_listen(void) +{ + int serv_sk; + int optval = 1; + struct sockaddr_in serv = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_ANY), + .sin_port = htons(PORT), + }; + + serv_sk = socket(AF_INET, SOCK_STREAM, 0); + if (serv_sk < 0) { + pr_perror("server socket failed"); + exit(1); + } + + if (setsockopt(serv_sk, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(optval)) < 0) { + pr_perror("setsockopt"); + exit(1); + } + + if (bind(serv_sk, (struct sockaddr *)&serv, sizeof(serv)) < 0) { + pr_perror("bind"); + exit(1); + } + + if (listen(serv_sk, 5) != 0) { + pr_perror("listen"); + exit(1); + } + + return serv_sk; +} + +static void client_connect(void) +{ + int sk; + struct sockaddr_in sockaddr = { + .sin_family = AF_INET, + }; + + sk = socket(AF_INET, SOCK_STREAM, 0); + if (sk < 0) { + pr_perror("client socket failed"); + exit(1); + } + + sockaddr.sin_addr.s_addr = inet_addr("127.0.0.1"); + sockaddr.sin_port = htons(PORT); + + if (connect(sk, (struct sockaddr *)&sockaddr, sizeof(sockaddr)) < 0) { + pr_perror("connect failed"); + exit(1); + } + + close(sk); +} + +int main(int argc, char *argv[]) +{ + int serv_sk; + int optval = 0; + socklen_t len = sizeof(optval); + + test_init(argc, argv); + + serv_sk = sock_bind_and_listen(); + + test_msg("listen 0.0.0.0: %d\n", PORT); + /* create CLOSE-WAIT status socket */ + client_connect(); + + test_daemon(); + test_waitsig(); + + if (getsockopt(serv_sk, SOL_SOCKET, SO_REUSEADDR, &optval, &len) != 0) { + pr_perror("getsockopt failed"); + return -1; + } + + if (optval != 1) { + pr_err("SO_REUSEADDR flag is %d, should 1", optval); + } else + pass(); + + return 0; +} \ No newline at end of file diff --git a/test/zdtm/customization/tcp00.desc b/test/zdtm/customization/tcp00.desc new file mode 100644 index 0000000..bc3b4a8 --- /dev/null +++ b/test/zdtm/customization/tcp00.desc @@ -0,0 +1 @@ +{'arch': 'aarch64', 'opts': '--use-fork-pid --share-src-ports=17173 --share-dst-ports=17173 --skip-in-flight', 'flavor': 'h', 'sysfs': '/sys/kernel/repair_share_socket'}
- fix pin memory pin failure caused by no pagemap-read - fix taint error msg
Signed-off-by: fu.lin fulin10@huawei.com --- test/zdtm.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/test/zdtm.py b/test/zdtm.py index 2a29400..ad9ad3c 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -2006,6 +2006,9 @@ class Launcher: elif not self.check_module(mod): subprocess.check_call(["modprobe", "pin_memory"])
+ cmd = [opts["criu_bin"], "init-pagemap-read"] + subprocess.check_call(cmd, shell=False) + def build_and_load_mod(self, target, kdir): if platform.machine() != "aarch64" or not target: return @@ -2039,9 +2042,10 @@ class Launcher: with open("/proc/sys/kernel/tainted") as taintfd: taint = taintfd.read() # 0x1000 means the out of tree module has been loaded - if self.__taint != taint and (int(self.__taint) | 0x1000) != int(taint): + # 0x2000 means the unsigned module was loaded + if self.__taint != taint and (int(self.__taint) | 0x3000) != int(taint): raise Exception("The kernel is tainted: %r (%r)" % - (taint, self.__taint)) + (taint, str(int(self.__taint) | 0x3000)))
if test_flag(desc, 'excl'): self.wait_all()
Signed-off-by: fu.lin fulin10@huawei.com --- test/modules/Makefile | 21 ++++++ test/modules/idr.c | 79 +++++++++++++++++++++ test/modules/jump_table.c | 107 ++++++++++++++++++++++++++++ test/modules/var_kern.c | 72 +++++++++++++++++++ test/modules/var_user.py | 40 +++++++++++ test/modules/workqueue_kern.c | 130 ++++++++++++++++++++++++++++++++++ 6 files changed, 449 insertions(+) create mode 100644 test/modules/Makefile create mode 100644 test/modules/idr.c create mode 100644 test/modules/jump_table.c create mode 100644 test/modules/var_kern.c create mode 100644 test/modules/var_user.py create mode 100644 test/modules/workqueue_kern.c
diff --git a/test/modules/Makefile b/test/modules/Makefile new file mode 100644 index 0000000..9458aa7 --- /dev/null +++ b/test/modules/Makefile @@ -0,0 +1,21 @@ +obj-m := var_kern.o workqueue_kern.o jump_table.o idr.o + +KDIR := /lib/modules/`uname -r`/build + +all: + make -C $(KDIR) M=$(PWD) modules + +clean: + make -C $(KDIR) M=$(PWD) clean + +var_kern.ko: + make -C $(KDIR) M=$(PWD) var_kern.ko + +workqueue_kern.ko: + make -C $(KDIR) M=$(PWD) workqueue_kern.ko + +jump_table.ko: + make -C $(KDIR) M=$(PWD) jump_table.ko + +idr.ko: + make -C $(KDIR) M=$(PWD) idr.ko diff --git a/test/modules/idr.c b/test/modules/idr.c new file mode 100644 index 0000000..67f248e --- /dev/null +++ b/test/modules/idr.c @@ -0,0 +1,79 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/idr.h> +#include <linux/modrestore.h> + +DEFINE_IDR(idr_head); +const int placeholder = 0; +static int idr_uid = 0; + +static int idr_test_show_internal(int id, void *p, void *data) +{ + pr_info("id: %d p %pK\n", id, p); + sprintf(data+strlen(data), "%d\n", id); + return 0; +} + +static ssize_t idr_test_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + idr_for_each(&idr_head, idr_test_show_internal, buf); + return strlen(buf); +} + +static ssize_t idr_test_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + const unsigned long max = 65536; + unsigned id = 0; + int retval; + + if (sscanf(buf, "%u", &id) != 1) { + pr_err("sscanf empty\n"); + return -EINVAL; + } + + retval = idr_alloc_u32(&idr_head, (void *)&placeholder, &id, max, GFP_KERNEL); + pr_info("alloc idr id %u, errno %d\n", id, retval); + return retval < 0 ? retval : count; +} + +static struct kobj_attribute idr_test = __ATTR_RW(idr_test); + +static int __init mod_init(void) +{ + return sysfs_create_file(kernel_kobj, &idr_test.attr); +} + +static void __exit mod_exit(void) +{ + sysfs_remove_file(kernel_kobj, &idr_test.attr); + idr_destroy(&idr_head); + return; +} + +static int __init mod_resume(void) +{ + int retval = mures_restore_idr(idr_uid, &idr_head); + + if (retval == 0) + retval = sysfs_create_file(kernel_kobj, &idr_test.attr); + return retval; +} + +static int __exit mod_suspend(void) +{ + sysfs_remove_file(kernel_kobj, &idr_test.attr); + return mures_save_idr(idr_uid, &idr_head); +} + +module_init(mod_init); +module_exit(mod_exit); +module_resume(mod_resume); +module_suspend(mod_suspend); + +MODULE_LICENSE("GPL"); \ No newline at end of file diff --git a/test/modules/jump_table.c b/test/modules/jump_table.c new file mode 100644 index 0000000..8648c2a --- /dev/null +++ b/test/modules/jump_table.c @@ -0,0 +1,107 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/hashtable.h> +#include <linux/sysfs.h> +#include <linux/modrestore.h> + +struct func_node { + struct hlist_node hash; + unsigned long key; + unsigned long value; +}; + +static int status __attribute__((section(".resume_0"))); + +/* + * The `mures_vcall()` can't used in irq context because of the implementation. + * Therefore, we must generate cache. + */ +DEFINE_HASHTABLE(__ro_after_init cache, 2); + +static int foo(void) +{ + status += 1; + return status; +} + +static void *find_func(unsigned long addr); + +static ssize_t jp_test_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + int (*func)(void) = find_func((unsigned long)foo); + ssize_t count = 0; + + if (func == NULL) { + count = sprintf(buf, "Not Found\n"); + } else { + count = sprintf(buf, "%d", func()); + } + + return count; +} + +static struct kobj_attribute jp_test = __ATTR_RO(jp_test); + +struct func_node nodes[] __ro_after_init = { + { .key = (unsigned long)foo, }, +}; + +static void *find_func(unsigned long addr) +{ + struct func_node *obj; + int i; + + pr_info("finding addr: %lx\n", addr); + hash_for_each(cache, i, obj, hash) {\ + pr_info("found key: %lx, val: %lx\n", obj->key, obj->value); + if (obj->key == addr) + return (void *)obj->value; + } + + return NULL; +} + +static void __init build_cache(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(nodes); i++) { + nodes[i].value = mures_vcall(nodes[i].key); + hash_add(cache, &nodes[i].hash, nodes[i].key); + } +} + +static int __init mod_init(void) +{ + build_cache(); + return sysfs_create_file(kernel_kobj, &jp_test.attr); +} + +static void __exit mod_exit(void) +{ + sysfs_remove_file(kernel_kobj, &jp_test.attr); + return; +} + +static int __init mod_resume(void) +{ + build_cache(); + return sysfs_create_file(kernel_kobj, &jp_test.attr); +} + +static int __exit mod_suspend(void) +{ + sysfs_remove_file(kernel_kobj, &jp_test.attr); + return 0; +} + +module_init(mod_init); +module_exit(mod_exit); +module_resume(mod_resume); +module_suspend(mod_suspend); + +MODULE_LICENSE("GPL"); \ No newline at end of file diff --git a/test/modules/var_kern.c b/test/modules/var_kern.c new file mode 100644 index 0000000..4321e3b --- /dev/null +++ b/test/modules/var_kern.c @@ -0,0 +1,72 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/sysfs.h> + +/* test variable persistence */ + +static int mod_int __attribute__((section(".resume_0"))); +static char *mod_str1 __attribute__((section(".resume_1"))) = "init"; +static char *mod_str2 __attribute__((section(".resume_2"))) = "upgrade"; +static char *mod_str __attribute__((section(".resume_3"))); + +static ssize_t var_test_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + ssize_t count = 0; + + count += sprintf(buf, "%d", mod_int); + count += sprintf(buf+count, " %s", mod_str); + + return count; +} + +static struct kobj_attribute sysfs_var = __ATTR_RO(var_test); + +static __init int mod1_resume(void) +{ + mod_int += 1; + mod_str = mod_str2; + + pr_info("This is %s, index %d\n", __func__, mod_int); + + return sysfs_create_file(kernel_kobj, &sysfs_var.attr); +} + +static __exit int mod1_suspend(void) +{ + mod_int += 1; + + pr_info("This is %s, index %d\n", __func__, mod_int); + sysfs_remove_file(kernel_kobj, &sysfs_var.attr); + + return 0; +} + +static __init int mod1_init(void) +{ + mod_int = 0; + mod_str = mod_str1; + + pr_info("This is %s, index %d\n", __func__, mod_int); + + return sysfs_create_file(kernel_kobj, &sysfs_var.attr); +} + +static __exit void mod1_exit(void) +{ + mod_int += 1; + + pr_info("This is %s, index %d\n", __func__, mod_int); + sysfs_remove_file(kernel_kobj, &sysfs_var.attr); + + return; +} + +module_resume(mod1_resume); +module_suspend(mod1_suspend); +module_init(mod1_init); +module_exit(mod1_exit); +MODULE_LICENSE("GPL"); diff --git a/test/modules/var_user.py b/test/modules/var_user.py new file mode 100644 index 0000000..98c5193 --- /dev/null +++ b/test/modules/var_user.py @@ -0,0 +1,40 @@ +import unittest +import subprocess + + +class TestVarMethods(unittest.TestCase): + mod_name = "var_kern" + + def unload_mod(self): + with open("/proc/modules") as f: + for line in f.readlines(): + words = line.split() + if words[0] == self.mod_name: + subprocess.check_call(["rmmod", self.mod_name]) + break + + def setUp(self): + subprocess.check_call(["make", "var_kern.ko"]) + self.unload_mod() + + def tearDown(self): + mod = f"{self.mod_name}.ko" + self.unload_mod() + + def test_var(self): + mod = f"{self.mod_name}.ko" + subprocess.check_call(["insmod", mod]) + with open("/sys/kernel/var_test") as f: + line = f.readline() + self.assertEqual(line, "0 init") + subprocess.check_call(["rmmod", "-r", mod]) + subprocess.check_call(["rmmod", mod]) + subprocess.check_call(["insmod", "-r", mod]) + with open("/sys/kernel/var_test") as f: + line = f.readline() + self.assertEqual(line, "2 upgrade") + subprocess.check_call(["rmmod", mod]) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/modules/workqueue_kern.c b/test/modules/workqueue_kern.c new file mode 100644 index 0000000..cecfb8c --- /dev/null +++ b/test/modules/workqueue_kern.c @@ -0,0 +1,130 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/sysfs.h> +#include <linux/delay.h> +#include <linux/modrestore.h> + +struct mod_status { + struct workqueue_struct *wq; +}; + +static struct workqueue_struct *wq; +static int wq_status __attribute__((section(".resume_0"))); + +static void worker_func(struct work_struct *work) +{ + wq_status += 1; + pr_info("worker run...\n"); + mdelay(100); + pr_info("worker end.\n"); + kfree(work); +} + +static ssize_t wq_test_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + flush_workqueue(wq); + return sprintf(buf, "%pK %d", wq, wq_status); +} + +static struct kobj_attribute wq_test = __ATTR_RO(wq_test); + +static int __init mod_init(void) +{ + int retval; + + retval = sysfs_create_file(kernel_kobj, &wq_test.attr); + if (retval != 0) { + pr_err("sysfs_create_file failed.\n"); + return retval; + } + + wq = alloc_workqueue("workqueue_kern_test", WQ_UNBOUND, 0); + if (wq == NULL) { + pr_err("unable to allocate workqueue\n"); + sysfs_remove_file(kernel_kobj, &wq_test.attr); + retval = -ENOMEM; + goto out; + } + + retval = 0; +out: + return retval; +} + +static void __exit mod_exit(void) +{ + destroy_workqueue(wq); + sysfs_remove_file(kernel_kobj, &wq_test.attr); +} + +static int __init mod_resume(void) +{ + struct mod_status *data; + int retval; + + data = get_module_state_space(KBUILD_MODNAME, NULL); + if (!data) { + pr_info("get_module_state_space failure\n"); + return -ENOMEM; + } + wq = data->wq; + + retval = sysfs_create_file(kernel_kobj, &wq_test.attr); + if (retval != 0) { + pr_err("sysfs_create_file failed.\n"); + return retval; + } + + return resume_workqueue(wq); +} + +static int __exit queue_worker(void) +{ + struct delayed_work *worker = kzalloc(sizeof(struct work_struct), GFP_KERNEL); + + if (worker == NULL) { + pr_err("alloc worker space failed\n"); + return -ENOMEM; + } + + INIT_DELAYED_WORK(worker, worker_func); + queue_delayed_work(wq, worker, 100); + return 0; +} + +static int __exit mod_suspend(void) +{ + struct mod_status *data; + int retval; + + data = alloc_module_state_space(KBUILD_MODNAME, sizeof(*data)); + if (data == NULL) { + pr_err("alloc_module_state_space failed\n"); + return -ENOMEM; + } + + data->wq = wq; + if (queue_worker() != 0) + return -ENOMEM; + + retval = suspend_workqueue(wq); + if (retval != 0) { + pr_err("suspend workqueue failed\n"); + return retval; + } + + sysfs_remove_file(kernel_kobj, &wq_test.attr); + return 0; +} + +module_init(mod_init); +module_exit(mod_exit); +module_resume(mod_resume); +module_suspend(mod_suspend); + +MODULE_LICENSE("GPL"); \ No newline at end of file
Theory: * The export symbol CRCs source: - /boot/symvers-$(uname -r).gz for Image and in tree modules: the ima mechanism could ensure the file credibility and non-tamper. - ELF section `.symtab` for out of tree modules: the export symbols has `__crc_` prefix, and `st_shndx` is `SHN_ABS` * compare CRC value between the known and the module
Design Details: - collect export symbols from * collect in tree symbols from `/boot/symvers-<release>.gz` * collect out of tree module symbols from the module self - compare external symbols stored in `__versions` section for each module
Usage: python3 -m upgchk.kabichk \ [[-r <kernel release>],...] \ [[-m <modname>],...] \ -c <modname> Example: python3 -m upgchk.kabichk -c /lib/modules/$(uname -r)/kernel/fs/mbcache.ko python3 -m upgchk.kabichk -m notify.ko -c osp_proc.ko
Note: The pyelftools library can't be import, therefore using elfutils wrapper to replace the library.
Signed-off-by: fu.lin fulin10@huawei.com --- upgchk/Makefile | 23 ++++ upgchk/lib/modsym.c | 268 ++++++++++++++++++++++++++++++++++++++ upgchk/lib/modsym.h | 39 ++++++ upgchk/setup.py | 20 +++ upgchk/upgchk/__init__.py | 11 ++ upgchk/upgchk/kabichk.py | 163 +++++++++++++++++++++++ 6 files changed, 524 insertions(+) create mode 100644 upgchk/Makefile create mode 100644 upgchk/lib/modsym.c create mode 100644 upgchk/lib/modsym.h create mode 100644 upgchk/setup.py create mode 100644 upgchk/upgchk/__init__.py create mode 100644 upgchk/upgchk/kabichk.py
diff --git a/upgchk/Makefile b/upgchk/Makefile new file mode 100644 index 0000000..df6b60e --- /dev/null +++ b/upgchk/Makefile @@ -0,0 +1,23 @@ +.PHONY: build install clean + +PYTHON=/usr/bin/python3 +TEST= +PARAMETERS= + +build: + ${PYTHON} setup.py build + +dist: + ${PYTHON} setup.py sdist + +install: + ${PYTHON} setup.py install + +clean: + ${PYTHON} setup.py clean + rm -rf \ + build \ + dist \ + upgchk/__pycache__ \ + upgchk/*.so \ + upgchk.egg-info diff --git a/upgchk/lib/modsym.c b/upgchk/lib/modsym.c new file mode 100644 index 0000000..eb75f68 --- /dev/null +++ b/upgchk/lib/modsym.c @@ -0,0 +1,268 @@ +#define PY_SSIZE_T_CLEAN +#include <Python.h> + +#include <fcntl.h> +#include <stdio.h> +#include <string.h> +#include <gelf.h> + +#include "modsym.h" + +static Elf_Data *get_elf_sec_data(Elf *elf, const char *sec_name) +{ + Elf_Scn *scn = NULL; + size_t strndx; + GElf_Shdr mem; + GElf_Shdr *shdr; + const char *name; + + /* To get the section names. */ + if (elf_getshdrstrndx(elf, &strndx) != 0) + return NULL; + + while ((scn = elf_nextscn(elf, scn)) != NULL) { + shdr = gelf_getshdr(scn, &mem); + name = elf_strptr (elf, strndx, shdr->sh_name); + + if (strcmp(name, sec_name) == 0) + return elf_getdata(scn, NULL); + } + + return NULL; +} + +static void modvers_dealloc(PyObject *obj) +{ + ModVersState *mvgstate = (ModVersState *)obj; + + elf_end(mvgstate->elf); + return; +} + +static PyObject *modvers_iternext(PyObject *obj) +{ + ModVersState *mvgstate = (ModVersState *)obj; + struct modversion_info *info = mvgstate->d->d_buf; + PyObject *elem = NULL; + + if (mvgstate->seq_index >= 0) { + size_t i = mvgstate->enum_index; + /* seq_index < 0 means that the generator is exhausted. + * Returning NULL in this case is enough. The next() builtin + * will raise the StopIteration error for us. + */ + elem = Py_BuildValue("(sk)", info[i].name, info[i].crc); + mvgstate->seq_index -= 1; + mvgstate->enum_index += 1; + } else { + /* The reference to the sequence is cleared in the first + * generator call after its exhaustion (after the call that + * returned the last element). + * Py_CLEAR will be harmless for subsequent calls since it's + * idempotent on NULL. + */ + mvgstate->seq_index = -1; + } + + return elem; +} + +static PyObject *modvers_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) +{ + ModVersState *mvgstate = NULL; + PyObject *file; + int fd; + Py_ssize_t len; + + if (!PyArg_ParseTuple(args, "O", &file)) + return NULL; + + fd = PyObject_AsFileDescriptor(file); + if (fd < 0) + return NULL; + + mvgstate = (ModVersState *)type->tp_alloc(type, 0); + if (mvgstate == NULL) + return NULL; + + elf_version(EV_CURRENT); + mvgstate->elf = elf_begin(fd, ELF_C_READ_MMAP, NULL); + if (mvgstate->elf == NULL) { + PyErr_Format(PyExc_TypeError, "File not usable: %s\n", elf_errmsg(-1)); + goto free; + } + + mvgstate->d = get_elf_sec_data(mvgstate->elf, VERS_SEC_NAME); + if (mvgstate->d == NULL) { + PyErr_Format(PyExc_TypeError, "Can't find ELF section `%s`\n", VERS_SEC_NAME); + goto elf_end; + } + + len = mvgstate->d->d_size / sizeof(struct modversion_info); + mvgstate->seq_index = len - 1; + mvgstate->enum_index = 0; + + return (PyObject *)mvgstate; + +elf_end: + elf_end(mvgstate->elf); +free: + type->tp_free(mvgstate); + return NULL; +} + +PyTypeObject PyModVersGen_Type = { + PyVarObject_HEAD_INIT(NULL, 0) + .tp_name = "modvers", + .tp_basicsize = sizeof(PyModVersGen_Type), + .tp_itemsize = 0, + .tp_dealloc = modvers_dealloc, + .tp_flags = Py_TPFLAGS_DEFAULT, + .tp_iter = PyObject_SelfIter, + .tp_iternext = modvers_iternext, + .tp_alloc = PyType_GenericAlloc, + .tp_new = modvers_new, +}; + +static void modcrcs_dealloc(PyObject *obj) +{ + ModCRCsState *mcgstate = (ModCRCsState *)obj; + + elf_end(mcgstate->elf); + return; +} + +static PyObject *modcrcs_iternext(PyObject *obj) +{ + ModCRCsState *mcgstate = (ModCRCsState *)obj; + const char *strtab = mcgstate->strtab->d_buf; + GElf_Sym *sym = mcgstate->symtab->d_buf; + PyObject *elem = NULL; + + while (mcgstate->seq_index >= 0) { + size_t i = mcgstate->enum_index; + const char *name = strtab + sym[i].st_name; + + mcgstate->seq_index -= 1; + mcgstate->enum_index += 1; + + /* + * If the symbol has '__crc_' prefix and absolute value, + * it's export symbol, and has CRC. + */ + if (strncmp(name, CRC_SYM_PREFIX, strlen(CRC_SYM_PREFIX)) == 0 + && sym[i].st_shndx == SHN_ABS) { + elem = Py_BuildValue("(sk)", + name+strlen(CRC_SYM_PREFIX), + sym[i].st_value); + break; + } + } + + return elem; +} + +static PyObject *modcrcs_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) +{ + ModCRCsState *mcgstate = NULL; + PyObject *file; + Elf_Data *d; + int fd; + Py_ssize_t len; + + if (!PyArg_ParseTuple(args, "O", &file)) + return NULL; + + fd = PyObject_AsFileDescriptor(file); + if (fd < 0) + return NULL; + + mcgstate = (ModCRCsState *)type->tp_alloc(type, 0); + if (mcgstate == NULL) + return NULL; + + elf_version(EV_CURRENT); + mcgstate->elf = elf_begin(fd, ELF_C_READ_MMAP, NULL); + if (mcgstate->elf == NULL) { + PyErr_Format(PyExc_TypeError, "File not usable: %s\n", elf_errmsg(-1)); + goto free; + } + + mcgstate->strtab = get_elf_sec_data(mcgstate->elf, STRT_SEC_NAME); + if (mcgstate->strtab == NULL) { + PyErr_Format(PyExc_TypeError, "Can't find ELF section `%s`\n", STRT_SEC_NAME); + goto elf_end; + } + + mcgstate->symtab = get_elf_sec_data(mcgstate->elf, SYMT_SEC_NAME); + if (mcgstate->symtab == NULL) { + PyErr_Format(PyExc_TypeError, "Can't find ELF section `%s`\n", SYMT_SEC_NAME); + goto elf_end; + } + + len = mcgstate->symtab->d_size / sizeof(GElf_Sym); + mcgstate->seq_index = len - 1; + mcgstate->enum_index = 0; + + return (PyObject *)mcgstate; + +elf_end: + elf_end(mcgstate->elf); +free: + type->tp_free(mcgstate); + return NULL; +} + +PyTypeObject PyModCRCsGen_Type = { + PyVarObject_HEAD_INIT(NULL, 0) + .tp_name = "modcrcs", + .tp_basicsize = sizeof(PyModCRCsGen_Type), + .tp_itemsize = 0, + .tp_dealloc = modcrcs_dealloc, + .tp_flags = Py_TPFLAGS_DEFAULT, + .tp_iter = PyObject_SelfIter, + .tp_iternext = modcrcs_iternext, + .tp_alloc = PyType_GenericAlloc, + .tp_new = modcrcs_new, +}; + +/* Module structure */ +/* Module structure */ +static struct PyModuleDef modvers_module = { + PyModuleDef_HEAD_INIT, + .m_name = "modsym", + .m_doc = "iter `" VERS_SEC_NAME "` section items", + .m_size = -1, +}; + +/* Module initialization function */ +PyMODINIT_FUNC PyInit_modsym(void) +{ + PyObject *m = PyModule_Create(&modvers_module); + if (m == NULL) + return NULL; + + if (PyType_Ready(&PyModVersGen_Type) < 0) + return NULL; + + Py_INCREF(&PyModVersGen_Type); + if (PyModule_AddObject(m, PyModVersGen_Type.tp_name, + (PyObject *)&PyModVersGen_Type) < 0) + goto free_vers; + + if (PyType_Ready(&PyModCRCsGen_Type) < 0) + goto free_vers; + + Py_INCREF(&PyModCRCsGen_Type); + if (PyModule_AddObject(m, PyModCRCsGen_Type.tp_name, + (PyObject *)&PyModCRCsGen_Type) < 0) + goto free_crcs; + + return m; +free_crcs: + Py_DECREF(&PyModCRCsGen_Type); +free_vers: + Py_DECREF(&PyModVersGen_Type); + Py_DECREF(m); + return NULL; +} diff --git a/upgchk/lib/modsym.h b/upgchk/lib/modsym.h new file mode 100644 index 0000000..b8069c3 --- /dev/null +++ b/upgchk/lib/modsym.h @@ -0,0 +1,39 @@ +#ifndef __PYTHON_MODSYM_H__ +#define __PYTHON_MODSYM_H__ + +#include <libelf.h> + +typedef struct { + PyObject_HEAD + Py_ssize_t seq_index; + Py_ssize_t enum_index; + Elf *elf; + Elf_Data *d; +} ModVersState; + +#define VERS_SEC_NAME "__versions" + +/* --- the following is copied from linux src --- */ +#define MAX_PARAM_PREFIX_LEN (64 - sizeof(unsigned long)) +#define MODULE_NAME_LEN MAX_PARAM_PREFIX_LEN + +struct modversion_info { + unsigned long crc; + char name[MODULE_NAME_LEN]; +}; +/* --- end --- */ + +typedef struct { + PyObject_HEAD + Py_ssize_t seq_index; + Py_ssize_t enum_index; + Elf *elf; + Elf_Data *strtab; + Elf_Data *symtab; +} ModCRCsState; + +#define STRT_SEC_NAME ".strtab" +#define SYMT_SEC_NAME ".symtab" +#define CRC_SYM_PREFIX "__crc_" + +#endif /* __PYTHON_MODSYM_H__ */ diff --git a/upgchk/setup.py b/upgchk/setup.py new file mode 100644 index 0000000..6758c95 --- /dev/null +++ b/upgchk/setup.py @@ -0,0 +1,20 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + +from setuptools import setup, Extension + +if __name__ == "__main__": + + setup(name="upgchk", + version="0.1", + description="Check the kernel upgrading environment", + + packages=["upgchk"], + ext_modules=[ + Extension("modsym", + sources=["lib/modsym.c"], + libraries=["elf"]) + ], + + python_requires='>=3.6', + ) diff --git a/upgchk/upgchk/__init__.py b/upgchk/upgchk/__init__.py new file mode 100644 index 0000000..c831e1d --- /dev/null +++ b/upgchk/upgchk/__init__.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- + +""" +.. module:: upgchk + :synopsis: Check the kernel upgrading environment +""" + +__title = "upgchk" +__description = "Check the upgrade environment" +__license__ = "GPL-2.0-or-later or LGPL-2.1-only" +__version__ = "0.1" diff --git a/upgchk/upgchk/kabichk.py b/upgchk/upgchk/kabichk.py new file mode 100644 index 0000000..cccacf3 --- /dev/null +++ b/upgchk/upgchk/kabichk.py @@ -0,0 +1,163 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + +''' +Theory: +- compare CRC value between the known and the module +- The export symbols CRC source: + * `/boot/symvers-<release>.gz` for in tree modules and Image + - the ima mechanism could ensure the file credibility and non-tamper + * The `.symtab` section for out of tree modules + - name format: `__crc_<symbol name>` + - it's absolute value, means: `sym->st_shndx == SHN_ABS` + +Design Details: +- collect export symbols from + * collect in tree symbols from `/boot/symvers-<release>.gz` + * collect out of tree module symbols from the module self +- compare external symbols stored in `__versions` section for each module + +`__versions` section data format: + + # define MAX_PARAM_PREFIX_LEN (64 - sizeof(unsigned long)) + # define MODULE_NAME_LEN MAX_PARAM_PREFIX_LEN + + struct modversion_info { + unsigned long crc; + char name[MODULE_NAME_LEN]; + }; + +Usage: + python3 -m upgchk.kabichk \ + [[-r <kernel release>],...] \ + [[-m <modname>],...] \ + -c <modname> +Example: + python3 -m upgchk.kabichk -c /lib/modules/$(uname -r)/kernel/fs/mbcache.ko + python3 -m upgchk.kabichk -m notify.ko -c osp_proc.ko +''' + +import argparse +import gzip +import pathlib +import platform +from typing import Tuple + +import modsym + +__all__ = ["KABI"] + +ELF_SELFMAG = 4 +ELF_ELFMAG = b"\177ELF" + + +class KABI: + def __init__(self, version: str): + """ + read all symbols of the specific kernel + """ + self._symbols = dict() + filename = f"symvers-{version}.gz" + filepath = pathlib.Path("/boot/").joinpath(filename) + + with gzip.open(filepath, "rt") as f: + for line in f.readlines(): + # (crc, sym, loc, type) + (_crc, sym, loc, _) = line.split() + crc = int(_crc, 16) # convert hex crc to integer + self._insert(sym, (crc, sym, loc)) + + def _insert(self, key: str, val: Tuple[int, str, str]): + inst = self._symbols.get(key) + if inst is None: + self._symbols[key] = val + elif inst != val: + raise KeyError( + f"{key} already exits value {self._symbols[key]}, can't insert new value {val}") + + def _get(self, key: str) -> Tuple[int, str, str]: + return self._symbols.get(key) + + def _parse_mod_vers(self, filepath: pathlib.Path) -> Tuple[int, str]: + with open(filepath, "rb") as f: + magic = f.read(ELF_SELFMAG) + if magic != ELF_ELFMAG: + raise TypeError(f"{filepath} isn't an ELF file") + + for sym, crc in modsym.modvers(f): + yield (sym, crc) + + def check_mod_syms(self, filepath: pathlib.Path) -> Tuple[bool, str]: + if not filepath.exists(): + raise FileNotFoundError(f"{filepath} isn't found") + + for sym, crc in self._parse_mod_vers(filepath): + val = self._get(sym) + if val is None: + msg = f"symbol {sym} isn't known" + return (False, msg) + elif val[0] != crc: + msg = f"symbol {sym} CRC should be {hex(crc)}, but {hex(val[0])}" + return (False, msg) + + return (True, "") + + def _parse_mod_crcs(self, filepath: pathlib.Path) -> Tuple[int, str]: + with open(filepath, "rb") as f: + magic = f.read(ELF_SELFMAG) + if magic != ELF_ELFMAG: + raise TypeError(f"{filepath} isn't an ELF file") + + for inst in modsym.modcrcs(f): + yield inst + + def add_mod_crcs(self, filepath: pathlib.Path): + if not filepath.exists(): + raise FileNotFoundError(f"{filepath} isn't found") + + modname = filepath.name[:-3] + for (sym, crc) in self._parse_mod_crcs(filepath): + self._insert(sym, (crc, sym, modname)) + + +def parse_argument() -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument("-r", "--release", action="store", + required=False, default=platform.release(), + help="specific the kernel release version") + parser.add_argument("-m", "--module", action="append", + required=False, default=[], + help="specific the out of tree modules") + parser.add_argument("-c", "--check", action="append", + required=True, + help="specific the checked module, e.g. -c a.ko -c b.ko") + options = parser.parse_args() + return (options.release, options.module, options.check) + + +def main(): + release, modules, checks = parse_argument() + kabi = KABI(release) + + for mod in modules: + filepath = pathlib.Path(mod) + kabi.add_mod_crcs(filepath) + + print("-------------- start check --------------") + passed = 0 + failed = 0 + for mod in checks: + filepath = pathlib.Path(mod) + modname = filepath.name + result, msg = kabi.check_mod_syms(filepath) + if not result: + print(f"module {modname} fail: {msg}") + failed += 1 + else: + print(f"module {modname} pass") + passed += 1 + print(f"-------------- {passed} pass, {failed} failed --------------") + + +if __name__ == '__main__': + main()
The line `Output file *.out appears to exist, aborting` is confusing. The one common reason is permission denied because of the test desc is lack of suid flag. The zdtm.py will set `ZDTM_UID` and `ZDTM_GID`, the function `test_init()` (in `zdtm/lib/test.c`) will change tester itself to that uid and gid if no suid flag.
Here print the errno when access *.out failed.
Signed-off-by: fu.lin fulin10@huawei.com --- test/zdtm/lib/test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/test/zdtm/lib/test.c b/test/zdtm/lib/test.c index e031357..dc34d99 100644 --- a/test/zdtm/lib/test.c +++ b/test/zdtm/lib/test.c @@ -74,8 +74,8 @@ static void test_fini(void) static void setup_outfile(void) { if (!access(outfile, F_OK) || errno != ENOENT) { - fprintf(stderr, "Output file %s appears to exist, aborting\n", - outfile); + fprintf(stderr, "Output file %s appears to exist, aborting: %s\n", + outfile, strerror(errno)); exit(1); }
--- test/zdtm.py | 66 ++++++++-- test/zdtm/customization/Makefile | 3 +- test/zdtm/customization/notifier00.c | 68 ++++++++++ test/zdtm/customization/notifier00.desc | 1 + test/zdtm/mod/.gitignore | 163 ++++++++++++++++++++++++ test/zdtm/mod/Makefile | 28 ++++ test/zdtm/mod/notifier.c | 145 +++++++++++++++++++++ 7 files changed, 464 insertions(+), 10 deletions(-) create mode 100644 test/zdtm/customization/notifier00.c create mode 100644 test/zdtm/customization/notifier00.desc create mode 100644 test/zdtm/mod/.gitignore create mode 100644 test/zdtm/mod/Makefile create mode 100644 test/zdtm/mod/notifier.c
diff --git a/test/zdtm.py b/test/zdtm.py index 111f9f1..2a29400 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -25,6 +25,7 @@ import tempfile import time import pathlib from builtins import (input, int, open, range, str, zip) +import platform
import pycriu as crpc
@@ -1460,6 +1461,13 @@ class criu: return True return False
+ @staticmethod + def check_sysfs(pathes): + for path in pathes.split(): + if not pathlib.Path(path).exists(): + return True + return False + @staticmethod def available(): if not os.access(opts['criu_bin'], os.X_OK): @@ -1983,17 +1991,45 @@ class Launcher: testline = u"ok %d - %s # SKIP %s" % (self.__runtest, name, reason) print(testline, file=self.__file_report)
+ def check_module(self, mod): + found = False + with open("/proc/modules") as f: + for line in f.readlines(): + if "pin_memory" == line.split()[0]: + found = True + return found + def modprobe_pin_memory(self, load): + mod = "pin_memory" if not load: return - else: - found = False - with open("/proc/modules") as f: - for line in f.readlines(): - if "pin_memory" == line.split()[0]: - found = True - if not found: - subprocess.check_call(["modprobe", "pin_memory"]) + elif not self.check_module(mod): + subprocess.check_call(["modprobe", "pin_memory"]) + + def build_and_load_mod(self, target, kdir): + if platform.machine() != "aarch64" or not target: + return + + if not os.access("zdtm/mod", os.R_OK): + print("should be executed in the test subdir") + sys.exit(0) + + dirpath = f"MOD={os.getcwd()}/zdtm/mod" + build_mod = ["make", "-C", "zdtm/mod", dirpath, target] + if kdir: + build_mod.append(f"KDIR={kdir}") + subprocess.check_call(build_mod) + + # ensure the module has been unloaded + if self.check_module(target.rstrip(".ko")): + subprocess.run(["rmmod", target], check=False) + + modpath = f"zdtm/mod/{target}" + subprocess.check_call(["insmod", modpath]) + + def unload_mod(self, mod): + if mod: + subprocess.check_call(["rmmod", mod])
def run_test(self, name, desc, flavor):
@@ -2034,6 +2070,8 @@ class Launcher: # `--use-fork-pid`, so don't care `--pin-memory` option self.modprobe_pin_memory(no_pid_ns)
+ self.build_and_load_mod(desc.get("mod", ""), opts["kdir"]) + sub = subprocess.Popen(["./zdtm_ct", "zdtm.py"], env=dict(os.environ, CR_CT_TEST_INFO=arg, ZDTM_NO_PID_NS=zdtm_no_pid_ns), @@ -2048,9 +2086,11 @@ class Launcher: }
# pin memory function don't support concurrency - if test_flag(desc, 'excl') or test_value(desc, "opts", "--pin-memory"): + if test_flag(desc, 'excl') or test_value(desc, "opts", "--pin-memory") or desc.get("mod", ""): self.wait()
+ self.unload_mod(desc.get("mod", "")) + def __wait_one(self, flags): pid = -1 status = -1 @@ -2390,6 +2430,11 @@ def run_tests(opts): t, f"cmdline '{cmdline}' isn't support, or don't set") continue
+ sysfs = tdesc.get('sysfs', '') + if sysfs and criu.check_sysfs(sysfs): + launcher.skip(t, f"sysfs file {sysfs} don't exist") + continue + test_flavs = tdesc.get('flavor', 'h ns uns').split() opts_flavs = (opts['flavor'] or 'h,ns,uns').split(',') if opts_flavs != ['best']: @@ -2412,6 +2457,7 @@ def run_tests(opts): launcher.run_test(t, tdesc, run_flavs) else: launcher.skip(t, "no flavors") + finally: fail = launcher.finish() if opts['join_ns']: @@ -2701,6 +2747,8 @@ rp.add_argument("--pre-dump-mode", rp.add_argument("--kdat", help="Path to criu.kdat, default '/run/criu.kdat'", default="/run/criu.kdat") +rp.add_argument( + "--kdir", help="specific kernel devel path, the default value is `/lib/modules/$(uname -r)/build`")
lp = sp.add_parser("list", help="List tests") lp.set_defaults(action=list_tests) diff --git a/test/zdtm/customization/Makefile b/test/zdtm/customization/Makefile index 82348f2..93922c7 100644 --- a/test/zdtm/customization/Makefile +++ b/test/zdtm/customization/Makefile @@ -10,7 +10,8 @@ TST_NOFILE = \ maps04 \ maps05 \ maps007 \ - maps008 + maps008 \ + notifier00
TST_FILE = \ maps00 \ diff --git a/test/zdtm/customization/notifier00.c b/test/zdtm/customization/notifier00.c new file mode 100644 index 0000000..5fc3d54 --- /dev/null +++ b/test/zdtm/customization/notifier00.c @@ -0,0 +1,68 @@ +#include <stdio.h> +/* Historical reasons: in order to compatible with R10 */ +#define CONFIG_EULEROS_MODRESTORE_NOTIFY +#include <linux/modrestore.h> + +#include "zdtmtst.h" + +const char *test_doc = "Tests the basic function of the notifiers"; +static char *nvwa_notifiers[] = { + "PRE_FREEZE", + "FREEZE_TO_KILL", + "PRE_UPDATE_KERNEL", + "POST_UPDATE_KERNEL", + "UNFREEZE_TO_RUN", + "POST_RUN" +}; + +_Static_assert(sizeof(nvwa_notifiers)/sizeof(nvwa_notifiers[0]) == KUP_HOOK_MAX, "nvwa_notifiers number is wrong!"); + +int main(int argc, char *argv[]) +{ + int orig_values[KUP_HOOK_MAX] = {0}; + bool failure = false; + FILE *fp; + + test_init(argc, argv); + + fp = fopen("/sys/kernel/criu_notifier", "r"); + if (fp == NULL) { + pr_perror("fopen"); + return 1; + } + + for (int i = 0; i < KUP_HOOK_MAX; i++) + fscanf(fp, "%d ", orig_values+i); + + test_daemon(); + test_waitsig(); + + if (fseek(fp, 0, SEEK_SET) != 0) { + pr_perror("fseek"); + return 2; + } + + for (int i = 0; i < KUP_HOOK_MAX; i++) { + int val = 0; + int should = orig_values[i]+1; + + fscanf(fp, "%d ", &val); + + /* those are not called in criu */ + if (i == PRE_UPDATE_KERNEL || i == POST_UPDATE_KERNEL) + continue; + + if (val != should) { + pr_err("%s notifier is abnormal, it should be %d, but %d.\n", + nvwa_notifiers[i], should, val); + failure = true; + } + } + + if (failure) + fail("notifier is abnormal."); + else + pass(); + + return 0; +} diff --git a/test/zdtm/customization/notifier00.desc b/test/zdtm/customization/notifier00.desc new file mode 100644 index 0000000..1c6b512 --- /dev/null +++ b/test/zdtm/customization/notifier00.desc @@ -0,0 +1 @@ +{'arch': 'aarch64', 'opts': '--with-notifier', 'flavor': 'h', 'flags': 'suid', 'sysfs': '/sys/kernel/modrestore/nvwa_notifier', 'mod': 'notifier.ko'} diff --git a/test/zdtm/mod/.gitignore b/test/zdtm/mod/.gitignore new file mode 100644 index 0000000..7afd412 --- /dev/null +++ b/test/zdtm/mod/.gitignore @@ -0,0 +1,163 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# NOTE! Don't add files that are generated in specific +# subdirectories here. Add them in the ".gitignore" file +# in that subdirectory instead. +# +# NOTE! Please use 'git ls-files -i --exclude-standard' +# command after changing this file, to see if there are +# any tracked files which get ignored after the change. +# +# Normal rules (sorted alphabetically) +# +.* +*.a +*.asn1.[ch] +*.bin +*.bz2 +*.c.[012]*.* +*.dt.yaml +*.dtb +*.dtbo +*.dtb.S +*.dwo +*.elf +*.gcno +*.gz +*.i +*.ko +*.lex.c +*.ll +*.lst +*.lz4 +*.lzma +*.lzo +*.mod +*.mod.c +*.o +*.o.* +*.patch +*.s +*.so +*.so.dbg +*.su +*.symtypes +*.symversions +*.tab.[ch] +*.tar +*.xz +*.zst +Module.symvers +modules.order + +# +# Top-level generic files +# +/linux +/modules-only.symvers +/vmlinux +/vmlinux.32 +/vmlinux.map +/vmlinux.symvers +/vmlinux-gdb.py +/vmlinuz +/System.map +/Module.markers +/modules.builtin +/modules.builtin.modinfo +/modules.nsdeps + +# +# RPM spec file (make rpm-pkg) +# +/*.spec + +# +# Debian directory (make deb-pkg) +# +/debian/ + +# +# Snap directory (make snap-pkg) +# +/snap/ + +# +# tar directory (make tar*-pkg) +# +/tar-install/ + +# +# We don't want to ignore the following even if they are dot-files +# +!.clang-format +!.cocciconfig +!.get_maintainer.ignore +!.gitattributes +!.gitignore +!.mailmap + +# +# Generated include files +# +/include/config/ +/include/generated/ +/include/ksym/ +/arch/*/include/generated/ + +# stgit generated dirs +patches-* + +# quilt's files +patches +series + +# ctags files +tags +TAGS + +# cscope files +cscope.* +ncscope.* + +# gnu global files +GPATH +GRTAGS +GSYMS +GTAGS + +# id-utils files +ID + +*.orig +*~ +#*# + +# +# Leavings from module signing +# +extra_certificates +signing_key.pem +signing_key.priv +signing_key.x509 +x509.genkey + +# Kconfig presets +/all.config +/alldef.config +/allmod.config +/allno.config +/allrandom.config +/allyes.config + +# Kconfig savedefconfig output +/defconfig + +# Kdevelop4 +*.kdev4 + +# Clang's compilation database file +/compile_commands.json + +# Documentation toolchain +sphinx_*/ diff --git a/test/zdtm/mod/Makefile b/test/zdtm/mod/Makefile new file mode 100644 index 0000000..10c9c9a --- /dev/null +++ b/test/zdtm/mod/Makefile @@ -0,0 +1,28 @@ +# notice: +# `ARCH` var is used in both criu and kernel, but they have the different value +# for the same architecture(e.g. arm64). Therefore, this Makefile can't be +# included in the criu Makefile. +obj-m += notifier.o + +# specific the kernel devel path +# example (use `/home/me/kernel` as `KDIR`): +# $ export KDIR="/home/me/kernel" +ifeq ($(KDIR),) + KDIR := /lib/modules/$(shell uname -r)/build +endif + +# specific the mod src path +ifeq ($(MOD),) + MOD := $(PWD) +endif + +all: + $(MAKE) -C $(KDIR) M=$(MOD) modules + +clean: + $(MAKE) -C $(KDIR) M=$(MOD) clean + +.PHONY: all clean + +notifier.ko: + $(MAKE) -C $(KDIR) M=$(MOD) notifier.ko diff --git a/test/zdtm/mod/notifier.c b/test/zdtm/mod/notifier.c new file mode 100644 index 0000000..70a5b33 --- /dev/null +++ b/test/zdtm/mod/notifier.c @@ -0,0 +1,145 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/modrestore.h> + +static int values[KUP_HOOK_MAX]; +static char *nvwa_actions[] = { + "PREPARE", + "ROLLBACK", +}; +static char *nvwa_notifiers[] = { + "PRE_FREEZE", + "FREEZE_TO_KILL", + "PRE_UPDATE_KERNEL", + "POST_UPDATE_KERNEL", + "UNFREEZE_TO_RUN", + "POST_RUN" +}; + +static int nvwa_notifier_func(struct notifier_block *nb, unsigned long val, void *data) +{ + struct nvwa_action *action = data; + + switch (action->cmd) { + case PREPARE: + values[val] += 1; + break; + case ROLLBACK: + values[val] -= 1; + break; + default: + pr_err("invalid cmd: %d", action->cmd); + return NOTIFY_BAD; + } + + pr_info("nvwa notifier action %s", nvwa_actions[action->cmd]); + + return NOTIFY_DONE; +} + +#define DEFINE_NVWA_NB(name) \ + static struct notifier_block nvwa_##name##_nb = { \ + .notifier_call = nvwa_notifier_func, \ + } + +DEFINE_NVWA_NB(pre_freeze); +DEFINE_NVWA_NB(freeze_to_kill); +DEFINE_NVWA_NB(pre_update_kernel); +DEFINE_NVWA_NB(post_update_kernel); +DEFINE_NVWA_NB(unfreeze_to_run); +DEFINE_NVWA_NB(post_run); + +static struct notifier_block *nvwa_nbs[] = { + &nvwa_pre_freeze_nb, + &nvwa_freeze_to_kill_nb, + &nvwa_pre_update_kernel_nb, + &nvwa_post_update_kernel_nb, + &nvwa_unfreeze_to_run_nb, + &nvwa_post_run_nb, +}; + +static int register_nvwa_notifiers(void) +{ + int i; + + BUILD_BUG_ON_MSG(ARRAY_SIZE(nvwa_nbs) != KUP_HOOK_MAX, + "wrong nvwa notifier block size!"); + + for (i = 0; i < ARRAY_SIZE(nvwa_nbs); i++) { + if (register_nvwa_notifier(i, nvwa_nbs[i]) != 0) { + pr_err("register nvwa %s notifier failed!", nvwa_notifiers[i]); + goto error; + } + } + + return 0; + +error: + + for (i -= 1; i >= 0; i -= 1) + unregister_nvwa_notifier(i, nvwa_nbs[i]); + + return -1; +} + +static void unregister_nvwa_notifiers(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(nvwa_nbs); i++) + unregister_nvwa_notifier(i, nvwa_nbs[i]); +} + +static ssize_t criu_notifier_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(values); i++) + values[i] = 0; + + return count; +} + +static ssize_t criu_notifier_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + int i; + ssize_t count = 0; + + for (i = 0; i < ARRAY_SIZE(values); i++) + count += sprintf(buf+count, "%d ", values[i]); + + buf[count-1] = '\n'; + + return count; +} + +static struct kobj_attribute notifier_file = __ATTR_RW(criu_notifier); + +static int __init notifier_init(void) +{ + if (register_nvwa_notifiers() != 0) + return -1; + + if (sysfs_create_file(kernel_kobj, ¬ifier_file.attr) != 0) { + unregister_nvwa_notifiers(); + return -1; + } + + return 0; +} + +static void __exit notifier_exit(void) +{ + sysfs_remove_file(kernel_kobj, ¬ifier_file.attr); + unregister_nvwa_notifiers(); +} + +module_init(notifier_init); +module_exit(notifier_exit); +MODULE_LICENSE("GPL");
Signed-off-by: fu.lin fulin10@huawei.com --- test/zdtm.py | 68 ++- test/zdtm/customization/Makefile | 23 +- test/zdtm/customization/get_smaps_bits.c | 127 +++++ test/zdtm/customization/get_smaps_bits.h | 6 + test/zdtm/customization/ipc.desc | 2 +- test/zdtm/customization/maps00.c | 271 +++++++++++ test/zdtm/customization/maps00.desc | 1 + test/zdtm/customization/maps007.c | 178 +++++++ test/zdtm/customization/maps007.desc | 1 + test/zdtm/customization/maps008.c | 514 ++++++++++++++++++++ test/zdtm/customization/maps008.desc | 1 + test/zdtm/customization/maps01.c | 183 +++++++ test/zdtm/customization/maps01.desc | 1 + test/zdtm/customization/maps02.c | 111 +++++ test/zdtm/customization/maps02.desc | 1 + test/zdtm/customization/maps04.c | 57 +++ test/zdtm/customization/maps04.desc | 1 + test/zdtm/customization/maps05.c | 91 ++++ test/zdtm/customization/maps05.desc | 1 + test/zdtm/customization/maps06.c | 70 +++ test/zdtm/customization/maps06.desc | 1 + test/zdtm/customization/maps_file_prot.c | 53 ++ test/zdtm/customization/maps_file_prot.desc | 1 + test/zdtm_ct.c | 13 +- 24 files changed, 1765 insertions(+), 11 deletions(-) create mode 100644 test/zdtm/customization/get_smaps_bits.c create mode 100644 test/zdtm/customization/get_smaps_bits.h create mode 100644 test/zdtm/customization/maps00.c create mode 100644 test/zdtm/customization/maps00.desc create mode 100644 test/zdtm/customization/maps007.c create mode 100644 test/zdtm/customization/maps007.desc create mode 100644 test/zdtm/customization/maps008.c create mode 100644 test/zdtm/customization/maps008.desc create mode 100644 test/zdtm/customization/maps01.c create mode 100644 test/zdtm/customization/maps01.desc create mode 100644 test/zdtm/customization/maps02.c create mode 100644 test/zdtm/customization/maps02.desc create mode 100644 test/zdtm/customization/maps04.c create mode 100644 test/zdtm/customization/maps04.desc create mode 100644 test/zdtm/customization/maps05.c create mode 100644 test/zdtm/customization/maps05.desc create mode 100644 test/zdtm/customization/maps06.c create mode 100644 test/zdtm/customization/maps06.desc create mode 100644 test/zdtm/customization/maps_file_prot.c create mode 100644 test/zdtm/customization/maps_file_prot.desc
diff --git a/test/zdtm.py b/test/zdtm.py index 73de5ac..111f9f1 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -366,6 +366,9 @@ def test_flag(tdesc, flag): return flag in tdesc.get('flags', '').split()
+def test_value(tdesc, opt, val): + return val in tdesc.get(opt, '').split() + # # Exception thrown when something inside the test goes wrong, # e.g. test doesn't start, criu returns with non zero code or @@ -947,7 +950,8 @@ class criu_rpc: inhfd.fd = int(fd[3:-1]) inhfd.key = key else: - raise test_fail_exc('RPC for %s(%s) required' % (arg, args.pop(0))) + raise test_fail_exc('RPC for %s(%s) required' % + (arg, args.pop(0)))
@staticmethod def run(action, @@ -1438,6 +1442,24 @@ class criu: "check", ["--no-default-config", "-v0", "--feature", feature], opts['criu_bin']) == 0
+ @staticmethod + def check_cmdline(cmdline): + with open("/proc/cmdline") as f: + bootparams = f.readline().strip().split() + + for arg in cmdline.split(): + words = [word.strip("'" ") for word in arg.split('=')] + matched = False + for param in bootparams: + prefix = param.startswith(words[0]) + if (len(words) == 1 and prefix) \ + or (len(words) == 2 and prefix and param[len(words[0])+1:] == words[1]): + matched = True + break + if not matched: + return True + return False + @staticmethod def available(): if not os.access(opts['criu_bin'], os.X_OK): @@ -1509,6 +1531,11 @@ def cr(cr_api, test, opts):
iters = iter_parm(opts['iters'], 1) for i in iters[0]: + if "--pin-memory" in test.getdopts(): + print("Clear pin memory space") + cmd = [opts["criu_bin"], "clear-pin-memory"] + subprocess.run(cmd, shell=False, check=True) + pres = iter_parm(opts['pre'], 0) for p in pres[0]: if opts['snaps']: @@ -1956,6 +1983,18 @@ class Launcher: testline = u"ok %d - %s # SKIP %s" % (self.__runtest, name, reason) print(testline, file=self.__file_report)
+ def modprobe_pin_memory(self, load): + if not load: + return + else: + found = False + with open("/proc/modules") as f: + for line in f.readlines(): + if "pin_memory" == line.split()[0]: + found = True + if not found: + subprocess.check_call(["modprobe", "pin_memory"]) + def run_test(self, name, desc, flavor):
if len(self.__subs) >= self.__max: @@ -1963,7 +2002,8 @@ class Launcher:
with open("/proc/sys/kernel/tainted") as taintfd: taint = taintfd.read() - if self.__taint != taint: + # 0x1000 means the out of tree module has been loaded + if self.__taint != taint and (int(self.__taint) | 0x1000) != int(taint): raise Exception("The kernel is tainted: %r (%r)" % (taint, self.__taint))
@@ -1988,8 +2028,15 @@ class Launcher: logf = None log = None
+ no_pid_ns = test_value(desc, 'opts', '--use-fork-pid') + zdtm_no_pid_ns = "1" if no_pid_ns else "0" + # load `pin_memory.ko`,`--pin-memory` option must be used with + # `--use-fork-pid`, so don't care `--pin-memory` option + self.modprobe_pin_memory(no_pid_ns) + sub = subprocess.Popen(["./zdtm_ct", "zdtm.py"], - env=dict(os.environ, CR_CT_TEST_INFO=arg), + env=dict(os.environ, CR_CT_TEST_INFO=arg, + ZDTM_NO_PID_NS=zdtm_no_pid_ns), stdout=log, stderr=subprocess.STDOUT, close_fds=True) @@ -2000,7 +2047,8 @@ class Launcher: "start": time.time() }
- if test_flag(desc, 'excl'): + # pin memory function don't support concurrency + if test_flag(desc, 'excl') or test_value(desc, "opts", "--pin-memory"): self.wait()
def __wait_one(self, flags): @@ -2336,6 +2384,12 @@ def run_tests(opts): launcher.skip(t, "remote lazy pages are not supported") continue
+ cmdline = tdesc.get('cmdline', '') + if cmdline and criu.check_cmdline(cmdline): + launcher.skip( + t, f"cmdline '{cmdline}' isn't support, or don't set") + continue + test_flavs = tdesc.get('flavor', 'h ns uns').split() opts_flavs = (opts['flavor'] or 'h,ns,uns').split(',') if opts_flavs != ['best']: @@ -2365,6 +2419,7 @@ def run_tests(opts): if fail: sys.exit(1)
+ sti_fmt = "%-40s%-10s%s"
@@ -2644,8 +2699,8 @@ rp.add_argument("--pre-dump-mode", choices=['splice', 'read'], default='splice') rp.add_argument("--kdat", - help="Path to criu.kdat, default '/run/criu.kdat'", - default="/run/criu.kdat") + help="Path to criu.kdat, default '/run/criu.kdat'", + default="/run/criu.kdat")
lp = sp.add_parser("list", help="List tests") lp.set_defaults(action=list_tests) @@ -2680,6 +2735,7 @@ if opts['action'] == 'run': kdat = pathlib.Path(opts['kdat']) if kdat.exists(): kdat.unlink() + for tst in test_classes.values(): tst.available()
diff --git a/test/zdtm/customization/Makefile b/test/zdtm/customization/Makefile index 563b7b1..82348f2 100644 --- a/test/zdtm/customization/Makefile +++ b/test/zdtm/customization/Makefile @@ -3,9 +3,21 @@ LIB := $(LIBDIR)/libzdtmtst.a LDLIBS += $(LIB) CPPFLAGS += -I$(LIBDIR)
-TST = \ - ipc +TST_NOFILE = \ + ipc \ + maps01 \ + maps02 \ + maps04 \ + maps05 \ + maps007 \ + maps008
+TST_FILE = \ + maps00 \ + maps06 \ + maps_file_prot + +TST = $(TST_NOFILE) $(TST_FILE) SRC = $(TST:%=%.c) OBJ = $(SRC:%.c=%.o) DEP = $(SRC:%.c=%.d) @@ -18,9 +30,12 @@ all: $(TST) install: all .PHONY: all install
-$(TST:%=%.pid): %.pid: % +$(TST_NOFILE:%=%.pid): %.pid: % $(<D)/$(<F) --pidfile=$@ --outfile=$<.out
+$(TST_FILE:%=%.pid): %.pid: % + $(<D)/$(<F) --pidfile=$@ --outfile=$<.out --filename=$<.test + %.out: %.pid % -kill -TERM `cat $<`
@@ -43,6 +58,8 @@ wait_stop:
$(TST): | $(LIB)
+maps02: get_smaps_bits.o + %: %.sh cp $< $@ chmod +x $@ diff --git a/test/zdtm/customization/get_smaps_bits.c b/test/zdtm/customization/get_smaps_bits.c new file mode 100644 index 0000000..9253f4d --- /dev/null +++ b/test/zdtm/customization/get_smaps_bits.c @@ -0,0 +1,127 @@ +#include <string.h> +#include <sys/mman.h> +#include "zdtmtst.h" + +#ifndef MAP_HUGETLB +# define MAP_HUGETLB 0x40000 +#endif + +#ifndef MADV_HUGEPAGE +# define MADV_HUGEPAGE 14 +#endif + +#ifndef MADV_NOHUGEPAGE +# define MADV_NOHUGEPAGE 15 +#endif + +#ifndef MADV_DONTDUMP +# define MADV_DONTDUMP 16 +#endif + +static void parse_vmflags(char *buf, unsigned long *flags, unsigned long *madv) +{ + char *tok; + + if (!buf[0]) + return; + + tok = strtok(buf, " \n"); + if (!tok) + return; + +#define _vmflag_match(_t, _s) (_t[0] == _s[0] && _t[1] == _s[1]) + + do { + /* mmap() block */ + if (_vmflag_match(tok, "gd")) + *flags |= MAP_GROWSDOWN; + else if (_vmflag_match(tok, "lo")) + *flags |= MAP_LOCKED; + else if (_vmflag_match(tok, "nr")) + *flags |= MAP_NORESERVE; + else if (_vmflag_match(tok, "ht")) + *flags |= MAP_HUGETLB; + + /* madvise() block */ + if (_vmflag_match(tok, "sr")) + *madv |= (1ul << MADV_SEQUENTIAL); + else if (_vmflag_match(tok, "rr")) + *madv |= (1ul << MADV_RANDOM); + else if (_vmflag_match(tok, "dc")) + *madv |= (1ul << MADV_DONTFORK); + else if (_vmflag_match(tok, "dd")) + *madv |= (1ul << MADV_DONTDUMP); + else if (_vmflag_match(tok, "mg")) + *madv |= (1ul << MADV_MERGEABLE); + else if (_vmflag_match(tok, "hg")) + *madv |= (1ul << MADV_HUGEPAGE); + else if (_vmflag_match(tok, "nh")) + *madv |= (1ul << MADV_NOHUGEPAGE); + + /* + * Anything else is just ignored. + */ + } while ((tok = strtok(NULL, " \n"))); + +#undef _vmflag_match +} + +#define is_hex_digit(c) \ + (((c) >= '0' && (c) <= '9') || \ + ((c) >= 'a' && (c) <= 'f') || \ + ((c) >= 'A' && (c) <= 'F')) + +static int is_vma_range_fmt(char *line, unsigned long *start, unsigned long *end) +{ + char *p = line; + while (*line && is_hex_digit(*line)) + line++; + + if (*line++ != '-') + return 0; + + while (*line && is_hex_digit(*line)) + line++; + + if (*line++ != ' ') + return 0; + + sscanf(p, "%lx-%lx", start, end); + return 1; +} + +int get_smaps_bits(unsigned long where, unsigned long *flags, unsigned long *madv) +{ + unsigned long start = 0, end = 0; + FILE *smaps = NULL; + char buf[1024]; + int found = 0; + + if (!where) + return 0; + + smaps = fopen("/proc/self/smaps", "r"); + if (!smaps) { + pr_perror("Can't open smaps"); + return -1; + } + + while (fgets(buf, sizeof(buf), smaps)) { + is_vma_range_fmt(buf, &start, &end); + + if (!strncmp(buf, "VmFlags: ", 9) && start == where) { + found = 1; + parse_vmflags(buf, flags, madv); + break; + } + } + + fclose(smaps); + + if (!found) { + pr_perror("VmFlags not found for %lx", where); + return -1; + } + + return 0; +} diff --git a/test/zdtm/customization/get_smaps_bits.h b/test/zdtm/customization/get_smaps_bits.h new file mode 100644 index 0000000..ce1070d --- /dev/null +++ b/test/zdtm/customization/get_smaps_bits.h @@ -0,0 +1,6 @@ +#ifndef ZDTM_GET_SMAPS_BITS_H_ +#define ZDTM_GET_SMAPS_BITS_H_ + +extern int get_smaps_bits(unsigned long where, unsigned long *flags, unsigned long *madv); + +#endif /* ZDTM_GET_SMAPS_BITS_H_ */ diff --git a/test/zdtm/customization/ipc.desc b/test/zdtm/customization/ipc.desc index 63df42a..4c127a0 100644 --- a/test/zdtm/customization/ipc.desc +++ b/test/zdtm/customization/ipc.desc @@ -1 +1 @@ -{'flavor': 'h'} +{'arch': 'aarch64', 'flavor': 'h'} diff --git a/test/zdtm/customization/maps00.c b/test/zdtm/customization/maps00.c new file mode 100644 index 0000000..83533f8 --- /dev/null +++ b/test/zdtm/customization/maps00.c @@ -0,0 +1,271 @@ +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> +#include <stdlib.h> +#include <signal.h> +#include <string.h> +#include <sys/mman.h> +#include <setjmp.h> +#include <sys/types.h> +#include <sys/stat.h> +#include "zdtmtst.h" + +const char *test_doc = "Create all sorts of maps and compare /proc/pid/maps\n" + "before and after migration\n"; +const char *test_author = "Pavel Emelianov xemul@parallels.com"; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +const static int map_prots[] = { + PROT_NONE, + PROT_READ, + PROT_READ | PROT_WRITE, + PROT_READ | PROT_WRITE | PROT_EXEC, +}; +#define NUM_MPROTS sizeof(map_prots) / sizeof(int) +#define RW_PROT(x) ((x) & (PROT_READ | PROT_WRITE)) +#define X_PROT(x) ((x) & PROT_EXEC) + +int check_prot(int src_prot, int dst_prot) +{ + if (RW_PROT(src_prot) != RW_PROT(dst_prot)) + return 0; + /* If exec bit will be enabled may depend on NX capability of CPUs of + * source and destination nodes. In any case, migrated mapping should + * not have less permissions than newly created one + ** + * A is a subset of B iff (A & B) == A + */ + return (X_PROT(dst_prot) & X_PROT(src_prot)) == X_PROT(dst_prot); +} + +const static int map_flags[] = { + MAP_PRIVATE, + MAP_SHARED, + MAP_PRIVATE | MAP_ANONYMOUS, + MAP_SHARED | MAP_ANONYMOUS +}; +#define NUM_MFLAGS sizeof(map_flags) / sizeof(int) +#define NUM_MAPS NUM_MPROTS * NUM_MFLAGS +#define ONE_MAP_SIZE 0x2000 + +struct map +{ + int prot; + int prot_real; + int flag; + char filename[256]; + int fd; + void *ptr; +}; + +static void init_map(struct map *map, int prot_no, int flag_no) +{ + map->fd = -1; + map->prot = map_prots[prot_no]; + map->flag = map_flags[flag_no]; +} + +static int make_map(struct map *map) +{ + uint32_t crc; + uint8_t buf[ONE_MAP_SIZE]; + static int i = 0; + + if (!(map->flag & MAP_ANONYMOUS)) { + /* need file */ + if (snprintf(map->filename, sizeof(map->filename), + "%s-%02d", filename, i++) >= sizeof(map->filename)) { + pr_perror("filename %s is too long", filename); + return -1; + } + + map->fd = open(map->filename, O_RDWR | O_CREAT, 0600); + if (map->fd < 0) { + pr_perror("can't open %s", map->filename); + return -1; + } + + crc = ~0; + datagen(buf, sizeof(buf), &crc); + if (write(map->fd, buf, sizeof(buf)) != sizeof(buf)) { + pr_perror("failed to write %s", map->filename); + return -1; + } + } + + map->ptr = mmap(NULL, ONE_MAP_SIZE, map->prot, map->flag, map->fd, 0); + if (map->ptr == MAP_FAILED) { + pr_perror("can't create mapping"); + return -1; + } + + if ((map->flag & MAP_ANONYMOUS) && (map->prot & PROT_WRITE)) { + /* can't fill it with data otherwise */ + crc = ~0; + datagen(map->ptr, ONE_MAP_SIZE, &crc); + } + + test_msg("map: ptr %p flag %8x prot %8x\n", + map->ptr, map->flag, map->prot); + + return 0; +} + +static sigjmp_buf segv_ret; /* we need sig*jmp stuff, otherwise SIGSEGV will reset our handler */ +static void segfault(int signo) +{ + siglongjmp(segv_ret, 1); +} + +/* + * after test func should be placed check map, because size of test_func + * is calculated as (check_map-test_func) + */ +int test_func(void) +{ + return 1; +} +static int check_map(struct map *map) +{ + int prot = PROT_WRITE | PROT_READ | PROT_EXEC; + + if (signal(SIGSEGV, segfault) == SIG_ERR) + { + fail("setting SIGSEGV handler failed: %m\n"); + return -1; + } + if (!sigsetjmp(segv_ret, 1)) + { + uint32_t crc = ~0; + if (datachk(map->ptr, ONE_MAP_SIZE, &crc)) /* perform read access */ + if (!(map->flag & MAP_ANONYMOUS) || + (map->prot & PROT_WRITE)) { /* anon maps could only be filled when r/w */ + fail("CRC mismatch: ptr %p flag %8x prot %8x\n", + map->ptr, map->flag, map->prot); + return -1; + } + /* prot |= PROT_READ// need barrier before this line, + because compiler change order commands. + I finded one method: look at next lines*/ + } else + prot &= PROT_WRITE | !PROT_READ | PROT_EXEC; + + if (signal(SIGSEGV, segfault) == SIG_ERR) + { + fail("setting SIGSEGV handler failed: %m\n"); + return -1; + } + + if (!sigsetjmp(segv_ret, 1)) + { + * (int *) (map->ptr) = 1234; /* perform write access */ + } else + prot &= !PROT_WRITE | PROT_READ | PROT_EXEC; + + if (signal(SIGSEGV, segfault) == SIG_ERR) + { + fail("restoring SIGSEGV handler failed: %m\n"); + return -1; + } + + if (!sigsetjmp(segv_ret, 1)) + { + if (map->prot & PROT_WRITE) { + memcpy(map->ptr,test_func, ONE_MAP_SIZE); + __builtin___clear_cache(map->ptr, map->ptr+ONE_MAP_SIZE); + } else { + if (!(map->flag & MAP_ANONYMOUS)) { + uint8_t funlen = (uint8_t *)check_map - (uint8_t *)test_func; + lseek(map->fd,0,SEEK_SET); + if (write(map->fd,test_func,funlen)<funlen) { + pr_perror("failed to write %s", map->filename); + return -1; + } + } + } + if (!(map->flag & MAP_ANONYMOUS) || (map->prot & PROT_WRITE)) { + /* Function body has been copied into the mapping */ + ((int (*)(void))map->ptr)(); /* perform exec access */ + } else { + /* No way to copy function body into mapping, + * clear exec bit from effective protection + */ + prot &= PROT_WRITE | PROT_READ | !PROT_EXEC; + } + } else + prot &= PROT_WRITE | PROT_READ | !PROT_EXEC; + + if (signal(SIGSEGV, SIG_DFL) == SIG_ERR) + { + fail("restoring SIGSEGV handler failed: %m\n"); + return -1; + } + + return prot; +} + +static void destroy_map(struct map *map) +{ + munmap(map->ptr, ONE_MAP_SIZE); + + if (map->fd >= 0) + { + close(map->fd); + unlink(map->filename); + } +} + + +#define MAPS_LEN 0x10000 + +int main(int argc, char ** argv) +{ + struct map maps[NUM_MAPS] = {}, maps_compare[NUM_MAPS] = {}; + int i, j, k; + test_init(argc, argv); + + k = 0; + for (i = 0; i < NUM_MPROTS; i++) + for (j = 0; j < NUM_MFLAGS; j++) + init_map(maps + k++, i, j); + + for (i = 0; i < NUM_MAPS; i++) + if (make_map(maps + i)) + goto err; + + test_daemon(); + test_waitsig(); + + for (i = 0; i < NUM_MAPS; i++) + if ((maps[i].prot_real=check_map(maps + i))<0) + goto err; + k=0; + for (i = 0; i < NUM_MPROTS; i++) + for (j = 0; j < NUM_MFLAGS; j++) + init_map(maps_compare + k++, i, j); + for (i = 0; i < NUM_MAPS; i++) + if (make_map(maps_compare+ i)) + goto err; + for (i = 0; i < NUM_MAPS; i++) + if ((maps_compare[i].prot_real=check_map(maps_compare + i))<0) + goto err; + for (i = 0; i< NUM_MAPS; i++) + if (!check_prot(maps[i].prot_real, maps_compare[i].prot_real)){ + fail("protection on %i (flag=%d prot=%d) maps has changed (prot=%d(expected %d))", + i, maps[i].flag, maps[i].prot, maps[i].prot_real, maps_compare[i].prot_real); + goto err; + } + + pass(); + + for (i = 0; i < NUM_MAPS; i++) { + destroy_map(maps + i); + destroy_map(maps_compare + i); + } + return 0; + +err: + return 1; +} diff --git a/test/zdtm/customization/maps00.desc b/test/zdtm/customization/maps00.desc new file mode 100644 index 0000000..dad462e --- /dev/null +++ b/test/zdtm/customization/maps00.desc @@ -0,0 +1 @@ +{'arch': 'aarch64', 'flavor': 'h', 'opts': '--pin-memory --use-fork-pid', 'flags': 'suid', 'cmdline': 'pinmemory max_pin_pid_num'} diff --git a/test/zdtm/customization/maps007.c b/test/zdtm/customization/maps007.c new file mode 100644 index 0000000..ee5e7c7 --- /dev/null +++ b/test/zdtm/customization/maps007.c @@ -0,0 +1,178 @@ + +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <string.h> +#include <sys/mman.h> +#include <signal.h> +#include <sys/wait.h> +#include <sys/uio.h> +#include <asm/unistd.h> + +#include "zdtmtst.h" +#include "lock.h" + +#define MAP_SIZE (1UL << 20) +#define MEM_SIZE (1UL << 29) + +const char *test_doc = "create random mappings and touch memory"; + +int sys_process_vm_readv(pid_t pid, void *addr, void *buf, int size) +{ + struct iovec lvec = {.iov_base = buf, .iov_len = size }; + struct iovec rvec = {.iov_base = addr, .iov_len = size }; + /* workaround bug in glibc with sixth argument of syscall */ + char nop[PAGE_SIZE]; + + memset(nop, 0, sizeof(nop)); + + return syscall(__NR_process_vm_readv, pid, &lvec, 1, &rvec, 1, 0); +} + +/* The child follows the parents two steps behind. */ +#define MAX_DELTA 1000 +int main(int argc, char **argv) +{ + void *start, *end, *p; + pid_t child; + struct { + futex_t delta; + futex_t stop; + } *shm; + uint32_t v; + unsigned long long count = 0; + int i; + + test_init(argc, argv); + + /* shared memory for synchronization */ + shm = mmap(NULL, PAGE_SIZE, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_SHARED, -1, 0); + if (shm == MAP_FAILED) + return -1; + + /* allocate workspace */ + start = mmap(NULL, MEM_SIZE, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (start == MAP_FAILED) + return -1; + + test_msg("%p-%p\n", start, start + MEM_SIZE); + + end = start + MEM_SIZE; + + v = 0; + futex_set(&shm->delta, v); + futex_set(&shm->stop, 0); + + child = fork(); + if (child < 0) { + pr_perror("fork"); + return 1; + } + + while (1) { + void *ret; + unsigned long size; + int prot = PROT_NONE; + + if (child) { + if (!test_go()) + break; + futex_wait_while_gt(&shm->delta, 2 * MAX_DELTA); + futex_inc_and_wake(&shm->delta); + } else { + if (!futex_get(&shm->stop)) + /* shm->delta must be always bigger than MAX_DELTA */ + futex_wait_while_lt(&shm->delta, MAX_DELTA + 2); + else if (count % 100 == 0) + test_msg("count %llu delta %d\n", + count, futex_get(&shm->delta)); /* heartbeat */ + + if (futex_get(&shm->stop) && atomic_get(&shm->delta.raw) == MAX_DELTA) + break; + futex_dec_and_wake(&shm->delta); + } + + count++; + if (child && count == MAX_DELTA + 1) + test_daemon(); + + p = start + ((lrand48() * PAGE_SIZE) % MEM_SIZE); + size = lrand48() * PAGE_SIZE; + size %= (end - p); + size %= MAP_SIZE; + if (size == 0) + size = PAGE_SIZE; + + if (lrand48() % 2) + prot |= PROT_READ; + if (lrand48() % 2) + prot |= PROT_EXEC; + if (lrand48() % 2) + prot |= PROT_WRITE; + + ret = mmap(p, size, prot, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + if (ret == MAP_FAILED) { + pr_perror("%p-%p", p, p + size); + goto err; + } + + if (!(prot & PROT_WRITE)) + continue; + + for (i = 0; i < lrand48() % 50; i++) { + char *t = p + (lrand48() * PAGE_SIZE) % (size); + t[0] = lrand48(); + } + } + test_msg("count %llu\n", count); + + if (child == 0) { + if (!test_go()) + pr_perror("unexpected state"); + futex_set_and_wake(&shm->stop, 2); + test_waitsig(); + return 0; + } else { + int readable = 0, status = -1; + + /* stop the child */ + futex_set(&shm->stop, 1); + futex_add_and_wake(&shm->delta, MAX_DELTA); + /* wait until the child will be in the same point */ + futex_wait_until(&shm->stop, 2); + + /* check that child and parent have the identical content of memory */ + for (p = start; p < end; p += PAGE_SIZE) { + char rbuf[PAGE_SIZE], lbuf[PAGE_SIZE]; + int rret, lret; + + lret = sys_process_vm_readv(getpid(), p, lbuf, PAGE_SIZE); + rret = sys_process_vm_readv(child, p, rbuf, PAGE_SIZE); + if (rret != lret) { + pr_perror("%p %d %d", p, lret, rret); + goto err; + } + if (lret < 0) + continue; + readable++; + if (memcmp(rbuf, lbuf, PAGE_SIZE)) { + pr_perror("%p", p); + goto err; + } + } + test_msg("readable %d\n", readable); + kill(child, SIGTERM); + wait(&status); + if (status != 0) { + pr_perror("Non-zero exit code: %d", status); + goto err; + } + pass(); + } + + return 0; +err: + kill(child, SIGSEGV); + *((volatile int *) 0) = 0; + return 1; +} diff --git a/test/zdtm/customization/maps007.desc b/test/zdtm/customization/maps007.desc new file mode 100644 index 0000000..9ed7e46 --- /dev/null +++ b/test/zdtm/customization/maps007.desc @@ -0,0 +1 @@ +{'arch': 'aarch64', 'opts': '--pin-memory --use-fork-pid', 'flags': 'suid', 'flavor': 'h', 'cmdline': 'pinmemory max_pin_pid_num'} diff --git a/test/zdtm/customization/maps008.c b/test/zdtm/customization/maps008.c new file mode 100644 index 0000000..7ed7c10 --- /dev/null +++ b/test/zdtm/customization/maps008.c @@ -0,0 +1,514 @@ +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> +#include <signal.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/wait.h> +#include <linux/limits.h> +#include "zdtmtst.h" +#include "lock.h" + +const char *test_doc = "ps tree with anon shared vmas for dedup"; + +/* + * 1. ps tree with non triavial anon shmem vmas is created first. + * 2. Each process gets its portion of shmem vmas. + * 3. Each process continuously datagens its portion until + * criu dump is finished. + * 4. Each process datachecks all its shmem portions after restore. + * 5. Contents of anon shmem vmas are checked for equality in + * different processes. + */ + +typedef int (*proc_func_t)(task_waiter_t *setup_waiter); + +static pid_t fork_and_setup(proc_func_t pfunc) +{ + task_waiter_t setup_waiter; + pid_t pid; + + task_waiter_init(&setup_waiter); + pid = test_fork(); + if (pid < 0) { + pr_perror("fork failed"); + exit(1); + } + + if (pid == 0) + exit(pfunc(&setup_waiter)); + + task_waiter_wait4(&setup_waiter, pid); + task_waiter_fini(&setup_waiter); + return pid; +} + +static void cont_and_wait_child(pid_t pid) +{ + int status; + + kill(pid, SIGTERM); + waitpid(pid, &status, 0); + if (WIFEXITED(status)) { + if (WEXITSTATUS(status)) + exit(WEXITSTATUS(status)); + } else + exit(1); +} + +static void *mmap_ashmem(size_t size) +{ + void *mem = mmap(NULL, size, PROT_WRITE | PROT_READ, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (mem == MAP_FAILED) { + pr_perror("Can't map shmem %zx", size); + exit(1); + } + return mem; +} + +static void *mmap_proc_mem(pid_t pid, unsigned long addr, + unsigned long size) +{ + int fd; + void *mem; + char path[PATH_MAX]; + + snprintf(path, PATH_MAX, "/proc/%d/map_files/%lx-%lx", + (int)pid, addr, addr + size); + fd = open(path, O_RDWR); + if (fd == -1) { + pr_perror("Can't open file %s", path); + exit(1); + } + + mem = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0); + close(fd); + if (mem == MAP_FAILED) { + pr_perror("Can't map file %s", path); + exit(1); + } + return mem; +} + +static void check_mem_eq(void *addr1, size_t size1, void *addr2, size_t size2) +{ + unsigned long min_size = size1 < size2 ? size1 : size2; + + if (memcmp(addr1, addr2, min_size)) { + pr_err("Mem differs %lx %lx %lx", (unsigned long)addr1, + (unsigned long)addr2, min_size); + exit(1); + } +} + +static void xmunmap(void *map, size_t size) +{ + if (munmap(map, size)) { + pr_err("xmunmap"); + exit(1); + } +} + +static void chk_proc_mem_eq(pid_t pid1, void *addr1, unsigned long size1, + pid_t pid2, void *addr2, unsigned long size2) +{ + void *map1, *map2; + + map1 = mmap_proc_mem(pid1, (unsigned long)addr1, size1); + map2 = mmap_proc_mem(pid2, (unsigned long)addr2, size2); + check_mem_eq(map1, size1, map2, size2); + xmunmap(map1, size1); + xmunmap(map2, size2); +} + +/* + * ps tree: + * proc1_______________ + * | | | + * proc11___ proc12 proc13 + * | | | + * proc111 proc112 proc131 + */ +#define PROC_CNT 7 + +#define PROC1_PGIX 0 +#define PROC11_PGIX 1 +#define PROC12_PGIX 2 +#define PROC13_PGIX 3 +#define PROC111_PGIX 4 +#define PROC112_PGIX 5 +#define PROC131_PGIX 6 +#define ZERO_PGIX 7 +/* unused pgix: 8 */ +#define MEM_PERIOD (9 * PAGE_SIZE) + +struct pstree { + pid_t proc1; + pid_t proc11; + pid_t proc12; + pid_t proc13; + pid_t proc111; + pid_t proc112; + pid_t proc131; +}; +struct pstree *pstree; + +struct test_sync { + futex_t datagen; + futex_t datagen_exit_cnt; +}; +struct test_sync *test_sync; + +size_t mem1_size, mem2_size, mem3_size; +uint8_t *mem1, *mem2, *mem3; + +#define CRC_EPOCH_OFFSET (PAGE_SIZE - sizeof(uint32_t)) + +static void read_each_pg(volatile uint8_t *mem, size_t size, size_t off) +{ + if (!mem) + return; + + while (off < size) { + (mem + off)[0]; + off += MEM_PERIOD; + } +} + +void datagen_each_pg(uint8_t *mem, size_t size, size_t off, uint32_t crc_epoch) +{ + if (!mem) + return; + + while (futex_get(&test_sync->datagen) && (off < size)) { + uint32_t crc = crc_epoch; + + datagen(mem + off, CRC_EPOCH_OFFSET, &crc); + *(uint32_t *)(mem + off + CRC_EPOCH_OFFSET) = crc_epoch; + off += MEM_PERIOD; + } +} + +void datachck_each_pg(uint8_t *mem, size_t size, size_t off) +{ + if (!mem) + return; + + while (off < size) { + uint32_t crc = *(uint32_t *)(mem + off + CRC_EPOCH_OFFSET); + + if (datachk(mem + off, CRC_EPOCH_OFFSET, &crc)) + exit(1); + off += MEM_PERIOD; + } +} + +static void mems_read_each_pgix(size_t pgix) +{ + const size_t off = pgix * PAGE_SIZE; + + read_each_pg(mem1, mem1_size, off); + read_each_pg(mem2, mem2_size, off); + read_each_pg(mem3, mem3_size, off); +} + +static void mems_datagen_each_pgix(size_t pgix, uint32_t *crc_epoch) +{ + const size_t off = pgix * PAGE_SIZE; + + ++(*crc_epoch); + datagen_each_pg(mem1, mem1_size, off, *crc_epoch); + datagen_each_pg(mem2, mem2_size, off, *crc_epoch); + datagen_each_pg(mem3, mem3_size, off, *crc_epoch); +} + +static void mems_datachck_each_pgix(size_t pgix) +{ + const size_t off = pgix * PAGE_SIZE; + + datachck_each_pg(mem1, mem1_size, off); + datachck_each_pg(mem2, mem2_size, off); + datachck_each_pg(mem3, mem3_size, off); +} + +static int proc131_func(task_waiter_t *setup_waiter) +{ + uint32_t crc_epoch = 0; + + pstree->proc131 = getpid(); + mems_datagen_each_pgix(PROC131_PGIX, &crc_epoch); + task_waiter_complete_current(setup_waiter); + + while (futex_get(&test_sync->datagen)) + mems_datagen_each_pgix(PROC131_PGIX, &crc_epoch); + futex_inc_and_wake(&test_sync->datagen_exit_cnt); + test_waitsig(); + + mems_datachck_each_pgix(PROC131_PGIX); + return 0; +} + +static int proc13_func(task_waiter_t *setup_waiter) +{ + size_t MEM1_HOLE_START = 2 * MEM_PERIOD; + size_t MEM1_HOLE_SIZE = 1 * MEM_PERIOD; + uint32_t crc_epoch = 0; + + pstree->proc13 = getpid(); + xmunmap(mem1 + MEM1_HOLE_START, MEM1_HOLE_SIZE); + xmunmap(mem2, mem2_size); + xmunmap(mem3, mem3_size); + mem2 = mem1 + MEM1_HOLE_START + MEM1_HOLE_SIZE; + mem2_size = mem1_size - (mem2 - mem1); + mem1_size = MEM1_HOLE_START; + mem3 = mmap_ashmem(mem3_size); + mems_datagen_each_pgix(PROC13_PGIX, &crc_epoch); + fork_and_setup(proc131_func); + task_waiter_complete_current(setup_waiter); + + while (futex_get(&test_sync->datagen)) + mems_datagen_each_pgix(PROC13_PGIX, &crc_epoch); + futex_inc_and_wake(&test_sync->datagen_exit_cnt); + test_waitsig(); + + mems_datachck_each_pgix(PROC13_PGIX); + + chk_proc_mem_eq(pstree->proc13, mem1, mem1_size, + pstree->proc131, mem1, mem1_size); + chk_proc_mem_eq(pstree->proc13, mem2, mem2_size, + pstree->proc131, mem2, mem2_size); + chk_proc_mem_eq(pstree->proc13, mem3, mem3_size, + pstree->proc131, mem3, mem3_size); + + cont_and_wait_child(pstree->proc131); + return 0; +} + +static int proc12_func(task_waiter_t *setup_waiter) +{ + uint32_t crc_epoch = 0; + + pstree->proc12 = getpid(); + mems_datagen_each_pgix(PROC12_PGIX, &crc_epoch); + task_waiter_complete_current(setup_waiter); + + while (futex_get(&test_sync->datagen)) + mems_datagen_each_pgix(PROC12_PGIX, &crc_epoch); + futex_inc_and_wake(&test_sync->datagen_exit_cnt); + test_waitsig(); + + mems_datachck_each_pgix(PROC12_PGIX); + + return 0; +} + +static int proc111_func(task_waiter_t *setup_waiter) +{ + uint32_t crc_epoch = 0; + + pstree->proc111 = getpid(); + mems_datagen_each_pgix(PROC111_PGIX, &crc_epoch); + task_waiter_complete_current(setup_waiter); + + while (futex_get(&test_sync->datagen)) + mems_datagen_each_pgix(PROC111_PGIX, &crc_epoch); + futex_inc_and_wake(&test_sync->datagen_exit_cnt); + test_waitsig(); + + mems_datachck_each_pgix(PROC111_PGIX); + return 0; +} + +static int proc112_func(task_waiter_t *setup_waiter) +{ + uint32_t crc_epoch = 0; + + pstree->proc112 = getpid(); + mems_datagen_each_pgix(PROC112_PGIX, &crc_epoch); + task_waiter_complete_current(setup_waiter); + + while (futex_get(&test_sync->datagen)) + mems_datagen_each_pgix(PROC112_PGIX, &crc_epoch); + futex_inc_and_wake(&test_sync->datagen_exit_cnt); + test_waitsig(); + + mems_datachck_each_pgix(PROC112_PGIX); + return 0; +} + +static int proc11_func(task_waiter_t *setup_waiter) +{ + const size_t MEM3_START_CUT = 1 * MEM_PERIOD; + const size_t MEM3_END_CUT = 2 * MEM_PERIOD; + void *mem3_old = mem3; + size_t mem3_size_old = mem3_size; + uint32_t crc_epoch = 0; + uint8_t *proc1_mem3; + + pstree->proc11 = getpid(); + xmunmap(mem3, MEM3_START_CUT); + mem3 += MEM3_START_CUT; + mem3_size -= MEM3_START_CUT; + fork_and_setup(proc111_func); + fork_and_setup(proc112_func); + xmunmap(mem3 + mem3_size - MEM3_END_CUT, MEM3_END_CUT); + mem3_size -= MEM3_END_CUT; + mems_datagen_each_pgix(PROC11_PGIX, &crc_epoch); + task_waiter_complete_current(setup_waiter); + + while (futex_get(&test_sync->datagen)) + mems_datagen_each_pgix(PROC11_PGIX, &crc_epoch); + futex_inc_and_wake(&test_sync->datagen_exit_cnt); + test_waitsig(); + + mems_datachck_each_pgix(PROC11_PGIX); + + chk_proc_mem_eq(pstree->proc11, mem1, mem1_size, + pstree->proc111, mem1, mem1_size); + chk_proc_mem_eq(pstree->proc11, mem1, mem1_size, + pstree->proc112, mem1, mem1_size); + + chk_proc_mem_eq(pstree->proc11, mem2, mem2_size, + pstree->proc111, mem2, mem2_size); + chk_proc_mem_eq(pstree->proc11, mem2, mem2_size, + pstree->proc112, mem2, mem2_size); + + chk_proc_mem_eq(pstree->proc11, mem3, mem3_size, + pstree->proc111, mem3, mem3_size + MEM3_END_CUT); + chk_proc_mem_eq(pstree->proc11, mem3, mem3_size, + pstree->proc112, mem3, mem3_size + MEM3_END_CUT); + + proc1_mem3 = mmap_proc_mem(pstree->proc1, + (unsigned long)mem3_old, mem3_size_old); + check_mem_eq(mem3, mem3_size, proc1_mem3 + MEM3_START_CUT, mem3_size); + xmunmap(proc1_mem3, mem3_size_old); + + cont_and_wait_child(pstree->proc111); + cont_and_wait_child(pstree->proc112); + return 0; +} + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MB(n) ((n) * (1UL << 20)) + +static int proc1_func(void) +{ + uint32_t crc_epoch = 0; + uint8_t *mem2_old = NULL; + + /* + * Min mem size: + * At least 5 mem periods for mem pages and vma holes. + * At least 1 MB mem size not to test on tiny working set. + */ + mem1_size = MEM_PERIOD * MAX(5, MB(1) / MEM_PERIOD + 1); + mem2_size = mem1_size * 2; + mem3_size = mem2_size * 3; + + futex_set(&test_sync->datagen, 1); + pstree->proc1 = getpid(); + mem1 = mmap_ashmem(mem1_size); + mem2 = mmap_ashmem(mem2_size); + mem3 = mmap_ashmem(mem3_size); + mems_datagen_each_pgix(PROC1_PGIX, &crc_epoch); + mems_read_each_pgix(ZERO_PGIX); + + fork_and_setup(proc11_func); + fork_and_setup(proc12_func); + fork_and_setup(proc13_func); + + xmunmap(mem1, mem1_size); + if (mremap(mem2, mem2_size, mem1_size, MREMAP_MAYMOVE | MREMAP_FIXED, + mem1) != mem1) { + pr_perror("proc1 mem2 remap"); + exit(1); + } + mem2_old = mem2; + mem2 = NULL; + + test_daemon(); + while (test_go()) + mems_datagen_each_pgix(PROC1_PGIX, &crc_epoch); + test_waitsig(); + futex_set(&test_sync->datagen_exit_cnt, 0); + futex_set(&test_sync->datagen, 0); + futex_wait_while(&test_sync->datagen_exit_cnt, PROC_CNT); + + mems_datachck_each_pgix(PROC1_PGIX); + + chk_proc_mem_eq(pstree->proc1, mem1, mem1_size, + pstree->proc11, mem2_old, mem2_size); + chk_proc_mem_eq(pstree->proc1, mem1, mem1_size, + pstree->proc12, mem2_old, mem2_size); + + chk_proc_mem_eq(pstree->proc1, mem3, mem3_size, + pstree->proc12, mem3, mem3_size); + + cont_and_wait_child(pstree->proc11); + cont_and_wait_child(pstree->proc12); + cont_and_wait_child(pstree->proc13); + + pass(); + return 0; +} + +static void kill_pstree_from_root(void) +{ + if (getpid() != pstree->proc1) + return; + + kill(pstree->proc11, SIGKILL); + kill(pstree->proc12, SIGKILL); + kill(pstree->proc13, SIGKILL); + kill(pstree->proc111, SIGKILL); + kill(pstree->proc112, SIGKILL); + kill(pstree->proc131, SIGKILL); +} + +static void sigchld_hand(int signo, siginfo_t *info, void *ucontext) +{ + if (info->si_code != CLD_EXITED) + return; + if (!info->si_status) + return; + + /* + * If we are not ps tree root then propagate child error to parent. + * If we are ps tree root then also call all + * atexit handlers set up by zdtm test framework and this test. + * exit() is not async signal safe but it's ok for testing purposes. + * exit() usage allows us to use very simple error handling + * and pstree killing logic. + */ + exit(info->si_status); +} + +int main(int argc, char **argv) +{ + struct sigaction sa = { + .sa_sigaction = sigchld_hand, + .sa_flags = SA_RESTART | SA_SIGINFO | SA_NOCLDSTOP + }; + sigemptyset(&sa.sa_mask); + + test_init(argc, argv); + + pstree = (struct pstree *)mmap_ashmem(PAGE_SIZE); + test_sync = (struct test_sync *)mmap_ashmem(sizeof(*test_sync)); + + if (sigaction(SIGCHLD, &sa, NULL)) { + pr_perror("SIGCHLD handler setup"); + exit(1); + }; + + if (atexit(kill_pstree_from_root)) { + pr_err("Can't setup atexit cleanup func"); + exit(1); + } + return proc1_func(); +} diff --git a/test/zdtm/customization/maps008.desc b/test/zdtm/customization/maps008.desc new file mode 100644 index 0000000..154ef8c --- /dev/null +++ b/test/zdtm/customization/maps008.desc @@ -0,0 +1 @@ +{'arch': 'aarch64', 'opts': '--pin-memory --use-fork-pid', 'flavor': 'h', 'flags': 'suid', 'cmdline': 'pinmemory max_pin_pid_num'} diff --git a/test/zdtm/customization/maps01.c b/test/zdtm/customization/maps01.c new file mode 100644 index 0000000..119d7a6 --- /dev/null +++ b/test/zdtm/customization/maps01.c @@ -0,0 +1,183 @@ +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> +#include <stdlib.h> +#include <signal.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/wait.h> +#include <linux/limits.h> +#include "zdtmtst.h" + +#define MEM_SIZE (1LU << 30) +#define MEM_OFFSET (1LU << 29) +#define MEM_OFFSET2 (MEM_SIZE - PAGE_SIZE) +#define MEM_OFFSET3 (20LU * PAGE_SIZE) + +const char *test_doc = "Test shared memory"; +const char *test_author = "Andrew Vagin <avagin@openvz.org"; + +int main(int argc, char ** argv) +{ + void *m, *m2, *p, *p2; + char path[PATH_MAX]; + uint32_t crc; + pid_t pid = -1; + int status, fd; + task_waiter_t t; + + test_init(argc, argv); + + task_waiter_init(&t); + + m = mmap(NULL, MEM_SIZE, PROT_WRITE | PROT_READ, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + + if (m == MAP_FAILED) { + pr_err("Failed to mmap %lu Mb shared anonymous R/W memory\n", + MEM_SIZE >> 20); + goto err; + } + + p = mmap(NULL, MEM_SIZE, PROT_WRITE | PROT_READ, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + + if (p == MAP_FAILED) { + pr_err("Failed to mmap %ld Mb shared anonymous R/W memory\n", + MEM_SIZE >> 20); + goto err; + } + + p2 = mmap(NULL, MEM_OFFSET, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p2 == MAP_FAILED) { + pr_err("Failed to mmap %lu Mb anonymous memory\n", + MEM_OFFSET >> 20); + goto err; + } + + pid = test_fork(); + if (pid < 0) { + pr_err("Fork failed with %d\n", pid); + goto err; + } else if (pid == 0) { + void *p3; + + p3 = mmap(NULL, MEM_OFFSET3, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p3 == MAP_FAILED) { + pr_err("Failed to mmap %lu Mb anonymous R/W memory\n", + MEM_OFFSET3 >> 20); + goto err; + } + + crc = ~0; + datagen(m + MEM_OFFSET, PAGE_SIZE, &crc); + crc = ~0; + datagen(m + MEM_OFFSET2, PAGE_SIZE, &crc); + crc = ~0; + datagen(p + MEM_OFFSET + MEM_OFFSET3, PAGE_SIZE, &crc); + crc = ~0; + datagen(p + MEM_OFFSET + 2 * MEM_OFFSET3, PAGE_SIZE, &crc); + crc = ~0; + datagen(p + MEM_OFFSET3, PAGE_SIZE, &crc); + crc = ~0; + datagen(p3, PAGE_SIZE, &crc); + + task_waiter_complete(&t, 1); + + test_waitsig(); + + crc = ~0; + status = datachk(m + MEM_OFFSET, PAGE_SIZE, &crc); + if (status) + return 1; + crc = ~0; + status = datachk(m + MEM_OFFSET2, PAGE_SIZE, &crc); + if (status) + return 1; + crc = ~0; + status = datachk(m + PAGE_SIZE, PAGE_SIZE, &crc); + if (status) + return 1; + crc = ~0; + status = datachk(p + MEM_OFFSET + 2 * MEM_OFFSET3, PAGE_SIZE, &crc); + if (status) + return 1; + crc = ~0; + status = datachk(p + MEM_OFFSET3, PAGE_SIZE, &crc); + if (status) + return 1; + crc = ~0; + status = datachk(p3, PAGE_SIZE, &crc); + if (status) + return 1; + return 0; + } + task_waiter_wait4(&t, 1); + + munmap(p, MEM_OFFSET); + p2 = mremap(p + MEM_OFFSET, MEM_OFFSET, MEM_OFFSET, MREMAP_FIXED | MREMAP_MAYMOVE, p2); + if (p2 == MAP_FAILED) + goto err; + + snprintf(path, PATH_MAX, "/proc/self/map_files/%lx-%lx", + (unsigned long) m, + (unsigned long) m + MEM_SIZE); + fd = open(path, O_RDWR); + if (fd == -1) { + pr_perror("Can't open file %s", path); + goto err; + } + + m2 = mmap(NULL, PAGE_SIZE, PROT_WRITE | PROT_READ, MAP_SHARED, fd, MEM_OFFSET3); + if (m2 == MAP_FAILED) { + pr_perror("Can't map file %s", path); + goto err; + } + close(fd); + + munmap(m, PAGE_SIZE); + munmap(m + PAGE_SIZE * 10, PAGE_SIZE); + munmap(m + MEM_OFFSET2, PAGE_SIZE); + + crc = ~0; + datagen(m + PAGE_SIZE, PAGE_SIZE, &crc); + + crc = ~0; + datagen(m2, PAGE_SIZE, &crc); + + test_daemon(); + test_waitsig(); + + kill(pid, SIGTERM); + wait(&status); + if (WIFEXITED(status)) { + if (WEXITSTATUS(status)) + goto err; + } else + goto err; + + crc = ~0; + if (datachk(m + MEM_OFFSET, PAGE_SIZE, &crc)) + goto err; + + crc = ~0; + if (datachk(m2, PAGE_SIZE, &crc)) + goto err; + + crc = ~0; + if (datachk(p2 + MEM_OFFSET3, PAGE_SIZE, &crc)) + goto err; + + pass(); + + return 0; +err: + if (waitpid(-1, NULL, WNOHANG) == 0) { + kill(pid, SIGTERM); + wait(NULL); + } + return 1; +} diff --git a/test/zdtm/customization/maps01.desc b/test/zdtm/customization/maps01.desc new file mode 100644 index 0000000..dad462e --- /dev/null +++ b/test/zdtm/customization/maps01.desc @@ -0,0 +1 @@ +{'arch': 'aarch64', 'flavor': 'h', 'opts': '--pin-memory --use-fork-pid', 'flags': 'suid', 'cmdline': 'pinmemory max_pin_pid_num'} diff --git a/test/zdtm/customization/maps02.c b/test/zdtm/customization/maps02.c new file mode 100644 index 0000000..eb7c09b --- /dev/null +++ b/test/zdtm/customization/maps02.c @@ -0,0 +1,111 @@ +#include <sys/mman.h> +#include "zdtmtst.h" +#include "get_smaps_bits.h" + +#ifndef MADV_DONTDUMP +#define MADV_DONTDUMP 16 +#endif + +const char *test_doc = "Test shared memory with advises"; +const char *test_author = "Cyrill Gorcunov gorcunov@openvz.org"; + +struct mmap_data { + void *start; + unsigned long orig_flags; + unsigned long orig_madv; + unsigned long new_flags; + unsigned long new_madv; +}; + +#define MEM_SIZE (8192) + +static int alloc_anon_mmap(struct mmap_data *m, int flags, int adv) +{ + m->start = mmap(NULL, MEM_SIZE, PROT_READ | PROT_WRITE, + flags, -1, 0); + if (m->start == MAP_FAILED) { + pr_perror("mmap failed"); + return -1; + } + + if (madvise(m->start, MEM_SIZE, adv)) { + if (errno == EINVAL) { + test_msg("madvise failed, no kernel support\n"); + munmap(m->start, MEM_SIZE); + *m = (struct mmap_data){ }; + } else { + pr_perror("madvise failed"); + return -1; + } + } + + return 0; +} + +int main(int argc, char **argv) +{ + struct mmap_data m[5] = { }; + size_t i; + + test_init(argc, argv); + + test_msg("Alloc growsdown\n"); + if (alloc_anon_mmap(&m[0], MAP_PRIVATE | MAP_ANONYMOUS, MADV_DONTFORK)) + return -1; + + test_msg("Alloc locked/sequential\n"); + if (alloc_anon_mmap(&m[1], MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKED, MADV_SEQUENTIAL)) + return -1; + + test_msg("Alloc noreserve/dontdump\n"); + if (alloc_anon_mmap(&m[2], MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, MADV_DONTDUMP)) + return -1; + + test_msg("Alloc hugetlb/hugepage\n"); + if (alloc_anon_mmap(&m[3], MAP_PRIVATE | MAP_ANONYMOUS, MADV_HUGEPAGE)) + return -1; + + test_msg("Alloc dontfork/random|mergeable\n"); + if (alloc_anon_mmap(&m[4], MAP_PRIVATE | MAP_ANONYMOUS, MADV_MERGEABLE)) + return -1; + + test_msg("Fetch existing flags/adv\n"); + for (i = 0; i < sizeof(m)/sizeof(m[0]); i++) { + if (get_smaps_bits((unsigned long)m[i].start, + &m[i].orig_flags, + &m[i].orig_madv)) + return -1; + } + + test_daemon(); + test_waitsig(); + + test_msg("Fetch restored flags/adv\n"); + for (i = 0; i < sizeof(m)/sizeof(m[0]); i++) { + if (get_smaps_bits((unsigned long)m[i].start, + &m[i].new_flags, + &m[i].new_madv)) + return -1; + + if (m[i].orig_flags != m[i].new_flags) { + pr_perror("Flags are changed %lx %lx -> %lx (%zu)", + (unsigned long)m[i].start, + m[i].orig_flags, m[i].new_flags, i); + fail(); + return -1; + } + + if (m[i].orig_madv != m[i].new_madv) { + pr_perror("Madvs are changed %lx %lx -> %lx (%zu)", + (unsigned long)m[i].start, + m[i].orig_madv, m[i].new_madv, i); + fail(); + return -1; + } + + } + + pass(); + + return 0; +} diff --git a/test/zdtm/customization/maps02.desc b/test/zdtm/customization/maps02.desc new file mode 100644 index 0000000..f14d661 --- /dev/null +++ b/test/zdtm/customization/maps02.desc @@ -0,0 +1 @@ +{'arch': 'aarch64', 'opts': '--pin-memory --use-fork-pid', 'flavor': 'h', 'cmdline': 'pinmemory max_pin_pid_num'} diff --git a/test/zdtm/customization/maps04.c b/test/zdtm/customization/maps04.c new file mode 100644 index 0000000..780c566 --- /dev/null +++ b/test/zdtm/customization/maps04.c @@ -0,0 +1,57 @@ +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> +#include <stdlib.h> +#include <signal.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/wait.h> +#include <linux/limits.h> +#include "zdtmtst.h" + +#define MEM_SIZE (1L << 29) + +const char *test_doc = "Test big mappings"; +const char *test_author = "Andrew Vagin <avagin@openvz.org"; + +int main(int argc, char ** argv) +{ + void *m; + uint32_t crc; + int i; + + test_init(argc, argv); + + m = mmap(NULL, MEM_SIZE, PROT_WRITE | PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (m == MAP_FAILED) { + fail(); + return 1; + } + + crc = ~0; + datagen(m, MEM_SIZE, &crc); + + for (i = 0; i < MEM_SIZE / (1<<20); i++) + if (mprotect(m + (lrand48() * PAGE_SIZE % MEM_SIZE), PAGE_SIZE, PROT_NONE)) { + pr_perror("mprotect"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (mprotect(m, MEM_SIZE, PROT_READ)) + pr_perror("mprotect"); + + crc = ~0; + if (datachk(m, MEM_SIZE, &crc)) + fail("Mem corrupted"); + else + pass(); + + return 0; +} diff --git a/test/zdtm/customization/maps04.desc b/test/zdtm/customization/maps04.desc new file mode 100644 index 0000000..2db7603 --- /dev/null +++ b/test/zdtm/customization/maps04.desc @@ -0,0 +1 @@ +{'arch': 'aarch64', 'opts': '--pin-memory --use-fork-pid', 'flavor': 'h', 'timeout': '60', 'cmdline': 'pinmemory max_pin_pid_num'} diff --git a/test/zdtm/customization/maps05.c b/test/zdtm/customization/maps05.c new file mode 100644 index 0000000..faa09ee --- /dev/null +++ b/test/zdtm/customization/maps05.c @@ -0,0 +1,91 @@ +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> +#include <stdlib.h> +#include <signal.h> +#include <string.h> +#include <sys/mman.h> +#include <setjmp.h> +#include <sys/types.h> +#include <sys/stat.h> +#include "zdtmtst.h" + +const char *test_doc = "Create a bunch of small VMAs and test they survive transferring\n"; +const char *test_author = "Cyrill Gorcunov gorcunov@openvz.org"; + +#define NR_MAPS 4096 + +#define NR_MAPS_1 (NR_MAPS + 0) +#define NR_MAPS_2 (NR_MAPS + 1) + +#define MAPS_SIZE_1 (140 << 10) +#define MAPS_SIZE_2 (8192) + +int main(int argc, char *argv[]) +{ + void *map[NR_MAPS + 2] = { }, *addr; + size_t i, summary; + + test_init(argc, argv); + + summary = NR_MAPS * 2 * 4096 + MAPS_SIZE_1 + MAPS_SIZE_2 + (1 << 20); + + addr = mmap(NULL, summary, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (addr == MAP_FAILED) { + pr_perror("Can't mmap"); + return 1; + } + munmap(addr, summary); + + for (i = 0; i < NR_MAPS; i++) { + map[i] = mmap(i > 0 ? map[i - 1] + 8192 : addr, 4096, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (map[i] == MAP_FAILED) { + pr_perror("Can't mmap"); + return 1; + } else { + /* Dirtify it */ + int *v = (void *)map[i]; + *v = i; + } + } + + map[NR_MAPS_1] = mmap(map[NR_MAPS_1 - 1] + 8192, MAPS_SIZE_1, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_GROWSDOWN, -1, 0); + if (map[NR_MAPS_1] == MAP_FAILED) { + pr_perror("Can't mmap"); + return 1; + } else { + /* Dirtify it */ + int *v = (void *)map[NR_MAPS_1]; + *v = i; + test_msg("map-1: %p %p\n", map[NR_MAPS_1], map[NR_MAPS_1] + MAPS_SIZE_1); + } + + map[NR_MAPS_2] = mmap(map[NR_MAPS_1] + MAPS_SIZE_1, MAPS_SIZE_2, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_GROWSDOWN, -1, 0); + if (map[NR_MAPS_2] == MAP_FAILED) { + pr_perror("Can't mmap"); + return 1; + } else { + /* Dirtify it */ + int *v = (void *)map[NR_MAPS_2]; + *v = i; + test_msg("map-2: %p %p\n", map[NR_MAPS_2], map[NR_MAPS_2] + MAPS_SIZE_2); + } + + test_daemon(); + test_waitsig(); + + for (i = 0; i < NR_MAPS; i++) { + int *v = (void *)map[i]; + + if (*v != i) { + fail("Data corrupted at page %lu", (unsigned long)i); + return 1; + } + } + + pass(); + return 0; +} diff --git a/test/zdtm/customization/maps05.desc b/test/zdtm/customization/maps05.desc new file mode 100644 index 0000000..f14d661 --- /dev/null +++ b/test/zdtm/customization/maps05.desc @@ -0,0 +1 @@ +{'arch': 'aarch64', 'opts': '--pin-memory --use-fork-pid', 'flavor': 'h', 'cmdline': 'pinmemory max_pin_pid_num'} diff --git a/test/zdtm/customization/maps06.c b/test/zdtm/customization/maps06.c new file mode 100644 index 0000000..7480d6b --- /dev/null +++ b/test/zdtm/customization/maps06.c @@ -0,0 +1,70 @@ +#include "zdtmtst.h" +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> + +const char *test_doc = "Create a lot of file vma-s"; +const char *test_author = "Andrei Vagin avagin@openvz.org"; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +int main(int argc, char ** argv) +{ + void *start; + int fd, i; + int ps = sysconf(_SC_PAGESIZE); + int test_size; + + test_init(argc, argv); + + fd = open(filename, O_RDWR | O_CREAT, 0666); + if (fd < 0) + return 1; + + ftruncate(fd, ps); + + if (ps == 0x1000) + test_size = 10240; + else + test_size = 512; + + start = mmap(0, ps * test_size * 4, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (start == MAP_FAILED) + return 1; + + for (i = 0; i < test_size; i++) { + int *addr; + addr = mmap(start + i * 3 * ps, ps, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_FILE | MAP_FIXED, fd, 0); + if (addr == MAP_FAILED) + return 1; + addr[0] = i * 2; + addr = mmap(start + (i * 3 + 1) * ps, ps, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + if (addr == MAP_FAILED) + return 1; + addr[0] = i; + } + + test_daemon(); + + test_waitsig(); + + for (i = 0; i < test_size; i++) { + int *addr; + addr = start + i * 3 * ps; + if (addr[0] != i * 2) + fail(); + addr = start + (i * 3 + 1) * ps; + if (addr[0] != i) + fail(); + } + + pass(); + + return 0; +} diff --git a/test/zdtm/customization/maps06.desc b/test/zdtm/customization/maps06.desc new file mode 100644 index 0000000..f14d661 --- /dev/null +++ b/test/zdtm/customization/maps06.desc @@ -0,0 +1 @@ +{'arch': 'aarch64', 'opts': '--pin-memory --use-fork-pid', 'flavor': 'h', 'cmdline': 'pinmemory max_pin_pid_num'} diff --git a/test/zdtm/customization/maps_file_prot.c b/test/zdtm/customization/maps_file_prot.c new file mode 100644 index 0000000..3b28c1f --- /dev/null +++ b/test/zdtm/customization/maps_file_prot.c @@ -0,0 +1,53 @@ +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <linux/limits.h> +#include "zdtmtst.h" + +const char *test_doc = "Test mappings of same file with different prot"; +const char *test_author = "Jamie Liu jamieliu@google.com"; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +#define die(fmt, arg...) do { pr_perror(fmt, ## arg); return 1; } while (0) + +int main(int argc, char ** argv) +{ + void *ro_map, *rw_map; + int fd; + + test_init(argc, argv); + + fd = open(filename, O_RDWR | O_CREAT, 0644); + if (fd < 0) + die("open failed"); + if (ftruncate(fd, 2 * PAGE_SIZE)) + die("ftruncate failed"); + + ro_map = mmap(NULL, 2 * PAGE_SIZE, PROT_READ, MAP_SHARED, fd, 0); + if (ro_map == MAP_FAILED) + die("mmap failed"); + rw_map = ro_map + PAGE_SIZE; + if (mprotect(rw_map, PAGE_SIZE, PROT_READ | PROT_WRITE)) + die("mprotect failed"); + + close(fd); + + test_daemon(); + test_waitsig(); + + /* Check that rw_map is still writeable */ + *(volatile char *)rw_map = 1; + + if (mprotect(ro_map, PAGE_SIZE, PROT_READ | PROT_WRITE)) { + fail("mprotect after restore failed"); + return 1; + } + + pass(); + return 0; +} diff --git a/test/zdtm/customization/maps_file_prot.desc b/test/zdtm/customization/maps_file_prot.desc new file mode 100644 index 0000000..0ec4023 --- /dev/null +++ b/test/zdtm/customization/maps_file_prot.desc @@ -0,0 +1 @@ +{'arch': 'aarch64', 'opts': '--pin-memory --use-fork-pid', 'flavor': 'h'} diff --git a/test/zdtm_ct.c b/test/zdtm_ct.c index 5495d61..186b360 100644 --- a/test/zdtm_ct.c +++ b/test/zdtm_ct.c @@ -9,6 +9,7 @@ #include <sys/stat.h> #include <fcntl.h> #include <errno.h> +#include <string.h>
#ifndef CLONE_NEWTIME #define CLONE_NEWTIME 0x00000080 /* New time namespace */ @@ -73,13 +74,23 @@ int main(int argc, char **argv) { pid_t pid; int status; + char *val = getenv("ZDTM_NO_PID_NS"); + int flags = CLONE_NEWNS | CLONE_NEWNET | CLONE_NEWIPC; + + /* + * Some customizing mechanism don't support pid namespace, + * so every customizing feature testcase will set + * 'ZDTM_NO_PID_NS' environment value. + */ + if (val == NULL || strcmp(val, "1") != 0) + flags |= CLONE_NEWPID;
/* * pidns is used to avoid conflicts * mntns is used to mount /proc * net is used to avoid conflicts of parasite sockets */ - if (unshare(CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC)) + if (unshare(flags)) return 1; pid = fork(); if (pid == 0) {