From ca02c47075b69c3387d03ae2a09ab9499d5bd27f Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Sun, 15 Dec 2019 20:38:46 +0000 Subject: [PATCH] kerndat: detect if system support clone3() with set_tid Linux kernel 5.4 extends clone3() with set_tid to allow processes to specify the PID of a newly created process. This introduces detection of the clone3() syscall and if set_tid is supported. This first implementation is X86_64 only. Signed-off-by: Adrian Reber --- .../arch/arm/plugins/std/syscalls/syscall.def | 1 + .../plugins/std/syscalls/syscall-ppc64.tbl | 1 + .../plugins/std/syscalls/syscall-s390.tbl | 1 + .../x86/plugins/std/syscalls/syscall_32.tbl | 1 + .../x86/plugins/std/syscalls/syscall_64.tbl | 1 + .../plugins/include/uapi/std/syscall-types.h | 1 + criu/cr-check.c | 12 ++++++ criu/include/kerndat.h | 1 + criu/include/sched.h | 33 +++++++++++++++ criu/kerndat.c | 41 +++++++++++++++++++ 10 files changed, 93 insertions(+) create mode 100644 criu/include/sched.h diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def index d5bdc677e..f7ebc8527 100644 --- a/compel/arch/arm/plugins/std/syscalls/syscall.def +++ b/compel/arch/arm/plugins/std/syscalls/syscall.def @@ -115,3 +115,4 @@ ppoll 73 336 (struct pollfd *fds, unsigned int nfds, const struct timespec *t fsopen 430 430 (char *fsname, unsigned int flags) fsconfig 431 431 (int fd, unsigned int cmd, const char *key, const char *value, int aux) fsmount 432 432 (int fd, unsigned int flags, unsigned int attr_flags) +clone3 435 435 (struct clone_args *uargs, size_t size) diff --git a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl index 4e283d5e9..1afaf1e70 100644 --- a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl +++ b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl @@ -111,3 +111,4 @@ __NR_ppoll 281 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) +__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl index fd48e3950..ae6fdb5f8 100644 --- a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl +++ b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl @@ -111,3 +111,4 @@ __NR_ppoll 302 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) +__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl index 038aeb4f7..7a487110d 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl @@ -99,3 +99,4 @@ __NR_ppoll 309 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) +__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl index 215f32026..6667c07db 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl @@ -110,3 +110,4 @@ __NR_ppoll 271 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struc __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) +__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) diff --git a/compel/plugins/include/uapi/std/syscall-types.h b/compel/plugins/include/uapi/std/syscall-types.h index 57865e741..031e773bb 100644 --- a/compel/plugins/include/uapi/std/syscall-types.h +++ b/compel/plugins/include/uapi/std/syscall-types.h @@ -39,6 +39,7 @@ struct msghdr; struct rusage; struct iocb; struct pollfd; +struct clone_args; typedef unsigned long aio_context_t; diff --git a/criu/cr-check.c b/criu/cr-check.c index 17dd29b42..80df3f7cd 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -1224,6 +1224,16 @@ static int check_uffd_noncoop(void) return 0; } +static int check_clone3_set_tid(void) +{ + if (!kdat.has_clone3_set_tid) { + pr_warn("clone3() with set_tid not supported\n"); + return -1; + } + + return 0; +} + static int check_can_map_vdso(void) { if (kdat_can_map_vdso() == 1) @@ -1373,6 +1383,7 @@ int cr_check(void) ret |= check_sk_netns(); ret |= check_kcmp_epoll(); ret |= check_net_diag_raw(); + ret |= check_clone3_set_tid(); } /* @@ -1476,6 +1487,7 @@ static struct feature_list feature_list[] = { { "link_nsid", check_link_nsid}, { "kcmp_epoll", check_kcmp_epoll}, { "external_net_ns", check_external_net_ns}, + { "clone3_set_tid", check_clone3_set_tid}, { NULL, NULL }, }; diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index 771195860..27c870bb8 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -66,6 +66,7 @@ struct kerndat_s { bool has_inotify_setnextwd; bool has_kcmp_epoll_tfd; bool has_fsopen; + bool has_clone3_set_tid; }; extern struct kerndat_s kdat; diff --git a/criu/include/sched.h b/criu/include/sched.h new file mode 100644 index 000000000..78f65e3b7 --- /dev/null +++ b/criu/include/sched.h @@ -0,0 +1,33 @@ +#ifndef __CR_SCHED_H__ +#define __CR_SCHED_H__ + +#include + +#ifndef ptr_to_u64 +#define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr))) +#endif +#ifndef u64_to_ptr +#define u64_to_ptr(x) ((void *)(uintptr_t)x) +#endif + +/* + * This structure is needed by clone3(). The kernel + * calls it 'struct clone_args'. As CRIU will always + * need at least this part of the structure (VER1) + * to be able to test if clone3() with set_tid works, + * the structure is defined here as 'struct _clone_args'. + */ + +struct _clone_args { + __aligned_u64 flags; + __aligned_u64 pidfd; + __aligned_u64 child_tid; + __aligned_u64 parent_tid; + __aligned_u64 exit_signal; + __aligned_u64 stack; + __aligned_u64 stack_size; + __aligned_u64 tls; + __aligned_u64 set_tid; + __aligned_u64 set_tid_size; +}; +#endif /* __CR_SCHED_H__ */ diff --git a/criu/kerndat.c b/criu/kerndat.c index d1afde71d..0772828bc 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -41,6 +41,7 @@ #include "uffd.h" #include "vdso.h" #include "kcmp.h" +#include "sched.h" struct kerndat_s kdat = { }; @@ -986,6 +987,44 @@ static int kerndat_tun_netns(void) return check_tun_netns_cr(&kdat.tun_ns); } +static bool kerndat_has_clone3_set_tid(void) +{ + pid_t pid; + struct _clone_args args = {}; + +#ifndef CONFIG_X86_64 + /* + * Currently the CRIU PIE assembler clone3() wrapper is + * only implemented for X86_64. + */ + kdat.has_clone3_set_tid = false; + return 0; +#endif + + args.set_tid = -1; + /* + * On a system without clone3() this will return ENOSYS. + * On a system with clone3() but without set_tid this + * will return E2BIG. + * On a system with clone3() and set_tid it will return + * EINVAL. + */ + pid = syscall(__NR_clone3, &args, sizeof(args)); + + if (pid == -1 && (errno == ENOSYS || errno == E2BIG)) { + kdat.has_clone3_set_tid = false; + return 0; + } + if (pid == -1 && errno == EINVAL) { + kdat.has_clone3_set_tid = true; + } else { + pr_perror("Unexpected error from clone3\n"); + return -1; + } + + return 0; +} + int kerndat_init(void) { int ret; @@ -1059,6 +1098,8 @@ int kerndat_init(void) ret = has_kcmp_epoll_tfd(); if (!ret) ret = kerndat_has_fsopen(); + if (!ret) + ret = kerndat_has_clone3_set_tid(); kerndat_lsm(); kerndat_mmap_min_addr();