kerndat: detect if system support clone3() with set_tid

Linux kernel 5.4 extends clone3() with set_tid to allow processes to
specify the PID of a newly created process. This introduces detection
of the clone3() syscall and if set_tid is supported.

This first implementation is X86_64 only.

Signed-off-by: Adrian Reber <areber@redhat.com>
This commit is contained in:
Adrian Reber 2019-12-15 20:38:46 +00:00 committed by Andrei Vagin
parent 8fea2647b6
commit ca02c47075
10 changed files with 93 additions and 0 deletions

View file

@ -115,3 +115,4 @@ ppoll 73 336 (struct pollfd *fds, unsigned int nfds, const struct timespec *t
fsopen 430 430 (char *fsname, unsigned int flags)
fsconfig 431 431 (int fd, unsigned int cmd, const char *key, const char *value, int aux)
fsmount 432 432 (int fd, unsigned int flags, unsigned int attr_flags)
clone3 435 435 (struct clone_args *uargs, size_t size)

View file

@ -111,3 +111,4 @@ __NR_ppoll 281 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct
__NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags)
__NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux)
__NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags)
__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size)

View file

@ -111,3 +111,4 @@ __NR_ppoll 302 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct
__NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags)
__NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux)
__NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags)
__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size)

View file

@ -99,3 +99,4 @@ __NR_ppoll 309 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct
__NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags)
__NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux)
__NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags)
__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size)

View file

@ -110,3 +110,4 @@ __NR_ppoll 271 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struc
__NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags)
__NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux)
__NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags)
__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size)

View file

@ -39,6 +39,7 @@ struct msghdr;
struct rusage;
struct iocb;
struct pollfd;
struct clone_args;
typedef unsigned long aio_context_t;

View file

@ -1224,6 +1224,16 @@ static int check_uffd_noncoop(void)
return 0;
}
static int check_clone3_set_tid(void)
{
if (!kdat.has_clone3_set_tid) {
pr_warn("clone3() with set_tid not supported\n");
return -1;
}
return 0;
}
static int check_can_map_vdso(void)
{
if (kdat_can_map_vdso() == 1)
@ -1373,6 +1383,7 @@ int cr_check(void)
ret |= check_sk_netns();
ret |= check_kcmp_epoll();
ret |= check_net_diag_raw();
ret |= check_clone3_set_tid();
}
/*
@ -1476,6 +1487,7 @@ static struct feature_list feature_list[] = {
{ "link_nsid", check_link_nsid},
{ "kcmp_epoll", check_kcmp_epoll},
{ "external_net_ns", check_external_net_ns},
{ "clone3_set_tid", check_clone3_set_tid},
{ NULL, NULL },
};

View file

@ -66,6 +66,7 @@ struct kerndat_s {
bool has_inotify_setnextwd;
bool has_kcmp_epoll_tfd;
bool has_fsopen;
bool has_clone3_set_tid;
};
extern struct kerndat_s kdat;

33
criu/include/sched.h Normal file
View file

@ -0,0 +1,33 @@
#ifndef __CR_SCHED_H__
#define __CR_SCHED_H__
#include <linux/types.h>
#ifndef ptr_to_u64
#define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr)))
#endif
#ifndef u64_to_ptr
#define u64_to_ptr(x) ((void *)(uintptr_t)x)
#endif
/*
* This structure is needed by clone3(). The kernel
* calls it 'struct clone_args'. As CRIU will always
* need at least this part of the structure (VER1)
* to be able to test if clone3() with set_tid works,
* the structure is defined here as 'struct _clone_args'.
*/
struct _clone_args {
__aligned_u64 flags;
__aligned_u64 pidfd;
__aligned_u64 child_tid;
__aligned_u64 parent_tid;
__aligned_u64 exit_signal;
__aligned_u64 stack;
__aligned_u64 stack_size;
__aligned_u64 tls;
__aligned_u64 set_tid;
__aligned_u64 set_tid_size;
};
#endif /* __CR_SCHED_H__ */

View file

@ -41,6 +41,7 @@
#include "uffd.h"
#include "vdso.h"
#include "kcmp.h"
#include "sched.h"
struct kerndat_s kdat = {
};
@ -986,6 +987,44 @@ static int kerndat_tun_netns(void)
return check_tun_netns_cr(&kdat.tun_ns);
}
static bool kerndat_has_clone3_set_tid(void)
{
pid_t pid;
struct _clone_args args = {};
#ifndef CONFIG_X86_64
/*
* Currently the CRIU PIE assembler clone3() wrapper is
* only implemented for X86_64.
*/
kdat.has_clone3_set_tid = false;
return 0;
#endif
args.set_tid = -1;
/*
* On a system without clone3() this will return ENOSYS.
* On a system with clone3() but without set_tid this
* will return E2BIG.
* On a system with clone3() and set_tid it will return
* EINVAL.
*/
pid = syscall(__NR_clone3, &args, sizeof(args));
if (pid == -1 && (errno == ENOSYS || errno == E2BIG)) {
kdat.has_clone3_set_tid = false;
return 0;
}
if (pid == -1 && errno == EINVAL) {
kdat.has_clone3_set_tid = true;
} else {
pr_perror("Unexpected error from clone3\n");
return -1;
}
return 0;
}
int kerndat_init(void)
{
int ret;
@ -1059,6 +1098,8 @@ int kerndat_init(void)
ret = has_kcmp_epoll_tfd();
if (!ret)
ret = kerndat_has_fsopen();
if (!ret)
ret = kerndat_has_clone3_set_tid();
kerndat_lsm();
kerndat_mmap_min_addr();