cr-service: add pidfd_store_sk option to rpc.proto

pidfd_store_sk option will be used later to store tasks pidfds
between predumps to detect pid reuse reliably.
pidfd_store_sk should be a fd of a connectionless unix socket.

init_pidfd_store_sk() steals the socket from the RPC client using
pidfd_getfd, checks that it is a connectionless unix socket and
checks if it is not initialized before (i.e. unnamed socket).
If not initialized the socket is first bound to an abstract name
(combination of the real pid/fd to avoid overlap), then it is
connected to itself hence allowing us to store the pidfds in the
receive queue of the socket (this is similar to how fdstore_init()
works).

v2:
	- avoid close(pidfd) overriding errno of SYS_pidfd_open in
	  init_pidfd_store_sk()
	- close pidfd_store_sk because we might have leftover from
	  previous iterations

Signed-off-by: Zeyad Yasser <zeyady98@gmail.com>
This commit is contained in:
Zeyad Yasser 2021-03-16 14:21:19 +02:00 committed by Andrei Vagin
parent a9508c9864
commit e3c9c3429a
6 changed files with 117 additions and 0 deletions

View file

@ -40,6 +40,7 @@
#include "proc_parse.h"
#include "common/scm.h"
#include "uffd.h"
#include "mem.h"
#include "setproctitle.h"
@ -688,6 +689,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req)
}
}
if (req->has_pidfd_store_sk && init_pidfd_store_sk(ids.pid, req->pidfd_store_sk))
goto err;
if (req->orphan_pts_master)
opts.orphan_pts_master = true;

View file

@ -50,4 +50,6 @@ int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta);
int unmap_guard_pages(struct pstree_item *t);
int prepare_mappings(struct pstree_item *t);
bool should_dump_page(VmaEntry *vmae, u64 pme);
int init_pidfd_store_sk(pid_t pid, int fd);
#endif /* __CR_MEM_H__ */

View file

@ -30,10 +30,107 @@
#include "fault-injection.h"
#include "prctl.h"
#include "compel/infect-util.h"
#include "compel/plugins/std/syscall-codes.h"
#include "protobuf.h"
#include "images/pagemap.pb-c.h"
static int pidfd_store_sk = -1;
int init_pidfd_store_sk(pid_t pid, int sk)
{
int pidfd;
int sock_type;
socklen_t len;
struct sockaddr_un addr;
unsigned int addrlen;
/* In kernel a bufsize has type int and a value is doubled. */
uint32_t buf[2] = { INT_MAX / 2, INT_MAX / 2 };
if (!kdat.has_pidfd_open) {
pr_err("pidfd_open syscall is not supported\n");
return -1;
}
if (!kdat.has_pidfd_getfd) {
pr_err("pidfd_getfd syscall is not supported\n");
return -1;
}
/* Steal pidfd store socket from RPC client */
pidfd = syscall(SYS_pidfd_open, pid, 0);
if (pidfd == -1) {
pr_perror("Can't get pidfd of (pid: %d)", pid);
goto err;
}
close_safe(&pidfd_store_sk);
pidfd_store_sk = syscall(SYS_pidfd_getfd, pidfd, sk, 0);
if (pidfd_store_sk == -1) {
pr_perror("Can't steal fd %d using pidfd_getfd", sk);
close(pidfd);
goto err;
}
close(pidfd);
/* Check that stolen socket is a connectionless unix domain socket */
len = sizeof(sock_type);
if (getsockopt(pidfd_store_sk, SOL_SOCKET, SO_TYPE, &sock_type, &len)) {
pr_perror("Can't get socket type (fd: %d)", pidfd_store_sk);
goto err;
}
if (sock_type != SOCK_DGRAM) {
pr_err("Pidfd store socket must be of type SOCK_DGRAM\n");
goto err;
}
addrlen = sizeof(addr);
if (getsockname(pidfd_store_sk, (struct sockaddr *)&addr, &addrlen)) {
pr_perror("Can't get socket bound name (fd: %d)", pidfd_store_sk);
goto err;
}
if (addr.sun_family != AF_UNIX) {
pr_err("Pidfd store socket must be AF_UNIX\n");
goto err;
}
/*
* Unnamed socket needs to be initialized and connected to itself.
* This only occurs once in the first predump, after the socket is
* bound, addrlen will be sizeof(struct sockaddr_un).
* This is similar to how fdstore_init() works.
*/
if (addrlen == sizeof(sa_family_t)) {
if (setsockopt(pidfd_store_sk, SOL_SOCKET, SO_SNDBUFFORCE, &buf[0], sizeof(buf[0])) < 0 ||
setsockopt(pidfd_store_sk, SOL_SOCKET, SO_RCVBUFFORCE, &buf[1], sizeof(buf[1])) < 0) {
pr_perror("Unable to set SO_SNDBUFFORCE/SO_RCVBUFFORCE");
goto err;
}
addrlen = snprintf(addr.sun_path, sizeof(addr.sun_path), "X/criu-pidfd-store-%d-%d", pid, sk);
addrlen += sizeof(addr.sun_family);
addr.sun_path[0] = 0;
if (bind(pidfd_store_sk, (struct sockaddr *)&addr, addrlen)) {
pr_perror("Unable to bind a socket");
goto err;
}
if (connect(pidfd_store_sk, (struct sockaddr *) &addr, addrlen)) {
pr_perror("Unable to connect a socket");
goto err;
}
}
return 0;
err:
close_safe(&pidfd_store_sk);
return -1;
}
static int task_reset_dirty_track(int pid)
{
int ret;

View file

@ -129,6 +129,7 @@ message criu_opts {
optional bool tls_no_cn_verify = 59;
optional string cgroup_yard = 60;
optional criu_pre_dump_mode pre_dump_mode = 61 [default = SPLICE];
optional int32 pidfd_store_sk = 62;
/* optional bool check_mounts = 128; */
}

View file

@ -1782,3 +1782,14 @@ int criu_get_orphan_pts_master_fd(void)
{
return orphan_pts_master_fd;
}
void criu_local_set_pidfd_store_sk(criu_opts *opts, int sk)
{
opts->rpc->has_pidfd_store_sk = true;
opts->rpc->pidfd_store_sk = sk;
}
void criu_set_pidfd_store_sk(int sk)
{
criu_local_set_pidfd_store_sk(global_opts, sk);
}

View file

@ -102,6 +102,7 @@ int criu_add_inherit_fd(int fd, const char *key);
int criu_add_external(const char *key);
int criu_set_page_server_address_port(const char *address, int port);
int criu_set_pre_dump_mode(enum criu_pre_dump_mode mode);
void criu_set_pidfd_store_sk(int sk);
/*
* The criu_notify_arg_t na argument is an opaque
@ -260,6 +261,7 @@ int criu_local_add_inherit_fd(criu_opts *opts, int fd, const char *key);
int criu_local_add_external(criu_opts *opts, const char *key);
int criu_local_set_page_server_address_port(criu_opts *opts, const char *address, int port);
int criu_local_set_pre_dump_mode(criu_opts *opts, enum criu_pre_dump_mode mode);
void criu_local_set_pidfd_store_sk(criu_opts *opts, int sk);
void criu_local_set_notify_cb(criu_opts *opts, int (*cb)(char *action, criu_notify_arg_t na));