mirror of
https://github.com/checkpoint-restore/criu.git
synced 2026-01-23 02:14:37 +00:00
Signed-off-by: Radostin Stoyanov <rstoyanov1@gmail.com> Signed-off-by: Andrei Vagin <avagin@virtuozzo.com>
1779 lines
44 KiB
C
1779 lines
44 KiB
C
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
|
|
#include <linux/securebits.h>
|
|
#include <linux/capability.h>
|
|
#include <linux/aio_abi.h>
|
|
#include <sys/types.h>
|
|
#include <sys/mman.h>
|
|
#include <sys/file.h>
|
|
#include <sys/stat.h>
|
|
#include <sys/wait.h>
|
|
#include <sys/time.h>
|
|
#include <sys/shm.h>
|
|
#include <fcntl.h>
|
|
#include <unistd.h>
|
|
#include <sched.h>
|
|
#include <sys/resource.h>
|
|
#include <signal.h>
|
|
|
|
#include "linux/userfaultfd.h"
|
|
|
|
#include "common/config.h"
|
|
#include "int.h"
|
|
#include "types.h"
|
|
#include "common/compiler.h"
|
|
#include <compel/plugins/std/syscall.h>
|
|
#include <compel/plugins/std/log.h>
|
|
#include <compel/ksigset.h>
|
|
#include "signal.h"
|
|
#include "prctl.h"
|
|
#include "criu-log.h"
|
|
#include "util.h"
|
|
#include "image.h"
|
|
#include "sk-inet.h"
|
|
#include "vma.h"
|
|
#include "uffd.h"
|
|
|
|
#include "common/lock.h"
|
|
#include "restorer.h"
|
|
#include "aio.h"
|
|
#include "seccomp.h"
|
|
|
|
#include "images/creds.pb-c.h"
|
|
#include "images/mm.pb-c.h"
|
|
|
|
#include "shmem.h"
|
|
#include "restorer.h"
|
|
#include "namespaces.h"
|
|
|
|
#define __sys(foo) sys_##foo
|
|
#define __sys_err(ret) ret
|
|
|
|
#include "../ns-common.c"
|
|
|
|
#ifndef PR_SET_PDEATHSIG
|
|
#define PR_SET_PDEATHSIG 1
|
|
#endif
|
|
|
|
#define sys_prctl_safe(opcode, val1, val2, val3) \
|
|
({ \
|
|
long __ret = sys_prctl(opcode, val1, val2, val3, 0); \
|
|
if (__ret) \
|
|
pr_err("prctl failed @%d with %ld\n", __LINE__, __ret);\
|
|
__ret; \
|
|
})
|
|
|
|
static struct task_entries *task_entries_local;
|
|
static futex_t thread_inprogress;
|
|
static pid_t *helpers;
|
|
static int n_helpers;
|
|
static pid_t *zombies;
|
|
static int n_zombies;
|
|
static enum faults fi_strategy;
|
|
bool fault_injected(enum faults f)
|
|
{
|
|
return __fault_injected(f, fi_strategy);
|
|
}
|
|
|
|
/*
|
|
* These are stubs for std compel plugin.
|
|
*/
|
|
int parasite_daemon_cmd(int cmd, void *args)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
int parasite_trap_cmd(int cmd, void *args)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
void parasite_cleanup(void)
|
|
{
|
|
}
|
|
|
|
extern void cr_restore_rt (void) asm ("__cr_restore_rt")
|
|
__attribute__ ((visibility ("hidden")));
|
|
|
|
static void sigchld_handler(int signal, siginfo_t *siginfo, void *data)
|
|
{
|
|
char *r;
|
|
int i;
|
|
|
|
/* We can ignore helpers that die, we expect them to after
|
|
* CR_STATE_RESTORE is finished. */
|
|
for (i = 0; i < n_helpers; i++)
|
|
if (siginfo->si_pid == helpers[i])
|
|
return;
|
|
|
|
for (i = 0; i < n_zombies; i++)
|
|
if (siginfo->si_pid == zombies[i])
|
|
return;
|
|
|
|
if (siginfo->si_code == CLD_EXITED)
|
|
r = "exited, status=";
|
|
else if (siginfo->si_code == CLD_KILLED)
|
|
r = "killed by signal";
|
|
else if (siginfo->si_code == CLD_DUMPED)
|
|
r = "terminated abnormally with";
|
|
else if (siginfo->si_code == CLD_TRAPPED)
|
|
r = "trapped with";
|
|
else if (siginfo->si_code == CLD_STOPPED)
|
|
r = "stopped with";
|
|
else
|
|
r = "disappeared with";
|
|
|
|
pr_info("Task %d %s %d\n", siginfo->si_pid, r, siginfo->si_status);
|
|
|
|
futex_abort_and_wake(&task_entries_local->nr_in_progress);
|
|
/* sa_restorer may be unmaped, so we can't go back to userspace*/
|
|
sys_kill(sys_getpid(), SIGSTOP);
|
|
sys_exit_group(1);
|
|
}
|
|
|
|
static int lsm_set_label(char *label, int procfd)
|
|
{
|
|
int ret = -1, len, lsmfd;
|
|
char path[STD_LOG_SIMPLE_CHUNK];
|
|
|
|
if (!label)
|
|
return 0;
|
|
|
|
pr_info("restoring lsm profile %s\n", label);
|
|
|
|
std_sprintf(path, "self/task/%ld/attr/current", sys_gettid());
|
|
|
|
lsmfd = sys_openat(procfd, path, O_WRONLY, 0);
|
|
if (lsmfd < 0) {
|
|
pr_err("failed openat %d\n", lsmfd);
|
|
return -1;
|
|
}
|
|
|
|
for (len = 0; label[len]; len++)
|
|
;
|
|
|
|
ret = sys_write(lsmfd, label, len);
|
|
sys_close(lsmfd);
|
|
if (ret < 0) {
|
|
pr_err("can't write lsm profile %d\n", ret);
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int restore_creds(struct thread_creds_args *args, int procfd)
|
|
{
|
|
CredsEntry *ce = &args->creds;
|
|
int b, i, ret;
|
|
struct cap_header hdr;
|
|
struct cap_data data[_LINUX_CAPABILITY_U32S_3];
|
|
|
|
/*
|
|
* We're still root here and thus can do it without failures.
|
|
*/
|
|
|
|
/*
|
|
* Setup supplementary group IDs early.
|
|
*/
|
|
if (args->groups) {
|
|
ret = sys_setgroups(ce->n_groups, args->groups);
|
|
if (ret) {
|
|
pr_err("Can't setup supplementary group IDs: %d\n", ret);
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* First -- set the SECURE_NO_SETUID_FIXUP bit not to
|
|
* lose caps bits when changing xids.
|
|
*/
|
|
|
|
ret = sys_prctl(PR_SET_SECUREBITS, 1 << SECURE_NO_SETUID_FIXUP, 0, 0, 0);
|
|
if (ret) {
|
|
pr_err("Unable to set SECURE_NO_SETUID_FIXUP: %d\n", ret);
|
|
return -1;
|
|
}
|
|
|
|
/*
|
|
* Second -- restore xids. Since we still have the CAP_SETUID
|
|
* capability nothing should fail. But call the setfsXid last
|
|
* to override the setresXid settings.
|
|
*/
|
|
|
|
if (ce->uid != NS_INVALID_XID || ce->euid != NS_INVALID_XID || ce->suid != NS_INVALID_XID) {
|
|
ret = sys_setresuid(ce->uid, ce->euid, ce->suid);
|
|
if (ret) {
|
|
pr_err("Unable to set real, effective and saved user ID: %d\n", ret);
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
if (ce->fsuid != NS_INVALID_XID) {
|
|
sys_setfsuid(ce->fsuid);
|
|
if (sys_setfsuid(-1) != ce->fsuid) {
|
|
pr_err("Unable to set fsuid\n");
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
if (ce->gid != NS_INVALID_XID || ce->egid != NS_INVALID_XID || ce->sgid != NS_INVALID_XID) {
|
|
ret = sys_setresgid(ce->gid, ce->egid, ce->sgid);
|
|
if (ret) {
|
|
pr_err("Unable to set real, effective and saved group ID: %d\n", ret);
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
if (ce->fsgid != NS_INVALID_XID) {
|
|
sys_setfsgid(ce->fsgid);
|
|
if (sys_setfsgid(-1) != ce->fsgid) {
|
|
pr_err("Unable to set fsgid\n");
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Third -- restore securebits. We don't need them in any
|
|
* special state any longer.
|
|
*/
|
|
|
|
ret = sys_prctl(PR_SET_SECUREBITS, ce->secbits, 0, 0, 0);
|
|
if (ret) {
|
|
pr_err("Unable to set PR_SET_SECUREBITS: %d\n", ret);
|
|
return -1;
|
|
}
|
|
|
|
/*
|
|
* Fourth -- trim bset. This can only be done while
|
|
* having the CAP_SETPCAP capablity.
|
|
*/
|
|
|
|
for (b = 0; b < CR_CAP_SIZE; b++) {
|
|
for (i = 0; i < 32; i++) {
|
|
if (b * 32 + i > args->cap_last_cap)
|
|
break;
|
|
if (args->cap_bnd[b] & (1 << i))
|
|
/* already set */
|
|
continue;
|
|
ret = sys_prctl(PR_CAPBSET_DROP, i + b * 32, 0, 0, 0);
|
|
if (ret) {
|
|
pr_err("Unable to drop capability %d: %d\n",
|
|
i + b * 32, ret);
|
|
return -1;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Fifth -- restore caps. Nothing but cap bits are changed
|
|
* at this stage, so just do it.
|
|
*/
|
|
|
|
hdr.version = _LINUX_CAPABILITY_VERSION_3;
|
|
hdr.pid = 0;
|
|
|
|
BUILD_BUG_ON(_LINUX_CAPABILITY_U32S_3 != CR_CAP_SIZE);
|
|
|
|
for (i = 0; i < CR_CAP_SIZE; i++) {
|
|
data[i].eff = args->cap_eff[i];
|
|
data[i].prm = args->cap_prm[i];
|
|
data[i].inh = args->cap_inh[i];
|
|
}
|
|
|
|
ret = sys_capset(&hdr, data);
|
|
if (ret) {
|
|
pr_err("Unable to restore capabilities: %d\n", ret);
|
|
return -1;
|
|
}
|
|
|
|
if (lsm_set_label(args->lsm_profile, procfd) < 0)
|
|
return -1;
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* This should be done after creds restore, as
|
|
* some creds changes might drop the value back
|
|
* to zero.
|
|
*/
|
|
|
|
static inline int restore_pdeath_sig(struct thread_restore_args *ta)
|
|
{
|
|
if (ta->pdeath_sig)
|
|
return sys_prctl(PR_SET_PDEATHSIG, ta->pdeath_sig, 0, 0, 0);
|
|
else
|
|
return 0;
|
|
}
|
|
|
|
static int restore_dumpable_flag(MmEntry *mme)
|
|
{
|
|
int current_dumpable;
|
|
int ret;
|
|
|
|
if (!mme->has_dumpable) {
|
|
pr_warn("Dumpable flag not present in criu dump.\n");
|
|
return 0;
|
|
}
|
|
|
|
if (mme->dumpable == 0 || mme->dumpable == 1) {
|
|
ret = sys_prctl(PR_SET_DUMPABLE, mme->dumpable, 0, 0, 0);
|
|
if (ret) {
|
|
pr_err("Unable to set PR_SET_DUMPABLE: %d\n", ret);
|
|
return -1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* If dumpable flag is present but it is not 0 or 1, then we can not
|
|
* use prctl to set it back. Try to see if it is already correct
|
|
* (which is likely if sysctl fs.suid_dumpable is the same when dump
|
|
* and restore are run), in which case there is nothing to do.
|
|
* Otherwise, set dumpable to 0 which should be a secure fallback.
|
|
*/
|
|
current_dumpable = sys_prctl(PR_GET_DUMPABLE, 0, 0, 0, 0);
|
|
if (mme->dumpable != current_dumpable) {
|
|
pr_warn("Dumpable flag [%d] does not match current [%d]. "
|
|
"Will fallback to setting it to 0 to disable it.\n",
|
|
mme->dumpable, current_dumpable);
|
|
ret = sys_prctl(PR_SET_DUMPABLE, 0, 0, 0, 0);
|
|
if (ret) {
|
|
pr_err("Unable to set PR_SET_DUMPABLE: %d\n", ret);
|
|
return -1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static void restore_sched_info(struct rst_sched_param *p)
|
|
{
|
|
struct sched_param parm;
|
|
|
|
pr_info("Restoring scheduler params %d.%d.%d\n",
|
|
p->policy, p->nice, p->prio);
|
|
|
|
sys_setpriority(PRIO_PROCESS, 0, p->nice);
|
|
parm.sched_priority = p->prio;
|
|
sys_sched_setscheduler(0, p->policy, &parm);
|
|
}
|
|
|
|
static void restore_rlims(struct task_restore_args *ta)
|
|
{
|
|
int r;
|
|
|
|
for (r = 0; r < ta->rlims_n; r++) {
|
|
struct krlimit krlim;
|
|
|
|
krlim.rlim_cur = ta->rlims[r].rlim_cur;
|
|
krlim.rlim_max = ta->rlims[r].rlim_max;
|
|
sys_setrlimit(r, &krlim);
|
|
}
|
|
}
|
|
|
|
static int restore_signals(siginfo_t *ptr, int nr, bool group)
|
|
{
|
|
int ret, i;
|
|
|
|
for (i = 0; i < nr; i++) {
|
|
siginfo_t *info = ptr + i;
|
|
|
|
pr_info("Restore signal %d group %d\n", info->si_signo, group);
|
|
if (group)
|
|
ret = sys_rt_sigqueueinfo(sys_getpid(), info->si_signo, info);
|
|
else
|
|
ret = sys_rt_tgsigqueueinfo(sys_getpid(),
|
|
sys_gettid(), info->si_signo, info);
|
|
if (ret) {
|
|
pr_err("Unable to send siginfo %d %x with code %d\n",
|
|
info->si_signo, info->si_code, ret);
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int restore_seccomp(struct task_restore_args *args)
|
|
{
|
|
int ret;
|
|
|
|
switch (args->seccomp_mode) {
|
|
case SECCOMP_MODE_DISABLED:
|
|
return 0;
|
|
case SECCOMP_MODE_STRICT:
|
|
ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
|
|
if (ret < 0) {
|
|
pr_err("prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT) returned %d\n", ret);
|
|
goto die;
|
|
}
|
|
return 0;
|
|
case SECCOMP_MODE_FILTER: {
|
|
int i;
|
|
void *filter_data;
|
|
|
|
filter_data = &args->seccomp_filters[args->seccomp_filters_n];
|
|
|
|
for (i = 0; i < args->seccomp_filters_n; i++) {
|
|
struct sock_fprog *fprog = &args->seccomp_filters[i];
|
|
|
|
fprog->filter = filter_data;
|
|
|
|
/* We always TSYNC here, since we require that the
|
|
* creds for all threads be the same; this means we
|
|
* don't have to restore_seccomp() in threads, and that
|
|
* future TSYNC behavior will be correct.
|
|
*/
|
|
ret = sys_seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, (char *) fprog);
|
|
if (ret < 0) {
|
|
pr_err("sys_seccomp() returned %d\n", ret);
|
|
goto die;
|
|
}
|
|
|
|
filter_data += fprog->len * sizeof(struct sock_filter);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
default:
|
|
goto die;
|
|
}
|
|
|
|
return 0;
|
|
die:
|
|
return -1;
|
|
}
|
|
|
|
static int restore_robust_futex(struct thread_restore_args *args)
|
|
{
|
|
uint32_t futex_len = args->futex_rla_len;
|
|
int ret;
|
|
|
|
if (!args->futex_rla_len)
|
|
return 0;
|
|
|
|
/*
|
|
* XXX: We check here *task's* mode, not *thread's*.
|
|
* But it's possible to write an application with mixed
|
|
* threads (on x86): some in 32-bit mode, some in 64-bit.
|
|
* Quite unlikely that such application exists at all.
|
|
*/
|
|
if (args->ta->compatible_mode) {
|
|
uint32_t futex = (uint32_t)args->futex_rla;
|
|
ret = set_compat_robust_list(futex, futex_len);
|
|
} else {
|
|
void *futex = decode_pointer(args->futex_rla);
|
|
ret = sys_set_robust_list(futex, futex_len);
|
|
}
|
|
|
|
if (ret)
|
|
pr_err("Failed to recover futex robust list: %d\n", ret);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int restore_thread_common(struct thread_restore_args *args)
|
|
{
|
|
sys_set_tid_address((int *)decode_pointer(args->clear_tid_addr));
|
|
|
|
if (restore_robust_futex(args))
|
|
return -1;
|
|
|
|
restore_sched_info(&args->sp);
|
|
|
|
if (restore_nonsigframe_gpregs(&args->gpregs))
|
|
return -1;
|
|
|
|
restore_tls(&args->tls);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void noinline rst_sigreturn(unsigned long new_sp,
|
|
struct rt_sigframe *sigframe)
|
|
{
|
|
ARCH_RT_SIGRETURN(new_sp, sigframe);
|
|
}
|
|
|
|
/*
|
|
* Threads restoration via sigreturn. Note it's locked
|
|
* routine and calls for unlock at the end.
|
|
*/
|
|
long __export_restore_thread(struct thread_restore_args *args)
|
|
{
|
|
bool check_only = args->check_only;
|
|
struct rt_sigframe *rt_sigframe;
|
|
k_rtsigset_t to_block;
|
|
unsigned long new_sp;
|
|
int my_pid = sys_gettid();
|
|
int i, fd, ret;
|
|
|
|
for (i = 0; i < MAX_NS_NESTING; i++)
|
|
if (args->pid[i] == 0)
|
|
break;
|
|
|
|
if (my_pid != args->pid[i-1]) {
|
|
pr_err("Thread pid mismatch %d/%d\n", my_pid, args->pid[i-1]);
|
|
goto core_restore_end;
|
|
}
|
|
|
|
/* All signals must be handled by thread leader */
|
|
ksigfillset(&to_block);
|
|
ret = sys_sigprocmask(SIG_SETMASK, &to_block, NULL, sizeof(k_rtsigset_t));
|
|
if (ret) {
|
|
pr_err("Unable to block signals %d\n", ret);
|
|
goto core_restore_end;
|
|
}
|
|
|
|
rt_sigframe = (void *)&args->mz->rt_sigframe;
|
|
|
|
if (restore_thread_common(args))
|
|
goto core_restore_end;
|
|
|
|
fd = args->pfc_ns_fd;
|
|
if (fd >= 0) {
|
|
ret = sys_setns(fd, CLONE_NEWPID);
|
|
if (ret) {
|
|
pr_err("Can't setns: ret=%d\n", ret);
|
|
goto core_restore_end;
|
|
}
|
|
sys_close(fd);
|
|
}
|
|
|
|
ret = restore_creds(args->creds_args, args->ta->proc_fd);
|
|
if (ret)
|
|
goto core_restore_end;
|
|
|
|
ret = restore_dumpable_flag(&args->ta->mm);
|
|
if (ret)
|
|
goto core_restore_end;
|
|
|
|
pr_info("%ld: Restored\n", sys_gettid());
|
|
|
|
restore_finish_stage(task_entries_local, CR_STATE_RESTORE);
|
|
|
|
if (restore_signals(args->siginfo, args->siginfo_n, false))
|
|
goto core_restore_end;
|
|
|
|
restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD);
|
|
restore_pdeath_sig(args);
|
|
|
|
if (args->ta->seccomp_mode != SECCOMP_MODE_DISABLED)
|
|
pr_info("Restoring seccomp mode %d for %ld\n", args->ta->seccomp_mode, sys_getpid());
|
|
|
|
restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS);
|
|
|
|
futex_dec_and_wake(&thread_inprogress);
|
|
|
|
if (check_only)
|
|
restore_finish_stage(task_entries_local, CR_STATE_COMPLETE);
|
|
|
|
new_sp = (long)rt_sigframe + RT_SIGFRAME_OFFSET(rt_sigframe);
|
|
rst_sigreturn(new_sp, rt_sigframe);
|
|
|
|
core_restore_end:
|
|
pr_err("Restorer abnormal termination for %ld\n", sys_getpid());
|
|
futex_abort_and_wake(&task_entries_local->nr_in_progress);
|
|
sys_exit_group(1);
|
|
return -1;
|
|
}
|
|
|
|
static long restore_self_exe_late(struct task_restore_args *args)
|
|
{
|
|
int fd = args->fd_exe_link, ret;
|
|
|
|
pr_info("Restoring EXE link\n");
|
|
ret = sys_prctl_safe(PR_SET_MM, PR_SET_MM_EXE_FILE, fd, 0);
|
|
if (ret)
|
|
pr_err("Can't restore EXE link (%d)\n", ret);
|
|
sys_close(fd);
|
|
|
|
return ret;
|
|
}
|
|
|
|
#ifndef ARCH_HAS_SHMAT_HOOK
|
|
unsigned long arch_shmat(int shmid, void *shmaddr,
|
|
int shmflg, unsigned long size)
|
|
{
|
|
return sys_shmat(shmid, shmaddr, shmflg);
|
|
}
|
|
#endif
|
|
|
|
static unsigned long restore_mapping(VmaEntry *vma_entry)
|
|
{
|
|
int prot = vma_entry->prot;
|
|
int flags = vma_entry->flags | MAP_FIXED;
|
|
unsigned long addr;
|
|
|
|
if (vma_entry_is(vma_entry, VMA_AREA_SYSVIPC)) {
|
|
int att_flags;
|
|
void *shmaddr = decode_pointer(vma_entry->start);
|
|
unsigned long shmsize = (vma_entry->end - vma_entry->start);
|
|
/*
|
|
* See comment in open_shmem_sysv() for what SYSV_SHMEM_SKIP_FD
|
|
* means and why we check for PROT_EXEC few lines below.
|
|
*/
|
|
if (vma_entry->fd == SYSV_SHMEM_SKIP_FD)
|
|
return vma_entry->start;
|
|
|
|
if (vma_entry->prot & PROT_EXEC) {
|
|
att_flags = 0;
|
|
vma_entry->prot &= ~PROT_EXEC;
|
|
} else
|
|
att_flags = SHM_RDONLY;
|
|
|
|
pr_info("Attach SYSV shmem %d at %"PRIx64"\n", (int)vma_entry->fd, vma_entry->start);
|
|
return arch_shmat(vma_entry->fd, shmaddr, att_flags, shmsize);
|
|
}
|
|
|
|
/*
|
|
* Restore or shared mappings are tricky, since
|
|
* we open anonymous mapping via map_files/
|
|
* MAP_ANONYMOUS should be eliminated so fd would
|
|
* be taken into account by a kernel.
|
|
*/
|
|
if (vma_entry_is(vma_entry, VMA_ANON_SHARED) && (vma_entry->fd != -1UL))
|
|
flags &= ~MAP_ANONYMOUS;
|
|
|
|
/* See comment in premap_private_vma() for this flag change */
|
|
if (vma_entry_is(vma_entry, VMA_AREA_AIORING))
|
|
flags |= MAP_ANONYMOUS;
|
|
|
|
/* A mapping of file with MAP_SHARED is up to date */
|
|
if ((vma_entry->fd == -1 || !(vma_entry->flags & MAP_SHARED)) &&
|
|
!(vma_entry->status & VMA_NO_PROT_WRITE))
|
|
prot |= PROT_WRITE;
|
|
|
|
pr_debug("\tmmap(%"PRIx64" -> %"PRIx64", %x %x %d)\n",
|
|
vma_entry->start, vma_entry->end,
|
|
prot, flags, (int)vma_entry->fd);
|
|
/*
|
|
* Should map memory here. Note we map them as
|
|
* writable since we're going to restore page
|
|
* contents.
|
|
*/
|
|
addr = sys_mmap(decode_pointer(vma_entry->start),
|
|
vma_entry_len(vma_entry),
|
|
prot, flags,
|
|
vma_entry->fd,
|
|
vma_entry->pgoff);
|
|
|
|
if ((vma_entry->fd != -1) &&
|
|
(vma_entry->status & VMA_CLOSE))
|
|
sys_close(vma_entry->fd);
|
|
|
|
return addr;
|
|
}
|
|
|
|
/*
|
|
* This restores aio ring header, content, head and in-kernel position
|
|
* of tail. To set tail, we write to /dev/null and use the fact this
|
|
* operation is synchronious for the device. Also, we unmap temporary
|
|
* anonymous area, used to store content of ring buffer during restore
|
|
* and mapped in premap_private_vma().
|
|
*/
|
|
static int restore_aio_ring(struct rst_aio_ring *raio)
|
|
{
|
|
struct aio_ring *ring = (void *)raio->addr, *new;
|
|
int i, maxr, count, fd, ret;
|
|
unsigned head = ring->head;
|
|
unsigned tail = ring->tail;
|
|
struct iocb *iocb, **iocbp;
|
|
unsigned long ctx = 0;
|
|
unsigned size;
|
|
char buf[1];
|
|
|
|
ret = sys_io_setup(raio->nr_req, &ctx);
|
|
if (ret < 0) {
|
|
pr_err("Ring setup failed with %d\n", ret);
|
|
return -1;
|
|
}
|
|
|
|
new = (struct aio_ring *)ctx;
|
|
i = (raio->len - sizeof(struct aio_ring)) / sizeof(struct io_event);
|
|
if (tail >= ring->nr || head >= ring->nr || ring->nr != i ||
|
|
new->nr != ring->nr) {
|
|
pr_err("wrong aio: tail=%x head=%x req=%x old_nr=%x new_nr=%x expect=%x\n",
|
|
tail, head, raio->nr_req, ring->nr, new->nr, i);
|
|
|
|
return -1;
|
|
}
|
|
|
|
if (tail == 0 && head == 0)
|
|
goto populate;
|
|
|
|
fd = sys_open("/dev/null", O_WRONLY, 0);
|
|
if (fd < 0) {
|
|
pr_err("Can't open /dev/null for aio\n");
|
|
return -1;
|
|
}
|
|
|
|
/*
|
|
* If tail < head, we have to do full turn and then submit
|
|
* tail more request, i.e. ring->nr + tail.
|
|
* If we do not do full turn, in-kernel completed_events
|
|
* will initialize wrong.
|
|
*
|
|
* Maximum number reqs to submit at once are ring->nr-1,
|
|
* so we won't allocate more.
|
|
*/
|
|
if (tail < head)
|
|
count = ring->nr + tail;
|
|
else
|
|
count = tail;
|
|
maxr = min_t(unsigned, count, ring->nr-1);
|
|
|
|
/*
|
|
* Since we only interested in moving the tail, the requests
|
|
* may be any. We submit count identical requests.
|
|
*/
|
|
size = sizeof(struct iocb) + maxr * sizeof(struct iocb *);
|
|
iocb = (void *)sys_mmap(NULL, size, PROT_READ|PROT_WRITE,
|
|
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
|
|
iocbp = (void *)iocb + sizeof(struct iocb);
|
|
|
|
if (IS_ERR(iocb)) {
|
|
pr_err("Can't mmap aio tmp buffer: %ld\n", PTR_ERR(iocb));
|
|
return -1;
|
|
}
|
|
|
|
iocb->aio_fildes = fd;
|
|
iocb->aio_buf = (unsigned long)buf;
|
|
iocb->aio_nbytes = 1;
|
|
iocb->aio_lio_opcode = IOCB_CMD_PWRITE; /* Write is nop, read populates buf */
|
|
|
|
for (i = 0; i < maxr; i++)
|
|
iocbp[i] = iocb;
|
|
|
|
i = 0;
|
|
do {
|
|
ret = sys_io_submit(ctx, count - i, iocbp);
|
|
if (ret < 0) {
|
|
pr_err("Can't submit aio iocbs: ret=%d\n", ret);
|
|
return -1;
|
|
}
|
|
i += ret;
|
|
|
|
/*
|
|
* We may submit less than requested, because of too big
|
|
* count OR behaviour of get_reqs_available(), which
|
|
* takes available requests only if their number is
|
|
* aliquot to kioctx::req_batch. Free part of buffer
|
|
* for next iteration.
|
|
*
|
|
* Direct set of head is equal to sys_io_getevents() call,
|
|
* and faster. See kernel for the details.
|
|
*/
|
|
((struct aio_ring *)ctx)->head = i < head ? i : head;
|
|
} while (i < count);
|
|
|
|
sys_munmap(iocb, size);
|
|
sys_close(fd);
|
|
|
|
populate:
|
|
i = offsetof(struct aio_ring, io_events);
|
|
memcpy((void *)ctx + i, (void *)ring + i, raio->len - i);
|
|
|
|
/*
|
|
* If we failed to get the proper nr_req right and
|
|
* created smaller or larger ring, then this remap
|
|
* will (should) fail, since AIO rings has immutable
|
|
* size.
|
|
*
|
|
* This is not great, but anyway better than putting
|
|
* a ring of wrong size into correct place.
|
|
*
|
|
* Also, this unmaps temporary anonymous area on raio->addr.
|
|
*/
|
|
|
|
ctx = sys_mremap(ctx, raio->len, raio->len,
|
|
MREMAP_FIXED | MREMAP_MAYMOVE,
|
|
raio->addr);
|
|
if (ctx != raio->addr) {
|
|
pr_err("Ring remap failed with %ld\n", ctx);
|
|
return -1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static void rst_tcp_repair_off(struct rst_tcp_sock *rts)
|
|
{
|
|
int aux, ret;
|
|
|
|
aux = rts->reuseaddr;
|
|
pr_debug("pie: Turning repair off for %d (reuse %d)\n", rts->sk, aux);
|
|
tcp_repair_off(rts->sk);
|
|
|
|
ret = sys_setsockopt(rts->sk, SOL_SOCKET, SO_REUSEADDR, &aux, sizeof(aux));
|
|
if (ret < 0)
|
|
pr_err("Failed to restore of SO_REUSEADDR on socket (%d)\n", ret);
|
|
}
|
|
|
|
static void rst_tcp_socks_all(struct task_restore_args *ta)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < ta->tcp_socks_n; i++)
|
|
rst_tcp_repair_off(&ta->tcp_socks[i]);
|
|
}
|
|
|
|
static int enable_uffd(int uffd, unsigned long addr, unsigned long len)
|
|
{
|
|
int rc;
|
|
struct uffdio_register uffdio_register;
|
|
unsigned long expected_ioctls;
|
|
|
|
/*
|
|
* If uffd == -1, this means that userfaultfd is not enabled
|
|
* or it is not available.
|
|
*/
|
|
if (uffd == -1)
|
|
return 0;
|
|
|
|
uffdio_register.range.start = addr;
|
|
uffdio_register.range.len = len;
|
|
uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
|
|
|
|
pr_info("lazy-pages: register: %lx, len %lx\n", addr, len);
|
|
|
|
rc = sys_ioctl(uffd, UFFDIO_REGISTER, (unsigned long) &uffdio_register);
|
|
if (rc != 0) {
|
|
pr_err("lazy-pages: register %lx failed: rc:%d, \n", addr, rc);
|
|
return -1;
|
|
}
|
|
|
|
expected_ioctls = (1 << _UFFDIO_WAKE) | (1 << _UFFDIO_COPY) | (1 << _UFFDIO_ZEROPAGE);
|
|
|
|
if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) {
|
|
pr_err("lazy-pages: unexpected missing uffd ioctl for anon memory\n");
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
static int vma_remap(VmaEntry *vma_entry, int uffd)
|
|
{
|
|
unsigned long src = vma_premmaped_start(vma_entry);
|
|
unsigned long dst = vma_entry->start;
|
|
unsigned long len = vma_entry_len(vma_entry);
|
|
unsigned long guard = 0, tmp;
|
|
|
|
pr_info("Remap %lx->%lx len %lx\n", src, dst, len);
|
|
|
|
if (src - dst < len)
|
|
guard = dst;
|
|
else if (dst - src < len)
|
|
guard = dst + len - PAGE_SIZE;
|
|
|
|
if (src == dst)
|
|
return 0;
|
|
|
|
if (guard != 0) {
|
|
/*
|
|
* mremap() returns an error if a target and source vma-s are
|
|
* overlapped. In this case the source vma are remapped in
|
|
* a temporary place and then remapped to the target address.
|
|
* Here is one hack to find non-ovelapped temporary place.
|
|
*
|
|
* 1. initial placement. We need to move src -> tgt.
|
|
* | |+++++src+++++|
|
|
* |-----tgt-----| |
|
|
*
|
|
* 2. map a guard page at the non-ovelapped border of a target vma.
|
|
* | |+++++src+++++|
|
|
* |G|----tgt----| |
|
|
*
|
|
* 3. remap src to any other place.
|
|
* G prevents src from being remaped on tgt again
|
|
* | |-------------| -> |+++++src+++++|
|
|
* |G|---tgt-----| |
|
|
*
|
|
* 4. remap src to tgt, no overlapping any longer
|
|
* |+++++src+++++| <---- |-------------|
|
|
* |G|---tgt-----| |
|
|
*/
|
|
|
|
unsigned long addr;
|
|
|
|
/* Map guard page (step 2) */
|
|
tmp = sys_mmap((void *) guard, PAGE_SIZE, PROT_NONE,
|
|
MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
|
|
if (tmp != guard) {
|
|
pr_err("Unable to map a guard page %lx (%lx)\n", guard, tmp);
|
|
return -1;
|
|
}
|
|
|
|
/* Move src to non-overlapping place (step 3) */
|
|
addr = sys_mmap(NULL, len, PROT_NONE,
|
|
MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
|
|
if (addr == (unsigned long) MAP_FAILED) {
|
|
pr_err("Unable to reserve memory (%lx)\n", addr);
|
|
return -1;
|
|
}
|
|
|
|
tmp = sys_mremap(src, len, len,
|
|
MREMAP_MAYMOVE | MREMAP_FIXED, addr);
|
|
if (tmp != addr) {
|
|
pr_err("Unable to remap %lx -> %lx (%lx)\n", src, addr, tmp);
|
|
return -1;
|
|
}
|
|
|
|
src = addr;
|
|
}
|
|
|
|
tmp = sys_mremap(src, len, len, MREMAP_MAYMOVE | MREMAP_FIXED, dst);
|
|
if (tmp != dst) {
|
|
pr_err("Unable to remap %lx -> %lx\n", src, dst);
|
|
return -1;
|
|
}
|
|
|
|
/*
|
|
* If running in userfaultfd/lazy-pages mode pages with
|
|
* MAP_ANONYMOUS and MAP_PRIVATE are remapped but without the
|
|
* real content.
|
|
* The function enable_uffd() marks the page(s) as userfaultfd
|
|
* pages, so that the processes will hang until the memory is
|
|
* injected via userfaultfd.
|
|
*/
|
|
if (vma_entry_can_be_lazy(vma_entry))
|
|
if (enable_uffd(uffd, dst, len) != 0)
|
|
return -1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int timerfd_arm(struct task_restore_args *args)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < args->timerfd_n; i++) {
|
|
struct restore_timerfd *t = &args->timerfd[i];
|
|
int ret;
|
|
|
|
pr_debug("timerfd: arm for fd %d (%d)\n", t->fd, i);
|
|
|
|
if (t->settime_flags & TFD_TIMER_ABSTIME) {
|
|
struct timespec ts;
|
|
|
|
/*
|
|
* We might need to adjust value because the checkpoint
|
|
* and restore procedure takes some time itself. Note
|
|
* we don't adjust nanoseconds, since the result may
|
|
* overflow the limit NSEC_PER_SEC FIXME
|
|
*/
|
|
if (sys_clock_gettime(t->clockid, &ts)) {
|
|
pr_err("Can't get current time\n");
|
|
return -1;
|
|
}
|
|
|
|
t->val.it_value.tv_sec += (time_t)ts.tv_sec;
|
|
|
|
pr_debug("Ajust id %#x it_value(%llu, %llu) -> it_value(%llu, %llu)\n",
|
|
t->id, (unsigned long long)ts.tv_sec,
|
|
(unsigned long long)ts.tv_nsec,
|
|
(unsigned long long)t->val.it_value.tv_sec,
|
|
(unsigned long long)t->val.it_value.tv_nsec);
|
|
}
|
|
|
|
ret = sys_timerfd_settime(t->fd, t->settime_flags, &t->val, NULL);
|
|
if (t->ticks)
|
|
ret |= sys_ioctl(t->fd, TFD_IOC_SET_TICKS, (unsigned long)&t->ticks);
|
|
if (ret) {
|
|
pr_err("Can't restore ticks/time for timerfd - %d\n", i);
|
|
return ret;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static int create_posix_timers(struct task_restore_args *args)
|
|
{
|
|
int ret, i;
|
|
kernel_timer_t next_id;
|
|
struct sigevent sev;
|
|
|
|
for (i = 0; i < args->posix_timers_n; i++) {
|
|
sev.sigev_notify = args->posix_timers[i].spt.it_sigev_notify;
|
|
sev.sigev_signo = args->posix_timers[i].spt.si_signo;
|
|
sev.sigev_value.sival_ptr = args->posix_timers[i].spt.sival_ptr;
|
|
|
|
while (1) {
|
|
ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &next_id);
|
|
if (ret < 0) {
|
|
pr_err("Can't create posix timer - %d\n", i);
|
|
return ret;
|
|
}
|
|
|
|
if (next_id == args->posix_timers[i].spt.it_id)
|
|
break;
|
|
|
|
ret = sys_timer_delete(next_id);
|
|
if (ret < 0) {
|
|
pr_err("Can't remove temporaty posix timer 0x%x\n", next_id);
|
|
return ret;
|
|
}
|
|
|
|
if ((long)next_id > args->posix_timers[i].spt.it_id) {
|
|
pr_err("Can't create timers, kernel don't give them consequently\n");
|
|
return -1;
|
|
}
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void restore_posix_timers(struct task_restore_args *args)
|
|
{
|
|
int i;
|
|
struct restore_posix_timer *rt;
|
|
|
|
for (i = 0; i < args->posix_timers_n; i++) {
|
|
rt = &args->posix_timers[i];
|
|
sys_timer_settime((kernel_timer_t)rt->spt.it_id, 0, &rt->val, NULL);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* sys_munmap must not return here. The control process must
|
|
* trap us on the exit from sys_munmap.
|
|
*/
|
|
#ifdef CONFIG_VDSO
|
|
unsigned long vdso_rt_size = 0;
|
|
#else
|
|
#define vdso_rt_size (0)
|
|
#endif
|
|
|
|
void *bootstrap_start = NULL;
|
|
unsigned int bootstrap_len = 0;
|
|
|
|
void __export_unmap(void)
|
|
{
|
|
sys_munmap(bootstrap_start, bootstrap_len - vdso_rt_size);
|
|
}
|
|
|
|
/*
|
|
* This function unmaps all VMAs, which don't belong to
|
|
* the restored process or the restorer.
|
|
*
|
|
* The restorer memory is two regions -- area with restorer, its stack
|
|
* and arguments and the one with private vmas of the tasks we restore
|
|
* (a.k.a. premmaped area):
|
|
*
|
|
* 0 task_size
|
|
* +----+====+----+====+---+
|
|
*
|
|
* Thus to unmap old memory we have to do 3 unmaps:
|
|
* [ 0 -- 1st area start ]
|
|
* [ 1st end -- 2nd start ]
|
|
* [ 2nd start -- task_size ]
|
|
*/
|
|
static int unmap_old_vmas(void *premmapped_addr, unsigned long premmapped_len,
|
|
void *bootstrap_start, unsigned long bootstrap_len,
|
|
unsigned long task_size)
|
|
{
|
|
unsigned long s1, s2;
|
|
void *p1, *p2;
|
|
int ret;
|
|
|
|
if (premmapped_addr < bootstrap_start) {
|
|
p1 = premmapped_addr;
|
|
s1 = premmapped_len;
|
|
p2 = bootstrap_start;
|
|
s2 = bootstrap_len;
|
|
} else {
|
|
p2 = premmapped_addr;
|
|
s2 = premmapped_len;
|
|
p1 = bootstrap_start;
|
|
s1 = bootstrap_len;
|
|
}
|
|
|
|
ret = sys_munmap(NULL, p1 - NULL);
|
|
if (ret) {
|
|
pr_err("Unable to unmap (%p-%p): %d\n", NULL, p1, ret);
|
|
return -1;
|
|
}
|
|
|
|
ret = sys_munmap(p1 + s1, p2 - (p1 + s1));
|
|
if (ret) {
|
|
pr_err("Unable to unmap (%p-%p): %d\n", p1 + s1, p2, ret);
|
|
return -1;
|
|
}
|
|
|
|
ret = sys_munmap(p2 + s2, task_size - (unsigned long)(p2 + s2));
|
|
if (ret) {
|
|
pr_err("Unable to unmap (%p-%p): %d\n",
|
|
p2 + s2, (void *)task_size, ret);
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int wait_helpers(struct task_restore_args *task_args)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < task_args->helpers_n; i++) {
|
|
int status;
|
|
pid_t pid = task_args->helpers[i];
|
|
|
|
/* Check that a helper completed. */
|
|
if (sys_wait4(pid, &status, 0, NULL) == -ECHILD) {
|
|
/* It has been waited in sigchld_handler */
|
|
continue;
|
|
}
|
|
if (!WIFEXITED(status) || WEXITSTATUS(status)) {
|
|
pr_err("%d exited with non-zero code (%d,%d)\n", pid,
|
|
WEXITSTATUS(status), WTERMSIG(status));
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int wait_zombies(struct task_restore_args *task_args)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < task_args->zombies_n; i++) {
|
|
int ret, nr_in_progress;
|
|
|
|
nr_in_progress = futex_get(&task_entries_local->nr_in_progress);
|
|
|
|
ret = sys_waitid(P_PID, task_args->zombies[i], NULL, WNOWAIT | WEXITED, NULL);
|
|
if (ret == -ECHILD) {
|
|
/* A process isn't reparented to this task yet.
|
|
* Let's wait when someone complete this stage
|
|
* and try again.
|
|
*/
|
|
futex_wait_while_eq(&task_entries_local->nr_in_progress,
|
|
nr_in_progress);
|
|
i--;
|
|
continue;
|
|
}
|
|
if (ret < 0) {
|
|
pr_err("Wait on %d zombie failed: %d\n", task_args->zombies[i], ret);
|
|
return -1;
|
|
}
|
|
pr_debug("%ld: Collect a zombie with pid %d\n",
|
|
sys_getpid(), task_args->zombies[i]);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static bool vdso_unmapped(struct task_restore_args *args)
|
|
{
|
|
unsigned int i;
|
|
|
|
/* Don't park rt-vdso or rt-vvar if dumpee doesn't have them */
|
|
for (i = 0; i < args->vmas_n; i++) {
|
|
VmaEntry *vma = &args->vmas[i];
|
|
|
|
if (vma_entry_is(vma, VMA_AREA_VDSO) ||
|
|
vma_entry_is(vma, VMA_AREA_VVAR))
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool vdso_needs_parking(struct task_restore_args *args)
|
|
{
|
|
/* Compatible vDSO will be mapped, not moved */
|
|
if (args->compatible_mode)
|
|
return false;
|
|
|
|
if (args->can_map_vdso)
|
|
return false;
|
|
|
|
return !vdso_unmapped(args);
|
|
}
|
|
|
|
/*
|
|
* The main routine to restore task via sigreturn.
|
|
* This one is very special, we never return there
|
|
* but use sigreturn facility to restore core registers
|
|
* and jump execution to some predefined ip read from
|
|
* core file.
|
|
*/
|
|
long __export_restore_task(struct task_restore_args *args)
|
|
{
|
|
long ret = -1;
|
|
int i, k, fd, self_thread;
|
|
VmaEntry *vma_entry;
|
|
unsigned long va;
|
|
struct restore_vma_io *rio;
|
|
struct rt_sigframe *rt_sigframe;
|
|
struct prctl_mm_map prctl_map;
|
|
unsigned long new_sp;
|
|
k_rtsigset_t to_block;
|
|
pid_t my_pid = sys_getpid();
|
|
rt_sigaction_t act;
|
|
|
|
bootstrap_start = args->bootstrap_start;
|
|
bootstrap_len = args->bootstrap_len;
|
|
|
|
#ifdef CONFIG_VDSO
|
|
vdso_rt_size = args->vdso_rt_size;
|
|
#endif
|
|
|
|
fi_strategy = args->fault_strategy;
|
|
|
|
task_entries_local = args->task_entries;
|
|
helpers = args->helpers;
|
|
n_helpers = args->helpers_n;
|
|
zombies = args->zombies;
|
|
n_zombies = args->zombies_n;
|
|
*args->breakpoint = rst_sigreturn;
|
|
|
|
ksigfillset(&act.rt_sa_mask);
|
|
act.rt_sa_handler = sigchld_handler;
|
|
act.rt_sa_flags = SA_SIGINFO | SA_RESTORER | SA_RESTART;
|
|
act.rt_sa_restorer = cr_restore_rt;
|
|
sys_sigaction(SIGCHLD, &act, NULL, sizeof(k_rtsigset_t));
|
|
|
|
ksigemptyset(&to_block);
|
|
ksigaddset(&to_block, SIGCHLD);
|
|
ret = sys_sigprocmask(SIG_UNBLOCK, &to_block, NULL, sizeof(k_rtsigset_t));
|
|
|
|
std_log_set_fd(args->logfd);
|
|
std_log_set_loglevel(args->loglevel);
|
|
std_log_set_start(&args->logstart);
|
|
|
|
pr_info("Switched to the restorer %d\n", my_pid);
|
|
|
|
if (args->uffd > -1) {
|
|
pr_debug("lazy-pages: uffd %d\n", args->uffd);
|
|
}
|
|
|
|
if (vdso_needs_parking(args)) {
|
|
if (vdso_do_park(&args->vdso_maps_rt,
|
|
args->vdso_rt_parked_at, vdso_rt_size))
|
|
goto core_restore_end;
|
|
}
|
|
|
|
if (unmap_old_vmas((void *)args->premmapped_addr, args->premmapped_len,
|
|
bootstrap_start, bootstrap_len, args->task_size))
|
|
goto core_restore_end;
|
|
|
|
/* Map vdso that wasn't parked */
|
|
if (!vdso_unmapped(args) && args->can_map_vdso) {
|
|
if (arch_map_vdso(args->vdso_rt_parked_at,
|
|
args->compatible_mode) < 0) {
|
|
goto core_restore_end;
|
|
}
|
|
}
|
|
|
|
/* Shift private vma-s to the left */
|
|
for (i = 0; i < args->vmas_n; i++) {
|
|
vma_entry = args->vmas + i;
|
|
|
|
if (!vma_entry_is(vma_entry, VMA_PREMMAPED))
|
|
continue;
|
|
|
|
if (vma_entry->end >= args->task_size)
|
|
continue;
|
|
|
|
if (vma_entry->start > vma_entry->shmid)
|
|
break;
|
|
|
|
if (vma_remap(vma_entry, args->uffd))
|
|
goto core_restore_end;
|
|
}
|
|
|
|
/* Shift private vma-s to the right */
|
|
for (i = args->vmas_n - 1; i >= 0; i--) {
|
|
vma_entry = args->vmas + i;
|
|
|
|
if (!vma_entry_is(vma_entry, VMA_PREMMAPED))
|
|
continue;
|
|
|
|
if (vma_entry->start > args->task_size)
|
|
continue;
|
|
|
|
if (vma_entry->start < vma_entry->shmid)
|
|
break;
|
|
|
|
if (vma_remap(vma_entry, args->uffd))
|
|
goto core_restore_end;
|
|
}
|
|
|
|
if (args->uffd > -1) {
|
|
/* re-enable THP if we disabled it previously */
|
|
if (args->has_thp_enabled) {
|
|
if (sys_prctl(PR_SET_THP_DISABLE, 0, 0, 0, 0)) {
|
|
pr_err("Cannot re-enable THP\n");
|
|
goto core_restore_end;
|
|
}
|
|
}
|
|
|
|
pr_debug("lazy-pages: closing uffd %d\n", args->uffd);
|
|
/*
|
|
* All userfaultfd configuration has finished at this point.
|
|
* Let's close the UFFD file descriptor, so that the restored
|
|
* process does not have an opened UFFD FD for ever.
|
|
*/
|
|
sys_close(args->uffd);
|
|
}
|
|
|
|
/*
|
|
* OK, lets try to map new one.
|
|
*/
|
|
for (i = 0; i < args->vmas_n; i++) {
|
|
vma_entry = args->vmas + i;
|
|
|
|
if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR) &&
|
|
!vma_entry_is(vma_entry, VMA_AREA_AIORING))
|
|
continue;
|
|
|
|
if (vma_entry_is(vma_entry, VMA_PREMMAPED))
|
|
continue;
|
|
|
|
va = restore_mapping(vma_entry);
|
|
|
|
if (va != vma_entry->start) {
|
|
pr_err("Can't restore %"PRIx64" mapping with %lx\n", vma_entry->start, va);
|
|
goto core_restore_end;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Now read the contents (if any)
|
|
*/
|
|
|
|
rio = args->vma_ios;
|
|
for (i = 0; i < args->vma_ios_n; i++) {
|
|
struct iovec *iovs = rio->iovs;
|
|
int nr = rio->nr_iovs;
|
|
ssize_t r;
|
|
|
|
while (nr) {
|
|
pr_debug("Preadv %lx:%d... (%d iovs)\n",
|
|
(unsigned long)iovs->iov_base,
|
|
(int)iovs->iov_len, nr);
|
|
r = sys_preadv(args->vma_ios_fd, iovs, nr, rio->off);
|
|
if (r < 0) {
|
|
pr_err("Can't read pages data (%d)\n", (int)r);
|
|
goto core_restore_end;
|
|
}
|
|
|
|
pr_debug("`- returned %ld\n", (long)r);
|
|
rio->off += r;
|
|
/* Advance the iovecs */
|
|
do {
|
|
if (iovs->iov_len <= r) {
|
|
pr_debug(" `- skip pagemap\n");
|
|
r -= iovs->iov_len;
|
|
iovs++;
|
|
nr--;
|
|
continue;
|
|
}
|
|
|
|
iovs->iov_base += r;
|
|
iovs->iov_len -= r;
|
|
break;
|
|
} while (nr > 0);
|
|
}
|
|
|
|
rio = ((void *)rio) + RIO_SIZE(rio->nr_iovs);
|
|
}
|
|
|
|
sys_close(args->vma_ios_fd);
|
|
|
|
#ifdef CONFIG_VDSO
|
|
/*
|
|
* Proxify vDSO.
|
|
*/
|
|
if (!args->check_only)
|
|
if (vdso_proxify(&args->vdso_maps_rt.sym, args->vdso_rt_parked_at,
|
|
args->vmas, args->vmas_n, args->compatible_mode,
|
|
fault_injected(FI_VDSO_TRAMPOLINES)))
|
|
goto core_restore_end;
|
|
#endif
|
|
|
|
/*
|
|
* Walk though all VMAs again to drop PROT_WRITE
|
|
* if it was not there.
|
|
*/
|
|
for (i = 0; i < args->vmas_n; i++) {
|
|
vma_entry = args->vmas + i;
|
|
|
|
if (!(vma_entry_is(vma_entry, VMA_AREA_REGULAR)))
|
|
continue;
|
|
|
|
if ((vma_entry->prot & PROT_WRITE) ||
|
|
(vma_entry->status & VMA_NO_PROT_WRITE))
|
|
continue;
|
|
|
|
sys_mprotect(decode_pointer(vma_entry->start),
|
|
vma_entry_len(vma_entry),
|
|
vma_entry->prot);
|
|
}
|
|
|
|
/*
|
|
* Now when all VMAs are in their places time to set
|
|
* up AIO rings.
|
|
*/
|
|
|
|
if (!args->check_only) {
|
|
for (i = 0; i < args->rings_n; i++)
|
|
if (restore_aio_ring(&args->rings[i]) < 0)
|
|
goto core_restore_end;
|
|
}
|
|
|
|
/*
|
|
* Finally restore madivse() bits
|
|
*/
|
|
for (i = 0; i < args->vmas_n; i++) {
|
|
unsigned long m;
|
|
|
|
vma_entry = args->vmas + i;
|
|
if (!vma_entry->has_madv || !vma_entry->madv)
|
|
continue;
|
|
|
|
for (m = 0; m < sizeof(vma_entry->madv) * 8; m++) {
|
|
if (vma_entry->madv & (1ul << m)) {
|
|
ret = sys_madvise(vma_entry->start,
|
|
vma_entry_len(vma_entry),
|
|
m);
|
|
if (ret) {
|
|
pr_err("madvise(%"PRIx64", %"PRIu64", %ld) "
|
|
"failed with %ld\n",
|
|
vma_entry->start,
|
|
vma_entry_len(vma_entry),
|
|
m, ret);
|
|
goto core_restore_end;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
/*
|
|
* Tune up the task fields.
|
|
*/
|
|
ret = sys_prctl_safe(PR_SET_NAME, (long)args->comm, 0, 0);
|
|
if (ret)
|
|
goto core_restore_end;
|
|
|
|
/*
|
|
* New kernel interface with @PR_SET_MM_MAP will become
|
|
* more widespread once kernel get deployed over the world.
|
|
* Thus lets be opportunistic and use new inteface as a try.
|
|
*/
|
|
prctl_map = (struct prctl_mm_map) {
|
|
.start_code = args->mm.mm_start_code,
|
|
.end_code = args->mm.mm_end_code,
|
|
.start_data = args->mm.mm_start_data,
|
|
.end_data = args->mm.mm_end_data,
|
|
.start_stack = args->mm.mm_start_stack,
|
|
.start_brk = args->mm.mm_start_brk,
|
|
.brk = args->mm.mm_brk,
|
|
.arg_start = args->mm.mm_arg_start,
|
|
.arg_end = args->mm.mm_arg_end,
|
|
.env_start = args->mm.mm_env_start,
|
|
.env_end = args->mm.mm_env_end,
|
|
.auxv = (void *)args->mm_saved_auxv,
|
|
.auxv_size = args->mm_saved_auxv_size,
|
|
.exe_fd = args->fd_exe_link,
|
|
};
|
|
ret = sys_prctl(PR_SET_MM, PR_SET_MM_MAP, (long)&prctl_map, sizeof(prctl_map), 0);
|
|
if (ret == -EINVAL) {
|
|
ret = sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_CODE, (long)args->mm.mm_start_code, 0);
|
|
ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_CODE, (long)args->mm.mm_end_code, 0);
|
|
ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_DATA, (long)args->mm.mm_start_data, 0);
|
|
ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_DATA, (long)args->mm.mm_end_data, 0);
|
|
ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_STACK, (long)args->mm.mm_start_stack, 0);
|
|
ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_BRK, (long)args->mm.mm_start_brk, 0);
|
|
ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_BRK, (long)args->mm.mm_brk, 0);
|
|
ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_START, (long)args->mm.mm_arg_start, 0);
|
|
ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_END, (long)args->mm.mm_arg_end, 0);
|
|
ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_START, (long)args->mm.mm_env_start, 0);
|
|
ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_END, (long)args->mm.mm_env_end, 0);
|
|
ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_AUXV, (long)args->mm_saved_auxv, args->mm_saved_auxv_size);
|
|
|
|
/*
|
|
* Because of requirements applied from kernel side
|
|
* we need to restore /proc/pid/exe symlink late,
|
|
* after old existing VMAs are superseded with
|
|
* new ones from image file.
|
|
*/
|
|
ret |= restore_self_exe_late(args);
|
|
} else {
|
|
if (ret)
|
|
pr_err("sys_prctl(PR_SET_MM, PR_SET_MM_MAP) failed with %d\n", (int)ret);
|
|
sys_close(args->fd_exe_link);
|
|
}
|
|
|
|
if (ret)
|
|
goto core_restore_end;
|
|
|
|
/*
|
|
* We need to prepare a valid sigframe here, so
|
|
* after sigreturn the kernel will pick up the
|
|
* registers from the frame, set them up and
|
|
* finally pass execution to the new IP.
|
|
*/
|
|
rt_sigframe = (void *)&args->t->mz->rt_sigframe;
|
|
|
|
if (restore_thread_common(args->t))
|
|
goto core_restore_end;
|
|
|
|
/*
|
|
* Threads restoration. This requires some more comments. This
|
|
* restorer routine and thread restorer routine has the following
|
|
* memory map, prepared by a caller code.
|
|
*
|
|
* | <-- low addresses high addresses --> |
|
|
* +-------------------------------------------------------+-----------------------+
|
|
* | this proc body | own stack | rt_sigframe space | thread restore zone |
|
|
* +-------------------------------------------------------+-----------------------+
|
|
*
|
|
* where each thread restore zone is the following
|
|
*
|
|
* | <-- low addresses high addresses --> |
|
|
* +--------------------------------------------------------------------------+
|
|
* | thread restore proc | thread1 stack | thread1 rt_sigframe |
|
|
* +--------------------------------------------------------------------------+
|
|
*/
|
|
self_thread = 0;
|
|
if (args->nr_threads > 1) {
|
|
struct thread_restore_args *thread_args = args->thread_args;
|
|
long clone_flags = CLONE_VM | CLONE_FILES | CLONE_SIGHAND |
|
|
CLONE_THREAD | CLONE_SYSVSEM | CLONE_FS;
|
|
long last_pid_len;
|
|
long parent_tid;
|
|
int i;
|
|
|
|
fd = -1;
|
|
if (thread_args[0].pid[1] == 0) {
|
|
/* One level pid ns hierarhy */
|
|
fd = sys_openat(args->proc_fd, LAST_PID_PATH, O_RDWR, 0);
|
|
if (fd < 0) {
|
|
pr_err("can't open last pid fd %d\n", fd);
|
|
goto core_restore_end;
|
|
}
|
|
}
|
|
|
|
mutex_lock(&task_entries_local->last_pid_mutex);
|
|
|
|
for (i = 0; i < args->nr_threads; i++) {
|
|
char last_pid_buf[16], *s;
|
|
/* skip self */
|
|
if (thread_args[i].pid[0] == args->t->pid[0]) {
|
|
self_thread = i;
|
|
continue;
|
|
}
|
|
|
|
if (fd >= 0) {
|
|
/* One level pid ns hierarhy */
|
|
last_pid_len = std_vprint_num(last_pid_buf, sizeof(last_pid_buf), thread_args[i].pid[0] - 1, &s);
|
|
sys_lseek(fd, 0, SEEK_SET);
|
|
ret = sys_write(fd, s, last_pid_len);
|
|
if (ret < 0) {
|
|
pr_err("Can't set last_pid %ld/%s\n", ret, last_pid_buf);
|
|
mutex_unlock(&task_entries_local->last_pid_mutex);
|
|
sys_close(fd);
|
|
goto core_restore_end;
|
|
}
|
|
} else {
|
|
for (k = 0; k < MAX_NS_NESTING; k++) {
|
|
if (thread_args[i].pid[k] == 0)
|
|
break;
|
|
if (request_set_next_pid(args->pid_ns_id[k], thread_args[i].pid[k], args->transport_fd) < 0) {
|
|
pr_err("Can't request to set pid\n");
|
|
mutex_unlock(&task_entries_local->last_pid_mutex);
|
|
goto core_restore_end;
|
|
}
|
|
}
|
|
}
|
|
new_sp = restorer_stack(thread_args[i].mz);
|
|
|
|
/*
|
|
* To achieve functionality like libc's clone()
|
|
* we need a pure assembly here, because clone()'ed
|
|
* thread will run with own stack and we must not
|
|
* have any additional instructions... oh, dear...
|
|
*/
|
|
|
|
RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, thread_args, args->clone_restore_fn);
|
|
}
|
|
|
|
mutex_unlock(&task_entries_local->last_pid_mutex);
|
|
if (fd >= 0)
|
|
sys_close(fd);
|
|
}
|
|
|
|
restore_rlims(args);
|
|
|
|
ret = create_posix_timers(args);
|
|
if (ret < 0) {
|
|
pr_err("Can't restore posix timers %ld\n", ret);
|
|
goto core_restore_end;
|
|
}
|
|
|
|
ret = timerfd_arm(args);
|
|
if (ret < 0) {
|
|
pr_err("Can't restore timerfd %ld\n", ret);
|
|
goto core_restore_end;
|
|
}
|
|
|
|
pr_info("%ld: Restored\n", sys_getpid());
|
|
|
|
restore_finish_stage(task_entries_local, CR_STATE_RESTORE);
|
|
|
|
if (wait_helpers(args) < 0)
|
|
goto core_restore_end;
|
|
if (wait_zombies(args) < 0)
|
|
goto core_restore_end;
|
|
|
|
ksigfillset(&to_block);
|
|
ret = sys_sigprocmask(SIG_SETMASK, &to_block, NULL, sizeof(k_rtsigset_t));
|
|
if (ret) {
|
|
pr_err("Unable to block signals %ld\n", ret);
|
|
goto core_restore_end;
|
|
}
|
|
|
|
if (!args->compatible_mode) {
|
|
sys_sigaction(SIGCHLD, &args->sigchld_act,
|
|
NULL, sizeof(k_rtsigset_t));
|
|
} else {
|
|
void *stack = alloc_compat_syscall_stack();
|
|
|
|
if (!stack) {
|
|
pr_err("Failed to allocate 32-bit stack for sigaction\n");
|
|
goto core_restore_end;
|
|
}
|
|
arch_compat_rt_sigaction(stack, SIGCHLD,
|
|
(void*)&args->sigchld_act);
|
|
free_compat_syscall_stack(stack);
|
|
}
|
|
|
|
ret = restore_signals(args->siginfo, args->siginfo_n, true);
|
|
if (ret)
|
|
goto core_restore_end;
|
|
|
|
ret = restore_signals(args->t->siginfo, args->t->siginfo_n, false);
|
|
if (ret)
|
|
goto core_restore_end;
|
|
|
|
restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD);
|
|
|
|
rst_tcp_socks_all(args);
|
|
|
|
fd = args->thread_args[self_thread].pfc_ns_fd;
|
|
if (fd >= 0) {
|
|
ret = sys_setns(fd, CLONE_NEWPID);
|
|
if (ret) {
|
|
pr_err("Can't setns: ret=%d\n", (int)ret);
|
|
goto core_restore_end;
|
|
}
|
|
sys_close(fd);
|
|
}
|
|
|
|
/* The kernel restricts setting seccomp to uid 0 in the current user
|
|
* ns, so we must do this before restore_creds.
|
|
*/
|
|
pr_info("restoring seccomp mode %d for %ld\n", args->seccomp_mode, sys_getpid());
|
|
if (restore_seccomp(args))
|
|
goto core_restore_end;
|
|
|
|
/*
|
|
* Writing to last-pid is CAP_SYS_ADMIN protected,
|
|
* turning off TCP repair is CAP_SYS_NED_ADMIN protected,
|
|
* thus restore* creds _after_ all of the above.
|
|
*/
|
|
ret = restore_creds(args->t->creds_args, args->proc_fd);
|
|
ret = ret || restore_dumpable_flag(&args->mm);
|
|
ret = ret || restore_pdeath_sig(args->t);
|
|
|
|
futex_set_and_wake(&thread_inprogress, args->nr_threads);
|
|
|
|
restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS);
|
|
|
|
if (ret)
|
|
BUG();
|
|
|
|
/* Wait until children stop to use args->task_entries */
|
|
futex_wait_while_gt(&thread_inprogress, 1);
|
|
|
|
if (args->check_only) {
|
|
pr_info("Restore check was successful.\n");
|
|
while (1) {
|
|
pid_t pid;
|
|
int status;
|
|
|
|
pid = sys_wait4(-1, &status, 0, NULL);
|
|
if (pid < 0) {
|
|
if (pid == -ECHILD)
|
|
break;
|
|
pr_err("Unable to wait a child: %d\n", pid);
|
|
goto core_restore_end;
|
|
}
|
|
if (status) {
|
|
pr_err("The %d process exited with %d\n", pid, status);
|
|
goto core_restore_end;
|
|
}
|
|
}
|
|
sys_exit_group(0);
|
|
}
|
|
|
|
sys_close(args->proc_fd);
|
|
sys_close(args->transport_fd);
|
|
std_log_set_fd(-1);
|
|
|
|
/*
|
|
* The code that prepared the itimers makes shure the
|
|
* code below doesn't fail due to bad timing values.
|
|
*/
|
|
|
|
#define itimer_armed(args, i) \
|
|
(args->itimers[i].it_interval.tv_sec || \
|
|
args->itimers[i].it_interval.tv_usec)
|
|
|
|
if (itimer_armed(args, 0))
|
|
sys_setitimer(ITIMER_REAL, &args->itimers[0], NULL);
|
|
if (itimer_armed(args, 1))
|
|
sys_setitimer(ITIMER_VIRTUAL, &args->itimers[1], NULL);
|
|
if (itimer_armed(args, 2))
|
|
sys_setitimer(ITIMER_PROF, &args->itimers[2], NULL);
|
|
|
|
restore_posix_timers(args);
|
|
|
|
sys_munmap(args->rst_mem, args->rst_mem_size);
|
|
|
|
/*
|
|
* Sigframe stack.
|
|
*/
|
|
new_sp = (long)rt_sigframe + RT_SIGFRAME_OFFSET(rt_sigframe);
|
|
|
|
/*
|
|
* Prepare the stack and call for sigreturn,
|
|
* pure assembly since we don't need any additional
|
|
* code insns from gcc.
|
|
*/
|
|
rst_sigreturn(new_sp, rt_sigframe);
|
|
|
|
core_restore_end:
|
|
futex_abort_and_wake(&task_entries_local->nr_in_progress);
|
|
pr_err("Restorer fail %ld\n", sys_getpid());
|
|
sys_exit_group(1);
|
|
return -1;
|
|
}
|
|
|
|
/*
|
|
* For most of the restorer's objects -fstack-protector is disabled.
|
|
* But we share some of them with CRIU, which may have it enabled.
|
|
*/
|
|
void __stack_chk_fail(void)
|
|
{
|
|
pr_err("Restorer stack smash detected %ld\n", sys_getpid());
|
|
sys_exit_group(1);
|
|
BUG();
|
|
}
|