mirror of
https://github.com/checkpoint-restore/criu.git
synced 2026-01-23 10:16:41 +00:00
A freezer cgroup can contain tasks which will be not dumped,
criu unfreezes the group, so we need to freeze all extra
task with ptrace like we do for target tasks.
Currently we attache and send an interrupt signals to these tasks,
but we don't call waitpid() for them, so then waitpid(-1, ...)
returns these tasks where we don't expect to see them.
v2: execute freezer_detach() only if opts.freeze_cgroup is set
calculate extra tasks in a freezer cgroup correctly
v3: s/frozen_processes/processes_to_wait/
Signed-off-by: Andrew Vagin <avagin@virtuozzo.com>
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
336 lines
7.6 KiB
C
336 lines
7.6 KiB
C
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <stdarg.h>
|
|
#include <string.h>
|
|
#include <errno.h>
|
|
#include <unistd.h>
|
|
#include <stdbool.h>
|
|
#include <limits.h>
|
|
#include <signal.h>
|
|
|
|
#include <sys/ptrace.h>
|
|
#include <sys/types.h>
|
|
#include <sys/time.h>
|
|
#include <sys/resource.h>
|
|
#include <sys/wait.h>
|
|
|
|
#include "compiler.h"
|
|
#include "asm/types.h"
|
|
#include "util.h"
|
|
#include "ptrace.h"
|
|
#include "proc_parse.h"
|
|
#include "crtools.h"
|
|
#include "security.h"
|
|
#include "seccomp.h"
|
|
|
|
int unseize_task(pid_t pid, int orig_st, int st)
|
|
{
|
|
pr_debug("\tUnseizing %d into %d\n", pid, st);
|
|
|
|
if (st == TASK_DEAD) {
|
|
kill(pid, SIGKILL);
|
|
return 0;
|
|
} else if (st == TASK_STOPPED) {
|
|
/*
|
|
* Task might have had STOP in queue. We detected such
|
|
* guy as TASK_STOPPED, but cleared signal to run the
|
|
* parasite code. hus after detach the task will become
|
|
* running. That said -- STOP everyone regardless of
|
|
* the initial state.
|
|
*/
|
|
kill(pid, SIGSTOP);
|
|
} else if (st == TASK_ALIVE) {
|
|
/*
|
|
* Same as in the comment above -- there might be a
|
|
* task with STOP in queue that would get lost after
|
|
* detach, so stop it again.
|
|
*/
|
|
if (orig_st == TASK_STOPPED)
|
|
kill(pid, SIGSTOP);
|
|
} else
|
|
pr_err("Unknown final state %d\n", st);
|
|
|
|
if (ptrace(PTRACE_DETACH, pid, NULL, NULL)) {
|
|
pr_perror("Unable to detach from %d", pid);
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int suspend_seccomp(pid_t pid)
|
|
{
|
|
if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP) < 0) {
|
|
pr_perror("suspending seccomp failed");
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int seize_catch_task(pid_t pid)
|
|
{
|
|
int ret;
|
|
|
|
ret = ptrace(PTRACE_SEIZE, pid, NULL, 0);
|
|
if (ret) {
|
|
/*
|
|
* ptrace API doesn't allow to distinguish
|
|
* attaching to zombie from other errors.
|
|
* All errors will be handled in seize_wait_task().
|
|
*/
|
|
pr_warn("Unable to interrupt task: %d (%s)\n", pid, strerror(errno));
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* If we SEIZE-d the task stop it before going
|
|
* and reading its stat from proc. Otherwise task
|
|
* may die _while_ we're doing it and we'll have
|
|
* inconsistent seize/state pair.
|
|
*
|
|
* If task dies after we seize it but before we
|
|
* do this interrupt, we'll notice it via proc.
|
|
*/
|
|
ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL);
|
|
if (ret < 0) {
|
|
pr_warn("SEIZE %d: can't interrupt task: %s", pid, strerror(errno));
|
|
if (ptrace(PTRACE_DETACH, pid, NULL, NULL))
|
|
pr_perror("Unable to detach from %d", pid);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int skip_sigstop(int pid, int nr_signals)
|
|
{
|
|
int i, status, ret;
|
|
|
|
/*
|
|
* 1) SIGSTOP is queued, but isn't handled yet:
|
|
* SGISTOP can't be blocked, so we need to wait when the kernel
|
|
* handles this signal.
|
|
*
|
|
* Otherwise the process will be stopped immediatly after
|
|
* starting it.
|
|
*
|
|
* 2) A seized task was stopped:
|
|
* PTRACE_SEIZE doesn't affect signal or group stop state.
|
|
* Currently ptrace reported that task is in stopped state.
|
|
* We need to start task again, and it will be trapped
|
|
* immediately, because we sent PTRACE_INTERRUPT to it.
|
|
*/
|
|
for (i = 0; i < nr_signals; i++) {
|
|
ret = ptrace(PTRACE_CONT, pid, 0, 0);
|
|
if (ret) {
|
|
pr_perror("Unable to start process");
|
|
return -1;
|
|
}
|
|
|
|
ret = wait4(pid, &status, __WALL, NULL);
|
|
if (ret < 0) {
|
|
pr_perror("SEIZE %d: can't wait task", pid);
|
|
return -1;
|
|
}
|
|
|
|
if (!WIFSTOPPED(status)) {
|
|
pr_err("SEIZE %d: task not stopped after seize\n", pid);
|
|
return -1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* This routine seizes task putting it into a special
|
|
* state where we can manipulate the task via ptrace
|
|
* interface, and finally we can detach ptrace out of
|
|
* of it so the task would not know if it was saddled
|
|
* up with someone else.
|
|
*/
|
|
int seize_wait_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds)
|
|
{
|
|
siginfo_t si;
|
|
int status, nr_sigstop;
|
|
int ret = 0, ret2, wait_errno = 0;
|
|
struct proc_status_creds cr;
|
|
|
|
/*
|
|
* For the comparison below, let's zero out any padding.
|
|
*/
|
|
memzero(&cr, sizeof(struct proc_status_creds));
|
|
|
|
/*
|
|
* It's ugly, but the ptrace API doesn't allow to distinguish
|
|
* attaching to zombie from other errors. Thus we have to parse
|
|
* the target's /proc/pid/stat. Sad, but parse whatever else
|
|
* we might need at that early point.
|
|
*/
|
|
|
|
processes_to_wait--;
|
|
try_again:
|
|
|
|
ret = wait4(pid, &status, __WALL, NULL);
|
|
if (ret < 0) {
|
|
/*
|
|
* wait4() can expectedly fail only in a first time
|
|
* if a task is zombie. If we are here from try_again,
|
|
* this means that we are tracing this task.
|
|
*
|
|
* processes_to_wait should be descrimented only once in this
|
|
* function if a first wait was success.
|
|
*/
|
|
processes_to_wait++;
|
|
wait_errno = errno;
|
|
}
|
|
|
|
ret2 = parse_pid_status(pid, &cr);
|
|
if (ret2)
|
|
goto err;
|
|
|
|
if (!may_dump(&cr)) {
|
|
pr_err("Check uid (pid: %d) failed\n", pid);
|
|
goto err;
|
|
}
|
|
|
|
if (ret < 0 || WIFEXITED(status) || WIFSIGNALED(status)) {
|
|
if (cr.state != 'Z') {
|
|
if (pid == getpid())
|
|
pr_err("The criu itself is within dumped tree.\n");
|
|
else
|
|
pr_err("Unseizable non-zombie %d found, state %c, err %d/%d\n",
|
|
pid, cr.state, ret, wait_errno);
|
|
return -1;
|
|
}
|
|
|
|
return TASK_DEAD;
|
|
}
|
|
|
|
if ((ppid != -1) && (cr.ppid != ppid)) {
|
|
pr_err("Task pid reused while suspending (%d: %d -> %d)\n",
|
|
pid, ppid, cr.ppid);
|
|
goto err;
|
|
}
|
|
|
|
if (!WIFSTOPPED(status)) {
|
|
pr_err("SEIZE %d: task not stopped after seize\n", pid);
|
|
goto err;
|
|
}
|
|
|
|
ret = ptrace(PTRACE_GETSIGINFO, pid, NULL, &si);
|
|
if (ret < 0) {
|
|
pr_perror("SEIZE %d: can't read signfo", pid);
|
|
goto err;
|
|
}
|
|
|
|
if (SI_EVENT(si.si_code) != PTRACE_EVENT_STOP) {
|
|
/*
|
|
* Kernel notifies us about the task being seized received some
|
|
* event other than the STOP, i.e. -- a signal. Let the task
|
|
* handle one and repeat.
|
|
*/
|
|
|
|
if (ptrace(PTRACE_CONT, pid, NULL,
|
|
(void *)(unsigned long)si.si_signo)) {
|
|
pr_perror("Can't continue signal handling, aborting");
|
|
goto err;
|
|
}
|
|
|
|
ret = 0;
|
|
goto try_again;
|
|
}
|
|
|
|
if (*creds == NULL) {
|
|
*creds = xzalloc(sizeof(struct proc_status_creds));
|
|
if (!*creds)
|
|
goto err;
|
|
|
|
**creds = cr;
|
|
|
|
} else if (!proc_status_creds_eq(*creds, &cr)) {
|
|
pr_err("creds don't match %d %d\n", pid, ppid);
|
|
goto err;
|
|
}
|
|
|
|
if (cr.seccomp_mode != SECCOMP_MODE_DISABLED && suspend_seccomp(pid) < 0)
|
|
goto err;
|
|
|
|
nr_sigstop = 0;
|
|
if (cr.sigpnd & (1 << (SIGSTOP - 1)))
|
|
nr_sigstop++;
|
|
if (cr.shdpnd & (1 << (SIGSTOP - 1)))
|
|
nr_sigstop++;
|
|
if (si.si_signo == SIGSTOP)
|
|
nr_sigstop++;
|
|
|
|
if (nr_sigstop) {
|
|
if (skip_sigstop(pid, nr_sigstop))
|
|
goto err_stop;
|
|
|
|
return TASK_STOPPED;
|
|
}
|
|
|
|
if (si.si_signo == SIGTRAP)
|
|
return TASK_ALIVE;
|
|
else {
|
|
pr_err("SEIZE %d: unsupported stop signal %d\n", pid, si.si_signo);
|
|
goto err;
|
|
}
|
|
|
|
err_stop:
|
|
kill(pid, SIGSTOP);
|
|
err:
|
|
if (ptrace(PTRACE_DETACH, pid, NULL, NULL))
|
|
pr_perror("Unable to detach from %d", pid);
|
|
return -1;
|
|
}
|
|
|
|
int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes)
|
|
{
|
|
unsigned long w;
|
|
if (bytes & (sizeof(long) - 1))
|
|
return -1;
|
|
for (w = 0; w < bytes / sizeof(long); w++) {
|
|
unsigned long *d = dst, *a = addr;
|
|
d[w] = ptrace(PTRACE_PEEKDATA, pid, a + w, NULL);
|
|
if (d[w] == -1U && errno)
|
|
goto err;
|
|
}
|
|
return 0;
|
|
err:
|
|
return -2;
|
|
}
|
|
|
|
int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes)
|
|
{
|
|
unsigned long w;
|
|
if (bytes & (sizeof(long) - 1))
|
|
return -1;
|
|
for (w = 0; w < bytes / sizeof(long); w++) {
|
|
unsigned long *s = src, *a = addr;
|
|
if (ptrace(PTRACE_POKEDATA, pid, a + w, s[w]))
|
|
goto err;
|
|
}
|
|
return 0;
|
|
err:
|
|
return -2;
|
|
}
|
|
|
|
/* don't swap big space, it might overflow the stack */
|
|
int ptrace_swap_area(pid_t pid, void *dst, void *src, long bytes)
|
|
{
|
|
void *t = alloca(bytes);
|
|
|
|
if (ptrace_peek_area(pid, t, dst, bytes))
|
|
return -1;
|
|
|
|
if (ptrace_poke_area(pid, src, dst, bytes)) {
|
|
if (ptrace_poke_area(pid, t, dst, bytes))
|
|
return -2;
|
|
return -1;
|
|
}
|
|
|
|
memcpy(src, t, bytes);
|
|
|
|
return 0;
|
|
}
|