criu/seize.c
Andrew Vagin 391efc9d7e seize: don't wory if a cgroup contains some extra tasks (v3)
A freezer cgroup can contain tasks which will be not dumped,
criu unfreezes the group, so we need to freeze all extra
task with ptrace like we do for target tasks.

Currently we attache and send an interrupt signals to these tasks,
but we don't call waitpid() for them, so then waitpid(-1, ...)
returns these tasks where we don't expect to see them.

v2: execute freezer_detach() only if opts.freeze_cgroup is set
    calculate extra tasks in a freezer cgroup correctly
v3: s/frozen_processes/processes_to_wait/

Signed-off-by: Andrew Vagin <avagin@virtuozzo.com>
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
2015-12-14 15:55:30 +03:00

659 lines
14 KiB
C

#include <stdbool.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/wait.h>
#include <time.h>
#include "compiler.h"
#include "cr_options.h"
#include "cr-errno.h"
#include "pstree.h"
#include "ptrace.h"
#include "seize.h"
#include "stats.h"
#include "xmalloc.h"
#include "util.h"
#define NR_ATTEMPTS 5
static const char frozen[] = "FROZEN";
static const char freezing[] = "FREEZING";
static const char thawed[] = "THAWED";
static const char *get_freezer_state(int fd)
{
char state[32];
int ret;
BUILD_BUG_ON((sizeof(state) < sizeof(frozen)) ||
(sizeof(state) < sizeof(freezing)) ||
(sizeof(state) < sizeof(thawed)));
lseek(fd, 0, SEEK_SET);
ret = read(fd, state, sizeof(state) - 1);
if (ret <= 0) {
pr_perror("Unable to get a current state");
goto err;
}
if (state[ret - 1] == '\n')
state[ret - 1] = 0;
else
state[ret] = 0;
pr_debug("freezer.state=%s\n", state);
if (strcmp(state, frozen) == 0)
return frozen;
else if (strcmp(state, freezing) == 0)
return freezing;
else if (strcmp(state, thawed) == 0)
return thawed;
pr_err("Unknown freezer state: %s\n", state);
err:
return NULL;
}
static bool freezer_thawed;
static int freezer_restore_state(void)
{
int fd;
char path[PATH_MAX];
if (!opts.freeze_cgroup || freezer_thawed)
return 0;
snprintf(path, sizeof(path), "%s/freezer.state", opts.freeze_cgroup);
fd = open(path, O_RDWR);
if (fd < 0) {
pr_perror("Unable to open %s", path);
return -1;
}
if (write(fd, frozen, sizeof(frozen)) != sizeof(frozen)) {
pr_perror("Unable to freeze tasks");
close(fd);
return -1;
}
close(fd);
return 0;
}
static int seize_cgroup_tree(char *root_path, const char *state)
{
DIR *dir;
struct dirent *de;
char path[PATH_MAX];
FILE *f;
/*
* New tasks can appear while a freezer state isn't
* frozen, so we need to catch all new tasks.
*/
snprintf(path, sizeof(path), "%s/tasks", root_path);
f = fopen(path, "r");
if (f == NULL) {
pr_perror("Unable to open %s", path);
return -1;
}
while (fgets(path, sizeof(path), f)) {
pid_t pid;
int ret;
pid = atoi(path);
/* Here we are going to skip tasks which are already traced. */
ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL);
if (ret == 0)
continue;
if (errno != ESRCH) {
pr_perror("Unexpected error");
fclose(f);
return -1;
}
if (!seize_catch_task(pid)) {
pr_debug("SEIZE %d: success\n", pid);
processes_to_wait++;
} else if (state == frozen) {
char buf[] = "/proc/XXXXXXXXXX/exe";
struct stat st;
/* skip kernel threads */
snprintf(buf, sizeof(buf), "/proc/%d/exe", pid);
if (stat(buf, &st) == -1 && errno == ENOENT)
continue;
/* fails when meets a zombie */
pr_err("zombie found while seizing\n");
fclose(f);
return -1;
}
}
fclose(f);
dir = opendir(root_path);
if (!dir) {
pr_perror("Unable to open %s", root_path);
return -1;
}
while ((de = readdir(dir))) {
struct stat st;
if (dir_dots(de))
continue;
sprintf(path, "%s/%s", root_path, de->d_name);
if (fstatat(dirfd(dir), de->d_name, &st, 0) < 0) {
pr_perror("stat of %s failed", path);
closedir(dir);
return -1;
}
if (!S_ISDIR(st.st_mode))
continue;
if (seize_cgroup_tree(path, state) < 0) {
closedir(dir);
return -1;
}
}
closedir(dir);
return 0;
}
/* A number of tasks in a freezer cgroup which are not going to be dumped */
int processes_to_wait;
/*
* A freezer cgroup can contain tasks which will not be dumped
* and we need to wait them, because the are interupted them by ptrace.
*/
static int freezer_wait_processes()
{
int i;
for (i = 0; i < processes_to_wait; i++) {
int status;
pid_t pid;
/*
* Here we are going to skip tasks which are already traced.
* Ptraced tasks looks like children for us, so if
* a task isn't ptraced yet, waitpid() will return a error.
*/
pid = waitpid(-1, &status, 0);
if (pid < 0) {
pr_perror("Unable to wait processes");
return -1;
}
pr_warn("Unexpected process %d in the freezer cgroup (status 0x%x)\n", pid, status);
}
return 0;
}
static int freezer_detach(void)
{
char path[PATH_MAX];
FILE *f;
if (!opts.freeze_cgroup)
return 0;
snprintf(path, sizeof(path), "%s/tasks", opts.freeze_cgroup);
f = fopen(path, "r");
if (f == NULL) {
pr_perror("Unable to open %s", path);
return -1;
}
while (fgets(path, sizeof(path), f)) {
pid_t pid;
pid = atoi(path);
if (ptrace(PTRACE_DETACH, pid, NULL, NULL))
pr_perror("Unable to detach from %d\n", pid);
}
fclose(f);
return 0;
}
static int freeze_processes(void)
{
int i, fd, exit_code = -1;
char path[PATH_MAX];
const char *state = thawed;
snprintf(path, sizeof(path), "%s/freezer.state", opts.freeze_cgroup);
fd = open(path, O_RDWR);
if (fd < 0) {
pr_perror("Unable to open %s", path);
return -1;
}
state = get_freezer_state(fd);
if (!state) {
close(fd);
return -1;
}
if (state == thawed) {
freezer_thawed = true;
lseek(fd, 0, SEEK_SET);
if (write(fd, frozen, sizeof(frozen)) != sizeof(frozen)) {
pr_perror("Unable to freeze tasks");
close(fd);
return -1;
}
}
/*
* There is not way to wait a specified state, so we need to poll the
* freezer.state.
* Here is one extra attempt to check that everything are frozen.
*/
for (i = 0; i <= NR_ATTEMPTS; i++) {
struct timespec req = {};
u64 timeout;
if (seize_cgroup_tree(opts.freeze_cgroup, state) < 0)
goto err;
if (state == frozen)
break;
state = get_freezer_state(fd);
if (!state)
goto err;
if (state == frozen) {
/*
* Enumerate all tasks one more time to collect all new
* tasks, which can be born while the cgroup is being frozen.
*/
continue;
}
timeout = 100000000 * (i + 1); /* 100 msec */
req.tv_nsec = timeout % 1000000000;
req.tv_sec = timeout / 1000000000;
nanosleep(&req, NULL);
}
if (i > NR_ATTEMPTS) {
pr_err("Unable to freeze cgroup %s\n", opts.freeze_cgroup);
goto err;
}
exit_code = 0;
err:
if (exit_code == 0 || freezer_thawed) {
lseek(fd, 0, SEEK_SET);
if (write(fd, thawed, sizeof(thawed)) != sizeof(thawed)) {
pr_perror("Unable to thaw tasks");
exit_code = -1;
}
}
if (close(fd)) {
pr_perror("Unable to thaw tasks");
return -1;
}
return exit_code;
}
static inline bool child_collected(struct pstree_item *i, pid_t pid)
{
struct pstree_item *c;
list_for_each_entry(c, &i->children, sibling)
if (c->pid.real == pid)
return true;
return false;
}
static int collect_task(struct pstree_item *item);
static int collect_children(struct pstree_item *item)
{
pid_t *ch;
int ret, i, nr_children, nr_inprogress;
ret = parse_children(item->pid.real, &ch, &nr_children);
if (ret < 0)
return ret;
nr_inprogress = 0;
for (i = 0; i < nr_children; i++) {
struct pstree_item *c;
pid_t pid = ch[i];
/* Is it already frozen? */
if (child_collected(item, pid))
continue;
nr_inprogress++;
pr_info("Seized task %d, state %d\n", pid, ret);
c = alloc_pstree_item();
if (c == NULL) {
ret = -1;
goto free;
}
if (!opts.freeze_cgroup)
/* fails when meets a zombie */
seize_catch_task(pid);
ret = seize_wait_task(pid, item->pid.real, &dmpi(c)->pi_creds);
if (ret < 0) {
/*
* Here is a race window between parse_children() and seize(),
* so the task could die for these time.
* Don't worry, will try again on the next attempt. The number
* of attempts is restricted, so it will exit if something
* really wrong.
*/
ret = 0;
xfree(c);
continue;
}
c->pid.real = pid;
c->parent = item;
c->state = ret;
list_add_tail(&c->sibling, &item->children);
/* Here is a recursive call (Depth-first search) */
ret = collect_task(c);
if (ret < 0)
goto free;
}
free:
xfree(ch);
return ret < 0 ? ret : nr_inprogress;
}
static void unseize_task_and_threads(const struct pstree_item *item, int st)
{
int i;
if (item->state == TASK_DEAD)
return;
/*
* The st is the state we want to switch tasks into,
* the item->state is the state task was in when we seized one.
*/
unseize_task(item->pid.real, item->state, st);
if (st == TASK_DEAD || opts.freeze_cgroup)
return;
for (i = 1; i < item->nr_threads; i++)
if (ptrace(PTRACE_DETACH, item->threads[i].real, NULL, NULL))
pr_perror("Unable to detach from %d", item->threads[i].real);
}
static void pstree_wait(struct pstree_item *root_item)
{
struct pstree_item *item = root_item;
int pid, status, i;
for_each_pstree_item(item) {
if (item->state == TASK_DEAD)
continue;
for (i = 0; i < item->nr_threads; i++) {
pid = wait4(-1, &status, __WALL, NULL);
if (pid < 0) {
pr_perror("wait4 failed");
break;
} else {
if (!WIFSIGNALED(status) || WTERMSIG(status) != SIGKILL) {
pr_err("Unexpected exit code %d of %d\n", status, pid);
BUG();
}
}
}
}
pid = wait4(-1, &status, __WALL, NULL);
if (pid > 0) {
pr_err("Unexpected child %d\n", pid);
BUG();
}
}
void pstree_switch_state(struct pstree_item *root_item, int st)
{
struct pstree_item *item = root_item;
if (st != TASK_DEAD)
freezer_restore_state();
pr_info("Unfreezing tasks into %d\n", st);
for_each_pstree_item(item)
unseize_task_and_threads(item, st);
freezer_detach();
if (st == TASK_DEAD)
pstree_wait(root_item);
}
static pid_t item_ppid(const struct pstree_item *item)
{
item = item->parent;
return item ? item->pid.real : -1;
}
static inline bool thread_collected(struct pstree_item *i, pid_t tid)
{
int t;
if (i->pid.real == tid) /* thread leader is collected as task */
return true;
for (t = 0; t < i->nr_threads; t++)
if (tid == i->threads[t].real)
return true;
return false;
}
static int collect_threads(struct pstree_item *item)
{
struct pid *threads = NULL;
int nr_threads = 0, i = 0, ret, nr_inprogress, nr_stopped = 0;
ret = parse_threads(item->pid.real, &threads, &nr_threads);
if (ret < 0)
goto err;
if ((item->state == TASK_DEAD) && (nr_threads > 1)) {
pr_err("Zombies with threads are not supported\n");
goto err;
}
/* The number of threads can't be less than allready frozen */
item->threads = xrealloc(item->threads, nr_threads * sizeof(struct pid));
if (item->threads == NULL)
return -1;
if (item->nr_threads == 0) {
item->threads[0].real = item->pid.real;
item->nr_threads = 1;
}
nr_inprogress = 0;
for (i = 0; i < nr_threads; i++) {
pid_t pid = threads[i].real;
if (thread_collected(item, pid))
continue;
nr_inprogress++;
pr_info("\tSeizing %d's %d thread\n",
item->pid.real, pid);
if (!opts.freeze_cgroup && seize_catch_task(pid))
continue;
ret = seize_wait_task(pid, item_ppid(item), &dmpi(item)->pi_creds);
if (ret < 0) {
/*
* Here is a race window between parse_threads() and seize(),
* so the task could die for these time.
* Don't worry, will try again on the next attempt. The number
* of attempts is restricted, so it will exit if something
* really wrong.
*/
continue;
}
BUG_ON(item->nr_threads + 1 > nr_threads);
item->threads[item->nr_threads].real = pid;
item->nr_threads++;
if (ret == TASK_DEAD) {
pr_err("Zombie thread not supported\n");
goto err;
}
if (ret == TASK_STOPPED) {
nr_stopped++;
}
}
if (nr_stopped && nr_stopped != nr_inprogress) {
pr_err("Individually stopped threads not supported\n");
goto err;
}
xfree(threads);
return nr_inprogress;
err:
xfree(threads);
return -1;
}
static int collect_loop(struct pstree_item *item,
int (*collect)(struct pstree_item *))
{
int attempts = NR_ATTEMPTS, nr_inprogress = 1;
if (opts.freeze_cgroup)
attempts = 1;
/*
* While we scan the proc and seize the children/threads
* new ones can appear (with clone(CLONE_PARENT) or with
* pthread_create). Thus, after one go, we need to repeat
* the scan-and-freeze again collecting new arrivals. As
* new guys may appear again we do NR_ATTEMPTS passes and
* fail to seize the item if new tasks/threads still
* appear.
*/
while (nr_inprogress > 0 && attempts >= 0) {
attempts--;
nr_inprogress = collect(item);
}
pr_info("Collected (%d attempts, %d in_progress)\n", attempts, nr_inprogress);
/*
* We may fail to collect items or run out of attempts.
* In the former case nr_inprogress will be negative, in
* the latter -- positive. Thus it's enough just to check
* for "no more new stuff" and say "we're OK" if so.
*/
return (nr_inprogress == 0) ? 0 : -1;
}
static int collect_task(struct pstree_item *item)
{
int ret;
ret = collect_loop(item, collect_threads);
if (ret < 0)
goto err_close;
/* Depth-first search (DFS) is used for traversing a process tree. */
ret = collect_loop(item, collect_children);
if (ret < 0)
goto err_close;
if ((item->state == TASK_DEAD) && !list_empty(&item->children)) {
pr_err("Zombie with children?! O_o Run, run, run!\n");
goto err_close;
}
if (pstree_alloc_cores(item))
goto err_close;
pr_info("Collected %d in %d state\n", item->pid.real, item->state);
return 0;
err_close:
close_pid_proc();
return -1;
}
int collect_pstree(pid_t pid)
{
int ret;
timing_start(TIME_FREEZING);
if (opts.freeze_cgroup && freeze_processes())
return -1;
root_item = alloc_pstree_item();
if (root_item == NULL)
return -1;
root_item->pid.real = pid;
if (!opts.freeze_cgroup && seize_catch_task(pid)) {
set_cr_errno(ESRCH);
goto err;
}
ret = seize_wait_task(pid, -1, &dmpi(root_item)->pi_creds);
if (ret < 0)
goto err;
pr_info("Seized task %d, state %d\n", pid, ret);
root_item->state = ret;
ret = collect_task(root_item);
if (ret < 0)
goto err;
if (opts.freeze_cgroup && freezer_wait_processes())
return -1;
timing_stop(TIME_FREEZING);
timing_start(TIME_FROZEN);
return 0;
err:
pstree_switch_state(root_item, TASK_ALIVE);
return -1;
}