diff --git a/Makefile.config b/Makefile.config index ce4b8d844..d2b5a3dd8 100644 --- a/Makefile.config +++ b/Makefile.config @@ -35,6 +35,9 @@ endif ifeq ($(call try-cc,$(PTRACE_PEEKSIGINFO_TEST),),y) $(Q) @echo '#define CONFIG_HAS_PEEKSIGINFO_ARGS' >> $@ endif +ifeq ($(call try-cc,$(PTRACE_SUSPEND_SECCOMP_TEST),),y) + $(Q) @echo '#define CONFIG_HAS_SUSPEND_SECCOMP' >> $@ +endif ifeq ($(VDSO),y) $(Q) @echo '#define CONFIG_VDSO' >> $@ endif diff --git a/cr-dump.c b/cr-dump.c index f865967db..ad7899865 100644 --- a/cr-dump.c +++ b/cr-dump.c @@ -19,6 +19,8 @@ #include #include +#include + #include "protobuf.h" #include "protobuf/fdinfo.pb-c.h" #include "protobuf/fs.pb-c.h" @@ -672,6 +674,12 @@ static int dump_task_core_all(struct pstree_item *item, if (ret < 0) goto err; + if (item->seccomp_mode != SECCOMP_MODE_DISABLED) { + pr_info("got seccomp mode %d for %d\n", item->seccomp_mode, item->pid.virt); + core->tc->has_seccomp_mode = true; + core->tc->seccomp_mode = item->seccomp_mode; + } + strncpy((char *)core->tc->comm, stat->comm, TASK_COMM_LEN); core->tc->flags = stat->flags; core->tc->task_state = item->state; @@ -801,7 +809,7 @@ static int collect_children(struct pstree_item *item) goto free; } - ret = seize_task(pid, item->pid.real); + ret = seize_task(pid, item->pid.real, &c->seccomp_mode); if (ret < 0) { /* * Here is a race window between parse_children() and seize(), @@ -913,7 +921,14 @@ static int collect_threads(struct pstree_item *item) pr_info("\tSeizing %d's %d thread\n", item->pid.real, pid); - ret = seize_task(pid, item_ppid(item)); + /* + * FIXME: The NULL here is wrong; we really want to get the + * seccomp state of this thread, but we have nowhere to put it, + * so for now we ignore it. We should at least check to see + * that the seccomp state is the same for all threads; we'll do + * this in a future series. + */ + ret = seize_task(pid, item_ppid(item), NULL); if (ret < 0) { /* * Here is a race window between parse_threads() and seize(), @@ -1063,7 +1078,7 @@ static int collect_pstree(pid_t pid) return -1; root_item->pid.real = pid; - ret = seize_task(pid, -1); + ret = seize_task(pid, -1, &root_item->seccomp_mode); if (ret < 0) goto err; pr_info("Seized task %d, state %d\n", pid, ret); diff --git a/cr-exec.c b/cr-exec.c index 9f6ebfe04..9d7162a48 100644 --- a/cr-exec.c +++ b/cr-exec.c @@ -129,7 +129,7 @@ int cr_exec(int pid, char **opt) goto out; } - prev_state = ret = seize_task(pid, -1); + prev_state = ret = seize_task(pid, -1, NULL); if (ret < 0) { pr_err("Can't seize task %d\n", pid); goto out; diff --git a/cr-restore.c b/cr-restore.c index de3749101..756e29022 100644 --- a/cr-restore.c +++ b/cr-restore.c @@ -24,6 +24,8 @@ #include +#include + #include "ptrace.h" #include "compiler.h" #include "asm/types.h" @@ -1060,6 +1062,7 @@ static inline int fork_with_pid(struct pstree_item *item) item->state = ca.core->tc->task_state; rsti(item)->cg_set = ca.core->tc->cg_set; + item->seccomp_mode = ca.core->tc->seccomp_mode; if (item->state == TASK_DEAD) rsti(item->parent)->nr_zombies++; @@ -1682,6 +1685,17 @@ static void finalize_restore(int status) goto detach; /* Unmap the restorer blob */ + + /* + * Suspend seccomp if necessary. We need to do this because + * although seccomp is restored at the very end of the + * restorer blob (and the final sigreturn is ok), here we're + * doing an munmap in the process, which may be blocked by + * seccomp and cause the task to be killed. + */ + if (item->seccomp_mode != SECCOMP_MODE_DISABLED && suspend_seccomp(pid) < 0) + pr_err("failed to suspend seccomp, restore will probably fail...\n"); + ctl = parasite_prep_ctl(pid, NULL); if (ctl == NULL) goto detach; @@ -2886,6 +2900,8 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core) #undef remap_array + task_args->seccomp_mode = core->tc->seccomp_mode; + if (lsm) task_args->creds.lsm_profile = rst_mem_remap_ptr(lsm_pos, RM_PRIVATE); else diff --git a/include/prctl.h b/include/prctl.h index 441135e5d..bfede51d7 100644 --- a/include/prctl.h +++ b/include/prctl.h @@ -9,6 +9,9 @@ #ifndef PR_GET_NAME # define PR_GET_NAME 16 #endif +#ifndef PR_SET_SECCOMP +# define PR_SET_SECCOMP 22 +#endif #ifndef PR_CAPBSET_READ # define PR_CAPBSET_READ 23 #endif diff --git a/include/proc_parse.h b/include/proc_parse.h index ebb53511d..3daf9a285 100644 --- a/include/proc_parse.h +++ b/include/proc_parse.h @@ -84,6 +84,8 @@ struct proc_status_creds { char state; int ppid; + + int seccomp_mode; }; struct mount_info; diff --git a/include/pstree.h b/include/pstree.h index c0fdac64b..f0cd53c30 100644 --- a/include/pstree.h +++ b/include/pstree.h @@ -24,6 +24,14 @@ struct pstree_item { int state; /* TASK_XXX constants */ + /* + * We keep the seccomp mode here temporarily between seizing and + * dumping the task to avoid parsing /proc/pid/status twice. We also + * use it on restore to hold the seccomp mode so that we don't have to + * keep track of each task's core entry in the main criu process. + */ + int seccomp_mode; + int nr_threads; /* number of threads */ struct pid *threads; /* array of threads */ CoreEntry **core; diff --git a/include/ptrace.h b/include/ptrace.h index 0d89788b3..bb4411dd1 100644 --- a/include/ptrace.h +++ b/include/ptrace.h @@ -11,6 +11,10 @@ # define PTRACE_SEIZE 0x4206 #endif +#ifndef PTRACE_O_SUSPEND_SECCOMP +# define PTRACE_O_SUSPEND_SECCOMP (1 << 21) +#endif + #ifndef PTRACE_INTERRUPT # define PTRACE_INTERRUPT 0x4207 #endif @@ -62,7 +66,8 @@ struct ptrace_peeksiginfo_args { #define SI_EVENT(_si_code) (((_si_code) & 0xFFFF) >> 8) -extern int seize_task(pid_t pid, pid_t ppid); +extern int seize_task(pid_t pid, pid_t ppid, int *seccomp_mode); +extern int suspend_seccomp(pid_t pid); extern int unseize_task(pid_t pid, int orig_state, int state); extern int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes); extern int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes); diff --git a/include/restorer.h b/include/restorer.h index ab7d39d5a..dcfa1758b 100644 --- a/include/restorer.h +++ b/include/restorer.h @@ -165,6 +165,8 @@ struct task_restore_args { */ int proc_fd; + int seccomp_mode; + #ifdef CONFIG_VDSO unsigned long vdso_rt_size; struct vdso_symtable vdso_sym_rt; /* runtime vdso symbols */ diff --git a/pie/restorer.c b/pie/restorer.c index fd860ca2b..4150b4958 100644 --- a/pie/restorer.c +++ b/pie/restorer.c @@ -40,6 +40,18 @@ #define PR_SET_PDEATHSIG 1 #endif +#ifndef SECCOMP_MODE_DISABLED +#define SECCOMP_MODE_DISABLED 0 +#endif + +#ifndef SECCOMP_MODE_STRICT +#define SECCOMP_MODE_STRICT 1 +#endif + +#ifndef SECCOMP_MODE_FILTER +#define SECCOMP_MODE_FILTER 2 +#endif + #define sys_prctl_safe(opcode, val1, val2, val3) \ ({ \ long __ret = sys_prctl(opcode, val1, val2, val3, 0); \ @@ -339,6 +351,32 @@ static int restore_signals(siginfo_t *ptr, int nr, bool group) return 0; } +static void restore_seccomp(int seccomp_mode) +{ + switch (seccomp_mode) { + case SECCOMP_MODE_DISABLED: + return; + case SECCOMP_MODE_STRICT: + if (sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0)) + goto die; + return; + case SECCOMP_MODE_FILTER: + goto die; + default: + goto die; + } + +die: + /* + * If preparing any seccomp state failed, we should make sure this + * process doesn't continue so that it can't do things outside the + * sandbox. Unfortunately, the rest of the restore has to continue + * since we're too late in the process to stop it and have unlocked the + * network. + */ + sys_exit_group(1); +} + static int restore_thread_common(struct rt_sigframe *sigframe, struct thread_restore_args *args) { @@ -417,9 +455,15 @@ long __export_restore_thread(struct thread_restore_args *args) restore_finish_stage(CR_STATE_RESTORE_SIGCHLD); restore_pdeath_sig(args); + + if (args->ta->seccomp_mode != SECCOMP_MODE_DISABLED) + pr_info("restoring seccomp mode %d for %ld\n", args->ta->seccomp_mode, sys_getpid()); + restore_finish_stage(CR_STATE_RESTORE_CREDS); futex_dec_and_wake(&thread_inprogress); + restore_seccomp(args->ta->seccomp_mode); + new_sp = (long)rt_sigframe + SIGFRAME_OFFSET; rst_sigreturn(new_sp); @@ -1199,6 +1243,13 @@ long __export_restore_task(struct task_restore_args *args) futex_set_and_wake(&thread_inprogress, args->nr_threads); + /* + * We have to close the log before restoring seccomp, because + * SECCOMP_MODE_STRICT blocks close(). + */ + if (args->seccomp_mode != SECCOMP_MODE_DISABLED) + pr_info("restoring seccomp mode %d for %ld\n", args->seccomp_mode, sys_getpid()); + restore_finish_stage(CR_STATE_RESTORE_CREDS); if (ret) @@ -1230,6 +1281,8 @@ long __export_restore_task(struct task_restore_args *args) sys_munmap(args->rst_mem, args->rst_mem_size); + restore_seccomp(args->seccomp_mode); + /* * Sigframe stack. */ diff --git a/proc_parse.c b/proc_parse.c index 979568923..9c14a27a3 100644 --- a/proc_parse.c +++ b/proc_parse.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "asm/types.h" #include "list.h" @@ -779,7 +780,7 @@ int parse_pid_status(pid_t pid, struct proc_status_creds *cr) if (bfdopenr(&f)) return -1; - while (done < 8) { + while (done < 9) { str = breadline(&f); if (str == NULL) break; @@ -840,9 +841,22 @@ int parse_pid_status(pid_t pid, struct proc_status_creds *cr) done++; } + + if (!strncmp(str, "Seccomp:", 8)) { + if (sscanf(str + 9, "%d", &cr->seccomp_mode) != 1) { + goto err_parse; + } + + if (cr->seccomp_mode == SECCOMP_MODE_FILTER) { + pr_err("SECCOMP_MODE_FILTER not currently supported\n"); + goto err_parse; + } + + done++; + } } - if (done == 8) + if (done == 9) ret = 0; err_parse: diff --git a/protobuf/core.proto b/protobuf/core.proto index 9f70da929..fd78f5cc8 100644 --- a/protobuf/core.proto +++ b/protobuf/core.proto @@ -10,6 +10,15 @@ import "siginfo.proto"; import "opts.proto"; +/* + * These match the SECCOMP_MODE_* flags from . + */ +enum seccomp_mode { + disabled = 0; + strict = 1; + filter = 2; +}; + message task_core_entry { required uint32 task_state = 1; required uint32 exit_code = 2; @@ -26,6 +35,8 @@ message task_core_entry { optional uint32 cg_set = 9; optional signal_queue_entry signals_s = 10; + + optional seccomp_mode seccomp_mode = 11; } message task_kobj_ids_entry { diff --git a/ptrace.c b/ptrace.c index be6b67b2d..f572f29a4 100644 --- a/ptrace.c +++ b/ptrace.c @@ -8,11 +8,14 @@ #include #include +#include #include #include #include #include +#include + #include "compiler.h" #include "asm/types.h" #include "util.h" @@ -30,14 +33,32 @@ int unseize_task(pid_t pid, int orig_st, int st) else if (st == TASK_STOPPED) { if (orig_st == TASK_ALIVE) kill(pid, SIGSTOP); - } else if (st == TASK_ALIVE) + } else if (st == TASK_ALIVE) { /* do nothing */ ; - else + } else pr_err("Unknown final state %d\n", st); return ptrace(PTRACE_DETACH, pid, NULL, NULL); } +#ifdef CONFIG_HAS_SUSPEND_SECCOMP +int suspend_seccomp(pid_t pid) +{ + if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP) < 0) { + pr_perror("suspending seccomp failed"); + return -1; + } + + return 0; +} +#else +int suspend_seccomp(pid_t pid) +{ + pr_err("seccomp enabled and seccomp suspending not supported\n"); + return -1; +} +#endif + /* * This routine seizes task putting it into a special * state where we can manipulate the task via ptrace @@ -46,7 +67,7 @@ int unseize_task(pid_t pid, int orig_st, int st) * up with someone else. */ -int seize_task(pid_t pid, pid_t ppid) +int seize_task(pid_t pid, pid_t ppid, int *seccomp_mode) { siginfo_t si; int status; @@ -90,6 +111,9 @@ try_again: if (ret2) goto err; + if (seccomp_mode) + *seccomp_mode = cr.seccomp_mode; + if (!may_dump(&cr)) { pr_err("Check uid (pid: %d) failed\n", pid); goto err; @@ -142,6 +166,9 @@ try_again: goto try_again; } + if (cr.seccomp_mode != SECCOMP_MODE_DISABLED && suspend_seccomp(pid) < 0) + goto err_stop; + if (si.si_signo == SIGTRAP) return TASK_ALIVE; else if (si.si_signo == SIGSTOP) { diff --git a/scripts/feature-tests.mak b/scripts/feature-tests.mak index 519eb5269..ec7972a64 100644 --- a/scripts/feature-tests.mak +++ b/scripts/feature-tests.mak @@ -92,3 +92,14 @@ int main(int argc, char *argv[], char *envp[]) } endef + +define PTRACE_SUSPEND_SECCOMP_TEST + +#include + +int main(void) +{ + return PTRACE_O_SUSPEND_SECCOMP; +} + +endef