diff --git a/Makefile b/Makefile index ed2a1992e..208557e33 100644 --- a/Makefile +++ b/Makefile @@ -43,7 +43,6 @@ ARCH ?= $(shell uname -m | sed \ -e s/sun4u/sparc64/ \ -e s/s390x/s390/ \ -e s/parisc64/parisc/ \ - -e s/ppc.*/powerpc/ \ -e s/mips.*/mips/ \ -e s/sh[234].*/sh/) @@ -85,6 +84,20 @@ ifeq ($(SRCARCH),arm) export PROTOUFIX endif +# +# The PowerPC 64 bits architecture could be big or little endian. +# They are handled in the same way. +# +ifeq ($(shell echo $(ARCH) | sed -e 's/ppc64.*/ppc64/'),ppc64) + ifeq ($(ARCH),ppc64) + error := $(error ppc64 big endian not yet supported) + endif + SRCARCH := ppc64 + DEFINES := -DCONFIG_PPC64 + LDARCH := powerpc:common64 + VDSO := y +endif + SRCARCH ?= $(ARCH) LDARCH ?= $(SRCARCH) @@ -193,6 +206,9 @@ PROGRAM-BUILTINS += $(ARCH_DIR)/vdso-pie.o ifeq ($(SRCARCH),aarch64) PROGRAM-BUILTINS += $(ARCH_DIR)/intraprocedure.o endif +ifeq ($(SRCARCH),ppc64) +PROGRAM-BUILTINS += $(ARCH_DIR)/vdso-trampoline.o +endif endif PROGRAM-BUILTINS += pie/util-fd.o diff --git a/arch/ppc64/Makefile b/arch/ppc64/Makefile new file mode 100644 index 000000000..c5d332364 --- /dev/null +++ b/arch/ppc64/Makefile @@ -0,0 +1,55 @@ +targets += syscalls +targets += crtools + +SYS-ASM := syscalls.S + +syscalls-asm-y += $(SYS-ASM:.S=).o +crtools-obj-y += crtools.o +crtools-obj-y += cpu.o + +SYS-DEF := syscall-ppc64.def +SYS-ASM-COMMON := syscall-common-ppc64.S + +SYS-TYPES := include/syscall-types.h +SYS-CODES := include/syscall-codes.h +SYS-PROTO := include/syscall.h + +SYS-GEN := syscalls-ppc64.sh + +SYS-EXEC-TBL := sys-exec-tbl.c + +syscalls-asm-y-asmflags := -fpie -Wstrict-prototypes -Wa,--noexecstack +syscalls-asm-y-asmflags += -nostdlib -fomit-frame-pointer -I$(obj) + +ASMFLAGS += -D__ASSEMBLY__ + +$(obj)/$(SYS-ASM): $(obj)/$(SYS-GEN) $(obj)/$(SYS-DEF) $(obj)/$(SYS-ASM-COMMON) $(SYS-TYPES) + $(E) " GEN " $@ + $(Q) $(SH) \ + $(obj)/$(SYS-GEN) --asm \ + $(obj)/$(SYS-DEF) \ + $(SYS-CODES) \ + $(SYS-PROTO) \ + $(obj)/$(SYS-ASM) \ + $(SYS-ASM-COMMON) \ + $(SYS-TYPES) + +$(obj)/syscalls.o: $(obj)/$(SYS-ASM) + +$(obj)/$(SYS-EXEC-TBL): $(obj)/$(SYS-GEN) $(obj)/$(SYS-DEF) + $(E) " GEN " $@ + $(Q) $(SH) \ + $(obj)/$(SYS-GEN) --exec \ + $(obj)/$(SYS-DEF) \ + $(obj)/$(SYS-EXEC-TBL) + +_all += $(obj)/$(SYS-EXEC-TBL) + +cleanup-y += $(obj)/$(SYS-EXEC-TBL) $(obj)/$(SYS-ASM) +cleanup-y += $(SYS-CODES) +cleanup-y += $(SYS-PROTO) + +ifneq ($(MAKECMDGOALS),clean) +deps-after := $(obj)/$(SYS-ASM) +incdeps := y +endif diff --git a/arch/ppc64/cpu.c b/arch/ppc64/cpu.c new file mode 100644 index 000000000..040fe14fc --- /dev/null +++ b/arch/ppc64/cpu.c @@ -0,0 +1,45 @@ +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +#include +#include "cpu.h" + +bool cpu_has_feature(unsigned int feature) +{ + return false; +} + +int cpu_init(void) +{ + return 0; +} + +int cpu_dump_cpuinfo(void) +{ + return 0; +} + +int cpu_validate_cpuinfo(void) +{ + return 0; +} + +int cpu_dump_cpuinfo_single(void) +{ + return -ENOTSUP; +} + +int cpu_validate_image_cpuinfo_single(void) +{ + return -ENOTSUP; +} + +int cpuinfo_dump(void) +{ + return -ENOTSUP; +} + +int cpuinfo_check(void) +{ + return -ENOTSUP; +} diff --git a/arch/ppc64/crtools.c b/arch/ppc64/crtools.c new file mode 100644 index 000000000..31cef5d22 --- /dev/null +++ b/arch/ppc64/crtools.c @@ -0,0 +1,293 @@ +#include +#include +#include +#include + +#include "asm/types.h" +#include "asm/fpu.h" +#include "asm/restorer.h" + +#include "cr_options.h" +#include "compiler.h" +#include "ptrace.h" +#include "parasite-syscall.h" +#include "syscall.h" +#include "log.h" +#include "util.h" +#include "cpu.h" +#include "errno.h" + +#include "protobuf.h" +#include "protobuf/core.pb-c.h" +#include "protobuf/creds.pb-c.h" + +/* + * Injected syscall instruction + */ +const u32 code_syscall[] = { + 0x44000002, /* sc */ + 0x0fe00000 /* twi 31,0,0 */ +}; + +const int code_syscall_size = sizeof(code_syscall); + +static inline void __check_code_syscall(void) +{ + BUILD_BUG_ON(sizeof(code_syscall) != BUILTIN_SYSCALL_SIZE); + BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); +} + +void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) +{ + regs->nip = new_ip; + if (stack) + regs->gpr[1] = (unsigned long) stack; + regs->trap = 0; +} + +bool arch_can_dump_task(pid_t pid) +{ + /* + * TODO: We should detect 32bit task when BE support is done. + */ + return true; +} + +int syscall_seized(struct parasite_ctl *ctl, int nr, unsigned long *ret, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3, + unsigned long arg4, + unsigned long arg5, + unsigned long arg6) +{ + user_regs_struct_t regs = ctl->orig.regs; + int err; + + regs.gpr[0] = (unsigned long)nr; + regs.gpr[3] = arg1; + regs.gpr[4] = arg2; + regs.gpr[5] = arg3; + regs.gpr[6] = arg4; + regs.gpr[7] = arg5; + regs.gpr[8] = arg6; + + err = __parasite_execute_syscall(ctl, ®s); + + *ret = regs.gpr[3]; + return err; +} + +/* This is the layout of the POWER7 VSX registers and the way they + * overlap with the existing FPR and VMX registers. + * + * VSR doubleword 0 VSR doubleword 1 + * ---------------------------------------------------------------- + * VSR[0] | FPR[0] | | + * ---------------------------------------------------------------- + * VSR[1] | FPR[1] | | + * ---------------------------------------------------------------- + * | ... | | + * ---------------------------------------------------------------- + * VSR[30] | FPR[30] | | + * ---------------------------------------------------------------- + * VSR[31] | FPR[31] | | + * ---------------------------------------------------------------- + * VSR[32] | VR[0] | + * ---------------------------------------------------------------- + * VSR[33] | VR[1] | + * ---------------------------------------------------------------- + * | ... | + * ---------------------------------------------------------------- + * VSR[62] | VR[30] | + * ---------------------------------------------------------------- + * VSR[63] | VR[31] | + * ---------------------------------------------------------------- + * + * PTRACE_GETFPREGS returns FPR[0..31] + FPSCR + * PTRACE_GETVRREGS returns VR[0..31] + VSCR + VRSAVE + * PTRACE_GETVSRREGS returns VSR[0..31] + * + * PTRACE_GETVSRREGS and PTRACE_GETFPREGS are required since we need + * to save FPSCR too. + */ +static int get_fpu_regs(pid_t pid, CoreEntry *core) +{ + elf_fpregset_t fpregs; + UserPpc64FpstateEntry *fpe; + int i; + + if (ptrace(PTRACE_GETFPREGS, pid, 0, (void *)&fpregs) < 0) { + pr_err("Couldn't get floating-point registers."); + return -1; + } + + fpe = xmalloc(sizeof(UserPpc64FpstateEntry)); + if (!fpe) + return -1; + user_ppc64_fpstate_entry__init(fpe); + + fpe->n_fpregs = NFPREG; + fpe->fpregs = xmalloc(fpe->n_fpregs * sizeof(fpe->fpregs[0])); + if (!fpe->fpregs) { + xfree(fpe); + return -1; + } + + /* FPSRC is the last (33th) register in the set */ + for (i=0; ifpregs[i] = fpregs[i]; + + core->ti_ppc64->fpstate = fpe; + return 0; +} + +static void put_fpu_regs(mcontext_t *mc, UserPpc64FpstateEntry *fpe) +{ + int i; + + for (i=0; in_fpregs; i++) + mc->fp_regs[i] = (double)(fpe->fpregs[i]); +} + +int get_task_regs(pid_t pid, user_regs_struct_t regs, CoreEntry *core) +{ + int i; + + pr_info("Dumping GP/FPU registers for %d\n", pid); + + /* + * This is inspired by kernel function check_syscall_restart in + * arch/powerpc/kernel/signal.c + */ +#ifndef TRAP +#define TRAP(r) ((r).trap & ~0xF) +#endif + + if (TRAP(regs) == 0x0C00 && regs.ccr & 0x10000000) { + /* Restart the system call */ + switch (regs.gpr[3]) { + case ERESTARTNOHAND: + case ERESTARTSYS: + case ERESTARTNOINTR: + regs.gpr[3] = regs.orig_gpr3; + regs.nip -= 4; + break; + case ERESTART_RESTARTBLOCK: + regs.gpr[0] = __NR_restart_syscall; + regs.nip -= 4; + break; + } + } + + /* Resetting trap since we are now comming from user space. */ + regs.trap = 0; + +#define assign_reg(dst, src, e) do { \ + dst->e = (__typeof__(dst->e))src.e; \ +} while (0) + + for (i=0; i<32; i++) + assign_reg(core->ti_ppc64->gpregs, regs, gpr[i]); + + assign_reg(core->ti_ppc64->gpregs, regs, nip); + assign_reg(core->ti_ppc64->gpregs, regs, msr); + assign_reg(core->ti_ppc64->gpregs, regs, orig_gpr3); + assign_reg(core->ti_ppc64->gpregs, regs, ctr); + assign_reg(core->ti_ppc64->gpregs, regs, link); + assign_reg(core->ti_ppc64->gpregs, regs, xer); + assign_reg(core->ti_ppc64->gpregs, regs, ccr); + assign_reg(core->ti_ppc64->gpregs, regs, trap); +#undef assign_reg + + if (get_fpu_regs(pid, core)) + return -1; + + return 0; +} + +int arch_alloc_thread_info(CoreEntry *core) +{ + ThreadInfoPpc64 *ti_ppc64; + UserPpc64RegsEntry *regs; + + ti_ppc64 = xmalloc(sizeof(*ti_ppc64)); + if(!ti_ppc64) + goto err; + thread_info_ppc64__init(ti_ppc64); + CORE_THREAD_ARCH_INFO(core) = ti_ppc64; + + /* user_ppc64_regs_entry */ + regs = xmalloc(sizeof(*regs)); + if (!regs) + goto err; + user_ppc64_regs_entry__init(regs); + + regs->gpr = xmalloc(32*sizeof(uint64_t)); + if (!regs->gpr) + goto err; + regs->n_gpr = 32; + + ti_ppc64->gpregs = regs; + + return 0; +err: + return -1; +} + +void arch_free_thread_info(CoreEntry *core) +{ + if (CORE_THREAD_ARCH_INFO(core)) { + if (CORE_THREAD_ARCH_INFO(core)->fpstate) { + xfree(CORE_THREAD_ARCH_INFO(core)->fpstate->fpregs); + xfree(CORE_THREAD_ARCH_INFO(core)->fpstate); + } + xfree(CORE_THREAD_ARCH_INFO(core)->gpregs->gpr); + xfree(CORE_THREAD_ARCH_INFO(core)->gpregs); + xfree(CORE_THREAD_ARCH_INFO(core)); + CORE_THREAD_ARCH_INFO(core) = NULL; + } +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) +{ + if (CORE_THREAD_ARCH_INFO(core)->fpstate) + put_fpu_regs(&sigframe->uc.uc_mcontext, + CORE_THREAD_ARCH_INFO(core)->fpstate); + return 0; +} + +int restore_gpregs(struct rt_sigframe *f, UserPpc64RegsEntry *r) +{ + int i; + + /* r0 to r31 */ + for (i=0; i<32; i++) + f->uc.uc_mcontext.gp_regs[i] = r->gpr[i]; + + f->uc.uc_mcontext.gp_regs[PT_NIP] = r->nip; + f->uc.uc_mcontext.gp_regs[PT_MSR] = r->msr; + f->uc.uc_mcontext.gp_regs[PT_ORIG_R3] = r->orig_gpr3; + f->uc.uc_mcontext.gp_regs[PT_CTR] = r->ctr; + f->uc.uc_mcontext.gp_regs[PT_LNK] = r->link; + f->uc.uc_mcontext.gp_regs[PT_XER] = r->xer; + f->uc.uc_mcontext.gp_regs[PT_CCR] = r->ccr; + f->uc.uc_mcontext.gp_regs[PT_TRAP] = r->trap; + + return 0; +} + +void *mmap_seized(struct parasite_ctl *ctl, + void *addr, size_t length, int prot, + int flags, int fd, off_t offset) +{ + unsigned long map = 0; + int err; + + err = syscall_seized(ctl, __NR_mmap, &map, + (unsigned long)addr, length, prot, flags, fd, offset); + if (err < 0 || (long)map < 0) + map = 0; + + return (void *)map; +} diff --git a/arch/ppc64/include/asm/atomic.h b/arch/ppc64/include/asm/atomic.h new file mode 100644 index 000000000..4fa33b1c7 --- /dev/null +++ b/arch/ppc64/include/asm/atomic.h @@ -0,0 +1,112 @@ +#ifndef __CR_ATOMIC_H__ +#define __CR_ATOMIC_H__ + +/* + * PowerPC atomic operations + * + * Copied from kernel header file arch/powerpc/include/asm/atomic.h + */ + +typedef struct { + int counter; +} atomic_t; + +#include "asm/cmpxchg.h" + +#define PPC_ATOMIC_ENTRY_BARRIER "lwsync \n" +#define PPC_ATOMIC_EXIT_BARRIER "sync \n" + +#define ATOMIC_INIT(i) { (i) } + +static __inline__ int atomic_read(const atomic_t *v) +{ + int t; + + __asm__ __volatile__("lwz%U1%X1 %0,%1" : "=r"(t) : "m"(v->counter)); + + return t; +} + +static __inline__ void atomic_set(atomic_t *v, int i) +{ + __asm__ __volatile__("stw%U0%X0 %1,%0" : "=m"(v->counter) : "r"(i)); +} + +#define ATOMIC_OP(op, asm_op) \ +static __inline__ void atomic_##op(int a, atomic_t *v) \ +{ \ + int t; \ + \ + __asm__ __volatile__( \ +"1: lwarx %0,0,%3 # atomic_" #op "\n" \ + #asm_op " %0,%2,%0\n" \ +" stwcx. %0,0,%3 \n" \ +" bne- 1b\n" \ + : "=&r" (t), "+m" (v->counter) \ + : "r" (a), "r" (&v->counter) \ + : "cc"); \ +} \ + +ATOMIC_OP(add, add) +ATOMIC_OP(sub, subf) + +#undef ATOMIC_OP + +static __inline__ void atomic_inc(atomic_t *v) +{ + int t; + + __asm__ __volatile__( +"1: lwarx %0,0,%2 # atomic_inc\n\ + addic %0,%0,1\n" +" stwcx. %0,0,%2 \n\ + bne- 1b" + : "=&r" (t), "+m" (v->counter) + : "r" (&v->counter) + : "cc", "xer"); +} + +static __inline__ int atomic_inc_return(atomic_t *v) +{ + int t; + + __asm__ __volatile__( + PPC_ATOMIC_ENTRY_BARRIER \ +"1: lwarx %0,0,%1 # atomic_inc_return\n\ + addic %0,%0,1\n" +" stwcx. %0,0,%1 \n\ + bne- 1b \n" \ + PPC_ATOMIC_EXIT_BARRIER + : "=&r" (t) + : "r" (&v->counter) + : "cc", "xer", "memory"); + + return t; +} + +/* + * atomic_inc_and_test - increment and test + * @v: pointer of type atomic_t + * + * Atomically increments @v by 1 + * and returns true if the result is zero, or false for all + * other cases. + */ + +static __inline__ void atomic_dec(atomic_t *v) +{ + int t; + + __asm__ __volatile__( +"1: lwarx %0,0,%2 # atomic_dec\n\ + addic %0,%0,-1\n" +" stwcx. %0,0,%2\n\ + bne- 1b" + : "=&r" (t), "+m" (v->counter) + : "r" (&v->counter) + : "cc", "xer"); +} + +#define atomic_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n))) + +#endif /* __CR_ATOMIC_H__ */ diff --git a/arch/ppc64/include/asm/bitops.h b/arch/ppc64/include/asm/bitops.h new file mode 100644 index 000000000..f310c5284 --- /dev/null +++ b/arch/ppc64/include/asm/bitops.h @@ -0,0 +1,11 @@ +#ifndef __CR_BITOPS_H__ +#define __CR_BITOPS_H__ + +#include "compiler.h" +/* + * TODO: create some optimized version instead of falling down with the + * generic ones. + */ +#include "asm-generic/bitops.h" + +#endif /* __CR_BITOPS_H__ */ diff --git a/arch/ppc64/include/asm/bitsperlong.h b/arch/ppc64/include/asm/bitsperlong.h new file mode 100644 index 000000000..d95727d19 --- /dev/null +++ b/arch/ppc64/include/asm/bitsperlong.h @@ -0,0 +1,6 @@ +#ifndef __CR_BITSPERLONG_H__ +#define __CR_BITSPERLONG_H__ + +#define BITS_PER_LONG 64 + +#endif /* __CR_BITSPERLONG_H__ */ diff --git a/arch/ppc64/include/asm/cmpxchg.h b/arch/ppc64/include/asm/cmpxchg.h new file mode 100644 index 000000000..b93fbdef0 --- /dev/null +++ b/arch/ppc64/include/asm/cmpxchg.h @@ -0,0 +1,96 @@ +#ifndef __CR_CMPXCHG_H__ +#define __CR_CMPXCHG_H__ + +/* + * Copied from kernel header file arch/powerpc/include/asm/cmpxchg.h + */ + +#define PPC_ACQUIRE_BARRIER "isync \n" +#define PPC_RELEASE_BARRIER "lwsync \n" + +/* + * Compare and exchange - if *p == old, set it to new, + * and return the old value of *p. + */ + +static __always_inline unsigned long +__cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new) +{ + unsigned int prev; + + __asm__ __volatile__ ( + PPC_RELEASE_BARRIER \ +"1: lwarx %0,0,%2 # __cmpxchg_u32\n\ + cmpw 0,%0,%3\n\ + bne- 2f\n" +" stwcx. %4,0,%2\n\ + bne- 1b \n" \ + PPC_ACQUIRE_BARRIER + "\n\ +2:" + : "=&r" (prev), "+m" (*p) + : "r" (p), "r" (old), "r" (new) + : "cc", "memory"); + + return prev; +} + +static __always_inline unsigned long +__cmpxchg_u64(volatile unsigned long *p, unsigned long old, unsigned long new) +{ + unsigned long prev; + + __asm__ __volatile__ ( + PPC_RELEASE_BARRIER \ +"1: ldarx %0,0,%2 # __cmpxchg_u64\n\ + cmpd 0,%0,%3\n\ + bne- 2f\n\ + stdcx. %4,0,%2\n\ + bne- 1b \n" \ + PPC_ACQUIRE_BARRIER + "\n\ +2:" + : "=&r" (prev), "+m" (*p) + : "r" (p), "r" (old), "r" (new) + : "cc", "memory"); + + return prev; +} + +/* This function doesn't exist, so you'll get a linker error + if something tries to do an invalid cmpxchg(). */ +#ifdef CR_DEBUG +static inline void __cmpxchg_called_with_bad_pointer(void) +{ + __asm__ __volatile__ ( + "1: twi 31,0,0 # trap\n" + " b 1b" + : : : "memory"); +} +#else +extern void __cmpxchg_called_with_bad_pointer(void); +#endif + +static __always_inline unsigned long +__cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, + unsigned int size) +{ + switch (size) { + case 4: + return __cmpxchg_u32(ptr, old, new); + case 8: + return __cmpxchg_u64(ptr, old, new); + } + __cmpxchg_called_with_bad_pointer(); + return old; +} + +#define cmpxchg(ptr, o, n) \ + ({ \ + __typeof__(*(ptr)) _o_ = (o); \ + __typeof__(*(ptr)) _n_ = (n); \ + (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_, \ + (unsigned long)_n_, sizeof(*(ptr))); \ + }) + +#endif /* __CR_CMPXCHG_H__ */ diff --git a/arch/ppc64/include/asm/cpu.h b/arch/ppc64/include/asm/cpu.h new file mode 100644 index 000000000..59118c211 --- /dev/null +++ b/arch/ppc64/include/asm/cpu.h @@ -0,0 +1 @@ +#include diff --git a/arch/ppc64/include/asm/dump.h b/arch/ppc64/include/asm/dump.h new file mode 100644 index 000000000..1505fd298 --- /dev/null +++ b/arch/ppc64/include/asm/dump.h @@ -0,0 +1,11 @@ +#ifndef __CR_ASM_DUMP_H__ +#define __CR_ASM_DUMP_H__ + +extern int get_task_regs(pid_t pid, user_regs_struct_t regs, CoreEntry *core); +extern int arch_alloc_thread_info(CoreEntry *core); +extern void arch_free_thread_info(CoreEntry *core); + + +#define core_put_tls(core, tls) + +#endif diff --git a/arch/ppc64/include/asm/fpu.h b/arch/ppc64/include/asm/fpu.h new file mode 100644 index 000000000..7f476d541 --- /dev/null +++ b/arch/ppc64/include/asm/fpu.h @@ -0,0 +1,4 @@ +#ifndef __CR_ASM_FPU_H__ +#define __CR_ASM_FPU_H__ + +#endif /* __CR_ASM_FPU_H__ */ diff --git a/arch/ppc64/include/asm/int.h b/arch/ppc64/include/asm/int.h new file mode 100644 index 000000000..642804e9b --- /dev/null +++ b/arch/ppc64/include/asm/int.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_INT_H__ +#define __CR_ASM_INT_H__ + +#include "asm-generic/int.h" + +#endif /* __CR_ASM_INT_H__ */ diff --git a/arch/ppc64/include/asm/linkage.h b/arch/ppc64/include/asm/linkage.h new file mode 100644 index 000000000..03e01dc96 --- /dev/null +++ b/arch/ppc64/include/asm/linkage.h @@ -0,0 +1,20 @@ +#ifndef __CR_LINKAGE_H__ +#define __CR_LINKAGE_H__ + +#ifdef __ASSEMBLY__ + +#define GLOBAL(name) \ + .globl name; \ + name: + +#define ENTRY(name) \ + .globl name; \ + .type name, @function; \ + name: + +#define END(sym) \ + .size sym, . - sym + +#endif /* __ASSEMBLY__ */ + +#endif /* __CR_LINKAGE_H__ */ diff --git a/arch/ppc64/include/asm/page.h b/arch/ppc64/include/asm/page.h new file mode 100644 index 000000000..169c6943d --- /dev/null +++ b/arch/ppc64/include/asm/page.h @@ -0,0 +1,23 @@ +#ifndef __CR_ASM_PAGE_H__ +#define __CR_ASM_PAGE_H__ + +/* + * Default config for Pseries is to use 64K pages. + * See kernel file arch/powerpc/configs/pseries_*defconfig + */ +#ifndef PAGE_SHIFT +# define PAGE_SHIFT 16 +#endif + +#ifndef PAGE_SIZE +# define PAGE_SIZE (1UL << PAGE_SHIFT) +#endif + +#ifndef PAGE_MASK +# define PAGE_MASK (~(PAGE_SIZE - 1)) +#endif + +#define PAGE_PFN(addr) ((addr) / PAGE_SIZE) +#define page_size() PAGE_SIZE + +#endif /* __CR_ASM_PAGE_H__ */ diff --git a/arch/ppc64/include/asm/parasite-syscall.h b/arch/ppc64/include/asm/parasite-syscall.h new file mode 100644 index 000000000..7665e207b --- /dev/null +++ b/arch/ppc64/include/asm/parasite-syscall.h @@ -0,0 +1,17 @@ +#ifndef __CR_ASM_PARASITE_SYSCALL_H__ +#define __CR_ASM_PARASITE_SYSCALL_H__ + +struct parasite_ctl; + +#define ARCH_SI_TRAP TRAP_BRKPT + +extern const char code_syscall[]; +extern const int code_syscall_size; + +void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs); + +void *mmap_seized(struct parasite_ctl *ctl, + void *addr, size_t length, int prot, + int flags, int fd, off_t offset); + +#endif diff --git a/arch/ppc64/include/asm/parasite.h b/arch/ppc64/include/asm/parasite.h new file mode 100644 index 000000000..fdbc340b0 --- /dev/null +++ b/arch/ppc64/include/asm/parasite.h @@ -0,0 +1,7 @@ +#ifndef __ASM_PARASITE_H__ +#define __ASM_PARASITE_H__ + +/* TLS is accessed through r13, which is already processed */ +static inline void arch_get_tls(tls_t *ptls) { (void)ptls; } + +#endif diff --git a/arch/ppc64/include/asm/prlimit.h b/arch/ppc64/include/asm/prlimit.h new file mode 100644 index 000000000..6746ba0e6 --- /dev/null +++ b/arch/ppc64/include/asm/prlimit.h @@ -0,0 +1,14 @@ +#ifndef __CR_PRLIMIT_H__ +#define __CR_PRLIMIT_H__ + +#include +#include +#include + +#include "config.h" + +#ifndef CONFIG_HAS_PRLIMIT +extern int prlimit(pid_t pid, int resource, const struct rlimit *new_rlimit, struct rlimit *old_rlimit); +#endif + +#endif /* __CR_PRLIMIT_H__ */ diff --git a/arch/ppc64/include/asm/processor-flags.h b/arch/ppc64/include/asm/processor-flags.h new file mode 100644 index 000000000..c1888af36 --- /dev/null +++ b/arch/ppc64/include/asm/processor-flags.h @@ -0,0 +1,4 @@ +#ifndef __CR_PROCESSOR_FLAGS_H__ +#define __CR_PROCESSOR_FLAGS_H__ + +#endif diff --git a/arch/ppc64/include/asm/restore.h b/arch/ppc64/include/asm/restore.h new file mode 100644 index 000000000..3ca0c534d --- /dev/null +++ b/arch/ppc64/include/asm/restore.h @@ -0,0 +1,33 @@ +#ifndef __CR_ASM_RESTORE_H__ +#define __CR_ASM_RESTORE_H__ + +#include "asm/restorer.h" + +#include "protobuf/core.pb-c.h" + +/* + * Set R2 to blob + 8000 which is the default value + * Jump to restore_task_exec_start + 8 since R2 is already set (local call) + */ +#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \ + task_args) \ + asm volatile( \ + "mr 1,%0 \n" \ + "mr 3,%1 \n" \ + "mtctr 3 \n" \ + "mr 3,%2 \n" \ + "mr 2,%3 \n" \ + "bctr \n" \ + : \ + : "r"(new_sp), \ + "r"((unsigned long)restore_task_exec_start), \ + "r"(task_args), \ + "r"((unsigned long)task_args->bootstrap_start + 0x8000) \ + : "sp", "1", "2", "3", "memory") + +/* There is nothing to do since TLS is accessed through r13 */ +#define core_get_tls(pcore, ptls) + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); + +#endif /* __CR_ASM_RESTORE_H__ */ diff --git a/arch/ppc64/include/asm/restorer.h b/arch/ppc64/include/asm/restorer.h new file mode 100644 index 000000000..0549992de --- /dev/null +++ b/arch/ppc64/include/asm/restorer.h @@ -0,0 +1,136 @@ +#ifndef __CR_ASM_RESTORER_H__ +#define __CR_ASM_RESTORER_H__ + +#include +#include +#include + +/* + * sigcontext structure defined in file + * /usr/include/powerpc64le-linux-gnu/bits/sigcontext.h, + * included from /usr/include/signal.h + * + * Kernel definition can be found in arch/powerpc/include/uapi/asm/sigcontext.h + */ +#include + +// XXX: the idetifier rt_sigcontext is expected to be struct by the CRIU code +#define rt_sigcontext sigcontext + +#include "sigframe.h" +#define SIGFRAME_OFFSET 0 + +/* Copied from the Linux kernel header arch/powerpc/include/asm/ptrace.h */ +#define USER_REDZONE_SIZE 512 + +/* Copied from the Linux kernel source file arch/powerpc/kernel/signal_64.c */ +#define TRAMP_SIZE 6 + +/* + * ucontext defined in /usr/include/powerpc64le-linux-gnu/sys/ucontext.h + */ +struct rt_sigframe { + /* sys_rt_sigreturn requires the ucontext be the first field */ + struct ucontext uc; +#if 1 + /* + * XXX: Assuming that transactional is turned on by default in + * most of the Linux distribution. + */ + struct ucontext uc_transact; +#endif + unsigned long _unused[2]; + unsigned int tramp[TRAMP_SIZE]; + struct rt_siginfo *pinfo; + void *puc; + struct rt_siginfo info; + /* New 64 bit little-endian ABI allows redzone of 512 bytes below sp */ + char abigap[USER_REDZONE_SIZE]; +} __attribute__ ((aligned (16))); + +#define ARCH_RT_SIGRETURN(new_sp) \ + asm volatile( \ + "mr 1, %0 \n" \ + "li 0, "__stringify(__NR_rt_sigreturn)" \n" \ + "sc \n" \ + : \ + : "r"(new_sp) \ + : "1", "memory") + +/* + * Clone trampoline + * + * See glibc sysdeps/powerpc/powerpc64/sysdep.h for FRAME_MIN_SIZE defines + */ +#if _CALL_ELF != 2 +#error Only supporting ABIv2. +#else +#define FRAME_MIN_SIZE_PARM 96 +#endif +#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ + thread_args, clone_restore_fn) \ + asm volatile( \ + "clone_emul: \n" \ + "/* Save fn, args, stack across syscall. */ \n" \ + "mr 14, %5 /* clone_restore_fn in r14 */ \n" \ + "mr 15, %6 /* &thread_args[i] in r15 */ \n" \ + "mr 3, %1 /* clone_flags */ \n" \ + "ld 4, %2 /* new_sp */ \n" \ + "mr 5, %3 /* &parent_tid */ \n" \ + "li 6, 0 /* tls = 0 ? */ \n" \ + "mr 7, %4 /* &thread_args[i].pid */ \n" \ + "li 0,"__stringify(__NR_clone)" \n" \ + "sc \n" \ + "/* Check for child process. */ \n" \ + "cmpdi cr1,3,0 \n" \ + "crandc cr1*4+eq,cr1*4+eq,cr0*4+so \n" \ + "bne- cr1,clone_end \n" \ + "/* child */ \n" \ + "addi 14, 14, 8 /* jump over r2 fixup */ \n" \ + "mtctr 14 \n" \ + "mr 3,15 \n" \ + "bctr \n" \ + "clone_end: \n" \ + "mr %0,3 \n" \ + : "=r"(ret) /* %0 */ \ + : "r"(clone_flags), /* %1 */ \ + "m"(new_sp), /* %2 */ \ + "r"(&parent_tid), /* %3 */ \ + "r"(&thread_args[i].pid), /* %4 */ \ + "r"(clone_restore_fn), /* %5 */ \ + "r"(&thread_args[i]) /* %6 */ \ + : "memory","0","3","4","5","6","7","14","15") + +#define RT_SIGFRAME_UC(rt_sigframe) rt_sigframe->uc +#define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(rt_sigframe)->uc.uc_mcontext.gp_regs[PT_NIP]) +#define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1) +#define RT_SIGFRAME_FPU(rt_sigframe) ((rt_sigframe)->uc.uc_mcontext) + +int restore_gpregs(struct rt_sigframe *f, UserPpc64RegsEntry *r); +int restore_nonsigframe_gpregs(UserPpc64RegsEntry *r); + +/* Nothing to do, TLS is accessed through r13 */ +static inline void restore_tls(tls_t *ptls) { (void)ptls; } + +static inline int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + return 0; +} + +static inline int ptrace_flush_breakpoints(pid_t pid) +{ + return 0; +} + +static inline int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, + mcontext_t *sigcontext) +{ + return 0; +} + +/* + * Defined in arch/ppc64/syscall-common-ppc64.S + */ +int sys_shmat(int shmid, const void *shmaddr, int shmflg); + +#endif /*__CR_ASM_RESTORER_H__*/ diff --git a/arch/ppc64/include/asm/string.h b/arch/ppc64/include/asm/string.h new file mode 100644 index 000000000..034442781 --- /dev/null +++ b/arch/ppc64/include/asm/string.h @@ -0,0 +1,11 @@ +#ifndef __CR_ASM_STRING_H__ +#define __CR_ASM_STRING_H__ + +#include "compiler.h" + +/* + * TODO : We may optimized some code here instead of using the generic ones. + */ +#include "asm-generic/string.h" + +#endif /* __CR_ASM_STRING_H__ */ diff --git a/arch/ppc64/include/asm/types.h b/arch/ppc64/include/asm/types.h new file mode 100644 index 000000000..67b7fe2ec --- /dev/null +++ b/arch/ppc64/include/asm/types.h @@ -0,0 +1,111 @@ +#ifndef __CR_ASM_TYPES_H__ +#define __CR_ASM_TYPES_H__ + +#include +#include +#include "protobuf/core.pb-c.h" + +#include "asm/page.h" +#include "asm/bitops.h" +#include "asm/int.h" + +/* + * Copied from kernel header include/uapi/asm-generic/signal-defs.h + */ +typedef void rt_signalfn_t(int, siginfo_t *, void *); +typedef rt_signalfn_t *rt_sighandler_t; + +typedef void rt_restorefn_t(void); +typedef rt_restorefn_t *rt_sigrestore_t; + +#define SIGMAX_OLD 31 +#define SIGMAX 64 + +/*Copied from the Linux kernel arch/powerpc/include/uapi/asm/signal.h */ +#define _KNSIG 64 +#define _NSIG_BPW 64 +#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) + +typedef struct { + uint64_t sig[_KNSIG_WORDS]; +} k_rtsigset_t; + +static inline void ksigfillset(k_rtsigset_t *set) +{ + int i; + for (i = 0; i < _KNSIG_WORDS; i++) + set->sig[i] = (unsigned long)-1; +} + +/* Copied from the Linux kernel arch/powerpc/include/uapi/asm/signal.h */ +#define SA_RESTORER 0x04000000U + +typedef struct { + rt_sighandler_t rt_sa_handler; + unsigned long rt_sa_flags; + rt_sigrestore_t rt_sa_restorer; + k_rtsigset_t rt_sa_mask; /* mask last for extensibility */ +} rt_sigaction_t; + +/* + * Copied from kernel header arch/powerpc/include/uapi/asm/ptrace.h + */ +typedef struct { + unsigned long gpr[32]; + unsigned long nip; + unsigned long msr; + unsigned long orig_gpr3; /* Used for restarting system calls */ + unsigned long ctr; + unsigned long link; + unsigned long xer; + unsigned long ccr; + unsigned long softe; /* Soft enabled/disabled */ + unsigned long trap; /* Reason for being here */ + /* N.B. for critical exceptions on 4xx, the dar and dsisr + fields are overloaded to hold srr0 and srr1. */ + unsigned long dar; /* Fault registers */ + unsigned long dsisr; /* on 4xx/Book-E used for ESR */ + unsigned long result; /* Result of a system call */ +} user_regs_struct_t; + +typedef UserPpc64RegsEntry UserRegsEntry; + +#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__PPC64 + +#define ASSIGN_TYPED(a, b) do { a = (typeof(a))b; } while (0) +#define ASSIGN_MEMBER(a,b,m) do { ASSIGN_TYPED((a)->m, (b)->m); } while (0) + +#define REG_RES(regs) ((u64)(regs).gpr[3]) +#define REG_IP(regs) ((u64)(regs).nip) +#define REG_SYSCALL_NR(regs) ((u64)(regs).gpr[0]) + + +#define CORE_THREAD_ARCH_INFO(core) core->ti_ppc64 + +/* + * Copied from the following kernel header files : + * include/linux/auxvec.h + * arch/powerpc/include/uapi/asm/auxvec.h + * include/linux/mm_types.h + */ +#define AT_VECTOR_SIZE_BASE 20 +#define AT_VECTOR_SIZE_ARCH 6 +#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) + +typedef uint64_t auxv_t; + +/* Not used but the structure parasite_dump_thread needs a tls_t field */ +typedef uint64_t tls_t; + +/* + * Copied for the Linux kernel arch/powerpc/include/asm/processor.h + * + * NOTE: 32bit task are supported. + */ +#define TASK_SIZE_USER64 (0x0000400000000000UL) +#define TASK_SIZE TASK_SIZE_USER64 + +static inline void *decode_pointer(uint64_t v) { return (void*)v; } +static inline uint64_t encode_pointer(void *p) { return (uint64_t)p; } + +#endif /* __CR_ASM_TYPES_H__ */ diff --git a/arch/ppc64/include/asm/vdso.h b/arch/ppc64/include/asm/vdso.h new file mode 100644 index 000000000..8d089dde3 --- /dev/null +++ b/arch/ppc64/include/asm/vdso.h @@ -0,0 +1,172 @@ +#ifndef __CR_ASM_VDSO_H__ +#define __CR_ASM_VDSO_H__ + +#include + +#include "asm/int.h" +#include "protobuf/vma.pb-c.h" + +struct parasite_ctl; +struct vm_area_list; + +#define VDSO_PROT (PROT_READ | PROT_EXEC) +#define VVAR_PROT (PROT_READ) + +#define VDSO_BAD_ADDR (-1ul) +#define VVAR_BAD_ADDR VDSO_BAD_ADDR +#define VDSO_BAD_PFN (-1ull) +#define VVAR_BAD_PFN VDSO_BAD_PFN + +struct vdso_symbol { + char name[32]; + unsigned long offset; +}; + +#define VDSO_SYMBOL_INIT { .offset = VDSO_BAD_ADDR, } + +/* Check if symbol present in symtable */ +static inline bool vdso_symbol_empty(struct vdso_symbol *s) +{ + return s->offset == VDSO_BAD_ADDR && s->name[0] == '\0'; +} + +/* + * Pick from kernel file arch/powerpc/kernel/vdso64/vdso64.lds.S + * + * Note that '__kernel_datapage_offset' is not a service but mostly a data + * inside the text page which should not be used as is from user space. + */ +enum { + VDSO_SYMBOL_CLOCK_GETRES, + VDSO_SYMBOL_CLOCK_GETTIME, + VDSO_SYMBOL_GET_SYSCALL_MAP, + VDSO_SYMBOL_GET_TBFREQ, + VDSO_SYMBOL_GETCPU, + VDSO_SYMBOL_GETTIMEOFDAY, + VDSO_SYMBOL_SIGTRAMP_RT64, + VDSO_SYMBOL_SYNC_DICACHE, + VDSO_SYMBOL_SYNC_DICACHE_P5, + VDSO_SYMBOL_TIME, + + VDSO_SYMBOL_MAX +}; + +#define VDSO_SYMBOL_CLOCK_GETRES_NAME "__kernel_clock_getres" +#define VDSO_SYMBOL_CLOCK_GETTIME_NAME "__kernel_clock_gettime" +#define VDSO_SYMBOL_GET_SYSCALL_MAP_NAME "__kernel_get_syscall_map" +#define VDSO_SYMBOL_GET_TBFREQ_NAME "__kernel_get_tbfreq" +#define VDSO_SYMBOL_GETCPU_NAME "__kernel_getcpu" +#define VDSO_SYMBOL_GETTIMEOFDAY_NAME "__kernel_gettimeofday" +#define VDSO_SYMBOL_SIGTRAMP_RT64_NAME "__kernel_sigtramp_rt64" +#define VDSO_SYMBOL_SYNC_DICACHE_NAME "__kernel_sync_dicache" +#define VDSO_SYMBOL_SYNC_DICACHE_P5_NAME "__kernel_sync_dicache_p5" +#define VDSO_SYMBOL_TIME_NAME "__kernel_time" + +struct vdso_symtable { + unsigned long vma_start; + unsigned long vma_end; + unsigned long vvar_start; + unsigned long vvar_end; + struct vdso_symbol symbols[VDSO_SYMBOL_MAX]; +}; + +#define VDSO_SYMTABLE_INIT \ + { \ + .vma_start = VDSO_BAD_ADDR, \ + .vma_end = VDSO_BAD_ADDR, \ + .vvar_start = VVAR_BAD_ADDR, \ + .vvar_end = VVAR_BAD_ADDR, \ + .symbols = { \ + [0 ... VDSO_SYMBOL_MAX - 1] = \ + (struct vdso_symbol)VDSO_SYMBOL_INIT, \ + }, \ + } + +/* Size of VMA associated with vdso */ +static inline unsigned long vdso_vma_size(struct vdso_symtable *t) +{ + return t->vma_end - t->vma_start; +} + +static inline unsigned long vvar_vma_size(struct vdso_symtable *t) +{ + return t->vvar_end - t->vvar_start; +} +/* + * Special mark which allows to identify runtime vdso where + * calls from proxy vdso are redirected. This mark usually + * placed at the start of vdso area where Elf header lives. + * Since such runtime vdso is solevey used by proxy and + * nobody else is supposed to access it, it's more-less + * safe to screw the Elf header with @signature and + * @proxy_addr. + * + * The @proxy_addr deserves a few comments. When we redirect + * the calls from proxy to runtime vdso, on next checkpoint + * it won't be possible to find which VMA is proxy, thus + * we save its address in the member. + */ +struct vdso_mark { + u64 signature; + unsigned long proxy_vdso_addr; + + unsigned long version; + + /* + * In case of new vDSO format the VVAR area address + * neeed for easier discovering where it lives without + * relying on procfs output. + */ + unsigned long proxy_vvar_addr; +}; + +#define VDSO_MARK_SIGNATURE (0x6f73647675697263ULL) /* Magic number (criuvdso) */ +#define VDSO_MARK_SIGNATURE_V2 (0x4f53447675697263ULL) /* Magic number (criuvDSO) */ +#define VDSO_MARK_CUR_VERSION (2) + +static inline void vdso_put_mark(void *where, unsigned long proxy_vdso_addr, unsigned long proxy_vvar_addr) +{ + struct vdso_mark *m = where; + + m->signature = VDSO_MARK_SIGNATURE_V2; + m->proxy_vdso_addr = proxy_vdso_addr; + m->version = VDSO_MARK_CUR_VERSION; + m->proxy_vvar_addr = proxy_vvar_addr; +} + +static inline bool is_vdso_mark(void *addr) +{ + struct vdso_mark *m = addr; + + if (m->signature == VDSO_MARK_SIGNATURE_V2) { + /* + * New format + */ + return true; + } else if (m->signature == VDSO_MARK_SIGNATURE) { + /* + * Old format -- simply extend the mark up + * to the version we support. + */ + vdso_put_mark(m, m->proxy_vdso_addr, VVAR_BAD_ADDR); + return true; + } + return false; +} + + +extern struct vdso_symtable vdso_sym_rt; +extern u64 vdso_pfn; + +extern int vdso_init(void); +extern int vdso_do_park(struct vdso_symtable *sym_rt, unsigned long park_at, unsigned long park_size); +extern int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t); +extern int vdso_proxify(char *who, struct vdso_symtable *sym_rt, + unsigned long vdso_rt_parked_at, size_t index, + VmaEntry *vmas, size_t nr_vmas); + +extern int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid, + struct vm_area_list *vma_area_list); +extern void write_intraprocedure_branch(void *to, void *from); + +#endif /* __CR_ASM_VDSO_H__ */ diff --git a/arch/ppc64/parasite-head.S b/arch/ppc64/parasite-head.S new file mode 100644 index 000000000..c7e5bdc66 --- /dev/null +++ b/arch/ppc64/parasite-head.S @@ -0,0 +1,44 @@ +#include "asm/linkage.h" +#include "parasite.h" + + .section .head.text + .align 8 + +ENTRY(__export_parasite_head_start) + + // int __used parasite_service(unsigned int cmd, void *args) + // cmd = r3 = *__export_parasite_cmd (u32 ?) + // args = r4 = @parasite_args_ptr + @pc + + bl 0f +0: mflr 2 + +#define LOAD_REG_ADDR(reg, name) \ + addis reg,2,(name - 0b)@ha; \ + addi reg,2,(name - 0b)@l; + + LOAD_REG_ADDR(3,__export_parasite_cmd) + lwz 3,0(3) + + LOAD_REG_ADDR(4,parasite_args_ptr) + lwz 4,0(4) + add 4,4,2 // Fix up ptr + + // Set the TOC pointer + LOAD_REG_ADDR(5,parasite_toc_ptr) + ld 5,0(5) + add 2,2,5 // Fix up ptr + + bl parasite_service + twi 31,0,0 // Should generate SIGTRAP + +parasite_args_ptr: + .long __export_parasite_args - (0b - __export_parasite_head_start) + +__export_parasite_cmd: + .long 0 + +parasite_toc_ptr: + .long .TOC. - (0b - __export_parasite_head_start) + +END(__export_parasite_head_start) diff --git a/arch/ppc64/restorer-trampoline.S b/arch/ppc64/restorer-trampoline.S new file mode 100644 index 000000000..5e15615ae --- /dev/null +++ b/arch/ppc64/restorer-trampoline.S @@ -0,0 +1,33 @@ +#include "asm/linkage.h" +#include "parasite.h" + + .section .head.text + .align 8 + + // Called through parasite_unmap + // This trampoline is there to restore r2 before jumping back to the + // C code. +#define LOAD_REG_ADDR(reg, name) \ + addis reg,7,(name - 0b)@ha; \ + addi reg,7,(name - 0b)@l; + +ENTRY(__export_unmap_trampoline) + bl 0f +0: mflr 7 + LOAD_REG_ADDR(8,restorer_r2) + ld 2,0(8) + b __export_unmap + //END(__export_restore_unmap_trampoline) + + // Called from JUMP_TO_RESTORER_BLOB, ctr contains the address where + // to jump to, and r3 etc contains the parameter. + // Assuming up to 4 parameters here since we are using r7 and r8. +ENTRY(__export_restore_task_trampoline) + bl 0f +0: mflr 7 + LOAD_REG_ADDR(8,restorer_r2) + std 2,0(8) + b __export_restore_task + +restorer_r2: + .long 0 diff --git a/arch/ppc64/restorer.c b/arch/ppc64/restorer.c new file mode 100644 index 000000000..c5e19d9fb --- /dev/null +++ b/arch/ppc64/restorer.c @@ -0,0 +1,14 @@ +#include + +#include "restorer.h" +#include "asm/restorer.h" +#include "asm/fpu.h" + +#include "syscall.h" +#include "log.h" +//#include "cpu.h" + +int restore_nonsigframe_gpregs(UserPpc64RegsEntry *r) +{ + return 0; +} diff --git a/arch/ppc64/syscall-common-ppc64.S b/arch/ppc64/syscall-common-ppc64.S new file mode 100644 index 000000000..78bc1b7e6 --- /dev/null +++ b/arch/ppc64/syscall-common-ppc64.S @@ -0,0 +1,32 @@ +#include "asm/linkage.h" +#include /* for __NR_ipc */ + +#define SYSCALL(name, opcode) \ + ENTRY(name); \ + li 0, opcode; \ + b __syscall_common; \ + END(name) + + .text + .align 4 + +ENTRY(__syscall_common) + sc + bnslr+ /* if no error return to LR */ + neg 3,3 /* r3 = -r3 to return -errno value */ + blr +END(__syscall_common) + +ENTRY(__cr_restore_rt) + li 0, __NR_rt_sigreturn + b __syscall_common +END(__cr_restore_rt) + + # On Power, shmat is done through the ipc system call. +ENTRY(sys_shmat) + mr 7, 4 # shmaddr -> ptr + mr 4, 3 # shmid -> first + li 3, 21 # call = SHMAT + li 0, __NR_ipc + b __syscall_common +END(sys_shmat) diff --git a/arch/ppc64/syscall-ppc64.def b/arch/ppc64/syscall-ppc64.def new file mode 100644 index 000000000..d8ae4491c --- /dev/null +++ b/arch/ppc64/syscall-ppc64.def @@ -0,0 +1,99 @@ +# +# System calls table, please make sure the table consist only the syscalls +# really used somewhere in project. +# +# The template is (name and srguments are optinal if you need only __NR_x +# defined, but no realy entry point in syscalls lib). +# +# name code name arguments +# ----------------------------------------------------------------------- +# +__NR_read 3 sys_read (int fd, void *buf, unsigned long count) +__NR_write 4 sys_write (int fd, const void *buf, unsigned long count) +__NR_open 5 sys_open (const char *filename, unsigned long flags, unsigned long mode) +__NR_close 6 sys_close (int fd) +__NR_lseek 19 sys_lseek (int fd, unsigned long offset, unsigned long origin) +__NR_mmap 90 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) +__NR_mprotect 125 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) +__NR_munmap 91 sys_munmap (void *addr, unsigned long len) +__NR_brk 45 sys_brk (void *addr) +__NR_rt_sigaction 173 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) +__NR_rt_sigprocmask 174 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) +__NR_rt_sigreturn 172 sys_rt_sigreturn (void) +__NR_ioctl 54 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) +__NR_pread64 179 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) +__NR_mremap 163 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) +__NR_mincore 206 sys_mincore (void *addr, unsigned long size, unsigned char *vec) +__NR_madvise 205 sys_madvise (unsigned long start, size_t len, int behavior) +__NR_pause 29 sys_pause (void) +__NR_nanosleep 162 sys_nanosleep (struct timespec *req, struct timespec *rem) +__NR_getitimer 105 sys_getitimer (int which, const struct itimerval *val) +__NR_setitimer 104 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) +__NR_getpid 20 sys_getpid (void) +__NR_socket 326 sys_socket (int domain, int type, int protocol) +__NR_connect 328 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) +__NR_sendto 335 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) +__NR_recvfrom 337 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) +__NR_sendmsg 341 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) +__NR_recvmsg 342 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) +__NR_shutdown 338 sys_shutdown (int sockfd, int how) +__NR_bind 327 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) +__NR_setsockopt 339 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) +__NR_getsockopt 340 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) +__NR_clone 120 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, void *child_tid) +__NR_exit 1 sys_exit (unsigned long error_code) +__NR_wait4 114 sys_wait4 (int pid, int *status, int options, struct rusage *ru) +__NR_kill 37 sys_kill (long pid, int sig) +__NR_fcntl 55 sys_fcntl (int fd, int type, long arg) +__NR_flock 143 sys_flock (int fd, unsigned long cmd) +__NR_mkdir 39 sys_mkdir (const char *name, int mode) +__NR_rmdir 40 sys_rmdir (const char *name) +__NR_unlink 10 sys_unlink (char *pathname) +__NR_readlink 85 sys_readlink (const char *path, char *buf, int bufsize) +__NR_umask 60 sys_umask (int mask) +__NR_getgroups 80 sys_getgroups (int gsize, unsigned int *groups) +__NR_setresuid 164 sys_setresuid (int uid, int euid, int suid) +__NR_getresuid 165 sys_getresuid (int *uid, int *euid, int *suid) +__NR_setresgid 169 sys_setresgid (int gid, int egid, int sgid) +__NR_getresgid 170 sys_getresgid (int *gid, int *egid, int *sgid) +__NR_getpgid 132 sys_getpgid (pid_t pid) +__NR_setfsuid 138 sys_setfsuid (int fsuid) +__NR_setfsgid 139 sys_setfsgid (int fsgid) +__NR_getsid 147 sys_getsid (void) +__NR_capget 183 sys_capget (struct cap_header *h, struct cap_data *d) +__NR_capset 184 sys_capset (struct cap_header *h, struct cap_data *d) +__NR_rt_sigqueueinfo 177 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) +__NR_sigaltstack 185 sys_sigaltstack (const void *uss, void *uoss) +__NR_personality 136 sys_personality (unsigned int personality) +__NR_setpriority 97 sys_setpriority (int which, int who, int nice) +__NR_sched_setscheduler 156 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) +__NR_prctl 171 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) +__NR_setrlimit 75 sys_setrlimit (int resource, struct krlimit *rlim) +__NR_mount 21 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) +__NR_umount2 52 sys_umount2 (char *name, int flags) +__NR_gettid 207 sys_gettid (void) +__NR_futex 221 sys_futex (u32 *uaddr, int op, u32 val, struct timespec *utime, u32 *uaddr2, u32 val3) +__NR_set_tid_address 232 sys_set_tid_address (int *tid_addr) +__NR_restart_syscall 0 sys_restart_syscall (void) +__NR_sys_timer_create 240 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, timer_t *created_timer_id) +__NR_sys_timer_settime 241 sys_timer_settime (timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) +__NR_sys_timer_gettime 242 sys_timer_gettime (int timer_id, const struct itimerspec *setting) +__NR_sys_timer_getoverrun 243 sys_timer_getoverrun (int timer_id) +__NR_sys_timer_delete 244 sys_timer_delete (timer_t timer_id) +__NR_clock_gettime 246 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_exit_group 234 sys_exit_group (int error_code) +__NR_set_robust_list 300 sys_set_robust_list (struct robust_list_head *head, size_t len) +__NR_get_robust_list 299 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) +__NR_vmsplice 285 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) +__NR_timerfd_settime 311 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) +__NR_signalfd4 313 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) +__NR_rt_tgsigqueueinfo 322 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) +__NR_fanotify_init 323 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) +__NR_fanotify_mark 324 sys_fanotify_mark (int fanotify_fd, unsigned int flags, u64 mask, int dfd, const char *pathname) +__NR_prlimit64 325 sys_prlimit64 (pid_t pid, unsigned int resource, const struct rlimit64 *new_rlim, struct rlimit64 *old_rlim) +__NR_open_by_handle_at 346 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) +__NR_setns 350 sys_setns (int fd, int nstype) +__NR_kcmp 354 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) +__NR_memfd_create 360 sys_memfd_create (const char *name, unsigned int flags) +__NR_io_setup 227 sys_io_setup (unsigned nr_events, aio_context_t *ctx_idp) +__NR_io_getevents 229 sys_io_getevents (aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout) \ No newline at end of file diff --git a/arch/ppc64/syscalls-ppc64.sh b/arch/ppc64/syscalls-ppc64.sh new file mode 100644 index 000000000..22c81293d --- /dev/null +++ b/arch/ppc64/syscalls-ppc64.sh @@ -0,0 +1,54 @@ +#!/bin/sh + +gen_asm() { + in=$1 + codesout=$2 + codesinc=`echo $2 | sed -e 's/.*include\///g'` + protosout=$3 + asmout=$4 + asmcommon=`echo $5 | sed -e 's/.*include\///g'` + prototypes=`echo $6 | sed -e 's/.*include\///g'` + + codesdef=`echo $codesout | sed -e 's/.*include\///g' | tr "[[:space:]].-" _` + protosdef=`echo $protosout | sed -e 's/.*include\///g' | tr "[[:space:]].-" _` + + echo "/* Autogenerated, don't edit */" > $codesout + echo "#ifndef $codesdef" >> $codesout + echo "#define $codesdef" >> $codesout + + echo "/* Autogenerated, don't edit */" > $protosout + echo "#ifndef $protosdef" >> $protosout + echo "#define $protosdef" >> $protosout + echo "#include \"$prototypes\"" >> $protosout + echo "#include \"$codesinc\"" >> $protosout + + echo "/* Autogenerated, don't edit */" > $asmout + echo "#include \"$codesinc\"" >> $asmout + echo "#include \"$asmcommon\"" >> $asmout + + cat $in | egrep -v '^#' | sed -e 's/\t\{1,\}/|/g' | awk -F '|' '{print "#define", $1, $2}' >> $codesout + cat $in | egrep -v '^#' | sed -e 's/\t\{1,\}/|/g' | awk -F '|' '{print "extern long ", $3, $4, ";"}' >> $protosout + cat $in | egrep -v '^#' | sed -e 's/\t\{1,\}/|/g' | awk -F '|' '{print "SYSCALL(", $3, ",", $2, ")"}' >> $asmout + + echo "#endif /* $codesdef */" >> $codesout + echo "#endif /* $protosdef */" >> $protosout +} + +gen_exec() { + in=$1 + codecout=$2 + + echo "/* Autogenerated, don't edit */" > $codecout + + cat $in | egrep -v '^#' | sed -e 's/\t\{1,\}/|/g' | awk -F '|' '{print "SYSCALL(", substr($3, 5), ",", $2, ")"}' >> $codecout +} + +if [ "$1" = "--asm" ]; then + shift + gen_asm $@ +fi + +if [ "$1" = "--exec" ]; then + shift + gen_exec $@ +fi diff --git a/arch/ppc64/vdso-pie.c b/arch/ppc64/vdso-pie.c new file mode 100644 index 000000000..8219e4af1 --- /dev/null +++ b/arch/ppc64/vdso-pie.c @@ -0,0 +1,594 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "asm/string.h" +#include "asm/types.h" + +#include "syscall.h" +#include "image.h" +#include "vdso.h" +#include "vma.h" +#include "log.h" +#include "bug.h" + +#ifdef LOG_PREFIX +# undef LOG_PREFIX +#endif +#define LOG_PREFIX "vdso: " + +/* This symbols are defined in vdso-trampoline.S */ +extern char *vdso_trampoline, *vdso_trampoline_end; + +static inline void invalidate_caches(unsigned long at) +{ + asm volatile("isync \n" \ + "li 3,0 \n" \ + "dcbf 3,%0 \n" \ + "sync \n" \ + "icbi 3,%0 \n" \ + "isync \n" \ + : /* no output */ \ + : "r"(at) \ + :"memory", "r3"); +} + +/* This is the size of the trampoline call : + * mlfr r0 + * bl trampoline + * <64 bit address> + */ +#define TRAMP_CALL_SIZE (2*sizeof(uint32_t) + sizeof(uint64_t)) + +/* + * put_trampoline does 2 things : + * + * 1. it looks for a place in the checkpointed vDSO where to put the + * trampoline code (see vdso-trampoline.S). + * + * 2. for each symbol from the checkpointed vDSO, it checks that there are + * enough place to put the call to the vDSO trampoline (see + * TRAMP_CALL_SIZE's comment above). + * This done by checking that there is no interesting symbols in the range + * of current one's offset -> (current one's offset + TRAMP_CALL_SIZE). + * Unfortunately the symbols are not sorted by address so we have to look + * for the complete table all the time. Since the vDSO is small, this is + * not a big issue. + */ +static unsigned long put_trampoline(unsigned long at, struct vdso_symtable *sym) +{ + int i,j; + unsigned long size; + unsigned long trampoline = 0; + + /* First of all we have to find a place where to put the trampoline + * code. + */ + size = (unsigned long)&vdso_trampoline_end + - (unsigned long)&vdso_trampoline; + + for (i = 0; i < ARRAY_SIZE(sym->symbols); i++) { + if (vdso_symbol_empty(&sym->symbols[i])) + continue; + + pr_debug("Checking '%s' at %lx\n", sym->symbols[i].name, + sym->symbols[i].offset); + + /* find the nearest followin symbol we are interested in */ + for (j=0; j < ARRAY_SIZE(sym->symbols); j++) { + if (i==j || vdso_symbol_empty(&sym->symbols[j])) + continue; + + /* pr_debug("next:%s(%lx)\n", sym->symbols[j].name, */ + /* sym->symbols[j].offset); */ + + if (sym->symbols[j].offset <= sym->symbols[i].offset) + /* this symbol is above the current one */ + continue; + + if ((sym->symbols[i].offset+TRAMP_CALL_SIZE) > + sym->symbols[j].offset) { + /* we have a major issue here since we cannot + * even put the trampoline call for this symbol + */ + pr_err("Can't handle small vDSO symbol %s\n", + sym->symbols[i].name); + return 0; + } + + if (trampoline) + /* no need to put it twice */ + continue; + + if ((sym->symbols[j].offset - + (sym->symbols[i].offset+TRAMP_CALL_SIZE)) <= size) + /* not enough place */ + continue; + + /* We can put the trampoline there */ + trampoline = at + sym->symbols[i].offset; + trampoline += TRAMP_CALL_SIZE; + + pr_debug("Puting vDSO trampoline in %s at %lx", + sym->symbols[i].name, trampoline); + builtin_memcpy((void *)trampoline, &vdso_trampoline, + size); + invalidate_caches(trampoline); + } + } + + return trampoline; +} + +static inline void put_trampoline_call(unsigned long at, unsigned long to, + unsigned long tr) +{ + uint32_t *addr = (uint32_t *)at;; + + *addr++ = 0x7C0802a6; /* mflr r0 */ + *addr++ = 0x48000001 | ((long)(tr-at-4) & 0x3fffffc); /* bl tr */ + *(uint64_t *)addr = to; /* the address to read by the trampoline */ + + invalidate_caches(at); +} + +static int vdso_redirect_calls(unsigned long base_to, + unsigned long base_from, + struct vdso_symtable *to, + struct vdso_symtable *from) +{ + unsigned int i; + unsigned long trampoline; + + trampoline = (unsigned long)put_trampoline(base_from, from); + if (!trampoline) + return 1; + + for (i = 0; i < ARRAY_SIZE(to->symbols); i++) { + if (vdso_symbol_empty(&from->symbols[i])) + continue; + + pr_debug("br: %lx/%lx -> %lx/%lx (index %d) '%s'\n", + base_from, from->symbols[i].offset, + base_to, to->symbols[i].offset, i, + from->symbols[i].name); + + put_trampoline_call(base_from + from->symbols[i].offset, + base_to + to->symbols[i].offset, + trampoline); + } + + return 0; +} + +/* Check if pointer is out-of-bound */ +static bool __ptr_oob(void *ptr, void *start, size_t size) +{ + void *end = (void *)((unsigned long)start + size); + return ptr > end || ptr < start; +} + +/* + * Elf hash, see format specification. + */ +static unsigned long elf_hash(const unsigned char *name) +{ + unsigned long h = 0, g; + + while (*name) { + h = (h << 4) + *name++; + g = h & 0xf0000000ul; + if (g) + h ^= g >> 24; + h &= ~g; + } + return h; +} + +/* + * TODO : + * PIE linking doesn't work for this kind of definition. + * When build for the parasite code, the pointers to the string are + * computed from the start of the object but the generated code is + * assuming that the pointers are fixed by the loader. + * + * In addition, GCC create a call to C library memcpy when the table is + * containing more than 9 items. Since the parasite code is not linked + * with the C library an undefined symbol error is raised at build time. + * By initialising the table at run time, we are working around this + * issue. + */ +#ifdef __pie__ +static const char *VDSO_SYMBOL(int i) +{ + static char *vdso_symbols[VDSO_SYMBOL_MAX]; + static int init_done = 0; + +#define SET_VDSO_SYM(s) vdso_symbols[VDSO_SYMBOL_##s] = VDSO_SYMBOL_##s##_NAME + if (!init_done) { + SET_VDSO_SYM(CLOCK_GETRES); + SET_VDSO_SYM(CLOCK_GETTIME); + SET_VDSO_SYM(GET_SYSCALL_MAP); + SET_VDSO_SYM(GET_TBFREQ); + SET_VDSO_SYM(GETCPU); + SET_VDSO_SYM(GETTIMEOFDAY); + SET_VDSO_SYM(SIGTRAMP_RT64); + SET_VDSO_SYM(SYNC_DICACHE); + SET_VDSO_SYM(SYNC_DICACHE_P5); + SET_VDSO_SYM(TIME); + init_done = 1; + } + return vdso_symbols[i]; +} +#else +#define SET_VDSO_SYM(s) [VDSO_SYMBOL_##s] = VDSO_SYMBOL_##s##_NAME +const char *vdso_symbols[VDSO_SYMBOL_MAX] = { + SET_VDSO_SYM(CLOCK_GETRES), + SET_VDSO_SYM(CLOCK_GETTIME), + SET_VDSO_SYM(GET_SYSCALL_MAP), + SET_VDSO_SYM(GET_TBFREQ), + SET_VDSO_SYM(GETCPU), + SET_VDSO_SYM(GETTIMEOFDAY), + SET_VDSO_SYM(SIGTRAMP_RT64), + SET_VDSO_SYM(SYNC_DICACHE), + SET_VDSO_SYM(SYNC_DICACHE_P5), + SET_VDSO_SYM(TIME) +}; +#define VDSO_SYMBOL(i) vdso_symbols[i] +#endif + +int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t) +{ + Elf64_Phdr *dynamic = NULL, *load = NULL; + Elf64_Ehdr *ehdr = (void *)mem; + Elf64_Dyn *dyn_strtab = NULL; + Elf64_Dyn *dyn_symtab = NULL; + Elf64_Dyn *dyn_strsz = NULL; + Elf64_Dyn *dyn_syment = NULL; + Elf64_Dyn *dyn_hash = NULL; + Elf64_Word *hash = NULL; + Elf64_Phdr *phdr; + Elf64_Dyn *d; + + Elf64_Word *bucket, *chain; + Elf64_Word nbucket, nchain; + + /* + * See Elf specification for this magic values. + */ + static const char elf_ident[] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + }; + + char *dynsymbol_names; + unsigned int i, j, k; + + BUILD_BUG_ON(sizeof(elf_ident) != sizeof(ehdr->e_ident)); + + pr_debug("Parsing at %lx %lx\n", (long)mem, (long)mem + (long)size); + + /* + * Make sure it's a file we support. + */ + if (builtin_memcmp(ehdr->e_ident, elf_ident, sizeof(elf_ident))) { + pr_err("Elf header magic mismatch\n"); + return -EINVAL; + } + + /* + * We need PT_LOAD and PT_DYNAMIC here. Each once. + */ + phdr = (void *)&mem[ehdr->e_phoff]; + for (i = 0; i < ehdr->e_phnum; i++, phdr++) { + if (__ptr_oob(phdr, mem, size)) + goto err_oob; + switch (phdr->p_type) { + case PT_DYNAMIC: + if (dynamic) { + pr_err("Second PT_DYNAMIC header\n"); + return -EINVAL; + } + dynamic = phdr; + break; + case PT_LOAD: + if (load) { + pr_err("Second PT_LOAD header\n"); + return -EINVAL; + } + load = phdr; + break; + } + } + + if (!load || !dynamic) { + pr_err("One of obligated program headers is missed\n"); + return -EINVAL; + } + + pr_debug("PT_LOAD p_vaddr: %lx\n", (unsigned long)load->p_vaddr); + + /* + * Dynamic section tags should provide us the rest of information + * needed. Note that we're interested in a small set of tags. + */ + d = (void *)&mem[dynamic->p_offset]; + for (i = 0; i < dynamic->p_filesz / sizeof(*d); i++, d++) { + if (__ptr_oob(d, mem, size)) + goto err_oob; + + if (d->d_tag == DT_NULL) { + break; + } else if (d->d_tag == DT_STRTAB) { + dyn_strtab = d; + pr_debug("DT_STRTAB: %lx\n", (unsigned long)d->d_un.d_ptr); + } else if (d->d_tag == DT_SYMTAB) { + dyn_symtab = d; + pr_debug("DT_SYMTAB: %lx\n", (unsigned long)d->d_un.d_ptr); + } else if (d->d_tag == DT_STRSZ) { + dyn_strsz = d; + pr_debug("DT_STRSZ: %lx\n", (unsigned long)d->d_un.d_val); + } else if (d->d_tag == DT_SYMENT) { + dyn_syment = d; + pr_debug("DT_SYMENT: %lx\n", (unsigned long)d->d_un.d_val); + } else if (d->d_tag == DT_HASH) { + dyn_hash = d; + pr_debug("DT_HASH: %lx\n", (unsigned long)d->d_un.d_ptr); + } + } + + if (!dyn_strtab || !dyn_symtab || !dyn_strsz || !dyn_syment || !dyn_hash) { + pr_err("Not all dynamic entries are present\n"); + return -EINVAL; + } + + dynsymbol_names = &mem[dyn_strtab->d_un.d_val - load->p_vaddr]; + if (__ptr_oob(dynsymbol_names, mem, size)) + goto err_oob; + + hash = (void *)&mem[(unsigned long)dyn_hash->d_un.d_ptr - (unsigned long)load->p_vaddr]; + if (__ptr_oob(hash, mem, size)) + goto err_oob; + + nbucket = hash[0]; + nchain = hash[1]; + bucket = &hash[2]; + chain = &hash[nbucket + 2]; + + pr_debug("nbucket %lx nchain %lx bucket %lx chain %lx\n", + (long)nbucket, (long)nchain, (unsigned long)bucket, (unsigned long)chain); + + for (i = 0; i < VDSO_SYMBOL_MAX; i++) { + const char * symbol = VDSO_SYMBOL(i); + k = elf_hash((const unsigned char *)symbol); + + for (j = bucket[k % nbucket]; j < nchain && chain[j] != STN_UNDEF; j = chain[j]) { + Elf64_Sym *sym = (void *)&mem[dyn_symtab->d_un.d_ptr - load->p_vaddr]; + char *name; + + sym = &sym[j]; + if (__ptr_oob(sym, mem, size)) + continue; + + if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC && + ELF64_ST_BIND(sym->st_info) != STB_GLOBAL) + continue; + + name = &dynsymbol_names[sym->st_name]; + if (__ptr_oob(name, mem, size)) + continue; + + if (builtin_strcmp(name, symbol)) + continue; + + builtin_memcpy(t->symbols[i].name, name, sizeof(t->symbols[i].name)); + t->symbols[i].offset = (unsigned long)sym->st_value - load->p_vaddr; + break; + } + } + + return 0; + +err_oob: + pr_err("Corrupted Elf data\n"); + return -EFAULT; +} + +static int vdso_remap(char *who, unsigned long from, unsigned long to, size_t size) +{ + unsigned long addr; + + pr_debug("Remap %s %lx -> %lx\n", who, from, to); + + addr = sys_mremap(from, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, to); + if (addr != to) { + pr_err("Unable to remap %lx -> %lx %lx\n", + from, to, addr); + return -1; + } + + return 0; +} + +/* Park runtime vDSO in some safe place where it can be accessible from restorer */ +int vdso_do_park(struct vdso_symtable *sym_rt, unsigned long park_at, unsigned long park_size) +{ + int ret; + + BUG_ON((vdso_vma_size(sym_rt) + vvar_vma_size(sym_rt)) < park_size); + + if (sym_rt->vvar_start != VDSO_BAD_ADDR) { + if (sym_rt->vma_start < sym_rt->vvar_start) { + ret = vdso_remap("rt-vdso", sym_rt->vma_start, + park_at, vdso_vma_size(sym_rt)); + park_at += vdso_vma_size(sym_rt); + ret |= vdso_remap("rt-vvar", sym_rt->vvar_start, + park_at, vvar_vma_size(sym_rt)); + } else { + ret = vdso_remap("rt-vvar", sym_rt->vvar_start, + park_at, vvar_vma_size(sym_rt)); + park_at += vvar_vma_size(sym_rt); + ret |= vdso_remap("rt-vdso", sym_rt->vma_start, + park_at, vdso_vma_size(sym_rt)); + } + } else + ret = vdso_remap("rt-vdso", sym_rt->vma_start, + park_at, vdso_vma_size(sym_rt)); + return ret; +} + +int vdso_proxify(char *who, struct vdso_symtable *sym_rt, + unsigned long vdso_rt_parked_at, size_t index, + VmaEntry *vmas, size_t nr_vmas) +{ + VmaEntry *vma_vdso = NULL, *vma_vvar = NULL; + struct vdso_symtable s = VDSO_SYMTABLE_INIT; + bool remap_rt = false; + + /* + * Figure out which kind of vdso tuple we get. + */ + if (vma_entry_is(&vmas[index], VMA_AREA_VDSO)) + vma_vdso = &vmas[index]; + else if (vma_entry_is(&vmas[index], VMA_AREA_VVAR)) + vma_vvar = &vmas[index]; + + if (index < (nr_vmas - 1)) { + if (vma_entry_is(&vmas[index + 1], VMA_AREA_VDSO)) + vma_vdso = &vmas[index + 1]; + else if (vma_entry_is(&vmas[index + 1], VMA_AREA_VVAR)) + vma_vvar = &vmas[index + 1]; + } + + if (!vma_vdso) { + pr_err("Can't find vDSO area in image\n"); + return -1; + } + + /* + * vDSO mark overwrites Elf program header of proxy vDSO thus + * it must never ever be greater in size. + */ + BUILD_BUG_ON(sizeof(struct vdso_mark) > sizeof(Elf64_Phdr)); + + /* + * Find symbols in vDSO zone read from image. + */ + if (vdso_fill_symtable((void *)vma_vdso->start, vma_entry_len(vma_vdso), &s)) + return -1; + + /* + * Proxification strategy + * + * - There might be two vDSO zones: vdso code and optionally vvar data + * - To be able to use in-place remapping we need + * + * a) Size and order of vDSO zones are to match + * b) Symbols offsets must match + * c) Have same number of vDSO zones + */ + if (vma_entry_len(vma_vdso) == vdso_vma_size(sym_rt)) { + size_t i; + + for (i = 0; i < ARRAY_SIZE(s.symbols); i++) { + if (s.symbols[i].offset != sym_rt->symbols[i].offset) + break; + } + + if (i == ARRAY_SIZE(s.symbols)) { + if (vma_vvar && sym_rt->vvar_start != VVAR_BAD_ADDR) { + remap_rt = (vvar_vma_size(sym_rt) == vma_entry_len(vma_vvar)); + if (remap_rt) { + long delta_rt = sym_rt->vvar_start - sym_rt->vma_start; + long delta_this = vma_vvar->start - vma_vdso->start; + + remap_rt = (delta_rt ^ delta_this) < 0 ? false : true; + } + } else + remap_rt = true; + } + } + + pr_debug("image [vdso] %lx-%lx [vvar] %lx-%lx\n", + vma_vdso->start, vma_vdso->end, + vma_vvar ? vma_vvar->start : VVAR_BAD_ADDR, + vma_vvar ? vma_vvar->end : VVAR_BAD_ADDR); + + /* + * Easy case -- the vdso from image has same offsets, order and size + * as runtime, so we simply remap runtime vdso to dumpee position + * without generating any proxy. + * + * Note we may remap VVAR vdso as well which might not yet been mapped + * by a caller code. So drop VMA_AREA_REGULAR from it and caller would + * not touch it anymore. + */ + if (remap_rt) { + int ret = 0; + + pr_info("Runtime vdso/vvar matches dumpee, remap inplace\n"); + + if (sys_munmap((void *)vma_vdso->start, vma_entry_len(vma_vdso))) { + pr_err("Failed to unmap %s\n", who); + return -1; + } + + if (vma_vvar) { + if (sys_munmap((void *)vma_vvar->start, vma_entry_len(vma_vvar))) { + pr_err("Failed to unmap %s\n", who); + return -1; + } + + if (vma_vdso->start < vma_vvar->start) { + ret = vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt)); + vdso_rt_parked_at += vdso_vma_size(sym_rt); + ret |= vdso_remap(who, vdso_rt_parked_at, vma_vvar->start, vvar_vma_size(sym_rt)); + } else { + ret = vdso_remap(who, vdso_rt_parked_at, vma_vvar->start, vvar_vma_size(sym_rt)); + vdso_rt_parked_at += vvar_vma_size(sym_rt); + ret |= vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt)); + } + } else + ret = vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt)); + + return ret; + } + + /* + * Now complex case -- we need to proxify calls. We redirect + * calls from dumpee vdso to runtime vdso, making dumpee + * to operate as proxy vdso. + */ + pr_info("Runtime vdso mismatches dumpee, generate proxy\n"); + + /* + * Don't forget to shift if vvar is before vdso. + */ + if (sym_rt->vvar_start != VDSO_BAD_ADDR && + sym_rt->vvar_start < sym_rt->vma_start) + vdso_rt_parked_at += vvar_vma_size(sym_rt); + + if (vdso_redirect_calls(vdso_rt_parked_at, + vma_vdso->start, + sym_rt, &s)) { + pr_err("Failed to proxify dumpee contents\n"); + return -1; + } + + /* + * Put a special mark into runtime vdso, thus at next checkpoint + * routine we could detect this vdso and do not dump it, since + * it's auto-generated every new session if proxy required. + */ + sys_mprotect((void *)vdso_rt_parked_at, vdso_vma_size(sym_rt), PROT_WRITE); + vdso_put_mark((void *)vdso_rt_parked_at, vma_vdso->start, vma_vvar ? vma_vvar->start : VVAR_BAD_ADDR); + sys_mprotect((void *)vdso_rt_parked_at, vdso_vma_size(sym_rt), VDSO_PROT); + return 0; +} diff --git a/arch/ppc64/vdso-trampoline.S b/arch/ppc64/vdso-trampoline.S new file mode 100644 index 000000000..54a224537 --- /dev/null +++ b/arch/ppc64/vdso-trampoline.S @@ -0,0 +1,11 @@ +#include "asm/linkage.h" + + .section .text + +GLOBAL(vdso_trampoline) + mflr 12 /* r12 vdso_ptr's address */ + mtlr 0 /* restore lr */ + ld 12,0(12) /* read value store in vdso_ptr */ + mtctr 12 /* branch to it */ + bctr +GLOBAL(vdso_trampoline_end) diff --git a/arch/ppc64/vdso.c b/arch/ppc64/vdso.c new file mode 100644 index 000000000..43d9637f0 --- /dev/null +++ b/arch/ppc64/vdso.c @@ -0,0 +1,309 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "asm/types.h" +#include "asm/parasite-syscall.h" + +#include "parasite-syscall.h" +#include "parasite.h" +#include "compiler.h" +#include "kerndat.h" +#include "vdso.h" +#include "util.h" +#include "log.h" +#include "mem.h" +#include "vma.h" + +#ifdef LOG_PREFIX +# undef LOG_PREFIX +#endif +#define LOG_PREFIX "vdso: " + +struct vdso_symtable vdso_sym_rt = VDSO_SYMTABLE_INIT; +u64 vdso_pfn = VDSO_BAD_PFN; +/* + * The VMAs list might have proxy vdso/vvar areas left + * from previous dump/restore cycle so we need to detect + * them and eliminated from the VMAs list, they will be + * generated again on restore if needed. + */ +int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid, + struct vm_area_list *vma_area_list) +{ + unsigned long proxy_vdso_addr = VDSO_BAD_ADDR; + unsigned long proxy_vvar_addr = VVAR_BAD_ADDR; + struct vma_area *proxy_vdso_marked = NULL; + struct vma_area *proxy_vvar_marked = NULL; + struct parasite_vdso_vma_entry *args; + struct vma_area *vma; + int fd, ret = -1; + off_t off; + u64 pfn; + + args = parasite_args(ctl, struct parasite_vdso_vma_entry); + fd = open_proc(pid, "pagemap"); + if (fd < 0) + return -1; + + list_for_each_entry(vma, &vma_area_list->h, list) { + if (!vma_area_is(vma, VMA_AREA_REGULAR)) + continue; + + if (vma_area_is(vma, VMA_FILE_SHARED) || + vma_area_is(vma, VMA_FILE_PRIVATE)) + continue; + /* + * It might be possible VVAR area from marked + * vDSO zone, we need to detect it earlier than + * VDSO_PROT test because VVAR_PROT is a subset + * of it but don't yield continue here, + * sigh... what a mess. + */ + BUILD_BUG_ON(!(VDSO_PROT & VVAR_PROT)); + + if ((vma->e->prot & VVAR_PROT) == VVAR_PROT) { + if (proxy_vvar_addr != VVAR_BAD_ADDR && + proxy_vvar_addr == vma->e->start) { + BUG_ON(proxy_vvar_marked); + proxy_vvar_marked = vma; + continue; + } + } + + if ((vma->e->prot & VDSO_PROT) != VDSO_PROT) + continue; + + if (vma->e->prot != VDSO_PROT) { + pr_debug("Dropping %lx using extra protection test\n", + vma->e->start); + continue; + } + + if (vma->e->start > TASK_SIZE) + continue; + + if (vma->e->flags & MAP_GROWSDOWN) + continue; + + /* + * I need to poke every potentially marked vma, + * otherwise if task never called for vdso functions + * page frame number won't be reported. + */ + args->start = vma->e->start; + args->len = vma_area_len(vma); + + if (parasite_execute_daemon(PARASITE_CMD_CHECK_VDSO_MARK, ctl)) { + pr_err("vdso: Parasite failed to poke for mark\n"); + ret = -1; + goto err; + } + + /* + * Defer handling marked vdso until we walked over + * all vmas and restore potentially remapped vDSO + * area status. + */ + if (unlikely(args->is_marked)) { + if (proxy_vdso_marked) { + pr_err("Ow! Second vdso mark detected!\n"); + ret = -1; + goto err; + } + proxy_vdso_marked = vma; + proxy_vdso_addr = args->proxy_vdso_addr; + proxy_vvar_addr = args->proxy_vvar_addr; + continue; + } + + off = (vma->e->start / PAGE_SIZE) * sizeof(u64); + ret = pread(fd, &pfn, sizeof(pfn), off); + if (ret < 0 || ret != sizeof(pfn)) { + pr_perror("Can't read pme for pid %d", pid); + ret = -1; + goto err; + } + + pfn = PME_PFRAME(pfn); + if (!pfn) { + pr_err("Unexpected page fram number 0 for pid %d\n", pid); + ret = -1; + goto err; + } + + /* + * Setup proper VMA status. Note starting with 3.16 + * the [vdso]/[vvar] marks are reported correctly + * even when they are remapped into a new place, + * but only since that particular version of the + * kernel! + */ + if (pfn == vdso_pfn) { + if (!vma_area_is(vma, VMA_AREA_VDSO)) { + pr_debug("vdso: Restore vDSO status by pfn at %lx\n", + (long)vma->e->start); + vma->e->status |= VMA_AREA_VDSO; + } + } else { + if (unlikely(vma_area_is(vma, VMA_AREA_VDSO))) { + pr_debug("vdso: Drop mishinted vDSO status at %lx\n", + (long)vma->e->start); + vma->e->status &= ~VMA_AREA_VDSO; + } + } + } + + /* + * There is marked vdso, it means such vdso is autogenerated + * and must be dropped from vma list. + */ + if (proxy_vdso_marked) { + pr_debug("vdso: Found marked at %lx (proxy vDSO at %lx VVAR at %lx)\n", + (long)proxy_vdso_marked->e->start, + (long)proxy_vdso_addr, (long)proxy_vvar_addr); + + /* + * Don't forget to restore the proxy vdso/vvar status, since + * it's unknown to the kernel. + */ + list_for_each_entry(vma, &vma_area_list->h, list) { + if (vma->e->start == proxy_vdso_addr) { + vma->e->status |= VMA_AREA_REGULAR | VMA_AREA_VDSO; + pr_debug("vdso: Restore proxy vDSO status at %lx\n", + (long)vma->e->start); + } else if (vma->e->start == proxy_vvar_addr) { + vma->e->status |= VMA_AREA_REGULAR | VMA_AREA_VVAR; + pr_debug("vdso: Restore proxy VVAR status at %lx\n", + (long)vma->e->start); + } + } + + pr_debug("vdso: Droppping marked vdso at %lx\n", + (long)proxy_vdso_marked->e->start); + list_del(&proxy_vdso_marked->list); + xfree(proxy_vdso_marked); + vma_area_list->nr--; + + if (proxy_vvar_marked) { + pr_debug("vdso: Droppping marked vvar at %lx\n", + (long)proxy_vvar_marked->e->start); + list_del(&proxy_vvar_marked->list); + xfree(proxy_vvar_marked); + vma_area_list->nr--; + } + } + ret = 0; +err: + close(fd); + return ret; +} + +static int vdso_fill_self_symtable(struct vdso_symtable *s) +{ + char buf[512]; + int ret = -1; + FILE *maps; + + *s = (struct vdso_symtable)VDSO_SYMTABLE_INIT; + + maps = fopen_proc(PROC_SELF, "maps"); + if (!maps) { + pr_perror("Can't open self-vma"); + return -1; + } + + while (fgets(buf, sizeof(buf), maps)) { + unsigned long start, end; + char *has_vdso, *has_vvar; + + has_vdso = strstr(buf, "[vdso]"); + if (!has_vdso) + has_vvar = strstr(buf, "[vvar]"); + else + has_vvar = NULL; + + if (!has_vdso && !has_vvar) + continue; + + ret = sscanf(buf, "%lx-%lx", &start, &end); + if (ret != 2) { + ret = -1; + pr_err("Can't find vDSO/VVAR bounds\n"); + goto err; + } + + if (has_vdso) { + if (s->vma_start != VDSO_BAD_ADDR) { + pr_err("Got second vDSO entry\n"); + ret = -1; + goto err; + } + s->vma_start = start; + s->vma_end = end; + + ret = vdso_fill_symtable((void *)start, end - start, s); + if (ret) + goto err; + } else { + if (s->vvar_start != VVAR_BAD_ADDR) { + pr_err("Got second VVAR entry\n"); + ret = -1; + goto err; + } + s->vvar_start = start; + s->vvar_end = end; + } + } + + /* + * Validate its structure -- for new vDSO format the + * structure must be like + * + * 7fff1f5fd000-7fff1f5fe000 r-xp 00000000 00:00 0 [vdso] + * 7fff1f5fe000-7fff1f600000 r--p 00000000 00:00 0 [vvar] + * + * The areas may be in reverse order. + * + * 7fffc3502000-7fffc3504000 r--p 00000000 00:00 0 [vvar] + * 7fffc3504000-7fffc3506000 r-xp 00000000 00:00 0 [vdso] + * + */ + ret = 0; + if (s->vma_start != VDSO_BAD_ADDR) { + if (s->vvar_start != VVAR_BAD_ADDR) { + if (s->vma_end != s->vvar_start && + s->vvar_end != s->vma_start) { + ret = -1; + pr_err("Unexpected rt vDSO area bounds\n"); + goto err; + } + } + } else { + ret = -1; + pr_err("Can't find rt vDSO\n"); + goto err; + } + + pr_debug("rt [vdso] %lx-%lx [vvar] %lx-%lx\n", + s->vma_start, s->vma_end, + s->vvar_start, s->vvar_end); + +err: + fclose(maps); + return ret; +} + +int vdso_init(void) +{ + if (vdso_fill_self_symtable(&vdso_sym_rt)) + return -1; + return vaddr_to_pfn(vdso_sym_rt.vma_start, &vdso_pfn); +} diff --git a/cr-restore.c b/cr-restore.c index 9d28e69e2..e100164d4 100644 --- a/cr-restore.c +++ b/cr-restore.c @@ -2751,8 +2751,13 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core) * might be completely unused so it's here just for convenience. */ restore_thread_exec_start = restorer_sym(exec_mem_hint, __export_restore_thread); +#ifdef CONFIG_PPC64 + restore_task_exec_start = restorer_sym(exec_mem_hint, __export_restore_task_trampoline); + rsti(current)->munmap_restorer = restorer_sym(exec_mem_hint, __export_unmap_trampoline); +#else restore_task_exec_start = restorer_sym(exec_mem_hint, __export_restore_task); rsti(current)->munmap_restorer = restorer_sym(exec_mem_hint, __export_unmap); +#endif exec_mem_hint += restorer_len; diff --git a/include/image.h b/include/image.h index 55e63dd9c..c13ead0e5 100644 --- a/include/image.h +++ b/include/image.h @@ -11,7 +11,11 @@ #include "bfd.h" #include "bug.h" +#ifdef _ARCH_PPC64 +#define PAGE_IMAGE_SIZE 65536 +#else #define PAGE_IMAGE_SIZE 4096 +#endif /* _ARCH_PPC64 */ #define PAGE_RSS 1 #define PAGE_ANON 2 diff --git a/pie/Makefile b/pie/Makefile index c0e8f62ce..11620d7dc 100644 --- a/pie/Makefile +++ b/pie/Makefile @@ -10,6 +10,9 @@ obj-e += $(ARCH_DIR)/vdso-pie.o ifeq ($(SRCARCH),aarch64) asm-e += $(ARCH_DIR)/intraprocedure.o endif +ifeq ($(SRCARCH), ppc64) +asm-e += $(ARCH_DIR)/vdso-trampoline.o +endif endif parasite-obj-y += parasite.o @@ -18,6 +21,9 @@ parasite-libs-e += $(SYSCALL-LIB) restorer-obj-y += restorer.o restorer-obj-e += $(ARCH_DIR)/restorer.o +ifeq ($(SRCARCH), ppc64) +restorer-asm-e += $(ARCH_DIR)/restorer-trampoline.o +endif restorer-libs-e += $(SYSCALL-LIB) # diff --git a/pie/pie.lds.S.in b/pie/pie.lds.S.in index f1dc526ef..9e9c97f00 100644 --- a/pie/pie.lds.S.in +++ b/pie/pie.lds.S.in @@ -12,6 +12,8 @@ SECTIONS . = ALIGN(32); *(.got*) . = ALIGN(32); + *(.toc*) + . = ALIGN(32); } =0x00000000, /DISCARD/ : { diff --git a/protobuf/Makefile b/protobuf/Makefile index d4e177462..0b1185203 100644 --- a/protobuf/Makefile +++ b/protobuf/Makefile @@ -3,6 +3,7 @@ proto-obj-y += core.o proto-obj-y += core-x86.o proto-obj-y += core-arm.o proto-obj-y += core-aarch64.o +proto-obj-y += core-ppc64.o proto-obj-y += cpuinfo.o proto-obj-y += inventory.o proto-obj-y += fdinfo.o diff --git a/protobuf/core-ppc64.proto b/protobuf/core-ppc64.proto new file mode 100644 index 000000000..b874ccf88 --- /dev/null +++ b/protobuf/core-ppc64.proto @@ -0,0 +1,23 @@ +message user_ppc64_regs_entry { + // Following is the list of regiters starting at r0. + repeated uint64 gpr = 1; + required uint64 nip = 2; + required uint64 msr = 3; + required uint64 orig_gpr3 = 4; + required uint64 ctr = 5; + required uint64 link = 6; + required uint64 xer = 7; + required uint64 ccr = 8; + required uint64 trap = 9; +} + +message user_ppc64_fpstate_entry { + // Following is the list of regiters starting at fpr0 + repeated uint64 fpregs = 1; +} + +message thread_info_ppc64 { + required uint64 clear_tid_addr = 1; + required user_ppc64_regs_entry gpregs = 2; + optional user_ppc64_fpstate_entry fpstate = 3; +} diff --git a/protobuf/core.proto b/protobuf/core.proto index 1f44a470c..9f70da929 100644 --- a/protobuf/core.proto +++ b/protobuf/core.proto @@ -1,6 +1,7 @@ import "core-x86.proto"; import "core-arm.proto"; import "core-aarch64.proto"; +import "core-ppc64.proto"; import "rlimit.proto"; import "timer.proto"; @@ -70,12 +71,14 @@ message core_entry { X86_64 = 1; ARM = 2; AARCH64 = 3; + PPC64 = 4; } required march mtype = 1; optional thread_info_x86 thread_info = 2; optional thread_info_arm ti_arm = 6; optional thread_info_aarch64 ti_aarch64 = 8; + optional thread_info_ppc64 ti_ppc64 = 9; optional task_core_entry tc = 3; optional task_kobj_ids_entry ids = 4;