From f1d465448fa3d464da6f8bc31500f5ce005b72da Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 26 Sep 2024 12:24:52 +0100 Subject: [PATCH 001/257] amdgpu: remove exec permissions on source files This patch fixes the following warnings that appear when building an RPM package: + /usr/lib/rpm/redhat/brp-mangle-shebangs *** WARNING: ./usr/src/debug/criu-4.0-1.fc42.x86_64/plugins/amdgpu/amdgpu_plugin_util.c is executable but has no shebang, removing executable bit *** WARNING: ./usr/src/debug/criu-4.0-1.fc42.x86_64/plugins/amdgpu/amdgpu_plugin_util.h is executable but has no shebang, removing executable bit Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin_util.c | 0 plugins/amdgpu/amdgpu_plugin_util.h | 0 2 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 plugins/amdgpu/amdgpu_plugin_util.c mode change 100755 => 100644 plugins/amdgpu/amdgpu_plugin_util.h diff --git a/plugins/amdgpu/amdgpu_plugin_util.c b/plugins/amdgpu/amdgpu_plugin_util.c old mode 100755 new mode 100644 diff --git a/plugins/amdgpu/amdgpu_plugin_util.h b/plugins/amdgpu/amdgpu_plugin_util.h old mode 100755 new mode 100644 From 4f8f6f2883689546c4f0f793ac5d5dc6bd5a937e Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 26 Sep 2024 10:59:32 +0100 Subject: [PATCH 002/257] Makefile.config: set CR_PLUGIN_DEFAULT variable By default, CRIU uses the path "/usr/lib/criu" to install and load plugins at runtime. This path is defined by the `PLUGINDIR` variable in Makefile.install and `CR_PLUGIN_DEFAULT` in `criu/include/plugin.h`. However, some distribution packages might install the CRIU plugins at "/usr/lib64/criu" instead. This patch updates the makefile to align the path defined by `CR_PLUGIN_DEFAULT` with the value of `PLUGINDIR`. Signed-off-by: Radostin Stoyanov --- Makefile.config | 4 ++++ plugins/amdgpu/Makefile | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Makefile.config b/Makefile.config index 52c250b21..5ab689d41 100644 --- a/Makefile.config +++ b/Makefile.config @@ -59,6 +59,10 @@ endif export LIBS += $(LIBS_FEATURES) +ifneq ($(PLUGINDIR),) + FEATURE_DEFINES += -DCR_PLUGIN_DEFAULT="\"$(PLUGINDIR)\"" +endif + CONFIG_FILE = .config $(CONFIG_FILE): diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 7d3388b80..a20d1d163 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -15,7 +15,7 @@ DEPS_NOK := ; __nmk_dir ?= ../../scripts/nmk/scripts/ include $(__nmk_dir)msg.mk -PLUGIN_CFLAGS := -g -Wall -Werror -D _GNU_SOURCE -shared -nostartfiles -fPIC -DCR_PLUGIN_DEFAULT="$(PLUGINDIR)" +PLUGIN_CFLAGS := -g -Wall -Werror -D _GNU_SOURCE -shared -nostartfiles -fPIC PLUGIN_LDFLAGS := -lpthread -lrt -ldrm -ldrm_amdgpu ifeq ($(CONFIG_AMDGPU),y) From 3322d1e94c8a91d795a1f341de07a1c130dce254 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Thu, 13 Jun 2024 20:00:09 +0530 Subject: [PATCH 003/257] images: Add protobuf definition for pidfd We only use the last pid from the list in NSpid entry (from /proc//fdinfo/) while restoring pidfds. The last pid refers to the pid of the process in the most deeply nested pid namespace. Since CRIU does not currently support nested pid namespaces, this entry is the one we want. After Linux 6.9, inode numbers can be used to compare pidfds. pidfds referring to the same process will have the same inode numbers. We use inode numbers to restore pidfds that point to dead processes. Signed-off-by: Bhavik Sachdev --- images/Makefile | 1 + images/fdinfo.proto | 3 +++ images/pidfd.proto | 13 +++++++++++++ 3 files changed, 17 insertions(+) create mode 100644 images/pidfd.proto diff --git a/images/Makefile b/images/Makefile index ca85b1a21..855d894da 100644 --- a/images/Makefile +++ b/images/Makefile @@ -73,6 +73,7 @@ proto-obj-y += bpfmap-file.o proto-obj-y += bpfmap-data.o proto-obj-y += apparmor.o proto-obj-y += rseq.o +proto-obj-y += pidfd.o CFLAGS += -iquote $(obj)/ diff --git a/images/fdinfo.proto b/images/fdinfo.proto index 88f1c1186..32ec13cf4 100644 --- a/images/fdinfo.proto +++ b/images/fdinfo.proto @@ -17,6 +17,7 @@ import "ext-file.proto"; import "sk-unix.proto"; import "fifo.proto"; import "pipe.proto"; +import "pidfd.proto"; import "tty.proto"; import "memfd.proto"; import "bpfmap-file.proto"; @@ -42,6 +43,7 @@ enum fd_types { TIMERFD = 17; MEMFD = 18; BPFMAP = 19; + PIDFD = 20; /* Any number above the real used. Not stored to image */ CTL_TTY = 65534; @@ -78,4 +80,5 @@ message file_entry { optional tty_file_entry tty = 19; optional memfd_file_entry memfd = 20; optional bpfmap_file_entry bpf = 21; + optional pidfd_entry pidfd = 22; } diff --git a/images/pidfd.proto b/images/pidfd.proto new file mode 100644 index 000000000..a9da3e454 --- /dev/null +++ b/images/pidfd.proto @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT + +syntax = "proto2"; + +import "fown.proto"; + +message pidfd_entry { + required uint32 id = 1; + required uint32 ino = 2; + required uint32 flags = 3; + required int32 nspid = 4; + required fown_entry fown = 5; +} From 1ce408ffa4a723e7110cbc0d68c68bfc5871b287 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Thu, 13 Jun 2024 21:18:51 +0530 Subject: [PATCH 004/257] criu: Support C/R of pidfds Process file descriptors (pidfds) were introduced to provide a stable handle on a process. They solve the problem of pid recycling. For a detailed explanation, see https://lwn.net/Articles/801319/ and http://www.corsix.org/content/what-is-a-pidfd Before Linux 6.9, anonymous inodes were used for the implementation of pidfds. So, we detect them in a fashion similiar to other fd types that use anonymous inodes by calling `readlink()`. After 6.9, pidfs (a file system for pidfds) was introduced. In 6.9 `S_ISREG()` returned true for pidfds, but this again changed with 6.10. (https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/fs/pidfs.c?h=v6.11-rc2#n285) After this change, pidfs inodes have no file type in st_mode in userspace. We use `PID_FS_MAGIC` to detect pidfds for kernel >= 6.9 Hence, check for pidfds occurs before the check for regular files. For pidfds that refer to dead processes, we lose the pid of the process as the Pid and NSpid fields in /proc//fdinfo/ change to -1. So, we create a temporary process for each unique inode and open pidfds that refer to this process. After all pidfds have been opened we kill this temporary process. This commit does not include support for pidfds that point to a specific thread, i.e pidfds opened with `PIDFD_THREAD` flag. Fixes: #2258 Signed-off-by: Bhavik Sachdev --- criu/Makefile.crtools | 1 + criu/cr-restore.c | 3 +- criu/files.c | 17 +++ criu/image-desc.c | 1 + criu/include/fs-magic.h | 4 + criu/include/image-desc.h | 1 + criu/include/magic.h | 1 + criu/include/pidfd.h | 16 ++ criu/include/protobuf-desc.h | 1 + criu/pidfd.c | 287 +++++++++++++++++++++++++++++++++++ criu/proc_parse.c | 29 ++++ criu/protobuf-desc.c | 1 + 12 files changed, 361 insertions(+), 1 deletion(-) create mode 100644 criu/include/pidfd.h create mode 100644 criu/pidfd.c diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index 3ddf45cd7..ba6132d2f 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -101,6 +101,7 @@ obj-$(CONFIG_COMPAT) += vdso-compat.o CFLAGS_REMOVE_vdso-compat.o += $(CFLAGS-ASAN) $(CFLAGS-GCOV) obj-y += pidfd-store.o obj-y += hugetlb.o +obj-y += pidfd.o PROTOBUF_GEN := scripts/protobuf-gen.sh diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 4d4dfbe6f..d5b6c8037 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -79,6 +79,7 @@ #include "timens.h" #include "bpfmap.h" #include "apparmor.h" +#include "pidfd.h" #include "parasite-syscall.h" #include "files-reg.h" @@ -280,7 +281,7 @@ static struct collect_image_info *cinfos_files[] = { &unix_sk_cinfo, &fifo_cinfo, &pipe_cinfo, &nsfile_cinfo, &packet_sk_cinfo, &netlink_sk_cinfo, &eventfd_cinfo, &epoll_cinfo, &epoll_tfd_cinfo, &signalfd_cinfo, &tunfile_cinfo, &timerfd_cinfo, &inotify_cinfo, &inotify_mark_cinfo, &fanotify_cinfo, - &fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, + &fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, &pidfd_cinfo }; /* These images are required to restore namespaces */ diff --git a/criu/files.c b/criu/files.c index 3b653e24b..a57fb860f 100644 --- a/criu/files.c +++ b/criu/files.c @@ -49,6 +49,7 @@ #include "kerndat.h" #include "fdstore.h" #include "bpfmap.h" +#include "pidfd.h" #include "protobuf.h" #include "util.h" @@ -544,6 +545,8 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, ops = &signalfd_dump_ops; else if (is_timerfd_link(link)) ops = &timerfd_dump_ops; + else if (is_pidfd_link(link)) + ops = &pidfd_dump_ops; #ifdef CONFIG_HAS_LIBBPF else if (is_bpfmap_link(link)) ops = &bpfmap_dump_ops; @@ -554,6 +557,11 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, return do_dump_gen_file(&p, lfd, ops, e); } + if (p.fs_type == PID_FS_MAGIC) { + ops = &pidfd_dump_ops; + return do_dump_gen_file(&p, lfd, ops, e); + } + if (S_ISREG(p.stat.st_mode) || S_ISDIR(p.stat.st_mode) || S_ISLNK(p.stat.st_mode)) { if (fill_fdlink(lfd, &p, &link)) return -1; @@ -1778,6 +1786,9 @@ static int collect_one_file(void *o, ProtobufCMessage *base, struct cr_img *i) case FD_TYPES__MEMFD: ret = collect_one_file_entry(fe, fe->memfd->id, &fe->memfd->base, &memfd_cinfo); break; + case FD_TYPES__PIDFD: + ret = collect_one_file_entry(fe, fe->pidfd->id, &fe->pidfd->base, &pidfd_cinfo); + break; #ifdef CONFIG_HAS_LIBBPF case FD_TYPES__BPFMAP: ret = collect_one_file_entry(fe, fe->bpf->id, &fe->bpf->base, &bpfmap_cinfo); @@ -1800,5 +1811,11 @@ int prepare_files(void) { init_fdesc_hash(); init_sk_info_hash(); + + if (init_dead_pidfd_hash()) { + pr_err("Could not initialise hash map for dead pidfds\n"); + return -1; + } + return collect_image(&files_cinfo); } diff --git a/criu/image-desc.c b/criu/image-desc.c index d65d9c098..2d87c7381 100644 --- a/criu/image-desc.c +++ b/criu/image-desc.c @@ -107,6 +107,7 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = { FD_ENTRY_F(BPFMAP_FILE, "bpfmap-file", O_NOBUF), FD_ENTRY_F(BPFMAP_DATA, "bpfmap-data", O_NOBUF), FD_ENTRY(APPARMOR, "apparmor"), + FD_ENTRY(PIDFD, "pidfd"), [CR_FD_STATS] = { .fmt = "stats-%s", diff --git a/criu/include/fs-magic.h b/criu/include/fs-magic.h index ad34f4891..ffc0455d5 100644 --- a/criu/include/fs-magic.h +++ b/criu/include/fs-magic.h @@ -57,4 +57,8 @@ #define OVERLAYFS_SUPER_MAGIC 0x794c7630 #endif +#ifndef PID_FS_MAGIC +#define PID_FS_MAGIC 0x50494446 +#endif + #endif /* __CR_FS_MAGIC_H__ */ diff --git a/criu/include/image-desc.h b/criu/include/image-desc.h index 9f369be64..79e1ac111 100644 --- a/criu/include/image-desc.h +++ b/criu/include/image-desc.h @@ -113,6 +113,7 @@ enum { CR_FD_PIPES, CR_FD_TTY_FILES, CR_FD_MEMFD_FILE, + CR_FD_PIDFD, CR_FD_AUTOFS, diff --git a/criu/include/magic.h b/criu/include/magic.h index 0e8c37234..6f0aff26d 100644 --- a/criu/include/magic.h +++ b/criu/include/magic.h @@ -100,6 +100,7 @@ #define BPFMAP_FILE_MAGIC 0x57506142 /* Alapayevsk */ #define BPFMAP_DATA_MAGIC 0x64324033 /* Arkhangelsk */ #define APPARMOR_MAGIC 0x59423047 /* Nikolskoye */ +#define PIDFD_MAGIC 0x54435556 /* Ufa */ #define IFADDR_MAGIC RAW_IMAGE_MAGIC #define ROUTE_MAGIC RAW_IMAGE_MAGIC diff --git a/criu/include/pidfd.h b/criu/include/pidfd.h new file mode 100644 index 000000000..4d2d71700 --- /dev/null +++ b/criu/include/pidfd.h @@ -0,0 +1,16 @@ +#ifndef __CR_PIDFD_H__ +#define __CR_PIDFD_H__ + +#include "files.h" +#include "pidfd.pb-c.h" + +extern const struct fdtype_ops pidfd_dump_ops; +extern struct collect_image_info pidfd_cinfo; +extern int is_pidfd_link(char *link); +extern int init_dead_pidfd_hash(void); +struct pidfd_dump_info { + PidfdEntry pidfe; + pid_t pid; +}; + +#endif /* __CR_PIDFD_H__ */ diff --git a/criu/include/protobuf-desc.h b/criu/include/protobuf-desc.h index 3824de101..c4241be55 100644 --- a/criu/include/protobuf-desc.h +++ b/criu/include/protobuf-desc.h @@ -70,6 +70,7 @@ enum { PB_BPFMAP_FILE, PB_BPFMAP_DATA, PB_APPARMOR, + PB_PIDFD, /* PB_AUTOGEN_STOP */ diff --git a/criu/pidfd.c b/criu/pidfd.c new file mode 100644 index 000000000..fdf5dec60 --- /dev/null +++ b/criu/pidfd.c @@ -0,0 +1,287 @@ +#include "common/lock.h" +#include "imgset.h" +#include "pidfd.h" +#include "fdinfo.h" +#include "pidfd.pb-c.h" +#include "protobuf.h" +#include "pstree.h" +#include +#include +#include +#include "common/bug.h" +#include "rst-malloc.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "pidfd: " + +#ifndef PIDFD_THREAD +#define PIDFD_THREAD O_EXCL +#endif + +struct pidfd_info { + PidfdEntry *pidfe; + struct file_desc d; +}; + +struct dead_pidfd { + unsigned int ino; + int pid; + size_t count; + mutex_t pidfd_lock; + struct hlist_node hash; +}; + +#define DEAD_PIDFD_HASH_SIZE 32 +static struct hlist_head dead_pidfd_hash[DEAD_PIDFD_HASH_SIZE]; +static mutex_t *dead_pidfd_hash_lock; + +int init_dead_pidfd_hash(void) +{ + for (int i = 0; i < DEAD_PIDFD_HASH_SIZE; i++) + INIT_HLIST_HEAD(&dead_pidfd_hash[i]); + + dead_pidfd_hash_lock = shmalloc(sizeof(*dead_pidfd_hash_lock)); + if (!dead_pidfd_hash_lock) + return -1; + + mutex_init(dead_pidfd_hash_lock); + + return 0; +} + +static struct dead_pidfd *lookup_dead_pidfd(unsigned int ino) +{ + struct dead_pidfd *dead; + struct hlist_head *chain; + + mutex_lock(dead_pidfd_hash_lock); + chain = &dead_pidfd_hash[ino % DEAD_PIDFD_HASH_SIZE]; + hlist_for_each_entry(dead, chain, hash) { + if (dead->ino == ino) { + mutex_unlock(dead_pidfd_hash_lock); + return dead; + } + } + mutex_unlock(dead_pidfd_hash_lock); + + return NULL; +} + +int is_pidfd_link(char *link) +{ + /* + * pidfs was introduced in Linux 6.9 + * before which anonymous-inodes were used + */ + return is_anon_link_type(link, "[pidfd]"); +} + +static void pr_info_pidfd(char *action, PidfdEntry *pidfe) +{ + pr_info("%s: id %#08x flags %u NSpid %d ino %u\n", + action, pidfe->id, pidfe->flags, pidfe->nspid, pidfe->ino + ); +} + +static int dump_one_pidfd(int pidfd, u32 id, const struct fd_parms *p) +{ + struct pidfd_dump_info pidfd_info = {.pidfe = PIDFD_ENTRY__INIT}; + FileEntry fe = FILE_ENTRY__INIT; + + if (parse_fdinfo(pidfd, FD_TYPES__PIDFD, &pidfd_info)) + return -1; + + if (p->flags & PIDFD_THREAD) { + pr_err("PIDFD_THREAD flag is currently not supported\n"); + return -1; + } + + /* + * Check if the pid pidfd refers to is part of process tree + * This ensures the process will exist on restore. + */ + if (pidfd_info.pid != -1 && !pstree_item_by_real(pidfd_info.pid)) { + pr_err("pidfd pid %d is not a part of process tree..\n", + pidfd_info.pid); + return -1; + } + + pidfd_info.pidfe.id = id; + pidfd_info.pidfe.flags = (p->flags & ~O_RDWR); + pidfd_info.pidfe.fown = (FownEntry *)&p->fown; + + fe.type = FD_TYPES__PIDFD; + fe.id = pidfd_info.pidfe.id; + fe.pidfd = &pidfd_info.pidfe; + + pr_info_pidfd("Dumping", &pidfd_info.pidfe); + return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); +} + +const struct fdtype_ops pidfd_dump_ops = { + .type = FD_TYPES__PIDFD, + .dump = dump_one_pidfd, +}; + +static int pidfd_open(pid_t pid, int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int create_tmp_process(void) +{ + int tmp_process; + tmp_process = fork(); + if (tmp_process < 0) { + pr_perror("Could not fork"); + return -1; + } else if (tmp_process == 0) { + while(1) + sleep(1); + } + return tmp_process; +} + +static int free_dead_pidfd(struct dead_pidfd *dead) +{ + int status; + + if (kill(dead->pid, SIGKILL) < 0) { + pr_perror("Could not kill temporary process with pid: %d", + dead->pid); + goto err; + } + + if (waitpid(dead->pid, &status, 0) != dead->pid) { + pr_perror("Could not wait on temporary process with pid: %d", + dead->pid); + goto err; + } + + if (!WIFSIGNALED(status)) { + pr_err("Expected temporary process to be terminated by a signal\n"); + goto err; + } + + if (WTERMSIG(status) != SIGKILL) { + pr_err("Expected temporary process to be terminated by SIGKILL\n"); + goto err; + } + + mutex_lock(dead_pidfd_hash_lock); + hlist_del(&dead->hash); + mutex_unlock(dead_pidfd_hash_lock); + return 0; +err: + return -1; +} + +static int open_one_pidfd(struct file_desc *d, int *new_fd) +{ + struct pidfd_info *info; + struct dead_pidfd *dead = NULL; + int pidfd; + + info = container_of(d, struct pidfd_info, d); + if (info->pidfe->nspid != -1) { + pidfd = pidfd_open(info->pidfe->nspid, info->pidfe->flags); + if (pidfd < 0) { + pr_perror("Could not open pidfd for %d", info->pidfe->nspid); + goto err_close; + } + goto out; + } + + dead = lookup_dead_pidfd(info->pidfe->ino); + BUG_ON(!dead); + + mutex_lock(&dead->pidfd_lock); + BUG_ON(dead->count == 0); + dead->count--; + if (dead->pid == -1) { + dead->pid = create_tmp_process(); + if (dead->pid < 0) { + mutex_unlock(&dead->pidfd_lock); + goto err_close; + } + } + + pidfd = pidfd_open(dead->pid, info->pidfe->flags); + if (pidfd < 0) { + pr_perror("Could not open pidfd for %d", info->pidfe->nspid); + mutex_unlock(&dead->pidfd_lock); + goto err_close; + } + + if (dead->count == 0) { + if (free_dead_pidfd(dead)) { + pr_err("Failed to delete dead_pidfd struct\n"); + mutex_unlock(&dead->pidfd_lock); + close(pidfd); + goto err_close; + } + } + mutex_unlock(&dead->pidfd_lock); + +out: + if (rst_file_params(pidfd, info->pidfe->fown, info->pidfe->flags)) { + goto err_close; + } + + *new_fd = pidfd; + return 0; +err_close: + pr_err("Can't create pidfd %#08x NSpid: %d flags: %u\n", + info->pidfe->id, info->pidfe->nspid, info->pidfe->flags); + return -1; +} + +static struct file_desc_ops pidfd_desc_ops = { + .type = FD_TYPES__PIDFD, + .open = open_one_pidfd +}; + +static int collect_one_pidfd(void *obj, ProtobufCMessage *msg, struct cr_img *i) +{ + struct dead_pidfd *dead; + struct pidfd_info *info = obj; + + info->pidfe = pb_msg(msg, PidfdEntry); + pr_info_pidfd("Collected ", info->pidfe); + + if (info->pidfe->nspid != -1) + goto out; + + dead = lookup_dead_pidfd(info->pidfe->ino); + if (dead) { + mutex_lock(&dead->pidfd_lock); + dead->count++; + mutex_unlock(&dead->pidfd_lock); + goto out; + } + + dead = shmalloc(sizeof(*dead)); + if (!dead) { + pr_err("Could not allocate shared memory..\n"); + return -1; + } + + INIT_HLIST_NODE(&dead->hash); + dead->ino = info->pidfe->ino; + dead->count = 1; + dead->pid = -1; + mutex_init(&dead->pidfd_lock); + + mutex_lock(dead_pidfd_hash_lock); + hlist_add_head(&dead->hash, &dead_pidfd_hash[dead->ino % DEAD_PIDFD_HASH_SIZE]); + mutex_unlock(dead_pidfd_hash_lock); +out: + return file_desc_add(&info->d, info->pidfe->id, &pidfd_desc_ops); +} + +struct collect_image_info pidfd_cinfo = { + .fd_type = CR_FD_PIDFD, + .pb_type = PB_PIDFD, + .priv_size = sizeof(struct pidfd_info), + .collect = collect_one_pidfd, +}; diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 92655a484..eb869dbbd 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -42,10 +42,12 @@ #include "fault-injection.h" #include "memfd.h" #include "hugetlb.h" +#include "pidfd.h" #include "protobuf.h" #include "images/fdinfo.pb-c.h" #include "images/mnt.pb-c.h" +#include "pidfd.pb-c.h" #include "plugin.h" #include @@ -2165,6 +2167,33 @@ static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) if (ret) goto parse_err; + entry_met = true; + continue; + } + if (fdinfo_field(str, "ino") || fdinfo_field(str, "NSpid") || fdinfo_field(str, "Pid")) { + struct pidfd_dump_info *pidfd_info = arg; + + if (type != FD_TYPES__PIDFD) + continue; + + if (fdinfo_field(str, "ino")) { + ret = sscanf(str, "%*s %u", &pidfd_info->pidfe.ino); + if (ret != 1) + goto parse_err; + } else if (fdinfo_field(str, "Pid")) { + ret = sscanf(str, "%*s %d", &pidfd_info->pid); + if (ret != 1) + goto parse_err; + } else if (fdinfo_field(str, "NSpid")) { + char *last; + + last = strrchr(str, '\t'); + if (!last || sscanf(last, "%d", &pidfd_info->pidfe.nspid) != 1) { + pr_err("Unable to parse: %s\n", str); + goto parse_err; + } + } + entry_met = true; continue; } diff --git a/criu/protobuf-desc.c b/criu/protobuf-desc.c index ff16b9f5b..e0dbfccc2 100644 --- a/criu/protobuf-desc.c +++ b/criu/protobuf-desc.c @@ -68,6 +68,7 @@ #include "images/bpfmap-file.pb-c.h" #include "images/bpfmap-data.pb-c.h" #include "images/apparmor.pb-c.h" +#include "images/pidfd.pb-c.h" struct cr_pb_message_desc cr_pb_descs[PB_MAX]; From 3096df9ea3cfd494905bf0497a31c77688a49cf6 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Tue, 9 Jul 2024 19:58:29 +0530 Subject: [PATCH 005/257] zdtm: Check pidfd fdinfo entry is consistent Ensures that entries in /proc//fdinfo/ are same. Signed-off-by: Bhavik Sachdev --- test/zdtm/static/Makefile | 1 + test/zdtm/static/pidfd_self.c | 140 ++++++++++++++++++++++++++++++++++ 2 files changed, 141 insertions(+) create mode 100644 test/zdtm/static/pidfd_self.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 1e891f0ba..a2e852d73 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -53,6 +53,7 @@ TST_NOFILE := \ shm \ shm-mp \ ptrace_sig \ + pidfd_self \ pipe00 \ pipe01 \ pipe02 \ diff --git a/test/zdtm/static/pidfd_self.c b/test/zdtm/static/pidfd_self.c new file mode 100644 index 000000000..2730ee123 --- /dev/null +++ b/test/zdtm/static/pidfd_self.c @@ -0,0 +1,140 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check pidfd /proc/self/fdinfo/ entry remains consistent after checkpoint/restore\n"; +const char *test_author = "Bhavik Sachdev "; + +struct pidfd_status { + unsigned int flags; + pid_t pid; +}; + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static void show_pidfd(char *prefix, struct pidfd_status *s) +{ + test_msg("\n\t%s\n\tflags: 0%o\n\tpid: %d\n", prefix, s->flags, s->pid); +} + +static int parse_self_fdinfo(int pidfd, struct pidfd_status *s) +{ + char buf[256]; + int ret = -1; + FILE *f; + + sprintf(buf, "/proc/self/fdinfo/%d", pidfd); + f = fopen(buf, "r"); + if (!f) { + perror("Can't open /proc/self/fdinfo/ to parse"); + return -1; + } + + memset(s, 0, sizeof(*s)); + + /* + * flags: file access mode (octal) 02000002 => [O_RDWR | O_CLOEXEC] + * pid: the pid to which we have pidfd open + */ + while (fgets(buf, sizeof(buf), f)) { + if (!fgets(buf, sizeof(buf), f)) + goto parse_err; + + if (sscanf(buf, "flags: 0%o", &s->flags) != 1) { + goto parse_err; + } + + if (!fgets(buf, sizeof(buf), f)) + goto parse_err; + if (!fgets(buf, sizeof(buf), f)) + goto parse_err; + + if (!fgets(buf, sizeof(buf), f)) + goto parse_err; + + if (sscanf(buf, "Pid: %d", &s->pid) != 1) + goto parse_err; + ret = 0; + break; + } + + if (ret) + goto parse_err; +err: + fclose(f); + return ret; + +parse_err: + pr_perror("Format error"); + goto err; +} + +static int check_pidfd(int fd, struct pidfd_status *old) +{ + struct pidfd_status new; + + if (parse_self_fdinfo(fd, &new)) + return -1; + + show_pidfd("restored", &new); + + if (old->flags != new.flags || old->pid != new.pid) + return -1; + + return 0; +} + +int main(int argc, char* argv[]) +{ + struct pidfd_status old; + int pidfd, ret; + + test_init(argc, argv); + + pidfd = pidfd_open(getpid(), 0); + if (pidfd < 0) { + pr_perror("pidfd_open failed"); + return 1; + } + + parse_self_fdinfo(pidfd, &old); + + show_pidfd("old", &old); + + if (pidfd_send_signal(pidfd, 0, NULL, 0)) { + pr_perror("Could not send signal"); + return 1; + } + + test_daemon(); + test_waitsig(); + + ret = check_pidfd(pidfd, &old); + if (ret) { + fail(); + goto err; + } + + if (pidfd_send_signal(pidfd, 0, NULL, 0)) { + pr_perror("Could not send signal"); + fail(); + goto err; + } + + pass(); + close(pidfd); + return 0; +err: + close(pidfd); + return 1; +} From 2899d46000be4ee85af7000068b2414400ed66be Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Tue, 9 Jul 2024 20:01:00 +0530 Subject: [PATCH 006/257] zdtm: Check pidfd can send signal after C/R Ensure `pidfd_send_signal()` syscall works as expected after C/R. Signed-off-by: Bhavik Sachdev --- test/zdtm/static/Makefile | 1 + test/zdtm/static/pidfd_child.c | 66 ++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 test/zdtm/static/pidfd_child.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index a2e852d73..0268ae492 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -54,6 +54,7 @@ TST_NOFILE := \ shm-mp \ ptrace_sig \ pidfd_self \ + pidfd_child \ pipe00 \ pipe01 \ pipe02 \ diff --git a/test/zdtm/static/pidfd_child.c b/test/zdtm/static/pidfd_child.c new file mode 100644 index 000000000..ec559605d --- /dev/null +++ b/test/zdtm/static/pidfd_child.c @@ -0,0 +1,66 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Checks pidfd sends signal to child process after restore\n"; +const char *test_author = "Bhavik Sachdev "; + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +int main(int argc, char* argv[]) +{ + int pidfd, status; + pid_t child; + + test_init(argc, argv); + + child = fork(); + if (child < 0) { + pr_perror("Unable to fork a new process"); + return 1; + } else if (child == 0) { + test_waitsig(); + return 0; + } + + pidfd = pidfd_open(child, 0); + if (pidfd < 0) { + pr_perror("pidfd_open failed"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (pidfd_send_signal(pidfd, SIGTERM, NULL, 0)) { + fail("Could not send signal"); + goto err_close; + } + + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid()"); + goto err_close; + } + + if (status != 0) { + fail("%d:%d:%d:%d", WIFEXITED(status), WEXITSTATUS(status), WIFSIGNALED(status), WTERMSIG(status)); + goto err_close; + } + + pass(); + close(pidfd); + return 0; +err_close: + close(pidfd); + return 1; +} From 3f30ec0eda1a98ec18c8b102c49a0b7988c92c6d Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Mon, 8 Jul 2024 22:25:00 +0530 Subject: [PATCH 007/257] zdtm: Check pidfd can kill descendant processes Validate that pidfds can been used to send signals to different processes after C/R using the `pidfd_send_signal()` syscall. Signed-off-by: Bhavik Sachdev --- test/zdtm/static/Makefile | 1 + test/zdtm/static/pidfd_kill.c | 128 ++++++++++++++++++++++++++++++++++ 2 files changed, 129 insertions(+) create mode 100644 test/zdtm/static/pidfd_kill.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 0268ae492..ab45b580a 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -55,6 +55,7 @@ TST_NOFILE := \ ptrace_sig \ pidfd_self \ pidfd_child \ + pidfd_kill \ pipe00 \ pipe01 \ pipe02 \ diff --git a/test/zdtm/static/pidfd_kill.c b/test/zdtm/static/pidfd_kill.c new file mode 100644 index 000000000..6232d033a --- /dev/null +++ b/test/zdtm/static/pidfd_kill.c @@ -0,0 +1,128 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Kill child and grandchild process using pidfds\n"; +const char *test_author = "Bhavik Sachdev "; + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static int wait_for_child(int child) +{ + int status; + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid()"); + return 1; + } + + if (status != 0) { + test_msg("%d:%d:%d:%d", WIFEXITED(status), WEXITSTATUS(status), + WIFSIGNALED(status), WTERMSIG(status)); + } + + return 0; +} + +int main(int argc, char* argv[]) +{ + #define READ 0 + #define WRITE 1 + + int child, gchild, cpidfd, gpidfd, gchild_pid, ret; + int p[2]; + + if (pipe(p)) { + pr_perror("pipe"); + return 1; + } + + test_init(argc, argv); + + child = fork(); + if (child < 0) { + pr_perror("fork"); + return 1; + } + + if (child == 0) { + gchild = fork(); + if (gchild < 0) { + pr_perror("fork"); + return 1; + } + + if (gchild == 0) { + test_waitsig(); + return 0; + } + + close(p[READ]); + if (write(p[WRITE], &gchild, sizeof(gchild)) + != sizeof(gchild)) { + pr_perror("write"); + return 1; + } + close(p[WRITE]); + + test_waitsig(); + return wait_for_child(gchild); + } + + cpidfd = pidfd_open(child, 0); + if (cpidfd < 0) { + pr_perror("pidfd_open"); + return 1; + } + + close(p[WRITE]); + if (read(p[READ], &gchild_pid, sizeof(gchild_pid)) + != sizeof(gchild_pid)) { + pr_perror("read"); + return 1; + } + close(p[READ]); + + gpidfd = pidfd_open(gchild_pid, 0); + if (gpidfd < 0) { + pr_perror("pidfd_open"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (pidfd_send_signal(gpidfd, SIGKILL, NULL, 0)) { + pr_perror("Could not send signal"); + goto fail_close; + } + + if (pidfd_send_signal(cpidfd, SIGKILL, NULL, 0)) { + pr_perror("Could not send signal"); + goto fail_close; + } + + ret = wait_for_child(child); + if (ret) + goto fail_close; + + pass(); + close(cpidfd); + close(gpidfd); + return 0; + +fail_close: + fail(); + close(cpidfd); + close(gpidfd); + return 1; +} From 2e6f348458b83a5228693d31dd53611df56fd8f3 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Thu, 25 Jul 2024 01:12:36 +0530 Subject: [PATCH 008/257] zdtm: Check dead pidfd is restored correctly After, C/R of pidfds that point to dead processes their inodes might change. But if two pidfds point to same dead process they should continue to do so after C/R. This test ensures that this happens by calling `statx()` on pidfds after C/R and then comparing their inode numbers. Support for comparing pidfds by using `statx()` and inode numbers was introduced alongside pidfs. So if `f_type` of pidfd is not equal to `PID_FS_MAGIC` then we skip this test. signed-off-by: Bhavik Sachdev --- test/zdtm/static/Makefile | 1 + test/zdtm/static/pidfd_dead.c | 244 ++++++++++++++++++++++++++++++++++ 2 files changed, 245 insertions(+) create mode 100644 test/zdtm/static/pidfd_dead.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index ab45b580a..20e4bc272 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -54,6 +54,7 @@ TST_NOFILE := \ shm-mp \ ptrace_sig \ pidfd_self \ + pidfd_dead \ pidfd_child \ pidfd_kill \ pipe00 \ diff --git a/test/zdtm/static/pidfd_dead.c b/test/zdtm/static/pidfd_dead.c new file mode 100644 index 000000000..9c825899d --- /dev/null +++ b/test/zdtm/static/pidfd_dead.c @@ -0,0 +1,244 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check C/R of pidfds that point to dead processes\n"; +const char *test_author = "Bhavik Sachdev "; + +#ifndef PID_FS_MAGIC +#define PID_FS_MAGIC 0x50494446 +#endif + +/* + * main + * `- child + * `- grandchild + * + * main opens a pidfd for both child and grandchild. + * Before C/R we kill both child and grandchild. + * We end up with two unique dead pidfds. + */ + +static long get_fs_type(int lfd) +{ + struct statfs fst; + + if (fstatfs(lfd, &fst)) { + return -1; + } + return fst.f_type; +} + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static int open_pidfd_pair(int pidfd[2], int pid) +{ + pidfd[0] = pidfd_open(pid, 0); + if (pidfd[0] < 0) { + pr_perror("pidfd_open() failed"); + return 1; + } + + pidfd[1] = pidfd_open(pid, 0); + if (pidfd[1] < 0) { + close(pidfd[0]); + pr_perror("pidfd_open() failed"); + return 1; + } + return 0; +} + +static int compare_pidfds(int pidfd[2]) +{ + /* + * After linux 6.9 we can compare inode numbers + * to determine if two pidfds point to the same process. + * While the inode number may change before and after C/R + * pidfds pointing to the same pid should have the same inode number. + */ + struct statx stats[2]; + statx(pidfd[0], "", AT_EMPTY_PATH, STATX_ALL, &stats[0]); + statx(pidfd[1], "", AT_EMPTY_PATH, STATX_ALL, &stats[1]); + if (stats[0].stx_ino != stats[1].stx_ino) + return 1; + return 0; +} + +static int check_for_pidfs(void) +{ + long type; + int pidfd = pidfd_open(getpid(), 0); + if (pidfd < 0) { + pr_perror("pidfd open() failed"); + return -1; + } + type = get_fs_type(pidfd); + close(pidfd); + return type == PID_FS_MAGIC; +} + +int main(int argc, char* argv[]) +{ + #define READ 0 + #define WRITE 1 + + int child, ret, gchild, p[2], status; + int cpidfd[2], gpidfd[2]; + struct statx stats[2]; + + test_init(argc, argv); + + ret = check_for_pidfs(); + if (ret < 0) + return 1; + + if (ret == 0) { + test_daemon(); + test_waitsig(); + skip("Test requires pidfs. skipping..."); + pass(); + return 0; + } + + if (pipe(p)) { + pr_perror("pipe"); + return 1; + } + + child = test_fork(); + if (child < 0) { + pr_perror("fork"); + return 1; + } else if (child == 0) { + int gchild = test_fork(); + close(p[READ]); + if (gchild < 0) { + pr_perror("fork"); + return 1; + } else if (gchild == 0) { + close(p[WRITE]); + while(1) + sleep(1000); + } else { + if (write(p[WRITE], &gchild, sizeof(int)) != sizeof(int)) { + pr_perror("write"); + return 1; + } + close(p[WRITE]); + if (waitpid(gchild, &status, 0) != gchild) { + pr_perror("waitpid"); + return 1; + } + + if (!WIFSIGNALED(status)) { + fail("Expected grandchild to be terminated by a signal"); + return 1; + } + + if (WTERMSIG(status) != SIGKILL) { + fail("Expected grandchild to be terminated by SIGKILL"); + return 1; + } + + return 0; + } + } + + ret = open_pidfd_pair(cpidfd, child); + if (ret) + return 1; + + close(p[WRITE]); + if (read(p[READ], &gchild, sizeof(int)) != sizeof(int)) { + pr_perror("write"); + return 1; + } + close(p[READ]); + + ret = open_pidfd_pair(gpidfd, gchild); + if (ret) + return 1; + + /* + * We kill grandchild and child processes only after opening pidfds. + */ + if (pidfd_send_signal(gpidfd[0], SIGKILL, NULL, 0)) { + pr_perror("pidfd_send_signal"); + goto fail_close; + } + + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid"); + goto fail_close; + } + + if (!WIFEXITED(status)) { + fail("Expected child to exit normally"); + goto fail_close; + } + + if (WEXITSTATUS(status) != 0) { + fail("Expected child to exit with 0"); + goto fail_close; + } + usleep(1000); + + if (kill(gchild, 0) != -1 && errno != ESRCH) { + fail("Expected grand child to not exist"); + goto fail_close; + } + + if (kill(child, 0) != -1 && errno != ESRCH) { + fail("Expected child to not exist"); + goto fail_close; + } + + test_daemon(); + test_waitsig(); + + ret = compare_pidfds(cpidfd); + if (ret) { + fail("inodes not same for same pid"); + goto fail_close; + } + + ret = compare_pidfds(gpidfd); + if (ret) { + fail("inodes not same for same pid"); + goto fail_close; + } + + statx(cpidfd[0], "", AT_EMPTY_PATH, STATX_ALL, &stats[0]); + statx(gpidfd[0], "", AT_EMPTY_PATH, STATX_ALL, &stats[1]); + if (stats[0].stx_ino == stats[1].stx_ino) { + fail("pidfds pointing to diff pids should have diff inodes"); + goto fail_close; + } + + pass(); + close(cpidfd[0]); + close(cpidfd[1]); + close(gpidfd[0]); + close(gpidfd[1]); + return 0; + +fail_close: + close(cpidfd[0]); + close(cpidfd[1]); + close(gpidfd[0]); + close(gpidfd[1]); + return 1; +} From 7a64004dc81d6d8fe8e5dbb8e31c787b54c96982 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Fri, 16 Aug 2024 21:20:57 +0530 Subject: [PATCH 009/257] zdtm: Check fd from pidfd_getfd is C/Red correctly We get the read end of a pipe using `pidfd_getfd` and check if we can read from it after C/R. signed-off-by: Bhavik Sachdev --- test/zdtm/static/Makefile | 1 + test/zdtm/static/fd_from_pidfd.c | 108 +++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+) create mode 100644 test/zdtm/static/fd_from_pidfd.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 20e4bc272..f4dbb1d96 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -57,6 +57,7 @@ TST_NOFILE := \ pidfd_dead \ pidfd_child \ pidfd_kill \ + fd_from_pidfd \ pipe00 \ pipe01 \ pipe02 \ diff --git a/test/zdtm/static/fd_from_pidfd.c b/test/zdtm/static/fd_from_pidfd.c new file mode 100644 index 000000000..1f863d6c0 --- /dev/null +++ b/test/zdtm/static/fd_from_pidfd.c @@ -0,0 +1,108 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check if fd obtained from pidfd_get_fd is C/R correctly\n"; +const char *test_author = "Bhavik Sachdev "; + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_getfd(int pidfd, int targetfd, unsigned int flags) +{ + return syscall(__NR_pidfd_getfd, pidfd, targetfd, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +int main(int argc, char* argv[]) +{ + #define READ 0 + #define WRITE 1 + + int pidfd, child, p[2], child_read, read_data, status; + int data = 42; + + test_init(argc, argv); + + if (pipe(p)) { + pr_perror("pipe"); + return 1; + } + + child = fork(); + if (child < 0) { + pr_perror("fork"); + return 1; + } + + if (child == 0) { + close(p[WRITE]); + test_waitsig(); + return 0; + } + + pidfd = pidfd_open(child, 0); + if (pidfd < 0) { + pr_perror("pidfd_open failed"); + return 1; + } + + close(p[READ]); + if (write(p[WRITE], &data, sizeof(data)) != sizeof(data)) { + pr_perror("write"); + return 1; + } + close(p[WRITE]); + + child_read = pidfd_getfd(pidfd, p[READ], 0); + if (child_read < 0) { + pr_perror("pidfd_getfd"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (read(child_read, &read_data, sizeof(read_data)) != sizeof(read_data)) { + pr_perror("read"); + goto err_close; + } + + if (read_data != data) { + fail("data from fd obtained using pidfd_getfd incorrect"); + goto err_close; + } + + if (pidfd_send_signal(pidfd, SIGTERM, NULL, 0)) { + pr_perror("Could not send signal"); + goto err_close; + } + + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid()"); + return 1; + } + + if (status != 0) { + fail("%d:%d:%d:%d", WIFEXITED(status), WEXITSTATUS(status), WIFSIGNALED(status), WTERMSIG(status)); + return 1; + } + + pass(); + close(child_read); + close(pidfd); + return 0; +err_close: + close(child_read); + close(pidfd); + return 1; +} From f29e655df9d5320c8bcab1ec26bac2b0315af4a5 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Mon, 26 Aug 2024 20:56:14 +0530 Subject: [PATCH 010/257] zdtm: Check pidfd for thread is valid after C/R We open a pidfd to a thread using `PIDFD_THREAD` flag and after C/R ensure that we can send signals using it with `PIDFD_SIGNAL_THREAD`. signed-off-by: Bhavik Sachdev --- test/zdtm/static/Makefile | 1 + test/zdtm/static/pidfd_of_thread.c | 114 ++++++++++++++++++++++++++ test/zdtm/static/pidfd_of_thread.desc | 1 + 3 files changed, 116 insertions(+) create mode 100644 test/zdtm/static/pidfd_of_thread.c create mode 100644 test/zdtm/static/pidfd_of_thread.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index f4dbb1d96..44ac64fe5 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -54,6 +54,7 @@ TST_NOFILE := \ shm-mp \ ptrace_sig \ pidfd_self \ + pidfd_of_thread \ pidfd_dead \ pidfd_child \ pidfd_kill \ diff --git a/test/zdtm/static/pidfd_of_thread.c b/test/zdtm/static/pidfd_of_thread.c new file mode 100644 index 000000000..d232c7ac1 --- /dev/null +++ b/test/zdtm/static/pidfd_of_thread.c @@ -0,0 +1,114 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" +#include "lock.h" + +const char *test_doc = "Check C/R of pidfds that point to threads\n"; +const char *test_author = "Bhavik Sachdev "; + +/* see also: https://codebrowser.dev/glibc/glibc/sysdeps/unix/sysv/linux/tst-clone3.c.html */ + +#ifndef PIDFD_THREAD +#define PIDFD_THREAD O_EXCL +#endif + +#ifndef PIDFD_SIGNAL_THREAD +#define PIDFD_SIGNAL_THREAD (1UL << 0) +#endif + +#ifndef PID_FS_MAGIC +#define PID_FS_MAGIC 0x50494446 +#endif + +static long get_fs_type(int lfd) +{ + struct statfs fst; + + if (fstatfs(lfd, &fst)) { + return -1; + } + return fst.f_type; +} + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static int thread_func(void *a) +{ + test_waitsig(); + return 0; +} + +#define CTID_INIT_VAL 1 + +int main(int argc, char* argv[]) +{ + char st[64 * 1024] __attribute__ ((aligned)); + pid_t tid; + int pidfd, test_pidfd; + futex_t exited; + + int clone_flags = CLONE_THREAD; + clone_flags |= CLONE_VM | CLONE_SIGHAND; + clone_flags |= CLONE_CHILD_CLEARTID; + + test_init(argc, argv); + + test_pidfd = pidfd_open(getpid(), 0); + if (test_pidfd < 0) { + pr_perror("pidfd_open() failed"); + return 1; + } + + /* PIDFD_THREAD, PIDFD_SIGNAL_THREAD are supported only with pidfs */ + if (get_fs_type(test_pidfd) != PID_FS_MAGIC) { + test_daemon(); + test_waitsig(); + skip("pidfs not supported."); + close(test_pidfd); + return 0; + } + close(test_pidfd); + + futex_set(&exited, CTID_INIT_VAL); + + tid = clone(thread_func, st + sizeof(st), clone_flags, NULL, NULL, NULL, &(exited.raw)); + if (tid == -1) { + pr_perror("clone() failed"); + return 1; + } + + test_msg("Successfully created a thread with tid: %d\n", tid); + pidfd = pidfd_open(tid, PIDFD_THREAD); + if (pidfd < 0) { + pr_perror("pidfd_open() failed"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (pidfd_send_signal(pidfd, SIGTERM, NULL, PIDFD_SIGNAL_THREAD)) { + pr_perror("pidfd_send_signal() failed"); + fail(); + close(pidfd); + return 1; + } + + test_msg("Waiting for thread to exit\n"); + futex_wait_until(&exited, 0); + + pass(); + close(pidfd); + return 0; +} diff --git a/test/zdtm/static/pidfd_of_thread.desc b/test/zdtm/static/pidfd_of_thread.desc new file mode 100644 index 000000000..802caed65 --- /dev/null +++ b/test/zdtm/static/pidfd_of_thread.desc @@ -0,0 +1 @@ +{'flags': 'noauto crfail'} From 88aa7e2c10a83a61226447cfe41d9e50ce001178 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 14 Oct 2024 12:39:18 +0100 Subject: [PATCH 011/257] make/lint: use 'ruff check ' The command `ruff ` has been deprecated and removed: https://astral.sh/blog/ruff-v0.5.0#removed-deprecated-features Signed-off-by: Radostin Stoyanov --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 97b4dc211..46d9adef3 100644 --- a/Makefile +++ b/Makefile @@ -437,7 +437,7 @@ help: ruff: @ruff --version - ruff ${RUFF_FLAGS} --config=scripts/ruff.toml \ + ruff check ${RUFF_FLAGS} --config=scripts/ruff.toml \ test/zdtm.py \ test/inhfd/*.py \ test/others/rpc/config_file.py \ From b524dab32f03f15a66b637057233ac28ef7b0091 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 14 Oct 2024 13:58:41 +0100 Subject: [PATCH 012/257] pycriu: fix lint errors This patch fixes the following errors reported by ruff: lib/pycriu/images/pb2dict.py:307:24: E721 Use `is` and `is not` for type comparisons, or `isinstance()` for isinstance checks | 305 | elif field.type in _basic_cast: 306 | cast = _basic_cast[field.type] 307 | if pretty and (cast == int): | ^^^^^^^^^^^ E721 308 | if is_hex: 309 | # Fields that have (criu).hex = true option set | lib/pycriu/images/pb2dict.py:379:13: E721 Use `is` and `is not` for type comparisons, or `isinstance()` for isinstance checks | 377 | elif field.type in _basic_cast: 378 | cast = _basic_cast[field.type] 379 | if (cast == int) and is_string(value): | ^^^^^^^^^^^ E721 380 | if _marked_as_dev(field): 381 | return encode_dev(field, value) | Signed-off-by: Radostin Stoyanov --- lib/pycriu/images/pb2dict.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/pycriu/images/pb2dict.py b/lib/pycriu/images/pb2dict.py index 0d1a24692..e3dd95ac0 100644 --- a/lib/pycriu/images/pb2dict.py +++ b/lib/pycriu/images/pb2dict.py @@ -304,7 +304,7 @@ def _pb2dict_cast(field, value, pretty=False, is_hex=False): return field.enum_type.values_by_number.get(value, None).name elif field.type in _basic_cast: cast = _basic_cast[field.type] - if pretty and (cast == int): + if pretty and cast is int: if is_hex: # Fields that have (criu).hex = true option set # should be stored in hex string format. @@ -376,7 +376,7 @@ def _dict2pb_cast(field, value): return field.enum_type.values_by_name.get(value, None).number elif field.type in _basic_cast: cast = _basic_cast[field.type] - if (cast == int) and is_string(value): + if cast is int and is_string(value): if _marked_as_dev(field): return encode_dev(field, value) From 5335b35f72da90a62ad3d771ca175d59ab1bd8b1 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 4 Oct 2024 12:14:29 +0100 Subject: [PATCH 013/257] images/inventory: add field for enabled plugins This patch extends the inventory image with a `plugins` field that contains an array of plugins which were used during checkpoint, for example, to save GPU state. In particular, the CUDA and AMDGPU plugins are added to this field only when the checkpoint contains GPU state. This allows to disable unnecessary plugins during restore, show appropriate error messages if required CRIU plugin are missing, and migrate a process that does not use GPU from a GPU-enabled system to CPU-only environment. We use the `optional plugins_entry` for backwards compatibility. This entry allows us to distinguish between *unset* and *missing* field: - When the field is missing, it indicates that the checkpoint was created with a previous version of CRIU, and all plugins should be *enabled* during restore. - When the field is empty, it indicates that no plugins were used during checkpointing. Thus, all plugins can be *disabled* during restore. Signed-off-by: Radostin Stoyanov --- criu/cr-restore.c | 6 +- criu/image.c | 124 +++++++++++++++++++++++++++++++++ criu/include/image.h | 4 ++ criu/plugin.c | 3 + images/inventory.proto | 8 +++ plugins/amdgpu/amdgpu_plugin.c | 31 +++++++++ plugins/cuda/cuda_plugin.c | 22 +++++- 7 files changed, 193 insertions(+), 5 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index d5b6c8037..646300bdb 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2354,12 +2354,12 @@ int cr_restore_tasks(void) if (init_service_fd()) return 1; - if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) - return -1; - if (check_img_inventory(/* restore = */ true) < 0) goto err; + if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) + return -1; + if (init_stats(RESTORE_STATS)) goto err; diff --git a/criu/image.c b/criu/image.c index 9fb390ab7..9589167fb 100644 --- a/criu/image.c +++ b/criu/image.c @@ -26,6 +26,14 @@ TaskKobjIdsEntry *root_ids; u32 root_cg_set; Lsmtype image_lsm; +struct inventory_plugin { + struct list_head node; + char *name; +}; + +struct list_head inventory_plugins_list = LIST_HEAD_INIT(inventory_plugins_list); +static int n_inventory_plugins; + int check_img_inventory(bool restore) { int ret = -1; @@ -99,6 +107,19 @@ int check_img_inventory(bool restore) } else { opts.network_lock_method = he->network_lock_method; } + + if (!he->plugins_entry) { + /* backwards compatibility: if the 'plugins_entry' field is missing, + * all plugins should be enabled during restore. + */ + n_inventory_plugins = -1; + } else { + PluginsEntry *pe = he->plugins_entry; + for (int i = 0; i < pe->n_plugins; i++) { + if (add_inventory_plugin(pe->plugins[i])) + goto out_err; + } + } } ret = 0; @@ -110,8 +131,92 @@ out_close: return ret; } +/** + * Check if the 'plugins' field in the inventory image contains + * the specified plugin name. If found, the plugin is removed + * from the linked list. + */ +bool check_and_remove_inventory_plugin(const char *name, size_t n) +{ + if (n_inventory_plugins == -1) + return true; /* backwards compatibility */ + + if (n_inventory_plugins > 0) { + struct inventory_plugin *p, *tmp; + + list_for_each_entry_safe(p, tmp, &inventory_plugins_list, node) { + if (!strncmp(name, p->name, n)) { + xfree(p->name); + list_del(&p->node); + xfree(p); + n_inventory_plugins--; + return true; + } + } + } + + return false; +} + +/** + * We expect during restore all loaded plugins to be removed from + * the inventory_plugins_list. If the list is not empty, show an + * error message for each missing plugin. + */ +int check_inventory_plugins(void) +{ + struct inventory_plugin *p; + + if (n_inventory_plugins <= 0) + return 0; + + list_for_each_entry(p, &inventory_plugins_list, node) { + pr_err("Missing required plugin: %s\n", p->name); + } + + return -1; +} + +/** + * Add plugin name to the inventory image. These values + * can be used to identify required plugins during restore. + */ +int add_inventory_plugin(const char *name) +{ + struct inventory_plugin *p; + + p = xmalloc(sizeof(struct inventory_plugin)); + if (p == NULL) + return -1; + + p->name = xstrdup(name); + if (!p->name) { + xfree(p); + return -1; + } + list_add(&p->node, &inventory_plugins_list); + n_inventory_plugins++; + + return 0; +} + +void free_inventory_plugins_list(void) +{ + struct inventory_plugin *p, *tmp; + + if (!list_empty(&inventory_plugins_list)) { + list_for_each_entry_safe(p, tmp, &inventory_plugins_list, node) { + xfree(p->name); + list_del(&p->node); + xfree(p); + } + } + n_inventory_plugins = 0; +} + int write_img_inventory(InventoryEntry *he) { + PluginsEntry pe = PLUGINS_ENTRY__INIT; struct cr_img *img; int ret; @@ -121,8 +226,27 @@ int write_img_inventory(InventoryEntry *he) if (!img) return -1; + if (!list_empty(&inventory_plugins_list)) { + struct inventory_plugin *p; + int i = 0; + + pe.n_plugins = n_inventory_plugins; + pe.plugins = xmalloc(n_inventory_plugins * sizeof(char *)); + if (!pe.plugins) + return -1; + + list_for_each_entry(p, &inventory_plugins_list, node) { + pe.plugins[i] = p->name; + i++; + } + } + he->plugins_entry = &pe; + ret = pb_write_one(img, he, PB_INVENTORY); + free_inventory_plugins_list(); + xfree(pe.plugins); + xfree(he->root_ids); close_image(img); if (ret < 0) diff --git a/criu/include/image.h b/criu/include/image.h index a17aae35c..afa7d5e12 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -177,4 +177,8 @@ extern int read_img_str(struct cr_img *, char **pstr, int size); extern void close_image(struct cr_img *); +extern int add_inventory_plugin(const char *name); +extern int check_inventory_plugins(void); +extern bool check_and_remove_inventory_plugin(const char *name, size_t n); + #endif /* __CR_IMAGE_H__ */ diff --git a/criu/plugin.c b/criu/plugin.c index 58b5ea5bf..65e79a069 100644 --- a/criu/plugin.c +++ b/criu/plugin.c @@ -256,6 +256,9 @@ int cr_plugin_init(int stage) goto err; } + if (stage == CR_PLUGIN_STAGE__RESTORE && check_inventory_plugins()) + goto err; + exit_code = 0; err: closedir(d); diff --git a/images/inventory.proto b/images/inventory.proto index a735bad1d..7f655031b 100644 --- a/images/inventory.proto +++ b/images/inventory.proto @@ -10,6 +10,13 @@ enum lsmtype { APPARMOR = 2; } +// It is not possible to distinguish between an empty repeated field +// and unset repeated field. To solve this problem and provide backwards +// compabibility, we use the 'plugins_entry' message. +message plugins_entry { + repeated string plugins = 12; +}; + message inventory_entry { required uint32 img_version = 1; optional bool fdinfo_per_id = 2; @@ -21,4 +28,5 @@ message inventory_entry { optional uint32 pre_dump_mode = 9; optional bool tcp_close = 10; optional uint32 network_lock_method = 11; + optional plugins_entry plugins_entry = 12; } diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index b56ba6d14..96c086162 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -60,6 +60,10 @@ static LIST_HEAD(update_vma_info_list); size_t kfd_max_buffer_size; +bool plugin_added_to_inventory = false; + +bool plugin_disabled = false; + /**************************************************************************************************/ /* Call ioctl, restarting if it is interrupted */ @@ -332,6 +336,13 @@ void getenv_size_t(const char *var, size_t *value) int amdgpu_plugin_init(int stage) { + if (stage == CR_PLUGIN_STAGE__RESTORE) { + if (!check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name))) { + plugin_disabled = true; + return 0; + } + } + pr_info("initialized: %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); topology_init(&src_topology); @@ -365,6 +376,9 @@ int amdgpu_plugin_init(int stage) void amdgpu_plugin_fini(int stage, int ret) { + if (plugin_disabled) + return; + pr_info("finished %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); if (stage == CR_PLUGIN_STAGE__RESTORE) @@ -414,6 +428,14 @@ int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf) if (ret) pr_perror("%s(), Can't handle VMAs of input device", __func__); + if (!ret && !plugin_added_to_inventory) { + ret = add_inventory_plugin(CR_PLUGIN_DESC.name); + if (ret) + pr_err("Failed to add AMDGPU plugin to inventory image\n"); + else + plugin_added_to_inventory = true; + } + return ret; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, amdgpu_plugin_handle_device_vma) @@ -1540,6 +1562,9 @@ int amdgpu_plugin_restore_file(int id) size_t img_size; FILE *img_fp = NULL; + if (plugin_disabled) + return -ENOTSUP; + pr_info("Initialized kfd plugin restorer with ID = %d\n", id); snprintf(img_path, sizeof(img_path), IMG_KFD_FILE, id); @@ -1746,6 +1771,9 @@ int amdgpu_plugin_update_vmamap(const char *in_path, const uint64_t addr, const char *p_end; bool is_kfd = false, is_renderD = false; + if (plugin_disabled) + return -ENOTSUP; + plugin_log_msg("Enter %s\n", __func__); strncpy(path, in_path, sizeof(path)); @@ -1805,6 +1833,9 @@ int amdgpu_plugin_resume_devices_late(int target_pid) struct kfd_ioctl_criu_args args = { 0 }; int fd, exit_code = 0; + if (plugin_disabled) + return -ENOTSUP; + pr_info("Inside %s for target pid = %d\n", __func__, target_pid); fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 23c3f4b1a..c4fc67fa9 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -38,6 +38,8 @@ */ bool plugin_disabled = false; +bool plugin_added_to_inventory = false; + struct pid_info { int pid; char checkpointed; @@ -319,7 +321,7 @@ int cuda_plugin_checkpoint_devices(int pid) k_rtsigset_t save_sigset; if (plugin_disabled) { - return 0; + return -ENOTSUP; } restore_tid = get_cuda_restore_tid(pid); @@ -354,6 +356,15 @@ int cuda_plugin_checkpoint_devices(int pid) pr_err("Failed to restore process after error %s on pid %d\n", msg_buf, pid); } } + + if (!status && !plugin_added_to_inventory) { + status = add_inventory_plugin(CR_PLUGIN_DESC.name); + if (status) + pr_err("Failed to add CUDA plugin to inventory image\n"); + else + plugin_added_to_inventory = true; + } + interrupt: int_ret = interrupt_restore_thread(restore_tid, &save_sigset); @@ -367,7 +378,7 @@ int cuda_plugin_pause_devices(int pid) char msg_buf[CUDA_CKPT_BUF_SIZE]; if (plugin_disabled) { - return 0; + return -ENOTSUP; } restore_tid = get_cuda_restore_tid(pid); @@ -463,6 +474,13 @@ int cuda_plugin_init(int stage) { int ret; + if (stage == CR_PLUGIN_STAGE__RESTORE) { + if (!check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name))) { + plugin_disabled = true; + return 0; + } + } + if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && access("/dev/nvidiactl", F_OK)) { pr_info("/dev/nvidiactl doesn't exist. The CUDA plugin is disabled.\n"); plugin_disabled = true; From 5ca4400699cc50fcd6de7d994358136b502d1374 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 14 Oct 2024 13:36:22 +0100 Subject: [PATCH 014/257] zdtm: add inventory test plugins This patch adds two test plugins to verify that CRIU plugins listed in the inventory image are enabled, while those that are not listed can be disabled. Signed-off-by: Radostin Stoyanov --- scripts/ci/run-ci-tests.sh | 1 + test/plugins/Makefile | 16 +++++++++++++++- test/plugins/inventory_test_disabled_plugin.c | 17 +++++++++++++++++ test/plugins/inventory_test_enabled_plugin.c | 17 +++++++++++++++++ test/zdtm.py | 2 +- 5 files changed, 51 insertions(+), 2 deletions(-) create mode 100644 test/plugins/inventory_test_disabled_plugin.c create mode 100644 test/plugins/inventory_test_enabled_plugin.c diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 38b7b5097..b472e954c 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -362,5 +362,6 @@ make -C plugins/amdgpu/ test_topology_remap ./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin cuda ./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin amdgpu ./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin amdgpu cuda +./test/zdtm.py run -t zdtm/static/busyloop00 --criu-plugin inventory_test_enabled inventory_test_disabled ./test/zdtm.py run -t zdtm/static/sigpending -t zdtm/static/pthread00 --mocked-cuda-checkpoint --fault 138 diff --git a/test/plugins/Makefile b/test/plugins/Makefile index 7827b655c..4f620ad50 100644 --- a/test/plugins/Makefile +++ b/test/plugins/Makefile @@ -1,5 +1,13 @@ SRC_DIR := ../../plugins -PLUGIN_TARGETS := amdgpu_plugin.so cuda_plugin.so +PLUGIN_TARGETS := inventory_test_enabled_plugin.so inventory_test_disabled_plugin.so amdgpu_plugin.so cuda_plugin.so + +ARCH := x86 + +PLUGIN_INCLUDE := -iquote../../include +PLUGIN_INCLUDE += -iquote../../criu/include +PLUGIN_INCLUDE += -iquote../../criu/arch/$(ARCH)/include/ +PLUGIN_INCLUDE += -iquote../../ +PLUGIN_CFLAGS := -g -Wall -Werror -shared -nostartfiles -fPIC # Silent make rules. Q := @ @@ -12,6 +20,12 @@ amdgpu_plugin.so: $(SRC_DIR)/amdgpu/amdgpu_plugin.so cuda_plugin.so: $(SRC_DIR)/cuda/cuda_plugin.so $(Q) cp $< $@ +inventory_test_enabled_plugin.so: inventory_test_enabled_plugin.c + $(Q) $(CC) $(PLUGIN_CFLAGS) $< -o $@ $(PLUGIN_INCLUDE) + +inventory_test_disabled_plugin.so: inventory_test_disabled_plugin.c + $(Q) $(CC) $(PLUGIN_CFLAGS) $< -o $@ $(PLUGIN_INCLUDE) + clean: $(Q) $(RM) $(PLUGIN_TARGETS) diff --git a/test/plugins/inventory_test_disabled_plugin.c b/test/plugins/inventory_test_disabled_plugin.c new file mode 100644 index 000000000..468fe924b --- /dev/null +++ b/test/plugins/inventory_test_disabled_plugin.c @@ -0,0 +1,17 @@ +#include "criu-plugin.h" +#include "image.h" + +int inventory_test_disabled_plugin_init(int stage) +{ + if (stage == CR_PLUGIN_STAGE__RESTORE) + return check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name)); + + return 0; +} + +void inventory_test_disabled_plugin_fini(int stage, int ret) +{ + return; +} + +CR_PLUGIN_REGISTER("inventory_test_disabled_plugin", inventory_test_disabled_plugin_init, inventory_test_disabled_plugin_fini) \ No newline at end of file diff --git a/test/plugins/inventory_test_enabled_plugin.c b/test/plugins/inventory_test_enabled_plugin.c new file mode 100644 index 000000000..89e684e2a --- /dev/null +++ b/test/plugins/inventory_test_enabled_plugin.c @@ -0,0 +1,17 @@ +#include "criu-plugin.h" +#include "image.h" + +int inventory_test_enabled_plugin_init(int stage) +{ + if (stage == CR_PLUGIN_STAGE__RESTORE) + return !check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name)); + + return add_inventory_plugin(CR_PLUGIN_DESC.name); +} + +void inventory_test_enabled_plugin_fini(int stage, int ret) +{ + return; +} + +CR_PLUGIN_REGISTER("inventory_test_enabled_plugin", inventory_test_enabled_plugin_init, inventory_test_enabled_plugin_fini) \ No newline at end of file diff --git a/test/zdtm.py b/test/zdtm.py index 6b2132cc3..37ebe63b7 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -2877,7 +2877,7 @@ def get_cli_args(): rp.add_argument("--preload-libfault", action="store_true", help="Run criu with library preload to simulate special cases") rp.add_argument("--criu-plugin", help="Run tests with CRIU plugin", - choices=['amdgpu', 'cuda'], + choices=['amdgpu', 'cuda', 'inventory_test_enabled', 'inventory_test_disabled'], nargs='+', default=None) rp.add_argument("--mocked-cuda-checkpoint", From c49eb18f9f00d18162684f840a1bed4dce9c1d13 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Wed, 9 Oct 2024 09:50:28 +0100 Subject: [PATCH 015/257] pidfd: block SIGCHLD during tmp process creation This patch blocks SIGCHLD during temporary process creation to prevent a race condition between kill() and waitpid() where sigchld_handler() causes `criu restore` to fail with an error. Fixes: #2490 Signed-off-by: Bhavik Sachdev Signed-off-by: Radostin Stoyanov --- criu/pidfd.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/criu/pidfd.c b/criu/pidfd.c index fdf5dec60..3ea3c9309 100644 --- a/criu/pidfd.c +++ b/criu/pidfd.c @@ -145,6 +145,20 @@ static int create_tmp_process(void) static int free_dead_pidfd(struct dead_pidfd *dead) { int status; + sigset_t blockmask, oldmask; + + /* + * Block SIGCHLD to prevent interfering from sigchld_handler() + * and to properly handle the tmp process termination without + * a race condition. A similar approach is used in cr_system(). + */ + sigemptyset(&oldmask); + sigemptyset(&blockmask); + sigaddset(&blockmask, SIGCHLD); + if (sigprocmask(SIG_BLOCK, &blockmask, &oldmask) == -1) { + pr_perror("Cannot set mask of blocked signals"); + goto err; + } if (kill(dead->pid, SIGKILL) < 0) { pr_perror("Could not kill temporary process with pid: %d", @@ -158,6 +172,12 @@ static int free_dead_pidfd(struct dead_pidfd *dead) goto err; } + /* Restore the original signal mask after tmp process has terminated */ + if (sigprocmask(SIG_SETMASK, &oldmask, NULL) == -1) { + pr_perror("Cannot clear blocked signals"); + goto err; + } + if (!WIFSIGNALED(status)) { pr_err("Expected temporary process to be terminated by a signal\n"); goto err; From d8f93e7baccb299e2f056beeab8c110654af9325 Mon Sep 17 00:00:00 2001 From: Haorong Lu Date: Tue, 1 Aug 2023 11:49:50 -0700 Subject: [PATCH 016/257] include: add common header files for riscv64 Co-authored-by: Yixue Zhao Co-authored-by: stove Signed-off-by: Haorong Lu --- - rebased - imported a page_size() type fix (authored by Cryolitia PukNgae) Signed-off-by: PukNgae Cryolitia Signed-off-by: Alexander Mikhalitsyn --- include/common/arch/riscv64/asm/atomic.h | 109 ++++++++++++++++++ include/common/arch/riscv64/asm/bitops.h | 50 ++++++++ include/common/arch/riscv64/asm/bitsperlong.h | 6 + include/common/arch/riscv64/asm/linkage.h | 23 ++++ include/common/arch/riscv64/asm/page.h | 44 +++++++ 5 files changed, 232 insertions(+) create mode 100644 include/common/arch/riscv64/asm/atomic.h create mode 100644 include/common/arch/riscv64/asm/bitops.h create mode 100644 include/common/arch/riscv64/asm/bitsperlong.h create mode 100644 include/common/arch/riscv64/asm/linkage.h create mode 100644 include/common/arch/riscv64/asm/page.h diff --git a/include/common/arch/riscv64/asm/atomic.h b/include/common/arch/riscv64/asm/atomic.h new file mode 100644 index 000000000..4b08bd9fd --- /dev/null +++ b/include/common/arch/riscv64/asm/atomic.h @@ -0,0 +1,109 @@ +#ifndef __CR_ATOMIC_H__ +#define __CR_ATOMIC_H__ + +typedef struct { + int counter; +} atomic_t; + +/* Copied from the Linux header arch/riscv/include/asm/barrier.h */ + +#define nop() __asm__ __volatile__("nop") + +#define RISCV_FENCE(p, s) __asm__ __volatile__("fence " #p "," #s : : : "memory") + +/* These barriers need to enforce ordering on both devices or memory. */ +#define mb() RISCV_FENCE(iorw, iorw) +#define rmb() RISCV_FENCE(ir, ir) +#define wmb() RISCV_FENCE(ow, ow) + +/* These barriers do not need to enforce ordering on devices, just memory. */ +#define __smp_mb() RISCV_FENCE(rw, rw) +#define __smp_rmb() RISCV_FENCE(r, r) +#define __smp_wmb() RISCV_FENCE(w, w) + +#define __smp_store_release(p, v) \ + do { \ + compiletime_assert_atomic_type(*p); \ + RISCV_FENCE(rw, w); \ + WRITE_ONCE(*p, v); \ + } while (0) + +#define __smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + RISCV_FENCE(r, rw); \ + ___p1; \ + }) + +/* Copied from the Linux kernel header arch/riscv/include/asm/atomic.h */ + +static inline int atomic_read(const atomic_t *v) +{ + return (*(volatile int *)&(v)->counter); +} + +static inline void atomic_set(atomic_t *v, int i) +{ + v->counter = i; +} + +#define atomic_get atomic_read + +static inline int atomic_add_return(int i, atomic_t *v) +{ + int result; + + asm volatile("amoadd.w.aqrl %1, %2, %0" : "+A"(v->counter), "=r"(result) : "r"(i) : "memory"); + __smp_mb(); + return result + i; +} + +static inline int atomic_sub_return(int i, atomic_t *v) +{ + return atomic_add_return(-i, v); +} + +static inline int atomic_inc(atomic_t *v) +{ + return atomic_add_return(1, v) - 1; +} + +static inline int atomic_add(int val, atomic_t *v) +{ + return atomic_add_return(val, v) - val; +} + +static inline int atomic_dec(atomic_t *v) +{ + return atomic_sub_return(1, v) + 1; +} + +/* true if the result is 0, or false for all other cases. */ +#define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0) +#define atomic_dec_return(v) (atomic_sub_return(1, v)) + +#define atomic_inc_return(v) (atomic_add_return(1, v)) + +static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) +{ + unsigned long tmp; + int oldval; + + __smp_mb(); + + asm volatile("1:\n" + " lr.w %1, %2\n" + " bne %1, %3, 2f\n" + " sc.w %0, %4, %2\n" + " bnez %0, 1b\n" + "2:" + : "=&r"(tmp), "=&r"(oldval), "+A"(ptr->counter) + : "r"(old), "r"(new) + : "memory"); + + __smp_mb(); + return oldval; +} + +#endif /* __CR_ATOMIC_H__ */ diff --git a/include/common/arch/riscv64/asm/bitops.h b/include/common/arch/riscv64/asm/bitops.h new file mode 100644 index 000000000..400cc3e15 --- /dev/null +++ b/include/common/arch/riscv64/asm/bitops.h @@ -0,0 +1,50 @@ +#ifndef __CR_ASM_BITOPS_H__ +#define __CR_ASM_BITOPS_H__ + +#include "common/compiler.h" +#include "common/asm-generic/bitops.h" + +#define BITS_PER_LONG 64 + +#define BIT_MASK(nr) ((1##UL) << ((nr) % BITS_PER_LONG)) +#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) + +#define __AMO(op) "amo" #op ".d" + +#define __test_and_op_bit_ord(op, mod, nr, addr, ord) \ + ({ \ + unsigned long __res, __mask; \ + __mask = BIT_MASK(nr); \ + __asm__ __volatile__(__AMO(op) #ord " %0, %2, %1" \ + : "=r"(__res), "+A"(addr[BIT_WORD(nr)]) \ + : "r"(mod(__mask)) \ + : "memory"); \ + ((__res & __mask) != 0); \ + }) + +#define __op_bit_ord(op, mod, nr, addr, ord) \ + __asm__ __volatile__(__AMO(op) #ord " zero, %1, %0" \ + : "+A"(addr[BIT_WORD(nr)]) \ + : "r"(mod(BIT_MASK(nr))) \ + : "memory"); + +#define __test_and_op_bit(op, mod, nr, addr) __test_and_op_bit_ord(op, mod, nr, addr, .aqrl) +#define __op_bit(op, mod, nr, addr) __op_bit_ord(op, mod, nr, addr, ) + +/* Bitmask modifiers */ +#define __NOP(x) (x) +#define __NOT(x) (~(x)) + +/** + * test_and_set_bit - Set a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation may be reordered on other architectures than x86. + */ +static inline int test_and_set_bit(int nr, volatile unsigned long *addr) +{ + return __test_and_op_bit(or, __NOP, nr, addr); +} + +#endif /* __CR_ASM_BITOPS_H__ */ diff --git a/include/common/arch/riscv64/asm/bitsperlong.h b/include/common/arch/riscv64/asm/bitsperlong.h new file mode 100644 index 000000000..d95727d19 --- /dev/null +++ b/include/common/arch/riscv64/asm/bitsperlong.h @@ -0,0 +1,6 @@ +#ifndef __CR_BITSPERLONG_H__ +#define __CR_BITSPERLONG_H__ + +#define BITS_PER_LONG 64 + +#endif /* __CR_BITSPERLONG_H__ */ diff --git a/include/common/arch/riscv64/asm/linkage.h b/include/common/arch/riscv64/asm/linkage.h new file mode 100644 index 000000000..c6d40f750 --- /dev/null +++ b/include/common/arch/riscv64/asm/linkage.h @@ -0,0 +1,23 @@ +#ifndef __CR_LINKAGE_H__ +#define __CR_LINKAGE_H__ + +#ifdef __ASSEMBLY__ + +#define __ALIGN .align 4, 0x00 +#define __ALIGN_STR ".align 4, 0x00" + +#define GLOBAL(name) \ + .globl name; \ +name: + +#define ENTRY(name) \ + .globl name; \ + .type name, @function; \ + __ALIGN; \ +name: + +#define END(sym) .size sym, .- sym + +#endif /* __ASSEMBLY__ */ + +#endif /* __CR_LINKAGE_H__ */ diff --git a/include/common/arch/riscv64/asm/page.h b/include/common/arch/riscv64/asm/page.h new file mode 100644 index 000000000..5113cb6db --- /dev/null +++ b/include/common/arch/riscv64/asm/page.h @@ -0,0 +1,44 @@ +#ifndef __CR_ASM_PAGE_H__ +#define __CR_ASM_PAGE_H__ + +#define ARCH_HAS_LONG_PAGES + +#ifndef CR_NOGLIBC +#include /* ffsl() */ +#include /* _SC_PAGESIZE */ + +extern unsigned __page_size; +extern unsigned __page_shift; + +static inline unsigned page_size(void) +{ + if (!__page_size) + __page_size = sysconf(_SC_PAGESIZE); + return __page_size; +} + +static inline unsigned page_shift(void) +{ + if (!__page_shift) + __page_shift = (ffsl(page_size()) - 1); + return __page_shift; +} + +/* + * Don't add ifdefs for PAGE_SIZE: if any header defines it as a constant + * on aarch64, then we need refrain using PAGE_SIZE in criu and use + * page_size() across sources (as it may differ on aarch64). + */ +#define PAGE_SIZE page_size() +#define PAGE_MASK (~(PAGE_SIZE - 1)) +#define PAGE_SHIFT page_shift() + +#define PAGE_PFN(addr) ((addr) / PAGE_SIZE) + +#else /* CR_NOGLIBC */ + +extern unsigned long page_size(void); +#define PAGE_SIZE page_size() + +#endif /* CR_NOGLIBC */ +#endif /* __CR_ASM_PAGE_H__ */ From 95359a62aa4dfb613d2a2cf8f7491b3ec766d348 Mon Sep 17 00:00:00 2001 From: Haorong Lu Date: Tue, 1 Aug 2023 11:58:26 -0700 Subject: [PATCH 017/257] compel: add riscv64 support Co-authored-by: Yixue Zhao Co-authored-by: stove Signed-off-by: Haorong Lu --- - rebased - added a membarrier() to syscall table (fix authored by Cryolitia PukNgae) Signed-off-by: PukNgae Cryolitia Signed-off-by: Alexander Mikhalitsyn --- Makefile | 6 +- compel/Makefile | 4 +- .../riscv64/plugins/include/asm/prologue.h | 35 +++ .../plugins/include/asm/syscall-types.h | 28 +++ .../arch/riscv64/plugins/include/features.h | 4 + .../arch/riscv64/plugins/std/parasite-head.S | 7 + .../plugins/std/syscalls/Makefile.syscalls | 59 +++++ .../plugins/std/syscalls/gen-sys-exec-tbl.pl | 43 ++++ .../plugins/std/syscalls/gen-syscalls.pl | 99 ++++++++ .../plugins/std/syscalls/syscall-aux.S | 37 +++ .../plugins/std/syscalls/syscall-aux.h | 3 + .../plugins/std/syscalls/syscall-common.S | 17 ++ .../riscv64/plugins/std/syscalls/syscall.def | 125 ++++++++++ .../riscv64/plugins/std/syscalls/syscalls.S | 112 +++++++++ compel/arch/riscv64/scripts/compel-pack.lds.S | 32 +++ compel/arch/riscv64/src/lib/cpu.c | 78 ++++++ compel/arch/riscv64/src/lib/handle-elf-host.c | 1 + compel/arch/riscv64/src/lib/handle-elf.c | 32 +++ compel/arch/riscv64/src/lib/include/cpu.h | 0 .../arch/riscv64/src/lib/include/handle-elf.h | 12 + compel/arch/riscv64/src/lib/include/syscall.h | 8 + .../src/lib/include/uapi/asm/breakpoints.h | 15 ++ .../riscv64/src/lib/include/uapi/asm/cpu.h | 7 + .../riscv64/src/lib/include/uapi/asm/fpu.h | 4 + .../src/lib/include/uapi/asm/infect-types.h | 52 ++++ .../include/uapi/asm/instruction_formats.h | 26 ++ .../lib/include/uapi/asm/processor-flags.h | 4 + .../src/lib/include/uapi/asm/sigframe.h | 68 ++++++ compel/arch/riscv64/src/lib/infect.c | 222 ++++++++++++++++++ compel/src/main.c | 3 + scripts/nmk/scripts/include.mk | 1 + 31 files changed, 1141 insertions(+), 3 deletions(-) create mode 100644 compel/arch/riscv64/plugins/include/asm/prologue.h create mode 100644 compel/arch/riscv64/plugins/include/asm/syscall-types.h create mode 100644 compel/arch/riscv64/plugins/include/features.h create mode 100644 compel/arch/riscv64/plugins/std/parasite-head.S create mode 100644 compel/arch/riscv64/plugins/std/syscalls/Makefile.syscalls create mode 100755 compel/arch/riscv64/plugins/std/syscalls/gen-sys-exec-tbl.pl create mode 100755 compel/arch/riscv64/plugins/std/syscalls/gen-syscalls.pl create mode 100644 compel/arch/riscv64/plugins/std/syscalls/syscall-aux.S create mode 100644 compel/arch/riscv64/plugins/std/syscalls/syscall-aux.h create mode 100644 compel/arch/riscv64/plugins/std/syscalls/syscall-common.S create mode 100644 compel/arch/riscv64/plugins/std/syscalls/syscall.def create mode 100644 compel/arch/riscv64/plugins/std/syscalls/syscalls.S create mode 100644 compel/arch/riscv64/scripts/compel-pack.lds.S create mode 100644 compel/arch/riscv64/src/lib/cpu.c create mode 120000 compel/arch/riscv64/src/lib/handle-elf-host.c create mode 100644 compel/arch/riscv64/src/lib/handle-elf.c create mode 100644 compel/arch/riscv64/src/lib/include/cpu.h create mode 100644 compel/arch/riscv64/src/lib/include/handle-elf.h create mode 100644 compel/arch/riscv64/src/lib/include/syscall.h create mode 100644 compel/arch/riscv64/src/lib/include/uapi/asm/breakpoints.h create mode 100644 compel/arch/riscv64/src/lib/include/uapi/asm/cpu.h create mode 100644 compel/arch/riscv64/src/lib/include/uapi/asm/fpu.h create mode 100644 compel/arch/riscv64/src/lib/include/uapi/asm/infect-types.h create mode 100644 compel/arch/riscv64/src/lib/include/uapi/asm/instruction_formats.h create mode 100644 compel/arch/riscv64/src/lib/include/uapi/asm/processor-flags.h create mode 100644 compel/arch/riscv64/src/lib/include/uapi/asm/sigframe.h create mode 100644 compel/arch/riscv64/src/lib/infect.c diff --git a/Makefile b/Makefile index 46d9adef3..60b78a074 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ endif # # Supported Architectures -ifneq ($(filter-out x86 arm aarch64 ppc64 s390 mips loongarch64,$(ARCH)),) +ifneq ($(filter-out x86 arm aarch64 ppc64 s390 mips loongarch64 riscv64,$(ARCH)),) $(error "The architecture $(ARCH) isn't supported") endif @@ -84,6 +84,10 @@ ifeq ($(ARCH),loongarch64) DEFINES := -DCONFIG_LOONGARCH64 endif +ifeq ($(ARCH),riscv64) + DEFINES := -DCONFIG_RISCV64 +endif + # # CFLAGS_PIE: # diff --git a/compel/Makefile b/compel/Makefile index 78ec4826a..c0b8a82a0 100644 --- a/compel/Makefile +++ b/compel/Makefile @@ -32,8 +32,8 @@ ifeq ($(ARCH),x86) lib-y += arch/$(ARCH)/src/lib/thread_area.o endif -# handle_elf() has no support of ELF relocations on ARM (yet?) -ifneq ($(filter arm aarch64 loongarch64,$(ARCH)),) +# handle_elf() has no support of ELF relocations on ARM and RISCV64 (yet?) +ifneq ($(filter arm aarch64 loongarch64 riscv64,$(ARCH)),) CFLAGS += -DNO_RELOCS HOSTCFLAGS += -DNO_RELOCS endif diff --git a/compel/arch/riscv64/plugins/include/asm/prologue.h b/compel/arch/riscv64/plugins/include/asm/prologue.h new file mode 100644 index 000000000..5c22b7b06 --- /dev/null +++ b/compel/arch/riscv64/plugins/include/asm/prologue.h @@ -0,0 +1,35 @@ +#ifndef __ASM_PROLOGUE_H__ +#define __ASM_PROLOGUE_H__ + +#ifndef __ASSEMBLY__ + +#include +#include +#include + +#include + +#define sys_recv(sockfd, ubuf, size, flags) sys_recvfrom(sockfd, ubuf, size, flags, NULL, NULL) + +typedef struct prologue_init_args { + struct sockaddr_un ctl_sock_addr; + unsigned int ctl_sock_addr_len; + + unsigned int arg_s; + void *arg_p; + + void *sigframe; +} prologue_init_args_t; + +#endif /* __ASSEMBLY__ */ + +/* + * Reserve enough space for sigframe. + * + * FIXME It is rather should be taken from sigframe header. + */ +#define PROLOGUE_SGFRAME_SIZE 4096 + +#define PROLOGUE_INIT_ARGS_SIZE 1024 + +#endif /* __ASM_PROLOGUE_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/include/asm/syscall-types.h b/compel/arch/riscv64/plugins/include/asm/syscall-types.h new file mode 100644 index 000000000..b9740a9ee --- /dev/null +++ b/compel/arch/riscv64/plugins/include/asm/syscall-types.h @@ -0,0 +1,28 @@ +#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ +#define COMPEL_ARCH_SYSCALL_TYPES_H__ + +#define SA_RESTORER 0x04000000 + +typedef void rt_signalfn_t(int, siginfo_t *, void *); +typedef rt_signalfn_t *rt_sighandler_t; + +typedef void rt_restorefn_t(void); +typedef rt_restorefn_t *rt_sigrestore_t; + +#define _KNSIG 64 // number of signals +#define _NSIG_BPW 64 // number of signals per word + +#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) + +typedef struct { + unsigned long sig[_KNSIG_WORDS]; +} k_rtsigset_t; + +typedef struct { + rt_sighandler_t rt_sa_handler; + unsigned long rt_sa_flags; + rt_sigrestore_t rt_sa_restorer; + k_rtsigset_t rt_sa_mask; +} rt_sigaction_t; + +#endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/include/features.h b/compel/arch/riscv64/plugins/include/features.h new file mode 100644 index 000000000..274cee52a --- /dev/null +++ b/compel/arch/riscv64/plugins/include/features.h @@ -0,0 +1,4 @@ +#ifndef __COMPEL_ARCH_FEATURES_H +#define __COMPEL_ARCH_FEATURES_H + +#endif /* __COMPEL_ARCH_FEATURES_H */ \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/parasite-head.S b/compel/arch/riscv64/plugins/std/parasite-head.S new file mode 100644 index 000000000..3e9d272e3 --- /dev/null +++ b/compel/arch/riscv64/plugins/std/parasite-head.S @@ -0,0 +1,7 @@ +#include "common/asm/linkage.h" + + .section .head.text, "ax" +ENTRY(__export_parasite_head_start) + jal parasite_service + ebreak +END(__export_parasite_head_start) \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/Makefile.syscalls b/compel/arch/riscv64/plugins/std/syscalls/Makefile.syscalls new file mode 100644 index 000000000..5af35bcb4 --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/Makefile.syscalls @@ -0,0 +1,59 @@ +ccflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ +asflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ + +sys-types := $(obj)/include/uapi/std/syscall-types.h +sys-codes := $(obj)/include/uapi/std/syscall-codes.h +sys-proto := $(obj)/include/uapi/std/syscall.h + +sys-def := $(PLUGIN_ARCH_DIR)/std/syscalls/syscall.def +sys-asm-common-name := std/syscalls/syscall-common.S +sys-asm-common := $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) +sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h +sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl.c + +sys-gen := $(PLUGIN_ARCH_DIR)/std/syscalls/gen-syscalls.pl +sys-gen-tbl := $(PLUGIN_ARCH_DIR)/std/syscalls/gen-sys-exec-tbl.pl + +sys-asm := ./$(PLUGIN_ARCH_DIR)/std/syscalls/syscalls.S +std-lib-y += $(sys-asm:.S=).o + +ifeq ($(ARCH),arm) +arch_bits := 32 +else +arch_bits := 64 +endif + +sys-exec-tbl := sys-exec-tbl.c + +$(sys-asm) $(sys-types) $(sys-codes) $(sys-proto): $(sys-gen) $(sys-def) $(sys-asm-common) $(sys-asm-types) + $(E) " GEN " $@ + $(Q) perl \ + $(sys-gen) \ + $(sys-def) \ + $(sys-codes) \ + $(sys-proto) \ + $(sys-asm) \ + $(sys-asm-common-name) \ + $(sys-types) \ + $(arch_bits) + +$(sys-asm:.S=).o: $(sys-asm) + +$(sys-exec-tbl): $(sys-gen-tbl) $(sys-def) + $(E) " GEN " $@ + $(Q) perl \ + $(sys-gen-tbl) \ + $(sys-def) \ + $(sys-exec-tbl) \ + $(arch_bits) + +$(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h + $(call msg-gen, $@) + $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) + $(Q) ln -s ../../../../../$(PLUGIN_ARCH_DIR)/std/syscalls/syscall-aux.S $(obj)/include/uapi/std/syscall-aux.S + $(Q) ln -s ../../../../../$(PLUGIN_ARCH_DIR)/std/syscalls/syscall-aux.h $(obj)/include/uapi/std/syscall-aux.h + +std-headers-deps += $(sys-asm) $(sys-codes) $(sys-proto) $(sys-asm-types) $(sys-codes) +mrproper-y += $(std-headers-deps) +mrproper-y += $(obj)/include/uapi/std/syscall-aux.S +mrproper-y += $(obj)/include/uapi/std/syscall-aux.h \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/gen-sys-exec-tbl.pl b/compel/arch/riscv64/plugins/std/syscalls/gen-sys-exec-tbl.pl new file mode 100755 index 000000000..61a807eb6 --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/gen-sys-exec-tbl.pl @@ -0,0 +1,43 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +my $in = $ARGV[0]; +my $tblout = $ARGV[1]; +my $bits = $ARGV[2]; + +my $code = "code$bits"; + +open TBLOUT, ">", $tblout or die $!; +open IN, "<", $in or die $!; + +print TBLOUT "/* Autogenerated, don't edit */\n"; +print TBLOUT "static struct syscall_exec_desc sc_exec_table[] = {\n"; + +for () { + if ($_ =~ /\#/) { + next; + } + + my $sys_name; + my $sys_num; + + if (/(?\S+)\s+(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { + $sys_name = $+{alias}; + } elsif (/(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { + $sys_name = $+{name}; + } else { + unlink $tblout; + die "Invalid syscall definition file: invalid entry $_\n"; + } + + $sys_num = $+{$code}; + + if ($sys_num ne "!") { + print TBLOUT "SYSCALL($sys_name, $sys_num)\n"; + } +} + +print TBLOUT " { }, /* terminator */"; +print TBLOUT "};" \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/gen-syscalls.pl b/compel/arch/riscv64/plugins/std/syscalls/gen-syscalls.pl new file mode 100755 index 000000000..a53f1962f --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/gen-syscalls.pl @@ -0,0 +1,99 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +my $in = $ARGV[0]; +my $codesout = $ARGV[1]; +my $codes = $ARGV[1]; +$codes =~ s/.*include\/uapi\//compel\/plugins\//g; +my $protosout = $ARGV[2]; +my $protos = $ARGV[2]; +$protos =~ s/.*include\/uapi\//compel\/plugins\//g; +my $asmout = $ARGV[3]; +my $asmcommon = $ARGV[4]; +my $prototypes = $ARGV[5]; +$prototypes =~ s/.*include\/uapi\//compel\/plugins\//g; +my $bits = $ARGV[6]; + +my $codesdef = $codes; +$codesdef =~ tr/.\-\//_/; +my $protosdef = $protos; +$protosdef =~ tr/.\-\//_/; +my $code = "code$bits"; +my $need_aux = 0; + +unlink $codesout; +unlink $protosout; +unlink $asmout; + +open CODESOUT, ">", $codesout or die $!; +open PROTOSOUT, ">", $protosout or die $!; +open ASMOUT, ">", $asmout or die $!; +open IN, "<", $in or die $!; + +print CODESOUT <<"END"; +/* Autogenerated, don't edit */ +#ifndef $codesdef +#define $codesdef +END + +print PROTOSOUT <<"END"; +/* Autogenerated, don't edit */ +#ifndef $protosdef +#define $protosdef +#include <$prototypes> +#include <$codes> +END + +print ASMOUT <<"END"; +/* Autogenerated, don't edit */ +#include <$codes> +#include "$asmcommon" +END + + +for () { + if ($_ =~ /\#/) { + next; + } + + my $code_macro; + my $sys_macro; + my $sys_name; + + if (/(?\S+)\s+(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { + $code_macro = "__NR_$+{name}"; + $sys_macro = "SYS_$+{name}"; + $sys_name = "sys_$+{alias}"; + } elsif (/(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { + $code_macro = "__NR_$+{name}"; + $sys_macro = "SYS_$+{name}"; + $sys_name = "sys_$+{name}"; + } else { + unlink $codesout; + unlink $protosout; + unlink $asmout; + + die "Invalid syscall definition file: invalid entry $_\n"; + } + + if ($+{$code} ne "!") { + print CODESOUT "#ifndef $code_macro\n#define $code_macro $+{$code}\n#endif\n"; + print CODESOUT "#ifndef $sys_macro\n#define $sys_macro $code_macro\n#endif\n"; + print ASMOUT "syscall $sys_name, $code_macro\n"; + + } else { + $need_aux = 1; + } + + print PROTOSOUT "extern long $sys_name($+{args});\n"; +} + +if ($need_aux == 1) { + print ASMOUT "#include \n"; + print CODESOUT "#include \n"; +} + +print CODESOUT "#endif /* $codesdef */"; +print PROTOSOUT "#endif /* $protosdef */"; \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.S b/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.S new file mode 100644 index 000000000..04160b7ac --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.S @@ -0,0 +1,37 @@ +/** + * This source contains emulation of syscalls + * that are not implemented in the riscv64 Linux kernel + */ + +ENTRY(sys_open) + add a3, x0, a2 + add a2, x0, a1 + add a1, x0, a0 + addi a0, x0, -100 + j sys_openat +END(sys_open) + + +ENTRY(sys_mkdir) + add a3,x0, a2 + add a2, x0, a1 + add a1, x0, a0 + addi a0, x0, -100 + j sys_mkdirat +END(sys_mkdir) + + +ENTRY(sys_rmdir) + addi a2, x0, 0x200 // flags = AT_REMOVEDIR + add a1, x0, a0 + addi a0, x0, -100 + j sys_unlinkat +END(sys_rmdir) + + +ENTRY(sys_unlink) + addi a2, x0, 0 // flags = 0 + add a1, x0, a0 + addi a0, x0, -100 + j sys_unlinkat +END(sys_unlink) \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.h b/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.h new file mode 100644 index 000000000..881765bbb --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.h @@ -0,0 +1,3 @@ +#ifndef __NR_openat +#define __NR_openat 56 +#endif \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscall-common.S b/compel/arch/riscv64/plugins/std/syscalls/syscall-common.S new file mode 100644 index 000000000..fdef3b47a --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/syscall-common.S @@ -0,0 +1,17 @@ +#include "common/asm/linkage.h" + +syscall_common: + ecall + ret + +.macro syscall name, nr + ENTRY(\name) + li a7, \nr + j syscall_common + END(\name) +.endm + +ENTRY(__cr_restore_rt) + li a7, __NR_rt_sigreturn + ecall +END(__cr_restore_rt) \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscall.def b/compel/arch/riscv64/plugins/std/syscalls/syscall.def new file mode 100644 index 000000000..17f763e90 --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/syscall.def @@ -0,0 +1,125 @@ +# +# System calls table, please make sure the table consists of only the syscalls +# really used somewhere in the project. +# +# The template is (name and arguments are optional if you need only __NR_x +# defined, but no real entry point in syscalls lib). +# +# name/alias code64 code32 arguments +# ----------------------------------------------------------------------- +# +read 63 3 (int fd, void *buf, unsigned long count) +write 64 4 (int fd, const void *buf, unsigned long count) +open ! 5 (const char *filename, unsigned long flags, unsigned long mode) +close 57 6 (int fd) +lseek 62 19 (int fd, unsigned long offset, unsigned long origin) +mmap 222 ! (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) +mprotect 226 125 (const void *addr, unsigned long len, unsigned long prot) +munmap 215 91 (void *addr, unsigned long len) +brk 214 45 (void *addr) +rt_sigaction sigaction 134 174 (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) +rt_sigprocmask sigprocmask 135 175 (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) +rt_sigreturn 139 173 (void) +ioctl 29 54 (unsigned int fd, unsigned int cmd, unsigned long arg) +pread64 67 180 (unsigned int fd, char *buf, size_t count, loff_t pos) +ptrace 117 26 (long request, pid_t pid, void *addr, void *data) +mremap 216 163 (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flag, unsigned long new_addr) +mincore 232 219 (void *addr, unsigned long size, unsigned char *vec) +madvise 233 220 (unsigned long start, size_t len, int behavior) +shmat 196 305 (int shmid, void *shmaddr, int shmflag) +pause 1061 29 (void) +nanosleep 101 162 (struct timespec *req, struct timespec *rem) +getitimer 102 105 (int which, const struct itimerval *val) +setitimer 103 104 (int which, const struct itimerval *val, struct itimerval *old) +getpid 172 20 (void) +socket 198 281 (int domain, int type, int protocol) +connect 203 283 (int sockfd, struct sockaddr *addr, int addrlen) +sendto 206 290 (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) +recvfrom 207 292 (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) +sendmsg 211 296 (int sockfd, const struct msghdr *msg, int flags) +recvmsg 212 297 (int sockfd, struct msghdr *msg, int flags) +shutdown 210 293 (int sockfd, int how) +bind 235 282 (int sockfd, const struct sockaddr *addr, int addrlen) +setsockopt 208 294 (int sockfd, int level, int optname, const void *optval, socklen_t optlen) +getsockopt 209 295 (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) +clone 220 120 (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) +exit 93 1 (unsigned long error_code) +wait4 260 114 (int pid, int *status, int options, struct rusage *ru) +waitid 95 280 (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) +kill 129 37 (long pid, int sig) +fcntl 25 55 (int fd, int type, long arg) +flock 32 143 (int fd, unsigned long cmd) +mkdir ! 39 (const char *name, int mode) +rmdir ! 40 (const char *name) +unlink ! 10 (char *pathname) +readlinkat 78 332 (int fd, const char *path, char *buf, int bufsize) +umask 166 60 (int mask) +getgroups 158 205 (int gsize, unsigned int *groups) +setgroups 159 206 (int gsize, unsigned int *groups) +setresuid 147 164 (int uid, int euid, int suid) +getresuid 148 165 (int *uid, int *euid, int *suid) +setresgid 149 170 (int gid, int egid, int sgid) +getresgid 150 171 (int *gid, int *egid, int *sgid) +getpgid 155 132 (pid_t pid) +setfsuid 151 138 (int fsuid) +setfsgid 152 139 (int fsgid) +getsid 156 147 (void) +capget 90 184 (struct cap_header *h, struct cap_data *d) +capset 91 185 (struct cap_header *h, struct cap_data *d) +rt_sigqueueinfo 138 178 (pid_t pid, int sig, siginfo_t *info) +setpriority 140 97 (int which, int who, int nice) +sched_setscheduler 119 156 (int pid, int policy, struct sched_param *p) +sigaltstack 132 186 (const void *uss, void *uoss) +personality 92 136 (unsigned int personality) +prctl 167 172 (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) +arch_prctl ! 17 (int option, unsigned long addr) +setrlimit 164 75 (int resource, struct krlimit *rlim) +mount 40 21 (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) +umount2 39 52 (char *name, int flags) +gettid 178 224 (void) +futex 98 240 (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) +set_tid_address 96 256 (int *tid_addr) +restart_syscall 128 0 (void) +timer_create 107 257 (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) +timer_settime 110 258 (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) +timer_gettime 108 259 (int timer_id, const struct itimerspec *setting) +timer_getoverrun 109 260 (int timer_id) +timer_delete 111 261 (kernel_timer_t timer_id) +clock_gettime 113 263 (const clockid_t which_clock, const struct timespec *tp) +exit_group 94 248 (int error_code) +set_robust_list 99 338 (struct robust_list_head *head, size_t len) +get_robust_list 100 339 (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) +signalfd4 74 355 (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) +rt_tgsigqueueinfo 240 363 (pid_t tgid, pid_t pid, int sig, siginfo_t *info) +vmsplice 75 343 (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) +timerfd_settime 86 353 (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) +fanotify_init 262 367 (unsigned int flags, unsigned int event_f_flags) +fanotify_mark 263 368 (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) +open_by_handle_at 265 371 (int mountdirfd, struct file_handle *handle, int flags) +setns 268 375 (int fd, int nstype) +kcmp 272 378 (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) +openat 56 322 (int dirfd, const char *pathname, int flags, mode_t mode) +mkdirat 34 323 (int dirfd, const char *pathname, mode_t mode) +unlinkat 35 328 (int dirfd, const char *pathname, int flags) +memfd_create 279 385 (const char *name, unsigned int flags) +io_setup 0 243 (unsigned nr_events, aio_context_t *ctx) +io_submit 2 246 (aio_context_t ctx_id, long nr, struct iocb **iocbpp) +io_getevents 4 245 (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) +seccomp 277 383 (unsigned int op, unsigned int flags, const char *uargs) +gettimeofday 169 78 (struct timeval *tv, struct timezone *tz) +preadv_raw 69 361 (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) +userfaultfd 282 388 (int flags) +fallocate 47 352 (int fd, int mode, loff_t offset, loff_t len) +cacheflush ! 983042 (void *start, void *end, int flags) +ppoll 73 336 (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) +fsopen 430 430 (char *fsname, unsigned int flags) +fsconfig 431 431 (int fd, unsigned int cmd, const char *key, const char *value, int aux) +fsmount 432 432 (int fd, unsigned int flags, unsigned int attr_flags) +clone3 435 435 (struct clone_args *uargs, size_t size) +pidfd_open 434 434 (pid_t pid, unsigned int flags) +pidfd_getfd 438 438 (int pidfd, int targetfd, unsigned int flags) +rseq 293 293 (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +move_mount 429 429 (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) +open_tree 428 428 (int dirfd, const char *pathname, unsigned int flags) +openat2 437 437 (int dirfd, char *pathname, struct open_how *how, size_t size) +membarrier 283 283 (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscalls.S b/compel/arch/riscv64/plugins/std/syscalls/syscalls.S new file mode 100644 index 000000000..715da4612 --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/syscalls.S @@ -0,0 +1,112 @@ +/* Autogenerated, don't edit */ +#include +#include "std/syscalls/syscall-common.S" +syscall sys_read, __NR_read +syscall sys_write, __NR_write +syscall sys_close, __NR_close +syscall sys_lseek, __NR_lseek +syscall sys_mmap, __NR_mmap +syscall sys_mprotect, __NR_mprotect +syscall sys_munmap, __NR_munmap +syscall sys_brk, __NR_brk +syscall sys_sigaction, __NR_rt_sigaction +syscall sys_sigprocmask, __NR_rt_sigprocmask +syscall sys_rt_sigreturn, __NR_rt_sigreturn +syscall sys_ioctl, __NR_ioctl +syscall sys_pread64, __NR_pread64 +syscall sys_ptrace, __NR_ptrace +syscall sys_mremap, __NR_mremap +syscall sys_mincore, __NR_mincore +syscall sys_madvise, __NR_madvise +syscall sys_shmat, __NR_shmat +syscall sys_pause, __NR_pause +syscall sys_nanosleep, __NR_nanosleep +syscall sys_getitimer, __NR_getitimer +syscall sys_setitimer, __NR_setitimer +syscall sys_getpid, __NR_getpid +syscall sys_socket, __NR_socket +syscall sys_connect, __NR_connect +syscall sys_sendto, __NR_sendto +syscall sys_recvfrom, __NR_recvfrom +syscall sys_sendmsg, __NR_sendmsg +syscall sys_recvmsg, __NR_recvmsg +syscall sys_shutdown, __NR_shutdown +syscall sys_bind, __NR_bind +syscall sys_setsockopt, __NR_setsockopt +syscall sys_getsockopt, __NR_getsockopt +syscall sys_clone, __NR_clone +syscall sys_exit, __NR_exit +syscall sys_wait4, __NR_wait4 +syscall sys_waitid, __NR_waitid +syscall sys_kill, __NR_kill +syscall sys_fcntl, __NR_fcntl +syscall sys_flock, __NR_flock +syscall sys_readlinkat, __NR_readlinkat +syscall sys_umask, __NR_umask +syscall sys_getgroups, __NR_getgroups +syscall sys_setgroups, __NR_setgroups +syscall sys_setresuid, __NR_setresuid +syscall sys_getresuid, __NR_getresuid +syscall sys_setresgid, __NR_setresgid +syscall sys_getresgid, __NR_getresgid +syscall sys_getpgid, __NR_getpgid +syscall sys_setfsuid, __NR_setfsuid +syscall sys_setfsgid, __NR_setfsgid +syscall sys_getsid, __NR_getsid +syscall sys_capget, __NR_capget +syscall sys_capset, __NR_capset +syscall sys_rt_sigqueueinfo, __NR_rt_sigqueueinfo +syscall sys_setpriority, __NR_setpriority +syscall sys_sched_setscheduler, __NR_sched_setscheduler +syscall sys_sigaltstack, __NR_sigaltstack +syscall sys_personality, __NR_personality +syscall sys_prctl, __NR_prctl +syscall sys_setrlimit, __NR_setrlimit +syscall sys_mount, __NR_mount +syscall sys_umount2, __NR_umount2 +syscall sys_gettid, __NR_gettid +syscall sys_futex, __NR_futex +syscall sys_set_tid_address, __NR_set_tid_address +syscall sys_restart_syscall, __NR_restart_syscall +syscall sys_timer_create, __NR_timer_create +syscall sys_timer_settime, __NR_timer_settime +syscall sys_timer_gettime, __NR_timer_gettime +syscall sys_timer_getoverrun, __NR_timer_getoverrun +syscall sys_timer_delete, __NR_timer_delete +syscall sys_clock_gettime, __NR_clock_gettime +syscall sys_exit_group, __NR_exit_group +syscall sys_set_robust_list, __NR_set_robust_list +syscall sys_get_robust_list, __NR_get_robust_list +syscall sys_signalfd4, __NR_signalfd4 +syscall sys_rt_tgsigqueueinfo, __NR_rt_tgsigqueueinfo +syscall sys_vmsplice, __NR_vmsplice +syscall sys_timerfd_settime, __NR_timerfd_settime +syscall sys_fanotify_init, __NR_fanotify_init +syscall sys_fanotify_mark, __NR_fanotify_mark +syscall sys_open_by_handle_at, __NR_open_by_handle_at +syscall sys_setns, __NR_setns +syscall sys_kcmp, __NR_kcmp +syscall sys_openat, __NR_openat +syscall sys_mkdirat, __NR_mkdirat +syscall sys_unlinkat, __NR_unlinkat +syscall sys_memfd_create, __NR_memfd_create +syscall sys_io_setup, __NR_io_setup +syscall sys_io_submit, __NR_io_submit +syscall sys_io_getevents, __NR_io_getevents +syscall sys_seccomp, __NR_seccomp +syscall sys_gettimeofday, __NR_gettimeofday +syscall sys_preadv_raw, __NR_preadv_raw +syscall sys_userfaultfd, __NR_userfaultfd +syscall sys_fallocate, __NR_fallocate +syscall sys_ppoll, __NR_ppoll +syscall sys_fsopen, __NR_fsopen +syscall sys_fsconfig, __NR_fsconfig +syscall sys_fsmount, __NR_fsmount +syscall sys_clone3, __NR_clone3 +syscall sys_pidfd_open, __NR_pidfd_open +syscall sys_pidfd_getfd, __NR_pidfd_getfd +syscall sys_rseq, __NR_rseq +syscall sys_move_mount, __NR_move_mount +syscall sys_open_tree, __NR_open_tree +syscall sys_openat2, __NR_openat2 +#include diff --git a/compel/arch/riscv64/scripts/compel-pack.lds.S b/compel/arch/riscv64/scripts/compel-pack.lds.S new file mode 100644 index 000000000..a61235b44 --- /dev/null +++ b/compel/arch/riscv64/scripts/compel-pack.lds.S @@ -0,0 +1,32 @@ +OUTPUT_ARCH(riscv) +EXTERN(__export_parasite_head_start) + +SECTIONS +{ + .crblob 0x0 : { + *(.head.text) + ASSERT(DEFINED(__export_parasite_head_start), + "Symbol __export_parasite_head_start is missing"); + *(.text*) + . = ALIGN(32); + *(.data*) + . = ALIGN(32); + *(.rodata*) + . = ALIGN(32); + *(.bss*) + . = ALIGN(32); + *(.got*) + . = ALIGN(32); + *(.toc*) + . = ALIGN(32); + } =0x00000000, + + /DISCARD/ : { + *(.debug*) + *(.comment*) + *(.note*) + *(.group*) + *(.eh_frame*) + *(*) + } +} \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/cpu.c b/compel/arch/riscv64/src/lib/cpu.c new file mode 100644 index 000000000..9a0291f70 --- /dev/null +++ b/compel/arch/riscv64/src/lib/cpu.c @@ -0,0 +1,78 @@ +#include +#include + +#include "compel-cpu.h" + +#include "common/bitops.h" + +#include "log.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +static compel_cpuinfo_t rt_info; + +static void fetch_rt_cpuinfo(void) +{ + static bool rt_info_done = false; + + if (!rt_info_done) { + compel_cpuid(&rt_info); + rt_info_done = true; + } +} + +void compel_set_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ +} +void compel_clear_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ +} +int compel_test_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ + return 0; +} +int compel_test_fpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ + return 0; +} +int compel_cpuid(compel_cpuinfo_t *info) +{ + return 0; +} + +bool compel_cpu_has_feature(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return compel_test_cpu_cap(&rt_info, feature); +} + +bool compel_fpu_has_feature(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return compel_test_fpu_cap(&rt_info, feature); +} + +uint32_t compel_fpu_feature_size(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return 0; +} + +uint32_t compel_fpu_feature_offset(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return 0; +} + +void compel_cpu_clear_feature(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return compel_clear_cpu_cap(&rt_info, feature); +} + +void compel_cpu_copy_cpuinfo(compel_cpuinfo_t *c) +{ + fetch_rt_cpuinfo(); + memcpy(c, &rt_info, sizeof(rt_info)); +} \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/handle-elf-host.c b/compel/arch/riscv64/src/lib/handle-elf-host.c new file mode 120000 index 000000000..fe4611886 --- /dev/null +++ b/compel/arch/riscv64/src/lib/handle-elf-host.c @@ -0,0 +1 @@ +handle-elf.c \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/handle-elf.c b/compel/arch/riscv64/src/lib/handle-elf.c new file mode 100644 index 000000000..22420bc78 --- /dev/null +++ b/compel/arch/riscv64/src/lib/handle-elf.c @@ -0,0 +1,32 @@ +#include +#include + +#include "handle-elf.h" +#include "piegen.h" +#include "log.h" + +static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +static const unsigned char __maybe_unused elf_ident_64_be[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x02, 0x01, 0x00, /* clang-format */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +int handle_binary(void *mem, size_t size) +{ + const unsigned char *elf_ident = +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + elf_ident_64_le; +#else + elf_ident_64_be; +#endif + + if (memcmp(mem, elf_ident, sizeof(elf_ident_64_le)) == 0) + return handle_elf_riscv64(mem, size); + + pr_err("Unsupported Elf format detected\n"); + return -EINVAL; +} \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/cpu.h b/compel/arch/riscv64/src/lib/include/cpu.h new file mode 100644 index 000000000..e69de29bb diff --git a/compel/arch/riscv64/src/lib/include/handle-elf.h b/compel/arch/riscv64/src/lib/include/handle-elf.h new file mode 100644 index 000000000..582770583 --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/handle-elf.h @@ -0,0 +1,12 @@ +#ifndef COMPEL_HANDLE_ELF_H__ +#define COMPEL_HANDLE_ELF_H__ + +#include "elf64-types.h" + +#define __handle_elf handle_elf_riscv64 +#define ELF_RISCV +#define arch_is_machine_supported(e_machine) (e_machine == EM_RISCV) + +extern int handle_elf_riscv64(void *mem, size_t size); + +#endif /* COMPEL_HANDLE_ELF_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/syscall.h b/compel/arch/riscv64/src/lib/include/syscall.h new file mode 100644 index 000000000..53f10525d --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/syscall.h @@ -0,0 +1,8 @@ +#ifndef __COMPEL_SYSCALL_H__ +#define __COMPEL_SYSCALL_H__ +#define __NR(syscall, compat) \ + ({ \ + (void)compat; \ + __NR_##syscall; \ + }) +#endif \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/breakpoints.h b/compel/arch/riscv64/src/lib/include/uapi/asm/breakpoints.h new file mode 100644 index 000000000..f2ba799cb --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/breakpoints.h @@ -0,0 +1,15 @@ +#ifndef __COMPEL_BREAKPOINTS_H__ +#define __COMPEL_BREAKPOINTS_H__ +#define ARCH_SI_TRAP TRAP_BRKPT + +static inline int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + return 0; +} + +static inline int ptrace_flush_breakpoints(pid_t pid) +{ + return 0; +} + +#endif \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/cpu.h b/compel/arch/riscv64/src/lib/include/uapi/asm/cpu.h new file mode 100644 index 000000000..ac58567e3 --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/cpu.h @@ -0,0 +1,7 @@ +#ifndef UAPI_COMPEL_ASM_CPU_H__ +#define UAPI_COMPEL_ASM_CPU_H__ + +typedef struct { +} compel_cpuinfo_t; + +#endif /* UAPI_COMPEL_ASM_CPU_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/fpu.h b/compel/arch/riscv64/src/lib/include/uapi/asm/fpu.h new file mode 100644 index 000000000..a74decc23 --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/fpu.h @@ -0,0 +1,4 @@ +#ifndef __CR_ASM_FPU_H__ +#define __CR_ASM_FPU_H__ + +#endif /* __CR_ASM_FPU_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/riscv64/src/lib/include/uapi/asm/infect-types.h new file mode 100644 index 000000000..192810cac --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/infect-types.h @@ -0,0 +1,52 @@ +#ifndef UAPI_COMPEL_ASM_TYPES_H__ +#define UAPI_COMPEL_ASM_TYPES_H__ + +#include +#include +#include +#include + +#define SIGMAX 64 +#define SIGMAX_OLD 31 + +/* + * Copied from the Linux kernel header arch/riscv/include/uapi/asm/ptrace.h + * + * A thread RISC-V CPU context + */ +typedef struct user_regs_struct user_regs_struct_t; +typedef struct __riscv_d_ext_state user_fpregs_struct_t; + +#define __compel_arch_fetch_thread_area(tid, th) 0 +#define compel_arch_fetch_thread_area(tctl) 0 +#define compel_arch_get_tls_task(ctl, tls) +#define compel_arch_get_tls_thread(tctl, tls) + +#define REG_RES(registers) ((uint64_t)(registers).a0) +#define REG_IP(registers) ((uint64_t)(registers).pc) +#define SET_REG_IP(registers, val) ((registers).pc = (val)) + +/* + * REG_SP is also defined in riscv64-linux-gnu/include/sys/ucontext.h + * with a different meaning, and it's not used in CRIU. So we have to + * undefine it here. + */ +#ifdef REG_SP +#undef REG_SP +#endif + +#define REG_SP(registers) ((uint64_t)((registers).sp)) + +#define REG_SYSCALL_NR(registers) ((uint64_t)(registers).a7) + +#define user_regs_native(pregs) true + +#define ARCH_SI_TRAP TRAP_BRKPT + +#define __NR(syscall, compat) \ + ({ \ + (void)compat; \ + __NR_##syscall; \ + }) + +#endif /* UAPI_COMPEL_ASM_TYPES_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/instruction_formats.h b/compel/arch/riscv64/src/lib/include/uapi/asm/instruction_formats.h new file mode 100644 index 000000000..e231d0465 --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/instruction_formats.h @@ -0,0 +1,26 @@ +#ifndef COMPEL_RELOCATIONS_H__ +#define COMPEL_RELOCATIONS_H__ + +#include + +static inline uint32_t riscv_b_imm(uint32_t val) +{ + return (val & 0x00001000) << 19 | (val & 0x000007e0) << 20 | (val & 0x0000001e) << 7 | (val & 0x00000800) >> 4; +} + +static inline uint32_t riscv_i_imm(uint32_t val) +{ + return val << 20; +} + +static inline uint32_t riscv_u_imm(uint32_t val) +{ + return val & 0xfffff000; +} + +static inline uint32_t riscv_j_imm(uint32_t val) +{ + return (val & 0x00100000) << 11 | (val & 0x000007fe) << 20 | (val & 0x00000800) << 9 | (val & 0x000ff000); +} + +#endif /* COMPEL_RELOCATIONS_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/processor-flags.h b/compel/arch/riscv64/src/lib/include/uapi/asm/processor-flags.h new file mode 100644 index 000000000..e40fb6fce --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/processor-flags.h @@ -0,0 +1,4 @@ +#ifndef UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ +#define UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ + +#endif /* UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/riscv64/src/lib/include/uapi/asm/sigframe.h new file mode 100644 index 000000000..761a08f62 --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/sigframe.h @@ -0,0 +1,68 @@ +#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ +#define UAPI_COMPEL_ASM_SIGFRAME_H__ + +#include + +#include + +#include + +/* Copied from the kernel header arch/riscv/include/uapi/asm/sigcontext.h */ +/* + * Signal context structure + * + * This contains the context saved before a signal handler is invoked; + * it is restored by sys_sigreturn / sys_rt_sigreturn. + */ +// struct sigcontext { +// struct user_regs_struct sc_regs; +// union __riscv_fp_state sc_fpregs; +// /* +// * 4K + 128 reserved for vector state and future expansion. +// * This space is enough to store the vector context whose VLENB +// * is less or equal to 128. +// * (The size of the vector context is 4144 byte as VLENB is 128) +// */ +// __u8 __reserved[4224] __attribute__((__aligned__(16))); +// }; + +#define rt_sigcontext sigcontext + +#include + +/* Copied from the kernel source arch/riscv/kernel/signal.c */ +struct rt_sigframe { + siginfo_t info; + ucontext_t uc; //ucontext_t structure holds the user context, e.g., the signal mask, GP regs +}; + +/* + generates inline assembly code for triggering the rt_sigreturn system call. + used to return from a signal handler back to the normal execution flow of the process. +*/ +/* clang-format off */ +#define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ + asm volatile( \ + "mv sp, %0\n" \ + "li a7, "__stringify(__NR_rt_sigreturn)" \n" \ + "ecall\n" \ + : \ + : "r"(new_sp) \ + : "a7", "memory") +/* clang-format on */ + +#define RT_SIGFRAME_UC(rt_sigframe) (&rt_sigframe->uc) +#define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(rt_sigframe)->uc.uc_mcontext.__gregs[REG_PC]) +#define RT_SIGFRAME_HAS_FPU(rt_sigframe) 1 +#define RT_SIGFRAME_OFFSET(rt_sigframe) 0 + +// #define RT_SIGFRAME_SIGCONTEXT(rt_sigframe) ((struct cr_sigcontext *)&(rt_sigframe)->uc.uc_mcontext) +// #define RT_SIGFRAME_AUX_CONTEXT(rt_sigframe) ((struct sigcontext *)&(RT_SIGFRAME_SIGCONTEXT(rt_sigframe)->__reserved)) +// #define RT_SIGFRAME_FPU(rt_sigframe) (&RT_SIGFRAME_AUX_CONTEXT(rt_sigframe)->fpsimd) + +#define rt_sigframe_erase_sigset(sigframe) \ + memset(&sigframe->uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) // erase the signal mask +#define rt_sigframe_copy_sigset(sigframe, from) \ + memcpy(&sigframe->uc.uc_sigmask, from, sizeof(k_rtsigset_t)) // copy the signal mask + +#endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/infect.c b/compel/arch/riscv64/src/lib/infect.c new file mode 100644 index 000000000..01395a205 --- /dev/null +++ b/compel/arch/riscv64/src/lib/infect.c @@ -0,0 +1,222 @@ +#include +#include +#include +#include +#include +#include +#include "common/page.h" +#include "uapi/compel/asm/infect-types.h" +#include "log.h" +#include "errno.h" +#include "infect.h" +#include "infect-priv.h" + +unsigned __page_size = 0; +unsigned __page_shift = 0; + +/* + * Injected syscall instruction + */ +const char code_syscall[] = { + 0x73, 0x00, 0x00, 0x00, /* ecall */ + 0x73, 0x00, 0x10, 0x00 /* ebreak */ +}; + +static const int code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long)); + +static inline void __always_unused __check_code_syscall(void) +{ + BUILD_BUG_ON(code_syscall_aligned != BUILTIN_SYSCALL_SIZE); + BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); +} + +int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +{ + sigframe->uc.uc_mcontext.__gregs[0] = regs->pc; + sigframe->uc.uc_mcontext.__gregs[1] = regs->ra; + sigframe->uc.uc_mcontext.__gregs[2] = regs->sp; + sigframe->uc.uc_mcontext.__gregs[3] = regs->gp; + sigframe->uc.uc_mcontext.__gregs[4] = regs->tp; + sigframe->uc.uc_mcontext.__gregs[5] = regs->t0; + sigframe->uc.uc_mcontext.__gregs[6] = regs->t1; + sigframe->uc.uc_mcontext.__gregs[7] = regs->t2; + sigframe->uc.uc_mcontext.__gregs[8] = regs->s0; + sigframe->uc.uc_mcontext.__gregs[9] = regs->s1; + sigframe->uc.uc_mcontext.__gregs[10] = regs->a0; + sigframe->uc.uc_mcontext.__gregs[11] = regs->a1; + sigframe->uc.uc_mcontext.__gregs[12] = regs->a2; + sigframe->uc.uc_mcontext.__gregs[13] = regs->a3; + sigframe->uc.uc_mcontext.__gregs[14] = regs->a4; + sigframe->uc.uc_mcontext.__gregs[15] = regs->a5; + sigframe->uc.uc_mcontext.__gregs[16] = regs->a6; + sigframe->uc.uc_mcontext.__gregs[17] = regs->a7; + sigframe->uc.uc_mcontext.__gregs[18] = regs->s2; + sigframe->uc.uc_mcontext.__gregs[19] = regs->s3; + sigframe->uc.uc_mcontext.__gregs[20] = regs->s4; + sigframe->uc.uc_mcontext.__gregs[21] = regs->s5; + sigframe->uc.uc_mcontext.__gregs[22] = regs->s6; + sigframe->uc.uc_mcontext.__gregs[23] = regs->s7; + sigframe->uc.uc_mcontext.__gregs[24] = regs->s8; + sigframe->uc.uc_mcontext.__gregs[25] = regs->s9; + sigframe->uc.uc_mcontext.__gregs[26] = regs->s10; + sigframe->uc.uc_mcontext.__gregs[27] = regs->s11; + sigframe->uc.uc_mcontext.__gregs[28] = regs->t3; + sigframe->uc.uc_mcontext.__gregs[29] = regs->t4; + sigframe->uc.uc_mcontext.__gregs[30] = regs->t5; + sigframe->uc.uc_mcontext.__gregs[31] = regs->t6; + + memcpy(sigframe->uc.uc_mcontext.__fpregs.__d.__f, fpregs->f, sizeof(fpregs->f)); + sigframe->uc.uc_mcontext.__fpregs.__d.__fcsr = fpregs->fcsr; + + return 0; +} + +int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) +{ + return 0; +} + +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, + void *arg, __maybe_unused unsigned long flags) +{ + user_fpregs_struct_t tmp, *fpsimd = ext_regs ? ext_regs : &tmp; + struct iovec iov; + int ret = -1; + + pr_info("Dumping FPU registers for %d\n", pid); + + iov.iov_base = fpsimd; + iov.iov_len = sizeof(*fpsimd); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov))) { + pr_perror("Failed to obtain FPU registers for %d", pid); + return -1; + } + + ret = save(arg, regs, fpsimd); + return ret; +} + +int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) +{ + struct iovec iov; + + pr_info("Restoring GP/FPU registers for %d\n", pid); + + iov.iov_base = ext_regs; + iov.iov_len = sizeof(*ext_regs); + if (ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov)) { + pr_perror("Failed to set FPU registers for %d", pid); + return -1; + } + return 0; +} + +int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) +{ + user_regs_struct_t regs = ctl->orig.regs; + int err; + + regs.a7 = (unsigned long)nr; + regs.a0 = arg1; + regs.a1 = arg2; + regs.a2 = arg3; + regs.a3 = arg4; + regs.a4 = arg5; + regs.a5 = arg6; + regs.a6 = 0; + + err = compel_execute_syscall(ctl, ®s, code_syscall); + + *ret = regs.a0; + return err; +} + +/* + * Calling the mmap system call in the context of the target (victim) process using the compel_syscall function. + * Used during the infection process to allocate memory for the parasite code. +*/ +void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) +{ + long map; + int err; + + err = compel_syscall(ctl, __NR_mmap, &map, (unsigned long)addr, length, prot, flags, fd, offset); + if (err < 0 || (long)map < 0) + map = 0; + + return (void *)map; +} + +void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) +{ + regs->pc = new_ip; + if (stack) + regs->sp = (unsigned long)stack; +} + +bool arch_can_dump_task(struct parasite_ctl *ctl) +{ + /* + * TODO: Add proper check here. + */ + return true; +} + +/* + * Fetch the signal alternate stack (sigaltstack), + * sas is a separate memory area for the signal handler to run on, + * avoiding potential issues with the main process stack +*/ +int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) +{ + long ret; + int err; + + err = compel_syscall(ctl, __NR_sigaltstack, &ret, 0, (unsigned long)&s->uc.uc_stack, 0, 0, 0, 0); + return err ? err : ret; +} + +/* + * Task size is the maximum virtual address space size that a process can occupy in the memory + * Refer to linux kernel arch/riscv/include/asm/pgtable.h, + * task size is: + * - 0x9fc00000 (~2.5GB) for RV32. + * - 0x4000000000 ( 256GB) for RV64 using SV39 mmu + * - 0x800000000000 ( 128TB) for RV64 using SV48 mmu + * + * Note that PGDIR_SIZE must evenly divide TASK_SIZE since "RISC-V + * Instruction Set Manual Volume II: Privileged Architecture" states that + * "load and store effective addresses, which are 64bits, must have bits + * 63–48 all equal to bit 47, or else a page-fault exception will occur." +*/ +#define TASK_SIZE 0x800000000000UL // hardcoded for SV48 MMU + +unsigned long compel_task_size(void) +{ + return TASK_SIZE; +} + +/* + * Get task registers (overwrites weak function) + */ +int ptrace_get_regs(int pid, user_regs_struct_t *regs) +{ + struct iovec iov; + + iov.iov_base = regs; + iov.iov_len = sizeof(user_regs_struct_t); + return ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov); +} + +/* + * Set task registers (overwrites weak function) + */ +int ptrace_set_regs(int pid, user_regs_struct_t *regs) +{ + struct iovec iov; + + iov.iov_base = regs; + iov.iov_len = sizeof(user_regs_struct_t); + return ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov); +} diff --git a/compel/src/main.c b/compel/src/main.c index bc16c0ab4..21e06d7dd 100644 --- a/compel/src/main.c +++ b/compel/src/main.c @@ -60,6 +60,9 @@ static const flags_t flags = { #elif defined CONFIG_LOONGARCH64 .arch = "loongarch64", .cflags = COMPEL_CFLAGS_PIE, +#elif defined CONFIG_RISCV64 + .arch = "riscv64", + .cflags = COMPEL_CFLAGS_PIE, #else #error "CONFIG_ not defined, or unsupported ARCH" #endif diff --git a/scripts/nmk/scripts/include.mk b/scripts/nmk/scripts/include.mk index 55c5be307..603c322cf 100644 --- a/scripts/nmk/scripts/include.mk +++ b/scripts/nmk/scripts/include.mk @@ -21,6 +21,7 @@ ARCH ?= $(shell echo $(SUBARCH) | sed \ -e s/mips.*/mips/ \ -e s/sh[234].*/sh/ \ -e s/aarch64.*/aarch64/ \ + -e s/riscv64.*/riscv64/ \ -e s/loongarch64.*/loongarch64/) export SUBARCH ARCH From 1d028ef44e9de6f8ec9c86eb43753f9156edd1f2 Mon Sep 17 00:00:00 2001 From: Haorong Lu Date: Tue, 1 Aug 2023 11:59:13 -0700 Subject: [PATCH 018/257] images: add riscv64 core image Co-authored-by: Yixue Zhao Co-authored-by: stove Signed-off-by: Haorong Lu --- images/Makefile | 1 + images/core-riscv64.proto | 53 +++++++++++++++++++++++++++++++++++++++ images/core.proto | 3 +++ 3 files changed, 57 insertions(+) create mode 100644 images/core-riscv64.proto diff --git a/images/Makefile b/images/Makefile index 855d894da..1e40b8a8f 100644 --- a/images/Makefile +++ b/images/Makefile @@ -7,6 +7,7 @@ proto-obj-y += core-arm.o proto-obj-y += core-aarch64.o proto-obj-y += core-ppc64.o proto-obj-y += core-s390.o +proto-obj-y += core-riscv64.o proto-obj-y += cpuinfo.o proto-obj-y += inventory.o proto-obj-y += fdinfo.o diff --git a/images/core-riscv64.proto b/images/core-riscv64.proto new file mode 100644 index 000000000..1ddfdd8bd --- /dev/null +++ b/images/core-riscv64.proto @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: MIT + +syntax = "proto2"; + +import "opts.proto"; + +// Refer to riscv-gnu-toolchain/linux-headers/include/asm/ptrace.h +message user_riscv64_regs_entry { + required uint64 pc = 1; + required uint64 ra = 2; + required uint64 sp = 3; + required uint64 gp = 4; + required uint64 tp = 5; + required uint64 t0 = 6; + required uint64 t1 = 7; + required uint64 t2 = 8; + required uint64 s0 = 9; + required uint64 s1 = 10; + required uint64 a0 = 11; + required uint64 a1 = 12; + required uint64 a2 = 13; + required uint64 a3 = 14; + required uint64 a4 = 15; + required uint64 a5 = 16; + required uint64 a6 = 17; + required uint64 a7 = 18; + required uint64 s2 = 19; + required uint64 s3 = 20; + required uint64 s4 = 21; + required uint64 s5 = 22; + required uint64 s6 = 23; + required uint64 s7 = 24; + required uint64 s8 = 25; + required uint64 s9 = 26; + required uint64 s10 = 27; + required uint64 s11 = 28; + required uint64 t3 = 29; + required uint64 t4 = 30; + required uint64 t5 = 31; + required uint64 t6 = 32; +} + +message user_riscv64_d_ext_entry { + repeated uint64 f = 1; + required uint32 fcsr = 2; +} + +message thread_info_riscv64 { + required uint64 clear_tid_addr = 1[(criu).hex = true]; + required uint64 tls = 2; + required user_riscv64_regs_entry gpregs = 3[(criu).hex = true]; + required user_riscv64_d_ext_entry fpsimd = 4; +} diff --git a/images/core.proto b/images/core.proto index 5b07b5c44..1fa23868b 100644 --- a/images/core.proto +++ b/images/core.proto @@ -9,6 +9,7 @@ import "core-ppc64.proto"; import "core-s390.proto"; import "core-mips.proto"; import "core-loongarch64.proto"; +import "core-riscv64.proto"; import "rlimit.proto"; import "timer.proto"; @@ -126,6 +127,7 @@ message core_entry { S390 = 5; MIPS = 6; LOONGARCH64 = 7; + RISCV64 = 8; } required march mtype = 1; @@ -136,6 +138,7 @@ message core_entry { optional thread_info_s390 ti_s390 = 10; optional thread_info_mips ti_mips = 11; optional thread_info_loongarch64 ti_loongarch64 = 12; + optional thread_info_riscv64 ti_riscv64 = 13; optional task_core_entry tc = 3; optional task_kobj_ids_entry ids = 4; From 6d970ed047592a2dacb51d81338ca7e9ecc21005 Mon Sep 17 00:00:00 2001 From: Haorong Lu Date: Tue, 1 Aug 2023 12:06:00 -0700 Subject: [PATCH 019/257] criu: add riscv64 support to parasite and restorer Co-authored-by: Yixue Zhao Co-authored-by: stove Signed-off-by: Haorong Lu --- criu/arch/riscv64/Makefile | 8 + criu/arch/riscv64/cpu.c | 40 ++++ criu/arch/riscv64/crtools.c | 171 ++++++++++++++++++ criu/arch/riscv64/include/asm/dump.h | 15 ++ criu/arch/riscv64/include/asm/int.h | 6 + criu/arch/riscv64/include/asm/kerndat.h | 7 + .../riscv64/include/asm/parasite-syscall.h | 6 + criu/arch/riscv64/include/asm/parasite.h | 16 ++ criu/arch/riscv64/include/asm/restore.h | 29 +++ criu/arch/riscv64/include/asm/restorer.h | 150 +++++++++++++++ .../arch/riscv64/include/asm/thread_pointer.h | 27 +++ criu/arch/riscv64/include/asm/types.h | 40 ++++ criu/arch/riscv64/include/asm/vdso.h | 28 +++ criu/arch/riscv64/restorer.c | 14 ++ criu/arch/riscv64/sigframe.c | 8 + criu/arch/riscv64/vdso-lookup.S | 15 ++ criu/arch/riscv64/vdso-pie.c | 159 ++++++++++++++++ criu/pie/Makefile | 8 + criu/pie/Makefile.library | 4 + 19 files changed, 751 insertions(+) create mode 100644 criu/arch/riscv64/Makefile create mode 100644 criu/arch/riscv64/cpu.c create mode 100644 criu/arch/riscv64/crtools.c create mode 100644 criu/arch/riscv64/include/asm/dump.h create mode 100644 criu/arch/riscv64/include/asm/int.h create mode 100644 criu/arch/riscv64/include/asm/kerndat.h create mode 100644 criu/arch/riscv64/include/asm/parasite-syscall.h create mode 100644 criu/arch/riscv64/include/asm/parasite.h create mode 100644 criu/arch/riscv64/include/asm/restore.h create mode 100644 criu/arch/riscv64/include/asm/restorer.h create mode 100644 criu/arch/riscv64/include/asm/thread_pointer.h create mode 100644 criu/arch/riscv64/include/asm/types.h create mode 100644 criu/arch/riscv64/include/asm/vdso.h create mode 100644 criu/arch/riscv64/restorer.c create mode 100644 criu/arch/riscv64/sigframe.c create mode 100644 criu/arch/riscv64/vdso-lookup.S create mode 100644 criu/arch/riscv64/vdso-pie.c diff --git a/criu/arch/riscv64/Makefile b/criu/arch/riscv64/Makefile new file mode 100644 index 000000000..d19895471 --- /dev/null +++ b/criu/arch/riscv64/Makefile @@ -0,0 +1,8 @@ +builtin-name := crtools.built-in.o + +ldflags-y += -r + +obj-y += cpu.o +obj-y += crtools.o +obj-y += sigframe.o +obj-y += vdso-lookup.o \ No newline at end of file diff --git a/criu/arch/riscv64/cpu.c b/criu/arch/riscv64/cpu.c new file mode 100644 index 000000000..97a883b8c --- /dev/null +++ b/criu/arch/riscv64/cpu.c @@ -0,0 +1,40 @@ +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +#include +#include "cpu.h" + +int cpu_init(void) +{ + return 0; +} + +int cpu_dump_cpuinfo(void) +{ + return 0; +} + +int cpu_validate_cpuinfo(void) +{ + return 0; +} + +int cpu_dump_cpuinfo_single(void) +{ + return -ENOTSUP; +} + +int cpu_validate_image_cpuinfo_single(void) +{ + return -ENOTSUP; +} + +int cpuinfo_dump(void) +{ + return -ENOTSUP; +} + +int cpuinfo_check(void) +{ + return -ENOTSUP; +} diff --git a/criu/arch/riscv64/crtools.c b/criu/arch/riscv64/crtools.c new file mode 100644 index 000000000..b2d6d2951 --- /dev/null +++ b/criu/arch/riscv64/crtools.c @@ -0,0 +1,171 @@ +#include +#include + +#include + +#include "types.h" +#include + +#include +#include "asm/restorer.h" +#include "common/compiler.h" +#include +#include "asm/dump.h" +#include "protobuf.h" +#include "images/core.pb-c.h" +#include "images/creds.pb-c.h" +#include "parasite-syscall.h" +#include "log.h" +#include "util.h" +#include "cpu.h" +#include "restorer.h" +#include "compel/infect.h" + +#define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))(src)->e + +int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) +{ + int i; + CoreEntry *core = x; + + // Save riscv64 gprs + assign_reg(core->ti_riscv64->gpregs, regs, pc); + assign_reg(core->ti_riscv64->gpregs, regs, ra); + assign_reg(core->ti_riscv64->gpregs, regs, sp); + assign_reg(core->ti_riscv64->gpregs, regs, gp); + assign_reg(core->ti_riscv64->gpregs, regs, tp); + assign_reg(core->ti_riscv64->gpregs, regs, t0); + assign_reg(core->ti_riscv64->gpregs, regs, t1); + assign_reg(core->ti_riscv64->gpregs, regs, t2); + assign_reg(core->ti_riscv64->gpregs, regs, s0); + assign_reg(core->ti_riscv64->gpregs, regs, s1); + assign_reg(core->ti_riscv64->gpregs, regs, a0); + assign_reg(core->ti_riscv64->gpregs, regs, a1); + assign_reg(core->ti_riscv64->gpregs, regs, a2); + assign_reg(core->ti_riscv64->gpregs, regs, a3); + assign_reg(core->ti_riscv64->gpregs, regs, a4); + assign_reg(core->ti_riscv64->gpregs, regs, a5); + assign_reg(core->ti_riscv64->gpregs, regs, a6); + assign_reg(core->ti_riscv64->gpregs, regs, a7); + assign_reg(core->ti_riscv64->gpregs, regs, s2); + assign_reg(core->ti_riscv64->gpregs, regs, s3); + assign_reg(core->ti_riscv64->gpregs, regs, s4); + assign_reg(core->ti_riscv64->gpregs, regs, s5); + assign_reg(core->ti_riscv64->gpregs, regs, s6); + assign_reg(core->ti_riscv64->gpregs, regs, s7); + assign_reg(core->ti_riscv64->gpregs, regs, s8); + assign_reg(core->ti_riscv64->gpregs, regs, s9); + assign_reg(core->ti_riscv64->gpregs, regs, s10); + assign_reg(core->ti_riscv64->gpregs, regs, s11); + assign_reg(core->ti_riscv64->gpregs, regs, t3); + assign_reg(core->ti_riscv64->gpregs, regs, t4); + assign_reg(core->ti_riscv64->gpregs, regs, t5); + assign_reg(core->ti_riscv64->gpregs, regs, t6); + + // Save riscv64 fprs + for (i = 0; i < 32; ++i) + assign_reg(core->ti_riscv64->fpsimd, fpsimd, f[i]); + assign_reg(core->ti_riscv64->fpsimd, fpsimd, fcsr); + + return 0; +} + +int arch_alloc_thread_info(CoreEntry *core) +{ + ThreadInfoRiscv64 *ti_riscv64; + UserRiscv64RegsEntry *gpregs; + UserRiscv64DExtEntry *fpsimd; + + ti_riscv64 = xmalloc(sizeof(*ti_riscv64)); + if (!ti_riscv64) + goto err; + thread_info_riscv64__init(ti_riscv64); + core->ti_riscv64 = ti_riscv64; + + gpregs = xmalloc(sizeof(*gpregs)); + if (!gpregs) + goto err; + user_riscv64_regs_entry__init(gpregs); + + ti_riscv64->gpregs = gpregs; + + fpsimd = xmalloc(sizeof(*fpsimd)); + if (!fpsimd) + goto err; + user_riscv64_d_ext_entry__init(fpsimd); + ti_riscv64->fpsimd = fpsimd; + fpsimd->f = xmalloc(32 * sizeof(fpsimd->f[0])); + fpsimd->n_f = 32; + if (!fpsimd->f) + goto err; + + return 0; +err: + return -1; +} + +void arch_free_thread_info(CoreEntry *core) +{ + if (core->ti_riscv64) { + if (core->ti_riscv64->fpsimd) { + xfree(core->ti_riscv64->fpsimd->f); + xfree(core->ti_riscv64->fpsimd); + } + xfree(core->ti_riscv64->gpregs); + xfree(core->ti_riscv64); + core->ti_riscv64 = NULL; + } +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) +{ + int i; + UserRiscv64DExtEntry *fpsimd = core->ti_riscv64->fpsimd; + + if (fpsimd->n_f != 32) + return 1; + + for (i = 0; i < 32; ++i) + sigframe->uc.uc_mcontext.__fpregs.__d.__f[i] = fpsimd->f[i]; + sigframe->uc.uc_mcontext.__fpregs.__d.__fcsr = fpsimd->fcsr; + + return 0; +} + +int restore_gpregs(struct rt_sigframe *f, UserRiscv64RegsEntry *r) +{ + f->uc.uc_mcontext.__gregs[0] = r->pc; + f->uc.uc_mcontext.__gregs[1] = r->ra; + f->uc.uc_mcontext.__gregs[2] = r->sp; + f->uc.uc_mcontext.__gregs[3] = r->gp; + f->uc.uc_mcontext.__gregs[4] = r->tp; + f->uc.uc_mcontext.__gregs[5] = r->t0; + f->uc.uc_mcontext.__gregs[6] = r->t1; + f->uc.uc_mcontext.__gregs[7] = r->t2; + f->uc.uc_mcontext.__gregs[8] = r->s0; + f->uc.uc_mcontext.__gregs[9] = r->s1; + f->uc.uc_mcontext.__gregs[10] = r->a0; + f->uc.uc_mcontext.__gregs[11] = r->a1; + f->uc.uc_mcontext.__gregs[12] = r->a2; + f->uc.uc_mcontext.__gregs[13] = r->a3; + f->uc.uc_mcontext.__gregs[14] = r->a4; + f->uc.uc_mcontext.__gregs[15] = r->a5; + f->uc.uc_mcontext.__gregs[16] = r->a6; + f->uc.uc_mcontext.__gregs[17] = r->a7; + f->uc.uc_mcontext.__gregs[18] = r->s2; + f->uc.uc_mcontext.__gregs[19] = r->s3; + f->uc.uc_mcontext.__gregs[20] = r->s4; + f->uc.uc_mcontext.__gregs[21] = r->s5; + f->uc.uc_mcontext.__gregs[22] = r->s6; + f->uc.uc_mcontext.__gregs[23] = r->s7; + f->uc.uc_mcontext.__gregs[24] = r->s8; + f->uc.uc_mcontext.__gregs[25] = r->s9; + f->uc.uc_mcontext.__gregs[26] = r->s10; + f->uc.uc_mcontext.__gregs[27] = r->s11; + f->uc.uc_mcontext.__gregs[28] = r->t3; + f->uc.uc_mcontext.__gregs[29] = r->t4; + f->uc.uc_mcontext.__gregs[30] = r->t5; + f->uc.uc_mcontext.__gregs[31] = r->t6; + + return 0; +} diff --git a/criu/arch/riscv64/include/asm/dump.h b/criu/arch/riscv64/include/asm/dump.h new file mode 100644 index 000000000..c2988f9bf --- /dev/null +++ b/criu/arch/riscv64/include/asm/dump.h @@ -0,0 +1,15 @@ +#ifndef __CR_ASM_DUMP_H__ +#define __CR_ASM_DUMP_H__ + +extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int arch_alloc_thread_info(CoreEntry *core); +extern void arch_free_thread_info(CoreEntry *core); + +static inline void core_put_tls(CoreEntry *core, tls_t tls) +{ + core->ti_riscv64->tls = tls; +} + +#define get_task_futex_robust_list_compat(pid, info) -1 + +#endif diff --git a/criu/arch/riscv64/include/asm/int.h b/criu/arch/riscv64/include/asm/int.h new file mode 100644 index 000000000..642804e9b --- /dev/null +++ b/criu/arch/riscv64/include/asm/int.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_INT_H__ +#define __CR_ASM_INT_H__ + +#include "asm-generic/int.h" + +#endif /* __CR_ASM_INT_H__ */ diff --git a/criu/arch/riscv64/include/asm/kerndat.h b/criu/arch/riscv64/include/asm/kerndat.h new file mode 100644 index 000000000..bb70cf6cf --- /dev/null +++ b/criu/arch/riscv64/include/asm/kerndat.h @@ -0,0 +1,7 @@ +#ifndef __CR_ASM_KERNDAT_H__ +#define __CR_ASM_KERNDAT_H__ + +#define kdat_compatible_cr() 0 +#define kdat_can_map_vdso() 0 + +#endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/criu/arch/riscv64/include/asm/parasite-syscall.h b/criu/arch/riscv64/include/asm/parasite-syscall.h new file mode 100644 index 000000000..6008c3792 --- /dev/null +++ b/criu/arch/riscv64/include/asm/parasite-syscall.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_PARASITE_SYSCALL_H__ +#define __CR_ASM_PARASITE_SYSCALL_H__ + +struct parasite_ctl; + +#endif diff --git a/criu/arch/riscv64/include/asm/parasite.h b/criu/arch/riscv64/include/asm/parasite.h new file mode 100644 index 000000000..4798cfd8a --- /dev/null +++ b/criu/arch/riscv64/include/asm/parasite.h @@ -0,0 +1,16 @@ +#ifndef __ASM_PARASITE_H__ +#define __ASM_PARASITE_H__ + +/* + * This function is used to retrieve the value of the thread pointer (tp) + * in RISC-V architecture, which is typically used for thread-local storage (TLS). + * The value is then stored in the provided tls_t pointer. + */ +static inline void arch_get_tls(tls_t *ptls) +{ + tls_t tls; + asm("mv %0, tp" : "=r"(tls)); + *ptls = tls; +} + +#endif diff --git a/criu/arch/riscv64/include/asm/restore.h b/criu/arch/riscv64/include/asm/restore.h new file mode 100644 index 000000000..e4f25a57b --- /dev/null +++ b/criu/arch/riscv64/include/asm/restore.h @@ -0,0 +1,29 @@ +#ifndef __CR_ASM_RESTORE_H__ +#define __CR_ASM_RESTORE_H__ + +#include "asm/restorer.h" + +#include "images/core.pb-c.h" + +/* clang-format off */ +#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \ + task_args) \ + asm volatile( \ + "and sp, %0, ~15 \n" \ + "mv a0, %2 \n" \ + "jr %1 \n" \ + : \ + : "r"(new_sp), \ + "r"(restore_task_exec_start), \ + "r"(task_args) \ + : "a0", "memory") +/* clang-format on */ + +static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) +{ + *ptls = pcore->ti_riscv64->tls; +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); + +#endif diff --git a/criu/arch/riscv64/include/asm/restorer.h b/criu/arch/riscv64/include/asm/restorer.h new file mode 100644 index 000000000..45fe847a9 --- /dev/null +++ b/criu/arch/riscv64/include/asm/restorer.h @@ -0,0 +1,150 @@ +#ifndef __CR_ASM_RESTORER_H__ +#define __CR_ASM_RESTORER_H__ + +#include + +#include "asm/types.h" +#include "images/core.pb-c.h" + +#include + +// kernel arg order for clone +// unsigned long clone_flags, +// unsigned long newsp, +// int __user * parent_tidptr, +// unsigned long tls, +// int __user * child_tidptr +/* clang-format off */ +#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ + thread_args, clone_restore_fn) \ + asm volatile( \ + "clone_emul: \n" \ + "ld a1, %2 \n" \ + "andi a1, a1, ~15 \n" \ + "addi a1, a1, -16 \n" \ + "sd %5, 0(a1) \n" \ + "sd %6, 8(a1) \n" \ + "mv a0, %1 \n" \ + "mv a2, %3 \n" \ + "mv a3, %4 \n" \ + "li a7, "__stringify(__NR_clone)" \n" \ + "ecall \n" \ + \ + "beqz a0, thread_run \n" \ + \ + "mv %0, a0 \n" \ + "j clone_end \n" \ + \ + "thread_run: \n" \ + "ld a1, 0(sp) \n" \ + "ld a0, 8(sp) \n" \ + "jr a1 \n" \ + \ + "clone_end: \n" \ + : "=r"(ret) \ + : "r"(clone_flags), \ + "m"(new_sp), \ + "r"(&parent_tid), \ + "r"(&thread_args[i].pid), \ + "r"(clone_restore_fn), \ + "r"(&thread_args[i]) \ + : "a0", "a1", "a2", "a3", "a7", "memory") + +/* + * Based on sysdeps/unix/sysv/linux/riscv/clone.S + * + * int clone(int (*fn)(void *arg), x0 + * void *child_stack, x1 + * int flags, x2 + * void *arg, x3 + * pid_t *ptid, x4 + * struct user_desc *tls, x5 + * pid_t *ctid); x6 + * + * int clone3(struct clone_args *args, x0 + * size_t size); x1 + * + * Always consult the CLONE3 wrappers for other architectures + * for additional details. + * + */ +#define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ + clone_restore_fn) \ + asm volatile( \ + /* In contrast to the clone() wrapper above this does not put + * the thread function and its arguments on the child stack, + * but uses registers to pass these parameters to the child process. + * Based on the glibc clone() wrapper at + * sysdeps/unix/sysv/linux/riscv/clone.S. + */ \ + "clone3_emul: \n" \ + /* + * Based on the glibc clone() wrapper, which uses x10 and x11 + * to save the arguments for the child process, this does the same. + * x10 for the thread function and x11 for the thread arguments. + */ \ + "mv t0, %3 /* clone_restore_fn */ \n" \ + "mv t1, %4 /* args */ \n" \ + "mv a0, %1 /* &clone_args */ \n" \ + "mv a1, %2 /* size */ \n" \ + /* Load syscall number */ \ + "li a7, "__stringify(__NR_clone3)" \n" \ + /* Do the syscall */ \ + "ecall \n" \ + \ + "beqz a0, clone3_thread_run \n" \ + \ + "mv %0, a0 \n" \ + "j clone3_end \n" \ + \ + "clone3_thread_run: \n" \ + /* Move args to a0 */ \ + "mv a0, t1 \n" \ + /* Jump to clone_restore_fn */ \ + "jr t0 \n" \ + \ + "clone3_end: \n" \ + : "=r"(ret) \ + : "r"(&clone_args), \ + "r"(size), \ + "r"(clone_restore_fn), \ + "r"(args) \ + : "a0", "a1", "a7", "t0", "t1", "memory") + +#define ARCH_FAIL_CORE_RESTORE \ + asm volatile( \ + "mv sp, %0 \n" \ + "li a0, 0 \n" \ + "jr x0 \n" \ + : \ + : "r"(ret) \ + : "sp", "a0", "memory") +/* clang-format on */ + +#define arch_map_vdso(map, compat) -1 + +int restore_gpregs(struct rt_sigframe *f, UserRiscv64RegsEntry *r); +int restore_nonsigframe_gpregs(UserRiscv64RegsEntry *r); + +static inline void restore_tls(tls_t *ptls) +{ + asm("mv tp, %0" : : "r"(*ptls)); +} + +static inline void *alloc_compat_syscall_stack(void) +{ + return NULL; +} +static inline void free_compat_syscall_stack(void *stack32) +{ +} +static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) +{ + return -1; +} +static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) +{ + return -1; +} + +#endif \ No newline at end of file diff --git a/criu/arch/riscv64/include/asm/thread_pointer.h b/criu/arch/riscv64/include/asm/thread_pointer.h new file mode 100644 index 000000000..f7e07066a --- /dev/null +++ b/criu/arch/riscv64/include/asm/thread_pointer.h @@ -0,0 +1,27 @@ +/* __thread_pointer definition. Generic version. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#ifndef _SYS_THREAD_POINTER_H +#define _SYS_THREAD_POINTER_H + +static inline void *__criu_thread_pointer(void) +{ + return __builtin_thread_pointer(); +} + +#endif /* _SYS_THREAD_POINTER_H */ diff --git a/criu/arch/riscv64/include/asm/types.h b/criu/arch/riscv64/include/asm/types.h new file mode 100644 index 000000000..83bb5f65f --- /dev/null +++ b/criu/arch/riscv64/include/asm/types.h @@ -0,0 +1,40 @@ +#ifndef __CR_ASM_TYPES_H__ +#define __CR_ASM_TYPES_H__ + +#include +#include +#include +#include "images/core.pb-c.h" + +#include "page.h" +#include "bitops.h" +#include "asm/int.h" + +#include + +#define core_is_compat(core) false + +typedef UserRiscv64RegsEntry UserRegsEntry; + +#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__RISCV64 + +#define CORE_THREAD_ARCH_INFO(core) core->ti_riscv64 + +#define TI_SP(core) ((core)->ti_riscv64->gpregs->sp) + +#define TI_IP(core) ((core)->ti_riscv64->gpregs->pc) + +static inline void *decode_pointer(uint64_t v) +{ + return (void *)v; +} +static inline uint64_t encode_pointer(void *p) +{ + return (uint64_t)p; +} + +#define AT_VECTOR_SIZE 64 +typedef uint64_t auxv_t; +typedef uint64_t tls_t; + +#endif /* __CR_ASM_TYPES_H__ */ diff --git a/criu/arch/riscv64/include/asm/vdso.h b/criu/arch/riscv64/include/asm/vdso.h new file mode 100644 index 000000000..322149c6e --- /dev/null +++ b/criu/arch/riscv64/include/asm/vdso.h @@ -0,0 +1,28 @@ +#ifndef __CR_ASM_VDSO_H__ +#define __CR_ASM_VDSO_H__ + +#include "asm/int.h" +#include "common/compiler.h" +#include "asm-generic/vdso.h" + +/* + * This is a minimal amount of symbols + * we should support at the moment. + */ +#define VDSO_SYMBOL_MAX 6 +#define VDSO_SYMBOL_GTOD 2 + +#define ARCH_VDSO_SYMBOLS_LIST \ + const char *rv64_vdso_symbol1 = "__vdso_clock_getres"; \ + const char *rv64_vdso_symbol2 = "__vdso_clock_gettime"; \ + const char *rv64_vdso_symbol3 = "__vdso_gettimeofday"; \ + const char *rv64_vdso_symbol4 = "__vdso_getcpu"; \ + const char *rv64_vdso_symbol5 = "__vdso_flush_icache"; \ + const char *rv64_vdso_symbol6 = "__vdso_rt_sigreturn"; + +#define ARCH_VDSO_SYMBOLS \ + rv64_vdso_symbol1, rv64_vdso_symbol2, rv64_vdso_symbol3, rv64_vdso_symbol4, rv64_vdso_symbol5, rv64_vdso_symbol6 + +extern void write_intraprocedure_branch(unsigned long to, unsigned long from); + +#endif /* __CR_ASM_VDSO_H__ */ \ No newline at end of file diff --git a/criu/arch/riscv64/restorer.c b/criu/arch/riscv64/restorer.c new file mode 100644 index 000000000..d605f048d --- /dev/null +++ b/criu/arch/riscv64/restorer.c @@ -0,0 +1,14 @@ +#include + +#include "restorer.h" +#include "asm/restorer.h" + +#include +#include "log.h" +#include +#include "cpu.h" + +int restore_nonsigframe_gpregs(UserRiscv64RegsEntry *r) +{ + return 0; +} diff --git a/criu/arch/riscv64/sigframe.c b/criu/arch/riscv64/sigframe.c new file mode 100644 index 000000000..8096fab66 --- /dev/null +++ b/criu/arch/riscv64/sigframe.c @@ -0,0 +1,8 @@ +#include "asm/types.h" +#include +#include "asm/sigframe.h" + +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) +{ + return 0; +} diff --git a/criu/arch/riscv64/vdso-lookup.S b/criu/arch/riscv64/vdso-lookup.S new file mode 100644 index 000000000..50d4ecf08 --- /dev/null +++ b/criu/arch/riscv64/vdso-lookup.S @@ -0,0 +1,15 @@ +#include "common/asm/linkage.h" + +.section .text + +/* Expects t0 to hold the index into the lookup table. */ +GLOBAL(riscv_vdso_lookup) + /* Get the beginning of the lookup table */ + la t1, riscv_vdso_lookup_end + /* Scale the index */ + slli t0, t0, 3 + add t1, t0, t1 + ld t2, 0(t1) + jr t2 + +GLOBAL(riscv_vdso_lookup_end) \ No newline at end of file diff --git a/criu/arch/riscv64/vdso-pie.c b/criu/arch/riscv64/vdso-pie.c new file mode 100644 index 000000000..aa9272fb5 --- /dev/null +++ b/criu/arch/riscv64/vdso-pie.c @@ -0,0 +1,159 @@ +#include + +#include "asm/types.h" + +#include +#include +#include +#include +#include "atomic.h" +#include "parasite-vdso.h" +#include "log.h" +#include "common/bug.h" + +#ifdef LOG_PREFIX +#undef LOG_PREFIX +#endif +#define LOG_PREFIX "vdso: " + +/* These symbols are defined in vdso-lookup.S */ +extern char *riscv_vdso_lookup, *riscv_vdso_lookup_end; + +/* + * li t0, INDEX + * jal x0, riscv_vdso_lookup + */ +#define TRAMP_CALL_SIZE (2 * sizeof(uint32_t)) + +static inline void invalidate_caches(void) +{ + // We're supposed to use the VDSO as the officially sanctioned ABI. But oh well. + int ret; + __smp_mb(); + asm volatile("li a0, 0\n" + "li a1, 0\n" + "li a2, 1\n" /* SYS_RISCV_FLUSH_ICACHE_ALL */ + "li a7, 259\n" /* __NR_arch_specific_syscall */ + "ecall\n" + : "=r"(ret) + : + : "a7"); +} + +static inline size_t vdso_trampoline_size(void) +{ + return (size_t)&riscv_vdso_lookup_end - (size_t)&riscv_vdso_lookup; +} + +static uint64_t put_trampoline(uint64_t at, struct vdso_symtable *sym) +{ + int i, j; + uint64_t total_size, trampoline_size; + uint64_t trampoline = 0; + + /* First of all we have to find a place where to put the trampoline + * code. + */ + trampoline_size = vdso_trampoline_size(); + total_size = trampoline_size + VDSO_SYMBOL_MAX * sizeof(uint64_t); + + for (i = 0; i < ARRAY_SIZE(sym->symbols); i++) { + if (vdso_symbol_empty(&sym->symbols[i])) + continue; + + pr_debug("Checking '%s' at %lx\n", sym->symbols[i].name, sym->symbols[i].offset); + + /* find the nearest following symbol we are interested in */ + for (j = 0; j < ARRAY_SIZE(sym->symbols); j++) { + if (i == j || vdso_symbol_empty(&sym->symbols[j])) + continue; + + if (sym->symbols[j].offset <= sym->symbols[i].offset) + /* this symbol is above the current one */ + continue; + + if ((sym->symbols[i].offset + TRAMP_CALL_SIZE) > sym->symbols[j].offset) { + /* we have a major issue here since we cannot + * even put the trampoline call for this symbol + */ + pr_err("Can't handle small vDSO symbol %s\n", sym->symbols[i].name); + return 0; + } + + if (trampoline) + /* no need to put it twice */ + continue; + + if ((sym->symbols[j].offset - (sym->symbols[i].offset + TRAMP_CALL_SIZE)) <= total_size) + /* not enough place */ + continue; + + /* We can put the trampoline there */ + trampoline = at + sym->symbols[i].offset; + trampoline += TRAMP_CALL_SIZE; + + pr_debug("Putting vDSO trampoline in %s at %lx\n", sym->symbols[i].name, trampoline); + memcpy((void *)trampoline, &riscv_vdso_lookup, trampoline_size); + invalidate_caches(); + return trampoline; + } + } + + return 0; +} + +static inline void put_trampoline_call(uint64_t from, uint64_t to, uint64_t trampoline, unsigned int idx) +{ + size_t trampoline_size = vdso_trampoline_size(); + uint64_t *lookup_table = NULL; + /* + * li t0, INDEX + * addi t0, x0 INDEX + * jal x0, riscv_vdso_lookup + */ + uint32_t trampoline_call[2] = { + 0x00000293, + 0x0000006f, + }; + const size_t insts_len = ARRAY_SIZE(trampoline_call); + uint32_t *call_addr = (uint32_t *)from; + // Offset from the jal instruction to the lookup trampoline. + ssize_t trampoline_offset = trampoline - (from + sizeof(uint32_t)); + + trampoline_call[0] = trampoline_call[0] | (idx << 24); + trampoline_call[1] = trampoline_call[1] | riscv_j_imm(trampoline_offset); + + for (unsigned int i = 0; i < insts_len; i++) { + call_addr[i] = trampoline_call[i]; + } + + // Set the lookup table pointer for this vdso symbol. + lookup_table = (uint64_t *)(trampoline + trampoline_size); + lookup_table[idx] = to; +} + +int vdso_redirect_calls(uint64_t base_to, uint64_t base_from, struct vdso_symtable *to, struct vdso_symtable *from, + bool __always_unused compat_vdso) +{ + unsigned int i, valid_idx = 0; + + uint64_t trampoline = (uint64_t)put_trampoline(base_from, from); + if (!trampoline) + return 1; + + for (i = 0; i < ARRAY_SIZE(to->symbols); i++) { + if (vdso_symbol_empty(&from->symbols[i])) + continue; + + pr_debug("br: %lx/%lx -> %lx/%lx (index %d) '%s'\n", base_from, from->symbols[i].offset, base_to, + to->symbols[i].offset, i, from->symbols[i].name); + + put_trampoline_call(base_from + from->symbols[i].offset, base_to + to->symbols[i].offset, trampoline, + valid_idx); + valid_idx++; + } + + invalidate_caches(); + + return 0; +} \ No newline at end of file diff --git a/criu/pie/Makefile b/criu/pie/Makefile index 912fab24b..60c7f1e94 100644 --- a/criu/pie/Makefile +++ b/criu/pie/Makefile @@ -23,6 +23,10 @@ ifeq ($(ARCH),x86) ccflags-y += -mshstk endif +ifeq ($(ARCH),riscv64) + ccflags-y += -fno-stack-protector +endif + LDS := compel/arch/$(ARCH)/scripts/compel-pack.lds.S restorer-obj-y += parasite-vdso.o ./$(ARCH_DIR)/vdso-pie.o @@ -43,6 +47,10 @@ ifeq ($(ARCH),ppc64) restorer-obj-y += ./$(ARCH_DIR)/vdso-trampoline.o endif +ifeq ($(ARCH),riscv64) + restorer-obj-y += ./$(ARCH_DIR)/vdso-lookup.o +endif + define gen-pie-rules $(1)-obj-y += $(1).o $(1)-obj-e += pie.lib.a diff --git a/criu/pie/Makefile.library b/criu/pie/Makefile.library index da2a2fab3..d96a7ac32 100644 --- a/criu/pie/Makefile.library +++ b/criu/pie/Makefile.library @@ -27,3 +27,7 @@ CFLAGS += $(CFLAGS_PIE) ifeq ($(ARCH),mips) CFLAGS += -fno-stack-protector -DCR_NOGLIBC -mno-abicalls -fno-pic endif + +ifeq ($(ARCH),riscv64) + ccflags-y += -fno-stack-protector +endif \ No newline at end of file From bb29067de9ee853fd132f88b4b3e62dbd87aa915 Mon Sep 17 00:00:00 2001 From: Haorong Lu Date: Tue, 1 Aug 2023 12:09:16 -0700 Subject: [PATCH 020/257] zdtm: add riscv64 support Signed-off-by: Haorong Lu --- .../lib/arch/riscv64/include/asm/atomic.h | 107 ++++++++++++++++++ test/zdtm/lib/test.c | 2 +- test/zdtm/static/fanotify00.c | 2 +- test/zdtm/static/netns-nf.desc | 2 +- test/zdtm/static/netns-nft-ipt.desc | 2 +- .../static/socket-tcp-closed-last-ack.desc | 4 +- test/zdtm/static/socket-tcp-reseted.desc | 6 +- test/zdtm/static/socket-tcp-syn-sent.desc | 4 +- 8 files changed, 118 insertions(+), 11 deletions(-) create mode 100644 test/zdtm/lib/arch/riscv64/include/asm/atomic.h diff --git a/test/zdtm/lib/arch/riscv64/include/asm/atomic.h b/test/zdtm/lib/arch/riscv64/include/asm/atomic.h new file mode 100644 index 000000000..a4faf1322 --- /dev/null +++ b/test/zdtm/lib/arch/riscv64/include/asm/atomic.h @@ -0,0 +1,107 @@ +#ifndef __CR_ATOMIC_H__ +#define __CR_ATOMIC_H__ + +typedef uint32_t atomic_t; + +/* Copied from the Linux header arch/riscv/include/asm/barrier.h */ + +#define nop() __asm__ __volatile__("nop") + +#define RISCV_FENCE(p, s) __asm__ __volatile__("fence " #p "," #s : : : "memory") + +/* These barriers need to enforce ordering on both devices or memory. */ +#define mb() RISCV_FENCE(iorw, iorw) +#define rmb() RISCV_FENCE(ir, ir) +#define wmb() RISCV_FENCE(ow, ow) + +/* These barriers do not need to enforce ordering on devices, just memory. */ +#define __smp_mb() RISCV_FENCE(rw, rw) +#define __smp_rmb() RISCV_FENCE(r, r) +#define __smp_wmb() RISCV_FENCE(w, w) + +#define __smp_store_release(p, v) \ + do { \ + compiletime_assert_atomic_type(*p); \ + RISCV_FENCE(rw, w); \ + WRITE_ONCE(*p, v); \ + } while (0) + +#define __smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + RISCV_FENCE(r, rw); \ + ___p1; \ + }) + +/* Copied from the Linux kernel header arch/riscv/include/asm/atomic.h */ + +static inline int atomic_read(const atomic_t *v) +{ + return (*(volatile int *)v); +} + +static inline void atomic_set(atomic_t *v, int i) +{ + *v = i; +} + +#define atomic_get atomic_read + +static inline int atomic_add_return(int i, atomic_t *v) +{ + int result; + + asm volatile("amoadd.w.aqrl %1, %2, %0" : "+A"(*v), "=r"(result) : "r"(i) : "memory"); + __smp_mb(); + return result + i; +} + +static inline int atomic_sub_return(int i, atomic_t *v) +{ + return atomic_add_return(-i, v); +} + +static inline int atomic_inc(atomic_t *v) +{ + return atomic_add_return(1, v) - 1; +} + +static inline int atomic_add(int val, atomic_t *v) +{ + return atomic_add_return(val, v) - val; +} + +static inline int atomic_dec(atomic_t *v) +{ + return atomic_sub_return(1, v) + 1; +} + +/* true if the result is 0, or false for all other cases. */ +#define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0) +#define atomic_dec_return(v) (atomic_sub_return(1, v)) + +#define atomic_inc_return(v) (atomic_add_return(1, v)) + +static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) +{ + unsigned long tmp; + int oldval; + + __smp_mb(); + + asm volatile("1:\n" + " lr.w %1, %2\n" + " bne %1, %3, 2f\n" + " sc.w %0, %4, %2\n" + " bnez %0, 1b\n" + "2:" + : "=&r"(tmp), "=&r"(oldval), "+A"(*ptr) + : "r"(old), "r"(new) + : "memory"); + + __smp_mb(); + return oldval; +} + +#endif /* __CR_ATOMIC_H__ */ diff --git a/test/zdtm/lib/test.c b/test/zdtm/lib/test.c index a5ba38b2d..95017e42e 100644 --- a/test/zdtm/lib/test.c +++ b/test/zdtm/lib/test.c @@ -406,7 +406,7 @@ pid_t sys_clone_unified(unsigned long flags, void *child_stack, void *parent_tid { #ifdef __x86_64__ return (pid_t)syscall(__NR_clone, flags, child_stack, parent_tid, child_tid, newtls); -#elif (__i386__ || __arm__ || __aarch64__ || __powerpc64__ || __mips__ || __loongarch64) +#elif (__i386__ || __arm__ || __aarch64__ || __powerpc64__ || __mips__ || __loongarch64 || __riscv) return (pid_t)syscall(__NR_clone, flags, child_stack, parent_tid, newtls, child_tid); #elif __s390x__ return (pid_t)syscall(__NR_clone, child_stack, flags, parent_tid, child_tid, newtls); diff --git a/test/zdtm/static/fanotify00.c b/test/zdtm/static/fanotify00.c index 69ead43e7..0400cc74b 100644 --- a/test/zdtm/static/fanotify00.c +++ b/test/zdtm/static/fanotify00.c @@ -22,7 +22,7 @@ #elif defined(__PPC64__) #define __NR_fanotify_init 323 #define __NR_fanotify_mark 324 -#elif __aarch64__ +#elif (__aarch64__ || __riscv) #define __NR_fanotify_init 262 #define __NR_fanotify_mark 263 #elif __s390x__ diff --git a/test/zdtm/static/netns-nf.desc b/test/zdtm/static/netns-nf.desc index e7e73b1ae..c99696d1c 100644 --- a/test/zdtm/static/netns-nf.desc +++ b/test/zdtm/static/netns-nf.desc @@ -1,6 +1,6 @@ { 'deps': [ '/bin/sh', '/sbin/iptables|/usr/sbin/iptables', - '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/iptables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so', + '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/iptables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_standard.so', '/usr/bin/diff'], 'flags': 'suid', 'flavor': 'ns uns'} diff --git a/test/zdtm/static/netns-nft-ipt.desc b/test/zdtm/static/netns-nft-ipt.desc index 4120f74d6..6d04589b3 100644 --- a/test/zdtm/static/netns-nft-ipt.desc +++ b/test/zdtm/static/netns-nft-ipt.desc @@ -2,7 +2,7 @@ 'deps': [ '/bin/sh', '/usr/sbin/nft', '/sbin/iptables|/usr/sbin/iptables', - '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/iptables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so', + '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/iptables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_standard.so', '/usr/bin/diff'], 'flags': 'suid', 'flavor': 'ns uns'} diff --git a/test/zdtm/static/socket-tcp-closed-last-ack.desc b/test/zdtm/static/socket-tcp-closed-last-ack.desc index d4cfe5064..309854fa5 100644 --- a/test/zdtm/static/socket-tcp-closed-last-ack.desc +++ b/test/zdtm/static/socket-tcp-closed-last-ack.desc @@ -1,7 +1,7 @@ { 'deps': [ '/bin/sh', '/sbin/iptables|/usr/sbin/iptables', - '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so', - '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so', + '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_tcp.so', + '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_standard.so', ], 'opts': '--tcp-established', 'flags': 'suid nouser samens', diff --git a/test/zdtm/static/socket-tcp-reseted.desc b/test/zdtm/static/socket-tcp-reseted.desc index 3ebdfeef8..4aa48ad87 100644 --- a/test/zdtm/static/socket-tcp-reseted.desc +++ b/test/zdtm/static/socket-tcp-reseted.desc @@ -1,8 +1,8 @@ { 'deps': [ '/bin/sh', '/sbin/iptables|/usr/sbin/iptables', - '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so', - '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so', - '/lib/xtables/libipt_REJECT.so|/usr/lib64/xtables/libipt_REJECT.so|/usr/lib/powerpc64le-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/x86_64-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/xtables/libipt_REJECT.so|/usr/lib/s390x-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/aarch64-linux-gnu/xtables/libipt_REJECT.so', + '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_tcp.so', + '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_standard.so', + '/lib/xtables/libipt_REJECT.so|/usr/lib64/xtables/libipt_REJECT.so|/usr/lib/powerpc64le-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/x86_64-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/xtables/libipt_REJECT.so|/usr/lib/s390x-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/aarch64-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/riscv64-linux-gnu/xtables/libipt_REJECT.so', ], 'opts': '--tcp-established', 'flags': 'suid nouser samens', diff --git a/test/zdtm/static/socket-tcp-syn-sent.desc b/test/zdtm/static/socket-tcp-syn-sent.desc index 4cc23c8fc..71cd26d72 100644 --- a/test/zdtm/static/socket-tcp-syn-sent.desc +++ b/test/zdtm/static/socket-tcp-syn-sent.desc @@ -1,7 +1,7 @@ { 'deps': [ '/bin/sh', '/sbin/iptables|/usr/sbin/iptables', - '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so', - '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so', + '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_tcp.so', + '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_standard.so', ], 'opts': '--tcp-established', 'flags': 'suid nouser samens', From da6b1807ef76fee6b744f0d3f8f50af26c492baa Mon Sep 17 00:00:00 2001 From: Haorong Lu Date: Tue, 1 Aug 2023 12:10:46 -0700 Subject: [PATCH 021/257] ci: add workflow for riscv64 Signed-off-by: Haorong Lu --- .github/workflows/cross-compile-daily.yml | 2 +- .github/workflows/cross-compile.yml | 1 + .../build/Dockerfile.riscv64-stable-cross.hdr | 5 ++ .../Dockerfile.riscv64-stable-cross.tmpl | 57 +++++++++++++++++++ scripts/build/Makefile | 2 +- scripts/ci/riscv64-cross/amd64-sources.list | 10 ++++ scripts/ci/riscv64-cross/riscv64-sources.list | 42 ++++++++++++++ 7 files changed, 117 insertions(+), 2 deletions(-) create mode 100644 scripts/build/Dockerfile.riscv64-stable-cross.hdr create mode 100644 scripts/build/Dockerfile.riscv64-stable-cross.tmpl create mode 100644 scripts/ci/riscv64-cross/amd64-sources.list create mode 100644 scripts/ci/riscv64-cross/riscv64-sources.list diff --git a/.github/workflows/cross-compile-daily.yml b/.github/workflows/cross-compile-daily.yml index b8c8c86d4..c709cca00 100644 --- a/.github/workflows/cross-compile-daily.yml +++ b/.github/workflows/cross-compile-daily.yml @@ -10,7 +10,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - target: [armv7-stable-cross, aarch64-stable-cross, ppc64-stable-cross, mips64el-stable-cross] + target: [armv7-stable-cross, aarch64-stable-cross, ppc64-stable-cross, mips64el-stable-cross, riscv64-stable-cross] branches: [criu-dev, master] steps: diff --git a/.github/workflows/cross-compile.yml b/.github/workflows/cross-compile.yml index 06b812823..96672b294 100644 --- a/.github/workflows/cross-compile.yml +++ b/.github/workflows/cross-compile.yml @@ -21,6 +21,7 @@ jobs: aarch64-stable-cross, ppc64-stable-cross, mips64el-stable-cross, + riscv64-stable-cross, ] include: - experimental: true diff --git a/scripts/build/Dockerfile.riscv64-stable-cross.hdr b/scripts/build/Dockerfile.riscv64-stable-cross.hdr new file mode 100644 index 000000000..d4c414023 --- /dev/null +++ b/scripts/build/Dockerfile.riscv64-stable-cross.hdr @@ -0,0 +1,5 @@ +FROM ubuntu:jammy + +ENV ARCH=riscv64 +ENV DEBIAN_ARCH=riscv64 +ENV CROSS_TRIPLET=riscv64-linux-gnu diff --git a/scripts/build/Dockerfile.riscv64-stable-cross.tmpl b/scripts/build/Dockerfile.riscv64-stable-cross.tmpl new file mode 100644 index 000000000..39a0c33c6 --- /dev/null +++ b/scripts/build/Dockerfile.riscv64-stable-cross.tmpl @@ -0,0 +1,57 @@ +COPY scripts/ci/apt-install /bin/apt-install + +# Add the cross compiler sources +RUN apt-get clean -y && apt-get update -y && apt-get install -y --no-install-recommends gnupg2 + +RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 871920D1991BC93C 8D69674688B6CB36 B523E5F3FC4E5F2C + +COPY scripts/ci/riscv64-cross/amd64-sources.list /etc/apt/sources.list + +COPY scripts/ci/riscv64-cross/riscv64-sources.list /etc/apt/sources.list.d/ + +RUN dpkg --add-architecture ${DEBIAN_ARCH} && \ + apt-get update -y + +# Install required packages +RUN apt-get install -y --no-install-recommends \ + build-essential \ + pkg-config \ + git \ + crossbuild-essential-${DEBIAN_ARCH} \ + libc6-dev-${DEBIAN_ARCH}-cross \ + libc6-${DEBIAN_ARCH}-cross \ + libbz2-dev:${DEBIAN_ARCH} \ + libexpat1-dev:${DEBIAN_ARCH} \ + ncurses-dev:${DEBIAN_ARCH} \ + libssl-dev:${DEBIAN_ARCH} \ + protobuf-c-compiler \ + protobuf-compiler \ + python3-protobuf \ + libnl-3-dev:${DEBIAN_ARCH} \ + libprotobuf-dev:${DEBIAN_ARCH} \ + libnet-dev:${DEBIAN_ARCH} \ + libprotobuf-c-dev:${DEBIAN_ARCH} \ + libcap-dev:${DEBIAN_ARCH} \ + libaio-dev:${DEBIAN_ARCH} \ + libnl-route-3-dev:${DEBIAN_ARCH} \ + libnftables-dev:${DEBIAN_ARCH} \ + libgnutls28-dev:${DEBIAN_ARCH} \ + iproute2:${DEBIAN_ARCH} + +ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ + CROSS_ROOT=/usr/${CROSS_TRIPLET} \ + AS=/usr/bin/${CROSS_TRIPLET}-as \ + AR=/usr/bin/${CROSS_TRIPLET}-ar \ + CC=/usr/bin/${CROSS_TRIPLET}-gcc \ + CPP=/usr/bin/${CROSS_TRIPLET}-cpp \ + CXX=/usr/bin/${CROSS_TRIPLET}-g++ \ + LD=/usr/bin/${CROSS_TRIPLET}-ld \ + FC=/usr/bin/${CROSS_TRIPLET}-gfortran + +ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ + PKG_CONFIG_PATH=/usr/lib/${CROSS_TRIPLET}/pkgconfig + +COPY . /criu +WORKDIR /criu + +RUN make mrproper && date && make -j $(nproc) zdtm && date diff --git a/scripts/build/Makefile b/scripts/build/Makefile index bc4a59db1..389315227 100644 --- a/scripts/build/Makefile +++ b/scripts/build/Makefile @@ -1,5 +1,5 @@ ARCHES := x86_64 fedora-asan fedora-rawhide armv7hf centos8 -STABLE_CROSS_ARCHES := armv7-stable-cross aarch64-stable-cross ppc64-stable-cross mips64el-stable-cross +STABLE_CROSS_ARCHES := armv7-stable-cross aarch64-stable-cross ppc64-stable-cross mips64el-stable-cross riscv64-stable-cross UNSTABLE_CROSS_ARCHES := armv7-unstable-cross aarch64-unstable-cross ppc64-unstable-cross mips64el-unstable-cross NON_CLANG := $(UNSTABLE_CROSS_ARCHES) $(STABLE_CROSS_ARCHES) CREATE_DOCKERFILES := $(ARCHES) $(NON_CLANG) diff --git a/scripts/ci/riscv64-cross/amd64-sources.list b/scripts/ci/riscv64-cross/amd64-sources.list new file mode 100644 index 000000000..72dad920c --- /dev/null +++ b/scripts/ci/riscv64-cross/amd64-sources.list @@ -0,0 +1,10 @@ +deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy main restricted +deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy-updates main restricted +deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy universe +deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy-updates universe +deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy multiverse +deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy-updates multiverse +deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy-backports main restricted universe multiverse +deb [arch=amd64] http://security.ubuntu.com/ubuntu/ jammy-security main restricted +deb [arch=amd64] http://security.ubuntu.com/ubuntu/ jammy-security universe +deb [arch=amd64] http://security.ubuntu.com/ubuntu/ jammy-security multiverse \ No newline at end of file diff --git a/scripts/ci/riscv64-cross/riscv64-sources.list b/scripts/ci/riscv64-cross/riscv64-sources.list new file mode 100644 index 000000000..67b8067b6 --- /dev/null +++ b/scripts/ci/riscv64-cross/riscv64-sources.list @@ -0,0 +1,42 @@ +# See http://help.ubuntu.com/community/UpgradeNotes for how to upgrade to +# newer versions of the distribution. +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy main restricted +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy main restricted + +## Major bug fix updates produced after the final release of the +## distribution. +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-updates main restricted +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-updates main restricted + +## N.B. software from this repository is ENTIRELY UNSUPPORTED by the Ubuntu +## team. Also, please note that software in universe WILL NOT receive any +## review or updates from the Ubuntu security team. +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy universe +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy universe +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-updates universe +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-updates universe + +## N.B. software from this repository is ENTIRELY UNSUPPORTED by the Ubuntu +## team, and may not be under a free licence. Please satisfy yourself as to +## your rights to use the software. Also, please note that software in +## multiverse WILL NOT receive any review or updates from the Ubuntu +## security team. +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy multiverse +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy multiverse +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-updates multiverse +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-updates multiverse + +## N.B. software from this repository may not have been tested as +## extensively as that contained in the main release, although it includes +## newer versions of some applications which may provide useful features. +## Also, please note that software in backports WILL NOT receive any review +## or updates from the Ubuntu security team. +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-backports main restricted universe multiverse +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-backports main restricted universe multiverse + +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-security main restricted +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-security main restricted +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-security universe +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-security universe +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-security multiverse +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-security multiverse \ No newline at end of file From 2be958d22ea4d78f5ba718688025ff0509a47ac2 Mon Sep 17 00:00:00 2001 From: Cryolitia PukNgae Date: Mon, 14 Oct 2024 01:35:44 +0800 Subject: [PATCH 022/257] include: don't use GCC's __builtin_ffs on riscv64 Link: https://github.com/SerenityOS/serenity/commit/e300da4db42e2484d98f4982d03150d83436304e Signed-off-by: PukNgae Cryolitia --- - cherry-picked Signed-off-by: Alexander Mikhalitsyn --- include/common/arch/riscv64/asm/bitops.h | 111 ++++++++++++++++++++++- 1 file changed, 110 insertions(+), 1 deletion(-) diff --git a/include/common/arch/riscv64/asm/bitops.h b/include/common/arch/riscv64/asm/bitops.h index 400cc3e15..eabab27c7 100644 --- a/include/common/arch/riscv64/asm/bitops.h +++ b/include/common/arch/riscv64/asm/bitops.h @@ -2,7 +2,116 @@ #define __CR_ASM_BITOPS_H__ #include "common/compiler.h" -#include "common/asm-generic/bitops.h" +#include "common/asm/bitsperlong.h" + +#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) +#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) + +#define DECLARE_BITMAP(name, bits) unsigned long name[BITS_TO_LONGS(bits)] +#define BITMAP_SIZE(name) (sizeof(name) * CHAR_BIT) + +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1) +/* Technically wrong, but this avoids compilation errors on some gcc + versions. */ +#define BITOP_ADDR(x) "=m"(*(volatile long *)(x)) +#else +#define BITOP_ADDR(x) "+m"(*(volatile long *)(x)) +#endif + +#define ADDR BITOP_ADDR(addr) + +static inline void set_bit(int nr, volatile unsigned long *addr) +{ + addr += nr / BITS_PER_LONG; + *addr |= (1UL << (nr % BITS_PER_LONG)); +} + +static inline void change_bit(int nr, volatile unsigned long *addr) +{ + addr += nr / BITS_PER_LONG; + *addr ^= (1UL << (nr % BITS_PER_LONG)); +} + +static inline int test_bit(int nr, volatile const unsigned long *addr) +{ + addr += nr / BITS_PER_LONG; + return (*addr & (1UL << (nr % BITS_PER_LONG))) ? -1 : 0; +} + +static inline void clear_bit(int nr, volatile unsigned long *addr) +{ + addr += nr / BITS_PER_LONG; + *addr &= ~(1UL << (nr % BITS_PER_LONG)); +} + +/** + * __ffs - find first set bit in word + * @word: The word to search + * + * Undefined if no bit exists, so code should check against 0 first. + */ +static inline unsigned long __ffs(unsigned long word) +{ + int p = 0; + + for (; p < 8*sizeof(word); ++p) { + if (word & 1) { + break; + } + + word >>= 1; + } + + return p; +} + +#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) + +/* + * Find the next set bit in a memory region. + */ +static inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) +{ + const unsigned long *p = addr + BITOP_WORD(offset); + unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long tmp; + + if (offset >= size) + return size; + size -= result; + offset %= BITS_PER_LONG; + if (offset) { + tmp = *(p++); + tmp &= (~0UL << offset); + if (size < BITS_PER_LONG) + goto found_first; + if (tmp) + goto found_middle; + size -= BITS_PER_LONG; + result += BITS_PER_LONG; + } + while (size & ~(BITS_PER_LONG - 1)) { + if ((tmp = *(p++))) + goto found_middle; + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) + return result; + tmp = *p; + +found_first: + tmp &= (~0UL >> (BITS_PER_LONG - size)); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ +found_middle: + return result + __ffs(tmp); +} + +#define for_each_bit(i, bitmask) \ + for (i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), 0); i < BITMAP_SIZE(bitmask); \ + i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), i + 1)) + #define BITS_PER_LONG 64 From 9052ef93c79f92634eb163080292d79af496617a Mon Sep 17 00:00:00 2001 From: Liu Hua Date: Sat, 12 Oct 2024 15:29:40 +0800 Subject: [PATCH 023/257] uffd: Disable image deduplication after fork After a fork, both the child and parent processes may trigger a page fault (#PF) at the same virtual address, referencing the same position in the page image. If deduplication is enabled, the last process to trigger the page fault will fail. Therefore, deduplication should be disabled after a fork to prevent this issue. Signed-off-by: Liu Hua --- criu/include/pagemap.h | 5 +++++ criu/pagemap.c | 11 ++++++++++- criu/uffd.c | 2 ++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h index 8c7180559..3ae15deb9 100644 --- a/criu/include/pagemap.h +++ b/criu/include/pagemap.h @@ -58,6 +58,9 @@ struct page_read { /* Whether or not pages can be read in PIE code */ bool pieok; + /* Whether or not disable image deduplication*/ + bool disable_dedup; + /* Private data of reader */ struct cr_img *pmi; struct cr_img *pi; @@ -112,6 +115,8 @@ int pagemap_render_iovec(struct list_head *from, struct task_restore_args *ta); */ extern void dup_page_read(struct page_read *src, struct page_read *dst); +extern void page_read_disable_dedup(struct page_read *pr); + extern int dedup_one_iovec(struct page_read *pr, unsigned long base, unsigned long len); static inline unsigned long pagemap_len(PagemapEntry *pe) diff --git a/criu/pagemap.c b/criu/pagemap.c index 83f69bba3..85bb92259 100644 --- a/criu/pagemap.c +++ b/criu/pagemap.c @@ -261,7 +261,7 @@ static int read_local_page(struct page_read *pr, unsigned long vaddr, unsigned l break; } - if (opts.auto_dedup) { + if (opts.auto_dedup && !pr->disable_dedup) { ret = punch_hole(pr, pr->pi_off, len, false); if (ret == -1) return -1; @@ -792,6 +792,7 @@ int open_page_read_at(int dfd, unsigned long img_id, struct page_read *pr, int p pr->bunch.iov_base = NULL; pr->pmes = NULL; pr->pieok = false; + pr->disable_dedup = false; pr->pmi = open_image_at(dfd, i_typ, O_RSTR, img_id); if (!pr->pmi) @@ -852,6 +853,14 @@ int open_page_read(unsigned long img_id, struct page_read *pr, int pr_flags) #define DUP_IDS_BASE 1000 +void page_read_disable_dedup(struct page_read *pr) +{ + pr_debug("disable dedup, id: %d\n", pr->id); + pr->disable_dedup = true; + if (pr->parent) + page_read_disable_dedup(pr->parent); +} + void dup_page_read(struct page_read *src, struct page_read *dst) { static int dup_ids = 1; diff --git a/criu/uffd.c b/criu/uffd.c index e07b21b69..98c2b7e07 100644 --- a/criu/uffd.c +++ b/criu/uffd.c @@ -1098,6 +1098,8 @@ static int handle_fork(struct lazy_pages_info *parent_lpi, struct uffd_msg *msg) lpi_get(lpi->parent); + page_read_disable_dedup(&parent_lpi->pr); + page_read_disable_dedup(&lpi->pr); return 1; out: From 622b43392fcb330343243a0f2842dd2919f977cc Mon Sep 17 00:00:00 2001 From: Lorenzo Fontana Date: Fri, 18 Oct 2024 18:51:18 +0200 Subject: [PATCH 024/257] criu: Initialize util before service worker starts When restoring dumps in new mount + pid namespaces where multiple dumps share the same network namespace, CRIU may fail due to conflicting unix socket names. This happens because the service worker creates sockets using a pattern that includes criu_run_id, but util_init() is called after cr_service_work() starts. The socket naming pattern "crtools-fd-%d-%d" uses the restore PID and criu_run_id, however criu_run_id is always 0 when not initialized, leading to conflicts when multiple restores run simultaneously either in the same CRIU process or because of multiple CRIU processes doing the same operation in different PID namespaces. Fix this by: - Moving util_init() before cr_service_work() starts - Adding a second util_init() call in the service worker fork to ensure unique IDs across multiple worker runs - Making sure that dump and restore operations have util_init() called early to generate unique socket names With this fix, socket names always include the namespace ID, preventing conflicts when multiple processes with the same pid share a network namespace. Fixes #2499 [ avagin: minore code changes ] Signed-off-by: Lorenzo Fontana Signed-off-by: Andrei Vagin --- criu/cr-service.c | 8 ++++++++ criu/crtools.c | 10 +++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index 61a04c5ff..b9d11ced2 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -1310,6 +1310,14 @@ int cr_service_work(int sk) int ret = -1; CriuReq *msg = 0; + /* + * util_init initializes criu_run_id and compel_run_id so that sockets + * are generated with an unique name identifying the specific process + * even in cases where multiple processes with the same pid in + * different pid namespaces are sharing the same network namespace. + */ + util_init(); + more: opts.mode = CR_SWRK; diff --git a/criu/crtools.c b/criu/crtools.c index 94657f418..6f493850b 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -169,7 +169,13 @@ int main(int argc, char *argv[], char *envp[]) pr_err("unknown command: %s\n", argv[optind]); goto usage; } - + /* + * util_init initializes criu_run_id and compel_run_id so that sockets + * are generated with an unique name identifying the specific process + * even in cases where multiple processes with the same pid in + * different pid namespaces are sharing the same network namespace. + */ + util_init(); if (opts.mode == CR_SWRK) { if (argc != optind + 2) { fprintf(stderr, "Usage: criu swrk \n"); @@ -254,8 +260,6 @@ int main(int argc, char *argv[], char *envp[]) return 1; } - util_init(); - if (log_init(opts.output)) return 1; From ff9dbef902361bfdda8e30e46c8f6b0df710de9f Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 4 Nov 2024 19:30:26 +0000 Subject: [PATCH 025/257] seize: fix error handling for check_freezer_cgroup When `check_freezer_cgroup()` has non-zero return value, `goto err` calls `return ret`. However, the value of `ret` has been set to `0` in the lines above and CRIU does not handle the error properly. This problem is related to https://github.com/checkpoint-restore/criu/issues/2508 Signed-off-by: Radostin Stoyanov --- criu/seize.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/criu/seize.c b/criu/seize.c index edeb57cc8..ab394f9ca 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -1009,7 +1009,7 @@ static int cgroup_version(void) int collect_pstree(void) { pid_t pid = root_item->pid->real; - int ret = -1; + int ret, exit_code = -1; struct proc_status_creds creds; struct pstree_item *iter; @@ -1069,7 +1069,6 @@ int collect_pstree(void) if (opts.freeze_cgroup && !freeze_cgroup_disabled && freezer_wait_processes()) { - ret = -1; goto err; } @@ -1081,12 +1080,12 @@ int collect_pstree(void) goto err; } - ret = 0; + exit_code = 0; timing_stop(TIME_FREEZING); timing_start(TIME_FROZEN); err: /* Freezing stage finished in time - disable timer. */ alarm(0); - return ret; + return exit_code; } From 4196268eef099833cd77c0fb93d367bcd8ce1463 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 4 Nov 2024 19:57:30 +0000 Subject: [PATCH 026/257] seize: enable support for frozen containers Container runtimes like CRI-O and containerd utilize the freezer cgroup to create a consistent snapshot of container root filesystem (rootfs) changes. In this case, the container is frozen before invoking CRIU. After CRIU successfully completes, a copy of the container rootfs diff is saved, and the container is then unfrozen. However, the `cuda-checkpoint` tool is not able to perform a 'lock' action on frozen threads. To support GPU checkpointing with these container runtimes, we need to unfreeze the cgroup and return it to its original state once the checkpointing is complete. To reflect this new behavior, the following changes are applied: - `dont_use_freeze_cgroup(void)` -> `set_compel_interrupt_only_mode(void)` - `bool freeze_cgroup_disabled` -> `bool compel_interrupt_only_mode` - `check_freezer_cgroup(void)` -> `prepare_freezer_for_interrupt_only_mode(void)` Note that when `compel_interrupt_only_mode` is set to `true`, `compel_interrupt_task()` is used instead of `freeze_processes()` to prevent tasks from running during `criu dump`. Fixes: #2508 Signed-off-by: Radostin Stoyanov --- criu/fault-injection.c | 4 +-- criu/include/fault-injection.h | 2 +- criu/include/seize.h | 2 +- criu/seize.c | 46 +++++++++++++++++++--------------- plugins/cuda/cuda_plugin.c | 2 +- test/jenkins/criu-fault.sh | 2 +- 6 files changed, 32 insertions(+), 26 deletions(-) diff --git a/criu/fault-injection.c b/criu/fault-injection.c index 2272e6d84..5dd9acf60 100644 --- a/criu/fault-injection.c +++ b/criu/fault-injection.c @@ -24,8 +24,8 @@ int fault_injection_init(void) fi_strategy = start; switch (fi_strategy) { - case FI_DISABLE_FREEZE_CGROUP: - dont_use_freeze_cgroup(); + case FI_COMPEL_INTERRUPT_ONLY_MODE: + set_compel_interrupt_only_mode(); break; default: break; diff --git a/criu/include/fault-injection.h b/criu/include/fault-injection.h index 59adf05b9..e987c18ce 100644 --- a/criu/include/fault-injection.h +++ b/criu/include/fault-injection.h @@ -21,7 +21,7 @@ enum faults { FI_CORRUPT_EXTREGS = 134, FI_DONT_USE_PAGEMAP_SCAN = 135, FI_DUMP_CRASH = 136, - FI_DISABLE_FREEZE_CGROUP = 137, + FI_COMPEL_INTERRUPT_ONLY_MODE = 137, FI_PLUGIN_CUDA_FORCE_ENABLE = 138, FI_MAX, }; diff --git a/criu/include/seize.h b/criu/include/seize.h index f5ea76b16..64e8d2d12 100644 --- a/criu/include/seize.h +++ b/criu/include/seize.h @@ -9,6 +9,6 @@ extern bool alarm_timeouted(void); extern char *task_comm_info(pid_t pid, char *comm, size_t size); extern char *__task_comm_info(pid_t pid); -extern void dont_use_freeze_cgroup(void); +extern void set_compel_interrupt_only_mode(void); #endif diff --git a/criu/seize.c b/criu/seize.c index ab394f9ca..9bd1832d9 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -25,17 +25,17 @@ #include "xmalloc.h" #include "util.h" -static bool freeze_cgroup_disabled; +static bool compel_interrupt_only_mode; /* * Disables the use of freeze cgroups for process seizing, even if explicitly - * requested via the --freeze-cgroup option. This is necessary for plugins - * (e.g., CUDA) that do not function correctly when processes are frozen using - * cgroups. + * requested via the --freeze-cgroup option or already set in a frozen state. + * This is necessary for plugins (e.g., CUDA) that do not function correctly + * when processes are frozen using cgroups. */ -void __attribute__((used)) dont_use_freeze_cgroup(void) +void __attribute__((used)) set_compel_interrupt_only_mode(void) { - freeze_cgroup_disabled = true; + compel_interrupt_only_mode = true; } char *task_comm_info(pid_t pid, char *comm, size_t size) @@ -410,7 +410,7 @@ static int freezer_detach(void) { int i; - if (!opts.freeze_cgroup || freeze_cgroup_disabled) + if (!opts.freeze_cgroup || compel_interrupt_only_mode) return 0; for (i = 0; i < processes_to_wait && processes_to_wait_pids; i++) { @@ -505,29 +505,35 @@ static int log_unfrozen_stacks(char *root) return 0; } -static int check_freezer_cgroup(void) +static int prepare_freezer_for_interrupt_only_mode(void) { enum freezer_state state = THAWED; int fd; + int exit_code = -1; - BUG_ON(!freeze_cgroup_disabled); + BUG_ON(!compel_interrupt_only_mode); fd = freezer_open(); if (fd < 0) return -1; state = get_freezer_state(fd); - close(fd); if (state == FREEZER_ERROR) { - return -1; + goto err; } + origin_freezer_state = state == FREEZING ? FROZEN : state; + if (state != THAWED) { - pr_err("One or more plugins are incompatible with the freezer cgroup in the FROZEN state.\n"); - return -1; + pr_warn("unfreezing cgroup for plugin compatibility\n"); + if (freezer_write_state(fd, THAWED)) + goto err; } - return 0; + exit_code = 0; +err: + close(fd); + return exit_code; } static int freeze_processes(void) @@ -681,7 +687,7 @@ static int collect_children(struct pstree_item *item) goto free; } - if (!opts.freeze_cgroup || freeze_cgroup_disabled) + if (!opts.freeze_cgroup || compel_interrupt_only_mode) /* fails when meets a zombie */ __ignore_value(compel_interrupt_task(pid)); @@ -869,7 +875,7 @@ static int collect_threads(struct pstree_item *item) pr_info("\tSeizing %d's %d thread\n", item->pid->real, pid); - if ((!opts.freeze_cgroup || freeze_cgroup_disabled) && + if ((!opts.freeze_cgroup || compel_interrupt_only_mode) && compel_interrupt_task(pid)) continue; @@ -926,7 +932,7 @@ static int collect_loop(struct pstree_item *item, int (*collect)(struct pstree_i { int attempts = NR_ATTEMPTS, nr_inprogress = 1; - if (opts.freeze_cgroup && !freeze_cgroup_disabled) + if (opts.freeze_cgroup && !compel_interrupt_only_mode) attempts = 1; /* @@ -1032,11 +1038,11 @@ int collect_pstree(void) pr_debug("Detected cgroup V%d freezer\n", cgroup_v2 ? 2 : 1); - if (opts.freeze_cgroup && !freeze_cgroup_disabled) { + if (opts.freeze_cgroup && !compel_interrupt_only_mode) { if (freeze_processes()) goto err; } else { - if (opts.freeze_cgroup && check_freezer_cgroup()) + if (opts.freeze_cgroup && prepare_freezer_for_interrupt_only_mode()) goto err; if (compel_interrupt_task(pid)) { set_cr_errno(ESRCH); @@ -1067,7 +1073,7 @@ int collect_pstree(void) if (ret < 0) goto err; - if (opts.freeze_cgroup && !freeze_cgroup_disabled && + if (opts.freeze_cgroup && !compel_interrupt_only_mode && freezer_wait_processes()) { goto err; } diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index c4fc67fa9..3d624750e 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -509,7 +509,7 @@ int cuda_plugin_init(int stage) INIT_LIST_HEAD(&cuda_pids); } - dont_use_freeze_cgroup(); + set_compel_interrupt_only_mode(); return 0; } diff --git a/test/jenkins/criu-fault.sh b/test/jenkins/criu-fault.sh index fc0eddc2b..8cb71d8ca 100755 --- a/test/jenkins/criu-fault.sh +++ b/test/jenkins/criu-fault.sh @@ -40,7 +40,7 @@ fi # also check for the main thread corruption ./test/zdtm.py run -t zdtm/static/fpu00 --fault 134 -f h --norst || fail -# check dont_use_freeze_cgroup +# check set_compel_interrupt_only_mode ./test/zdtm.py run -t zdtm/static/env00 --freezecg zdtm:t --fault 137 ./test/zdtm.py run -t zdtm/static/env00 --freezecg zdtm:t --fault 137 --norst From 36a53fe23c4092ee1ad68144e4e216ab8979b3ab Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 8 Nov 2024 13:41:20 +0000 Subject: [PATCH 027/257] ci: test interrupt-only mode with frozen cgroup Signed-off-by: Radostin Stoyanov --- test/jenkins/criu-fault.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/jenkins/criu-fault.sh b/test/jenkins/criu-fault.sh index 8cb71d8ca..6ee7ce33a 100755 --- a/test/jenkins/criu-fault.sh +++ b/test/jenkins/criu-fault.sh @@ -43,6 +43,8 @@ fi # check set_compel_interrupt_only_mode ./test/zdtm.py run -t zdtm/static/env00 --freezecg zdtm:t --fault 137 ./test/zdtm.py run -t zdtm/static/env00 --freezecg zdtm:t --fault 137 --norst +# check set_compel_interrupt_only_mode when test cgroup is frozen +./test/zdtm.py run -t zdtm/static/env00 --freezecg zdtm:f --fault 137 if ./test/zdtm.py run -t zdtm/static/vfork00 --fault 136 --report report -f h ; then fail From b1cac7a8e580bb023d84d07a9c6f738f9eaf602d Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 2 Nov 2024 08:29:43 +0000 Subject: [PATCH 028/257] cuda: fix check for GPU device availability The check for `/dev/nvidiactl` to determine if the CUDA plugin can be used is unreliable because in some cases the default path for driver installation is different [1]. This patch changes the logic to check if a GPU device is available in `/proc/driver/nvidia/gpus/`. This approach is similar to `torch.cuda.is_available()` and it is a more accurate indicator. The subsequent check for support of the `cuda-checkpoint --action` option would confirm if the driver supports checkpoint/restore. [1] https://github.com/NVIDIA/gpu-operator Fixes: #2509 Signed-off-by: Radostin Stoyanov --- plugins/cuda/cuda_plugin.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 3d624750e..718db3025 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -470,6 +470,20 @@ int cuda_plugin_resume_devices_late(int pid) } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_devices_late) +/** + * Check if a CUDA device is available on the system + */ +static bool is_cuda_device_available(void) +{ + const char *gpu_path = "/proc/driver/nvidia/gpus/"; + struct stat sb; + + if (stat(gpu_path, &sb) != 0) + return false; + + return S_ISDIR(sb.st_mode); +} + int cuda_plugin_init(int stage) { int ret; @@ -481,8 +495,8 @@ int cuda_plugin_init(int stage) } } - if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && access("/dev/nvidiactl", F_OK)) { - pr_info("/dev/nvidiactl doesn't exist. The CUDA plugin is disabled.\n"); + if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && !is_cuda_device_available()) { + pr_info("No GPU device found; CUDA plugin is disabled\n"); plugin_disabled = true; return 0; } From 7125bfc69579a93e2df9720a615b7ad29d79120b Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 6 Nov 2024 22:08:24 +0530 Subject: [PATCH 029/257] pidfd: one process creates a helper and opens all fds to it Currently, the `waitpid()` call on the tmp process can be made by a process which is not its parent. This causes restore to fail. This patch instead selects one process to create the tmp process and open all the fds that point to it. These fds are sent to the correct process(es). Fixes: #2496 Signed-off-by: Andrei Vagin Signed-off-by: Bhavik Sachdev --- criu/files.c | 7 +-- criu/include/pidfd.h | 2 +- criu/pidfd.c | 128 +++++++++++++++++++++---------------------- 3 files changed, 64 insertions(+), 73 deletions(-) diff --git a/criu/files.c b/criu/files.c index a57fb860f..31e705bcc 100644 --- a/criu/files.c +++ b/criu/files.c @@ -1811,11 +1811,6 @@ int prepare_files(void) { init_fdesc_hash(); init_sk_info_hash(); - - if (init_dead_pidfd_hash()) { - pr_err("Could not initialise hash map for dead pidfds\n"); - return -1; - } - + init_dead_pidfd_hash(); return collect_image(&files_cinfo); } diff --git a/criu/include/pidfd.h b/criu/include/pidfd.h index 4d2d71700..bcc0fb45a 100644 --- a/criu/include/pidfd.h +++ b/criu/include/pidfd.h @@ -7,7 +7,7 @@ extern const struct fdtype_ops pidfd_dump_ops; extern struct collect_image_info pidfd_cinfo; extern int is_pidfd_link(char *link); -extern int init_dead_pidfd_hash(void); +extern void init_dead_pidfd_hash(void); struct pidfd_dump_info { PidfdEntry pidfe; pid_t pid; diff --git a/criu/pidfd.c b/criu/pidfd.c index 3ea3c9309..53b9bcf71 100644 --- a/criu/pidfd.c +++ b/criu/pidfd.c @@ -21,32 +21,26 @@ struct pidfd_info { PidfdEntry *pidfe; struct file_desc d; + + struct dead_pidfd *dead; + struct pidfd_info *next; }; struct dead_pidfd { unsigned int ino; - int pid; - size_t count; - mutex_t pidfd_lock; + int creator_id; + struct hlist_node hash; + struct pidfd_info *list; }; #define DEAD_PIDFD_HASH_SIZE 32 static struct hlist_head dead_pidfd_hash[DEAD_PIDFD_HASH_SIZE]; -static mutex_t *dead_pidfd_hash_lock; -int init_dead_pidfd_hash(void) +void init_dead_pidfd_hash(void) { for (int i = 0; i < DEAD_PIDFD_HASH_SIZE; i++) INIT_HLIST_HEAD(&dead_pidfd_hash[i]); - - dead_pidfd_hash_lock = shmalloc(sizeof(*dead_pidfd_hash_lock)); - if (!dead_pidfd_hash_lock) - return -1; - - mutex_init(dead_pidfd_hash_lock); - - return 0; } static struct dead_pidfd *lookup_dead_pidfd(unsigned int ino) @@ -54,15 +48,12 @@ static struct dead_pidfd *lookup_dead_pidfd(unsigned int ino) struct dead_pidfd *dead; struct hlist_head *chain; - mutex_lock(dead_pidfd_hash_lock); chain = &dead_pidfd_hash[ino % DEAD_PIDFD_HASH_SIZE]; hlist_for_each_entry(dead, chain, hash) { if (dead->ino == ino) { - mutex_unlock(dead_pidfd_hash_lock); return dead; } } - mutex_unlock(dead_pidfd_hash_lock); return NULL; } @@ -142,7 +133,7 @@ static int create_tmp_process(void) return tmp_process; } -static int free_dead_pidfd(struct dead_pidfd *dead) +static int kill_helper(pid_t pid) { int status; sigset_t blockmask, oldmask; @@ -160,15 +151,13 @@ static int free_dead_pidfd(struct dead_pidfd *dead) goto err; } - if (kill(dead->pid, SIGKILL) < 0) { - pr_perror("Could not kill temporary process with pid: %d", - dead->pid); + if (kill(pid, SIGKILL) < 0) { + pr_perror("Could not kill temporary process with pid: %d", pid); goto err; } - if (waitpid(dead->pid, &status, 0) != dead->pid) { - pr_perror("Could not wait on temporary process with pid: %d", - dead->pid); + if (waitpid(pid, &status, 0) != pid) { + pr_perror("Could not wait on temporary process with pid: %d", pid); goto err; } @@ -188,9 +177,6 @@ static int free_dead_pidfd(struct dead_pidfd *dead) goto err; } - mutex_lock(dead_pidfd_hash_lock); - hlist_del(&dead->hash); - mutex_unlock(dead_pidfd_hash_lock); return 0; err: return -1; @@ -198,8 +184,9 @@ err: static int open_one_pidfd(struct file_desc *d, int *new_fd) { - struct pidfd_info *info; + struct pidfd_info *info, *child; struct dead_pidfd *dead = NULL; + pid_t pid; int pidfd; info = container_of(d, struct pidfd_info, d); @@ -215,34 +202,44 @@ static int open_one_pidfd(struct file_desc *d, int *new_fd) dead = lookup_dead_pidfd(info->pidfe->ino); BUG_ON(!dead); - mutex_lock(&dead->pidfd_lock); - BUG_ON(dead->count == 0); - dead->count--; - if (dead->pid == -1) { - dead->pid = create_tmp_process(); - if (dead->pid < 0) { - mutex_unlock(&dead->pidfd_lock); - goto err_close; + if (info->dead && info->dead->creator_id != info->pidfe->id) { + int ret = recv_desc_from_peer(&info->d, &pidfd); + if (ret != 0) { + if (ret != 1) + pr_err("Can't get fd\n"); + return ret; } + goto out; } - pidfd = pidfd_open(dead->pid, info->pidfe->flags); + pid = create_tmp_process(); + if (pid < 0) + goto err_close; + + for (child = dead->list; child; child = child->next) { + if (child == info) + continue; + pidfd = pidfd_open(pid, child->pidfe->flags); + if (pidfd < 0) { + pr_perror("Could not open pidfd for %d", child->pidfe->nspid); + goto err_close; + } + + if (send_desc_to_peer(pidfd, &child->d)) { + pr_perror("Can't send file descriptor"); + close(pidfd); + return -1; + } + close(pidfd); + } + + pidfd = pidfd_open(pid, info->pidfe->flags); if (pidfd < 0) { pr_perror("Could not open pidfd for %d", info->pidfe->nspid); - mutex_unlock(&dead->pidfd_lock); goto err_close; } - - if (dead->count == 0) { - if (free_dead_pidfd(dead)) { - pr_err("Failed to delete dead_pidfd struct\n"); - mutex_unlock(&dead->pidfd_lock); - close(pidfd); - goto err_close; - } - } - mutex_unlock(&dead->pidfd_lock); - + if (kill_helper(pid)) + goto err_close; out: if (rst_file_params(pidfd, info->pidfe->fown, info->pidfe->flags)) { goto err_close; @@ -269,32 +266,31 @@ static int collect_one_pidfd(void *obj, ProtobufCMessage *msg, struct cr_img *i) info->pidfe = pb_msg(msg, PidfdEntry); pr_info_pidfd("Collected ", info->pidfe); + info->dead = NULL; if (info->pidfe->nspid != -1) goto out; dead = lookup_dead_pidfd(info->pidfe->ino); - if (dead) { - mutex_lock(&dead->pidfd_lock); - dead->count++; - mutex_unlock(&dead->pidfd_lock); - goto out; - } - - dead = shmalloc(sizeof(*dead)); if (!dead) { - pr_err("Could not allocate shared memory..\n"); - return -1; + dead = xmalloc(sizeof(*dead)); + if (!dead) { + pr_err("Could not allocate memory..\n"); + return -1; + } + + INIT_HLIST_NODE(&dead->hash); + dead->list = NULL; + dead->ino = info->pidfe->ino; + dead->creator_id = info->pidfe->id; + hlist_add_head(&dead->hash, &dead_pidfd_hash[dead->ino % DEAD_PIDFD_HASH_SIZE]); } - INIT_HLIST_NODE(&dead->hash); - dead->ino = info->pidfe->ino; - dead->count = 1; - dead->pid = -1; - mutex_init(&dead->pidfd_lock); + info->dead = dead; + info->next = dead->list; + dead->list = info; + if (dead->creator_id > info->pidfe->id) + dead->creator_id = info->pidfe->id; - mutex_lock(dead_pidfd_hash_lock); - hlist_add_head(&dead->hash, &dead_pidfd_hash[dead->ino % DEAD_PIDFD_HASH_SIZE]); - mutex_unlock(dead_pidfd_hash_lock); out: return file_desc_add(&info->d, info->pidfe->id, &pidfd_desc_ops); } From 498bcf28067624b1fca1b939000f8314574a5e80 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Wed, 6 Nov 2024 22:10:08 +0530 Subject: [PATCH 030/257] zdtm: Check many processes with common dead pidfd We have multiple processes open a pidfd to a common dead process. After C/R we check that the inode numbers for these pidfds are equal or not. Signed-off-by: Bhavik Sachdev --- test/zdtm/static/Makefile | 1 + test/zdtm/static/pidfd_diffdead.c | 228 ++++++++++++++++++++++++++++++ 2 files changed, 229 insertions(+) create mode 100644 test/zdtm/static/pidfd_diffdead.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 44ac64fe5..71a1b6a53 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -56,6 +56,7 @@ TST_NOFILE := \ pidfd_self \ pidfd_of_thread \ pidfd_dead \ + pidfd_diffdead \ pidfd_child \ pidfd_kill \ fd_from_pidfd \ diff --git a/test/zdtm/static/pidfd_diffdead.c b/test/zdtm/static/pidfd_diffdead.c new file mode 100644 index 000000000..5bc1911a5 --- /dev/null +++ b/test/zdtm/static/pidfd_diffdead.c @@ -0,0 +1,228 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check C/R of processes that point to a common dead pidfd\n"; +const char *test_author = "Bhavik Sachdev "; + +#ifndef PID_FS_MAGIC +#define PID_FS_MAGIC 0x50494446 +#endif + +/* + * main + * `- child + * `- grandchild + * + * main and child open a pidfd for grandchild. + * Before C/R we kill grandchild. + * We end up with two pidfds in two diff processes that point to the same dead process. + */ + +static long get_fs_type(int lfd) +{ + struct statfs fst; + + if (fstatfs(lfd, &fst)) { + return -1; + } + return fst.f_type; +} + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t *info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static int check_for_pidfs(void) +{ + long type; + int pidfd = pidfd_open(getpid(), 0); + if (pidfd < 0) { + pr_perror("pidfd open() failed"); + return -1; + } + type = get_fs_type(pidfd); + close(pidfd); + return type == PID_FS_MAGIC; +} + +int main(int argc, char *argv[]) +{ +#define READ 0 +#define WRITE 1 + + int child, ret, gchild, status; + struct statx stat; + task_waiter_t t; + unsigned long long ino; + + /* + * We use the inop pipe to send the inode number of the + * pidfd opened in the child to the main process for + * comparison. + */ + int p[2]; + int pidfd; + + test_init(argc, argv); + task_waiter_init(&t); + + ret = check_for_pidfs(); + if (ret < 0) + return 1; + + if (ret == 0) { + test_daemon(); + test_waitsig(); + skip("Test requires pidfs. skipping..."); + pass(); + return 0; + } + + if (pipe(p)) { + pr_perror("pipe"); + return 1; + } + + child = test_fork(); + if (child < 0) { + pr_perror("fork"); + return 1; + } else if (child == 0) { + int gchild; + gchild = test_fork(); + if (gchild < 0) { + pr_perror("fork"); + return 1; + } else if (gchild == 0) { + close(p[READ]); + close(p[WRITE]); + while (1) + sleep(1000); + } else { + if (write(p[WRITE], &gchild, sizeof(int)) != sizeof(int)) { + pr_perror("write"); + return 1; + } + + pidfd = pidfd_open(gchild, 0); + if (pidfd < 0) { + pr_perror("pidfd_open"); + return 1; + } + + if (waitpid(gchild, &status, 0) != gchild) { + pr_perror("waitpid"); + return 1; + } + + if (!WIFSIGNALED(status)) { + fail("Expected grandchild to be terminated by a signal"); + return 1; + } + + if (WTERMSIG(status) != SIGKILL) { + fail("Expected grandchild to be terminated by SIGKILL"); + return 1; + } + task_waiter_complete(&t, 1); + + test_waitsig(); + + if (statx(pidfd, "", AT_EMPTY_PATH, STATX_ALL, &stat) < 0) { + pr_perror("statx"); + return 1; + } + + close(p[WRITE]); + if (read(p[READ], &ino, sizeof(ino)) != sizeof(ino)) { + pr_perror("read"); + return 1; + } + close(p[READ]); + close(pidfd); + + /* ino number should be same because both pidfds were for the same process */ + if (ino != stat.stx_ino) { + exit(1); + } + exit(0); + } + } + + if (read(p[READ], &gchild, sizeof(int)) != sizeof(int)) { + pr_perror("write"); + return 1; + } + + pidfd = pidfd_open(gchild, 0); + if (pidfd < 0) { + pr_perror("pidfd_open"); + return 1; + } + + /* + * We kill grandchild process only after opening pidfd. + */ + if (pidfd_send_signal(pidfd, SIGKILL, NULL, 0)) { + pr_perror("pidfd_send_signal"); + return 1; + } + + /* Wait for child to waitpid on gchild */ + task_waiter_wait4(&t, 1); + + test_daemon(); + test_waitsig(); + + close(p[READ]); + if (statx(pidfd, "", AT_EMPTY_PATH, STATX_ALL, &stat) < 0) { + pr_perror("statx"); + goto err; + } + + /* Send inode number of pidfd to child for comparison */ + if (write(p[WRITE], &stat.stx_ino, sizeof(stat.stx_ino)) != sizeof(stat.stx_ino)) { + pr_perror("write"); + goto err; + } + close(p[WRITE]); + + if (kill(child, SIGTERM)) { + pr_perror("kill"); + goto err; + } + + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid"); + goto err; + } + + if (!WIFEXITED(status)) { + fail("Expected child to terminate normally"); + goto err; + } + + if (WEXITSTATUS(status) != 0) { + fail("Child failed"); + goto err; + } + + pass(); + close(pidfd); + return 0; +err: + close(pidfd); + return 1; +} From 28c2cb3fd6121f3280484665915d1ef5d8b9df14 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 12 Nov 2024 13:04:31 +0000 Subject: [PATCH 031/257] cuda: enable checkpoint support for paused tasks If a CUDA process is already in a "locked" or "checkpointed" state during criu dump, the CUDA plugin currently fails with an error because it attempts an unnecessary "lock" action using the cuda-checkpoint tool. This patch extends the CUDA plugin to handle such cases by first verifying the initial state of the CUDA processes and skipping unnecessary "lock" and "checkpoint" actions when a process has been locked or checkpointed before CRIU is invoked. In particular, CUDA tasks may already be in a "locked" or "checkpointed" state to ensure consistent checkpoint/restore for distributed workloads, such as model training, where multiple containers run across different cluster nodes. Another use case for this functionality is optimizing resource utilization, where CUDA tasks with low-priority are preempted immediately to release GPU resources needed by high-priority tasks, and the paused workloads are later resumed or migrated to another node. Signed-off-by: Radostin Stoyanov --- plugins/cuda/cuda_plugin.c | 155 +++++++++++++++++++++++++++---------- 1 file changed, 116 insertions(+), 39 deletions(-) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 718db3025..7764cf3c7 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -26,6 +26,13 @@ #define ACTION_RESTORE "restore" #define ACTION_UNLOCK "unlock" +typedef enum { + CUDA_TASK_RUNNING = 0, + CUDA_TASK_LOCKED, + CUDA_TASK_CHECKPOINTED, + CUDA_TASK_UNKNOWN = -1 +} cuda_task_state_t; + #define CUDA_CKPT_BUF_SIZE (128) #ifdef LOG_PREFIX @@ -43,6 +50,7 @@ bool plugin_added_to_inventory = false; struct pid_info { int pid; char checkpointed; + cuda_task_state_t initial_task_state; struct list_head list; }; @@ -62,7 +70,7 @@ static void dealloc_pid_buffer(struct list_head *pid_buf) } } -static int add_pid_to_buf(struct list_head *pid_buf, int pid) +static int add_pid_to_buf(struct list_head *pid_buf, int pid, cuda_task_state_t state) { struct pid_info *new = xmalloc(sizeof(*new)); @@ -72,25 +80,12 @@ static int add_pid_to_buf(struct list_head *pid_buf, int pid) new->pid = pid; new->checkpointed = 0; + new->initial_task_state = state; list_add_tail(&new->list, pid_buf); return 0; } -static int update_checkpointed_pid(struct list_head *pid_buf, int pid) -{ - struct pid_info *info; - - list_for_each_entry(info, pid_buf, list) { - if (info->pid == pid) { - info->checkpointed = 1; - return 0; - } - } - - return -1; -} - static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) { #define READ 0 @@ -231,6 +226,37 @@ static int get_cuda_restore_tid(int root_pid) return atoi(pid_out); } +static cuda_task_state_t get_task_state_enum(const char *state_str) +{ + if (strncmp(state_str, "running", 7) == 0) + return CUDA_TASK_RUNNING; + + if (strncmp(state_str, "locked", 6) == 0) + return CUDA_TASK_LOCKED; + + if (strncmp(state_str, "checkpointed", 12) == 0) + return CUDA_TASK_CHECKPOINTED; + + pr_err("Unknown CUDA state: %s\n", state_str); + return CUDA_TASK_UNKNOWN; +} + +static cuda_task_state_t get_cuda_state(pid_t pid) +{ + char pid_buf[16]; + char state_str[CUDA_CKPT_BUF_SIZE]; + const char *args[] = { CUDA_CHECKPOINT, "--get-state", "--pid", pid_buf, NULL }; + + snprintf(pid_buf, sizeof(pid_buf), "%d", pid); + + if (launch_cuda_checkpoint(args, state_str, sizeof(state_str))) { + pr_err("Failed to launch cuda-checkpoint to retrieve state: %s\n", state_str); + return CUDA_TASK_UNKNOWN; + } + + return get_task_state_enum(state_str); +} + static int cuda_process_checkpoint_action(int pid, const char *action, unsigned int timeout, char *msg_buf, int buf_size) { @@ -319,6 +345,8 @@ int cuda_plugin_checkpoint_devices(int pid) int int_ret; int status; k_rtsigset_t save_sigset; + struct pid_info *task_info; + bool pid_found = false; if (plugin_disabled) { return -ENOTSUP; @@ -336,6 +364,26 @@ int cuda_plugin_checkpoint_devices(int pid) return 0; } + /* Check if the process is already in a checkpointed state */ + list_for_each_entry(task_info, &cuda_pids, list) { + if (task_info->pid == pid) { + if (task_info->initial_task_state == CUDA_TASK_CHECKPOINTED) { + pr_info("pid %d already in a checkpointed state\n", pid); + return 0; + } + pid_found = true; + break; + } + } + + if (pid_found == false) { + /* We return an error here. The task should be restored + * to its original state at cuda_plugin_fini(). + */ + pr_err("Failed to track pid %d\n", pid); + return -1; + } + pr_info("Checkpointing CUDA devices on pid %d restore_tid %d\n", pid, restore_tid); /* We need to resume the checkpoint thread to prepare the mappings for * checkpointing @@ -348,22 +396,8 @@ int cuda_plugin_checkpoint_devices(int pid) pr_err("CHECKPOINT_DEVICES failed with %s\n", msg_buf); goto interrupt; } - status = update_checkpointed_pid(&cuda_pids, pid); - if (status) { - pr_err("Failed to track checkpointed pid %d\n", pid); - status = cuda_process_checkpoint_action(pid, ACTION_RESTORE, 0, msg_buf, sizeof(msg_buf)); - if (status) { - pr_err("Failed to restore process after error %s on pid %d\n", msg_buf, pid); - } - } - if (!status && !plugin_added_to_inventory) { - status = add_inventory_plugin(CR_PLUGIN_DESC.name); - if (status) - pr_err("Failed to add CUDA plugin to inventory image\n"); - else - plugin_added_to_inventory = true; - } + task_info->checkpointed = 1; interrupt: int_ret = interrupt_restore_thread(restore_tid, &save_sigset); @@ -376,6 +410,7 @@ int cuda_plugin_pause_devices(int pid) { int restore_tid; char msg_buf[CUDA_CKPT_BUF_SIZE]; + cuda_task_state_t task_state; if (plugin_disabled) { return -ENOTSUP; @@ -388,6 +423,34 @@ int cuda_plugin_pause_devices(int pid) return 0; } + task_state = get_cuda_state(restore_tid); + if (task_state == CUDA_TASK_UNKNOWN) { + pr_err("Failed to get CUDA state for PID %d\n", restore_tid); + return -1; + } + + if (!plugin_added_to_inventory) { + if (add_inventory_plugin(CR_PLUGIN_DESC.name)) { + pr_err("Failed to add CUDA plugin to inventory image\n"); + return -1; + } + plugin_added_to_inventory = true; + } + + if (task_state == CUDA_TASK_LOCKED) { + pr_info("pid %d already in a locked state\n", pid); + /* Leave this PID in a "locked" state at resume_device() */ + add_pid_to_buf(&cuda_pids, pid, CUDA_TASK_LOCKED); + return 0; + } + + if (task_state == CUDA_TASK_CHECKPOINTED) { + /* We need to skip this PID in cuda_plugin_checkpoint_devices(), + * and leave it in a "checkpoined" state at resume_device(). */ + add_pid_to_buf(&cuda_pids, pid, CUDA_TASK_CHECKPOINTED); + return 0; + } + pr_info("pausing devices on pid %d\n", pid); int status = cuda_process_checkpoint_action(pid, ACTION_LOCK, opts.timeout * 1000, msg_buf, sizeof(msg_buf)); if (status) { @@ -397,7 +460,7 @@ int cuda_plugin_pause_devices(int pid) return -1; } - if (add_pid_to_buf(&cuda_pids, pid)) { + if (add_pid_to_buf(&cuda_pids, pid, CUDA_TASK_RUNNING)) { pr_err("unable to track paused pid %d\n", pid); goto unlock; } @@ -412,7 +475,7 @@ unlock: } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__PAUSE_DEVICES, cuda_plugin_pause_devices) -int resume_device(int pid, int checkpointed) +int resume_device(int pid, int checkpointed, cuda_task_state_t initial_task_state) { char msg_buf[CUDA_CKPT_BUF_SIZE]; int status; @@ -420,6 +483,11 @@ int resume_device(int pid, int checkpointed) int int_ret; k_rtsigset_t save_sigset; + if (initial_task_state == CUDA_TASK_UNKNOWN) { + pr_info("skip resume for PID %d (unknown state)\n", pid); + return 0; + } + int restore_tid = get_cuda_restore_tid(pid); if (restore_tid == -1) { pr_info("No need to resume devices on pid %d\n", pid); @@ -439,7 +507,8 @@ int resume_device(int pid, int checkpointed) return -1; } - if (checkpointed) { + if (checkpointed && (initial_task_state == CUDA_TASK_RUNNING || initial_task_state == CUDA_TASK_LOCKED)) { + /* If the process was "locked" or "running" before checkpointing it, we need to restore it */ status = cuda_process_checkpoint_action(pid, ACTION_RESTORE, 0, msg_buf, sizeof(msg_buf)); if (status) { pr_err("RESUME_DEVICES RESTORE failed with %s\n", msg_buf); @@ -448,10 +517,13 @@ int resume_device(int pid, int checkpointed) } } - status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf)); - if (status) { - pr_err("RESUME_DEVICES UNLOCK failed with %s\n", msg_buf); - ret = -1; + if (initial_task_state == CUDA_TASK_RUNNING) { + /* If the process was "running" before we paused it, we need to unlock it */ + status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("RESUME_DEVICES UNLOCK failed with %s\n", msg_buf); + ret = -1; + } } interrupt: @@ -466,7 +538,12 @@ int cuda_plugin_resume_devices_late(int pid) return -ENOTSUP; } - return resume_device(pid, 1); + /* RESUME_DEVICES_LATE is used during `criu restore`. + * Here, we assume that users expect the target process + * to be in a "running" state after restore, even if it was + * in a "locked" or "checkpointed" state during `criu dump`. + */ + return resume_device(pid, 1, CUDA_TASK_RUNNING); } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_devices_late) @@ -542,7 +619,7 @@ void cuda_plugin_fini(int stage, int ret) if (stage == CR_PLUGIN_STAGE__DUMP && (opts.final_state == TASK_ALIVE || ret != 0)) { struct pid_info *info; list_for_each_entry(info, &cuda_pids, list) { - resume_device(info->pid, info->checkpointed); + resume_device(info->pid, info->checkpointed, info->initial_task_state); } } if (stage == CR_PLUGIN_STAGE__DUMP) { From 21e5f4cfd55b8d6837d0f01441d9772c3f09f707 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 12 Nov 2024 15:14:51 +0000 Subject: [PATCH 032/257] test: add get-state to mocked cuda-checkpoint tool Signed-off-by: Radostin Stoyanov --- test/cuda-checkpoint/cuda-checkpoint.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/cuda-checkpoint/cuda-checkpoint.c b/test/cuda-checkpoint/cuda-checkpoint.c index f35a4b41d..3b7ce8b9f 100644 --- a/test/cuda-checkpoint/cuda-checkpoint.c +++ b/test/cuda-checkpoint/cuda-checkpoint.c @@ -11,6 +11,7 @@ int main(int argc, char *argv[]) int option_index = 0; static struct option long_options[] = { { "pid", required_argument, 0, 'p' }, + { "get-state", no_argument, 0, 's' }, { "get-restore-tid", no_argument, 0, 'g' }, { "action", required_argument, 0, 'a' }, { "timeout", required_argument, 0, 't' }, @@ -31,6 +32,9 @@ int main(int argc, char *argv[]) case 'a': case 't': break; + case 's': + printf("running\n"); + break; case 'h': printf("--action - execute an action"); break; From 399d7bdcbb94bdcbbca2ec7bef881cdfd6c9f404 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 17 Nov 2024 16:10:20 +0000 Subject: [PATCH 033/257] compel: fix gitignore and remove autogenerated code We don't need to have compel/arch/riscv64/plugins/std/syscalls/syscalls.S tracked in git. It is autogenerated. We also need to update our .gitignore to ignore autogenerated files with syscall tables. Signed-off-by: Alexander Mikhalitsyn --- compel/.gitignore | 3 + .../riscv64/plugins/std/syscalls/syscalls.S | 112 ------------------ 2 files changed, 3 insertions(+), 112 deletions(-) delete mode 100644 compel/arch/riscv64/plugins/std/syscalls/syscalls.S diff --git a/compel/.gitignore b/compel/.gitignore index eab3337d6..5e770a86c 100644 --- a/compel/.gitignore +++ b/compel/.gitignore @@ -4,6 +4,9 @@ arch/arm/plugins/std/syscalls/syscalls.S arch/aarch64/plugins/std/syscalls/syscalls.S arch/s390/plugins/std/syscalls/syscalls.S arch/ppc64/plugins/std/syscalls/syscalls.S +arch/mips/plugins/std/syscalls/syscalls-64.S +arch/loongarch64/plugins/std/syscalls/syscalls-64.S +arch/riscv64/plugins/std/syscalls/syscalls.S include/version.h plugins/include/uapi/std/asm/syscall-types.h plugins/include/uapi/std/syscall-64.h diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscalls.S b/compel/arch/riscv64/plugins/std/syscalls/syscalls.S deleted file mode 100644 index 715da4612..000000000 --- a/compel/arch/riscv64/plugins/std/syscalls/syscalls.S +++ /dev/null @@ -1,112 +0,0 @@ -/* Autogenerated, don't edit */ -#include -#include "std/syscalls/syscall-common.S" -syscall sys_read, __NR_read -syscall sys_write, __NR_write -syscall sys_close, __NR_close -syscall sys_lseek, __NR_lseek -syscall sys_mmap, __NR_mmap -syscall sys_mprotect, __NR_mprotect -syscall sys_munmap, __NR_munmap -syscall sys_brk, __NR_brk -syscall sys_sigaction, __NR_rt_sigaction -syscall sys_sigprocmask, __NR_rt_sigprocmask -syscall sys_rt_sigreturn, __NR_rt_sigreturn -syscall sys_ioctl, __NR_ioctl -syscall sys_pread64, __NR_pread64 -syscall sys_ptrace, __NR_ptrace -syscall sys_mremap, __NR_mremap -syscall sys_mincore, __NR_mincore -syscall sys_madvise, __NR_madvise -syscall sys_shmat, __NR_shmat -syscall sys_pause, __NR_pause -syscall sys_nanosleep, __NR_nanosleep -syscall sys_getitimer, __NR_getitimer -syscall sys_setitimer, __NR_setitimer -syscall sys_getpid, __NR_getpid -syscall sys_socket, __NR_socket -syscall sys_connect, __NR_connect -syscall sys_sendto, __NR_sendto -syscall sys_recvfrom, __NR_recvfrom -syscall sys_sendmsg, __NR_sendmsg -syscall sys_recvmsg, __NR_recvmsg -syscall sys_shutdown, __NR_shutdown -syscall sys_bind, __NR_bind -syscall sys_setsockopt, __NR_setsockopt -syscall sys_getsockopt, __NR_getsockopt -syscall sys_clone, __NR_clone -syscall sys_exit, __NR_exit -syscall sys_wait4, __NR_wait4 -syscall sys_waitid, __NR_waitid -syscall sys_kill, __NR_kill -syscall sys_fcntl, __NR_fcntl -syscall sys_flock, __NR_flock -syscall sys_readlinkat, __NR_readlinkat -syscall sys_umask, __NR_umask -syscall sys_getgroups, __NR_getgroups -syscall sys_setgroups, __NR_setgroups -syscall sys_setresuid, __NR_setresuid -syscall sys_getresuid, __NR_getresuid -syscall sys_setresgid, __NR_setresgid -syscall sys_getresgid, __NR_getresgid -syscall sys_getpgid, __NR_getpgid -syscall sys_setfsuid, __NR_setfsuid -syscall sys_setfsgid, __NR_setfsgid -syscall sys_getsid, __NR_getsid -syscall sys_capget, __NR_capget -syscall sys_capset, __NR_capset -syscall sys_rt_sigqueueinfo, __NR_rt_sigqueueinfo -syscall sys_setpriority, __NR_setpriority -syscall sys_sched_setscheduler, __NR_sched_setscheduler -syscall sys_sigaltstack, __NR_sigaltstack -syscall sys_personality, __NR_personality -syscall sys_prctl, __NR_prctl -syscall sys_setrlimit, __NR_setrlimit -syscall sys_mount, __NR_mount -syscall sys_umount2, __NR_umount2 -syscall sys_gettid, __NR_gettid -syscall sys_futex, __NR_futex -syscall sys_set_tid_address, __NR_set_tid_address -syscall sys_restart_syscall, __NR_restart_syscall -syscall sys_timer_create, __NR_timer_create -syscall sys_timer_settime, __NR_timer_settime -syscall sys_timer_gettime, __NR_timer_gettime -syscall sys_timer_getoverrun, __NR_timer_getoverrun -syscall sys_timer_delete, __NR_timer_delete -syscall sys_clock_gettime, __NR_clock_gettime -syscall sys_exit_group, __NR_exit_group -syscall sys_set_robust_list, __NR_set_robust_list -syscall sys_get_robust_list, __NR_get_robust_list -syscall sys_signalfd4, __NR_signalfd4 -syscall sys_rt_tgsigqueueinfo, __NR_rt_tgsigqueueinfo -syscall sys_vmsplice, __NR_vmsplice -syscall sys_timerfd_settime, __NR_timerfd_settime -syscall sys_fanotify_init, __NR_fanotify_init -syscall sys_fanotify_mark, __NR_fanotify_mark -syscall sys_open_by_handle_at, __NR_open_by_handle_at -syscall sys_setns, __NR_setns -syscall sys_kcmp, __NR_kcmp -syscall sys_openat, __NR_openat -syscall sys_mkdirat, __NR_mkdirat -syscall sys_unlinkat, __NR_unlinkat -syscall sys_memfd_create, __NR_memfd_create -syscall sys_io_setup, __NR_io_setup -syscall sys_io_submit, __NR_io_submit -syscall sys_io_getevents, __NR_io_getevents -syscall sys_seccomp, __NR_seccomp -syscall sys_gettimeofday, __NR_gettimeofday -syscall sys_preadv_raw, __NR_preadv_raw -syscall sys_userfaultfd, __NR_userfaultfd -syscall sys_fallocate, __NR_fallocate -syscall sys_ppoll, __NR_ppoll -syscall sys_fsopen, __NR_fsopen -syscall sys_fsconfig, __NR_fsconfig -syscall sys_fsmount, __NR_fsmount -syscall sys_clone3, __NR_clone3 -syscall sys_pidfd_open, __NR_pidfd_open -syscall sys_pidfd_getfd, __NR_pidfd_getfd -syscall sys_rseq, __NR_rseq -syscall sys_move_mount, __NR_move_mount -syscall sys_open_tree, __NR_open_tree -syscall sys_openat2, __NR_openat2 -#include From 40b7f04b7c0475813d3e4809cfee2b918715f9c9 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 17 Nov 2024 18:32:03 +0000 Subject: [PATCH 034/257] compel/arch/riscv64: properly implement compel_task_size() We need to dynamically calculate TASK_SIZE depending on the MMU on RISC-V system. [We are using analogical approach on aarch64/ppc64le.] This change was tested on physical machine: StarFive VisionFive 2 isa : rv64imafdc_zicntr_zicsr_zifencei_zihpm_zca_zcd_zba_zbb mmu : sv39 uarch : sifive,u74-mc mvendorid : 0x489 marchid : 0x8000000000000007 mimpid : 0x4210427 hart isa : rv64imafdc_zicntr_zicsr_zifencei_zihpm_zca_zcd_zba_zbb Signed-off-by: Alexander Mikhalitsyn --- compel/arch/riscv64/src/lib/infect.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/compel/arch/riscv64/src/lib/infect.c b/compel/arch/riscv64/src/lib/infect.c index 01395a205..861fe3b2f 100644 --- a/compel/arch/riscv64/src/lib/infect.c +++ b/compel/arch/riscv64/src/lib/infect.c @@ -181,20 +181,22 @@ int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) * Task size is the maximum virtual address space size that a process can occupy in the memory * Refer to linux kernel arch/riscv/include/asm/pgtable.h, * task size is: - * - 0x9fc00000 (~2.5GB) for RV32. - * - 0x4000000000 ( 256GB) for RV64 using SV39 mmu - * - 0x800000000000 ( 128TB) for RV64 using SV48 mmu - * - * Note that PGDIR_SIZE must evenly divide TASK_SIZE since "RISC-V - * Instruction Set Manual Volume II: Privileged Architecture" states that - * "load and store effective addresses, which are 64bits, must have bits - * 63–48 all equal to bit 47, or else a page-fault exception will occur." -*/ -#define TASK_SIZE 0x800000000000UL // hardcoded for SV48 MMU + * - 0x9fc00000 (~2.5GB) for RV32. + * - 0x4000000000 ( 256GB) for RV64 using SV39 mmu + * - 0x800000000000 ( 128TB) for RV64 using SV48 mmu + * - 0x100000000000000 ( 64PB) for RV64 using SV57 mmu + */ +#define TASK_SIZE_MIN (1UL << 38) +#define TASK_SIZE_MAX (1UL << 56) unsigned long compel_task_size(void) { - return TASK_SIZE; + unsigned long task_size; + + for (task_size = TASK_SIZE_MIN; task_size < TASK_SIZE_MAX; task_size <<= 1) + if (munmap((void *)task_size, page_size())) + break; + return task_size; } /* From ed560a3491079157b9044d3e14aa522159e9450b Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 23 Nov 2024 22:29:45 +0000 Subject: [PATCH 035/257] pidfd: add missing include MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix for the following error when building CRIU on Rocky Linux 8 criu/pidfd.c: In function ‘pidfd_open’: criu/pidfd.c:119:17: error: ‘__NR_pidfd_open’ undeclared (first use in this function); did you mean ‘pidfd_open’? return syscall(__NR_pidfd_open, pid, flags); ^~~~~~~~~~~~~~~ pidfd_open criu/pidfd.c:119:17: note: each undeclared identifier is reported only once for each function it appears in criu/pidfd.c:120:1: error: control reaches end of non-void function [-Werror=return-type] } ^ criu/pidfd.c: At top level: cc1: error: unrecognized command line option ‘-Wno-unknown-warning-option’ [-Werror] cc1: error: unrecognized command line option ‘-Wno-dangling-pointer’ [-Werror] cc1: all warnings being treated as errors Signed-off-by: Radostin Stoyanov --- criu/pidfd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/criu/pidfd.c b/criu/pidfd.c index 53b9bcf71..ae32025b0 100644 --- a/criu/pidfd.c +++ b/criu/pidfd.c @@ -11,6 +11,8 @@ #include "common/bug.h" #include "rst-malloc.h" +#include "compel/plugins/std/syscall-codes.h" + #undef LOG_PREFIX #define LOG_PREFIX "pidfd: " From 8ee2eba47c0c540026311b24af7a74784e370750 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 5 Dec 2024 22:17:38 +0000 Subject: [PATCH 036/257] vdso: handle vvar_vclock vma-s The vvar_vclock was introduced by [1]. Basically, the old vvar vma has been splited on two parts. In term of C/R, these two vma-s can be still treated as one. [1] e93d2521b27f ("x86/vdso: Split virtual clock pages into dedicated mapping") Signed-off-by: Andrei Vagin --- criu/include/util-vdso.h | 1 + criu/pie/parasite-vdso.c | 19 ++++++++++++++++++- criu/proc_parse.c | 23 +++++++++++++++++++---- criu/vdso.c | 28 +++++++++++++++++++++------- 4 files changed, 59 insertions(+), 12 deletions(-) diff --git a/criu/include/util-vdso.h b/criu/include/util-vdso.h index c4386cf8e..9fd9a6de4 100644 --- a/criu/include/util-vdso.h +++ b/criu/include/util-vdso.h @@ -30,6 +30,7 @@ struct vdso_symbol { struct vdso_symtable { unsigned long vdso_size; unsigned long vvar_size; + unsigned long vvar_vclock_size; struct vdso_symbol symbols[VDSO_SYMBOL_MAX]; bool vdso_before_vvar; /* order of vdso/vvar pair */ }; diff --git a/criu/pie/parasite-vdso.c b/criu/pie/parasite-vdso.c index 355007fa9..f3ad3107f 100644 --- a/criu/pie/parasite-vdso.c +++ b/criu/pie/parasite-vdso.c @@ -45,6 +45,7 @@ static int remap_one(char *who, unsigned long *from, unsigned long to, size_t si static int park_at(struct vdso_maps *rt, unsigned long vdso, unsigned long vvar) { unsigned long vvar_size = rt->sym.vvar_size; + unsigned long vvar_vclock_size = rt->sym.vvar_vclock_size; unsigned long vdso_size = rt->sym.vdso_size; int ret; @@ -54,8 +55,24 @@ static int park_at(struct vdso_maps *rt, unsigned long vdso, unsigned long vvar) std_log_set_gettimeofday(NULL); /* stop using vdso for timings */ - if (vvar) + if (vvar) { + /* + * v6.13-rc1~172^2~9 splits the vvar vma in two parts vvar and + * vvar_clock. The last one is mapped right after the first + * one. + */ + if (vvar_vclock_size) { + unsigned long from; + + vvar_size -= vvar_vclock_size; + from = rt->vvar_start + vvar_size; + + ret = remap_one("rt-vvar", &from, vvar + vvar_size, vvar_vclock_size); + if (ret) + return ret; + } ret = remap_one("rt-vvar", &rt->vvar_start, vvar, vvar_size); + } if (!ret) vdso_update_gtod_addr(rt); diff --git a/criu/proc_parse.c b/criu/proc_parse.c index eb869dbbd..be0c3d531 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -579,7 +579,8 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_pat } else if (!strcmp(file_path, "[vdso]")) { if (handle_vdso_vma(vma_area)) goto err; - } else if (!strcmp(file_path, "[vvar]")) { + } else if (!strcmp(file_path, "[vvar]") || + !strcmp(file_path, "[vvar_vclock]")) { if (handle_vvar_vma(vma_area)) goto err; } else if (!strcmp(file_path, "[heap]")) { @@ -771,7 +772,7 @@ static int task_size_check(pid_t pid, VmaEntry *entry) int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t dump_filemap) { - struct vma_area *vma_area = NULL; + struct vma_area *vma_area = NULL, *prev_vma_area = NULL; unsigned long start, end, pgoff, prev_end = 0; char r, w, x, s; int ret = -1, vm_file_fd = -1; @@ -813,8 +814,22 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t du continue; } - if (vma_area && vma_list_add(vma_area, vma_area_list, &prev_end, &vfi, &prev_vfi)) - goto err; + if (vma_area && vma_area_is(vma_area, VMA_AREA_VVAR) && + prev_vma_area && vma_area_is(prev_vma_area, VMA_AREA_VVAR)) { + if (prev_vma_area->e->end != vma_area->e->start) { + pr_err("two nonconsecutive vvar vma-s: " + "%" PRIx64 "-%" PRIx64 " %" PRIx64 "-%" PRIx64 "\n", + prev_vma_area->e->start, prev_vma_area->e->end, + vma_area->e->start, vma_area->e->end); + goto err; + } + /* Merge all vvar vma-s into one. */ + prev_vma_area->e->end = vma_area->e->end; + } else { + if (vma_area && vma_list_add(vma_area, vma_area_list, &prev_end, &vfi, &prev_vfi)) + goto err; + prev_vma_area = vma_area; + } if (eof) break; diff --git a/criu/vdso.c b/criu/vdso.c index 7de2fae78..d4d351131 100644 --- a/criu/vdso.c +++ b/criu/vdso.c @@ -310,7 +310,7 @@ static int vdso_parse_maps(pid_t pid, struct vdso_maps *s) while (1) { unsigned long start, end; - char *has_vdso, *has_vvar; + char *has_vdso, *has_vvar, *has_vvar_vclock; buf = breadline(&f); if (buf == NULL) @@ -318,13 +318,19 @@ static int vdso_parse_maps(pid_t pid, struct vdso_maps *s) if (IS_ERR(buf)) goto err; - has_vdso = strstr(buf, "[vdso]"); - if (!has_vdso) + has_vvar = NULL; + has_vvar_vclock = NULL; + do { + has_vdso = strstr(buf, "[vdso]"); + if (has_vdso) + break; has_vvar = strstr(buf, "[vvar]"); - else - has_vvar = NULL; + if (has_vvar) + break; + has_vvar_vclock = strstr(buf, "[vvar_vclock]"); + } while (0); - if (!has_vdso && !has_vvar) + if (!has_vdso && !has_vvar && !has_vvar_vclock) continue; if (sscanf(buf, "%lx-%lx", &start, &end) != 2) { @@ -339,13 +345,21 @@ static int vdso_parse_maps(pid_t pid, struct vdso_maps *s) } s->vdso_start = start; s->sym.vdso_size = end - start; - } else { + } else if (has_vvar) { if (s->vvar_start != VVAR_BAD_ADDR) { pr_err("Got second VVAR entry\n"); goto err; } s->vvar_start = start; s->sym.vvar_size = end - start; + } else { + if (s->vvar_start == VDSO_BAD_ADDR || + s->vvar_start + s->sym.vvar_size != start) { + pr_err("VVAR and VVAR_VCLOCK entries are not subsequent\n"); + goto err; + } + s->sym.vvar_vclock_size = end - start; + s->sym.vvar_size += s->sym.vvar_vclock_size; } } From dc6cef0b4cb1c2de60ee0300fa9705835dff0f45 Mon Sep 17 00:00:00 2001 From: Jesus Ramos Date: Tue, 10 Dec 2024 12:11:57 -0800 Subject: [PATCH 037/257] cuda: Fix return value from CHECKPOINT_DEVICES hook so that dump's fail properly cuda-checkpoint returns the positive CUDA error code when it runs into an issue and passing that along as the return value would cause errors to get ignored Signed-off-by: Jesus Ramos --- plugins/cuda/cuda_plugin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 7764cf3c7..e78828b18 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -402,7 +402,7 @@ int cuda_plugin_checkpoint_devices(int pid) interrupt: int_ret = interrupt_restore_thread(restore_tid, &save_sigset); - return status != 0 ? status : int_ret; + return status != 0 ? -1 : int_ret; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, cuda_plugin_checkpoint_devices); From 15c81c12629c516a1e58097de0cee157515e2401 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sat, 7 Dec 2024 00:08:59 +0000 Subject: [PATCH 038/257] test/java: increate the ghost file limit Right now, this test fails with this error: Error (criu/files-reg.c:1031): Can't dump ghost file /criu/test/javaTests/omrvmem_000000626_Mlm48x of 2097152 size, increase limit Signed-off-by: Andrei Vagin --- scripts/build/Dockerfile.openj9-ubuntu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.openj9-ubuntu index c2cf20a36..e190c2792 100644 --- a/scripts/build/Dockerfile.openj9-ubuntu +++ b/scripts/build/Dockerfile.openj9-ubuntu @@ -24,9 +24,10 @@ RUN apt-install protobuf-c-compiler \ gcc \ maven +RUN mkdir -p /etc/criu && echo 'ghost-limit 16777216' > /etc/criu/default.conf COPY . /criu WORKDIR /criu RUN make mrproper && make -j $(nproc) CC="$CC" -ENTRYPOINT mvn -q -f test/javaTests/pom.xml test +ENTRYPOINT mvn -f test/javaTests/pom.xml test From a8754905c05a08f560194d90bb53ba86b27577d2 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 13 Dec 2024 09:03:42 -0800 Subject: [PATCH 039/257] test: run scm06 in the ns and uns flavors The kernel releases a test socket asynchronously, so the restore can fail if it is executed before the kernel actually destroys the socket. Fixes #2537 Signed-off-by: Andrei Vagin --- test/zdtm/static/scm06.desc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/zdtm/static/scm06.desc b/test/zdtm/static/scm06.desc index 2eac7e654..38cc3be51 100644 --- a/test/zdtm/static/scm06.desc +++ b/test/zdtm/static/scm06.desc @@ -1 +1,4 @@ -{'flags': 'suid'} +# This test isn't executed in the host flavor (in the same network namespace, +# because the kernel releases a test socket asynchronously, so the restore +# can fail if it is executed before the kernel actually destroys the socket. +{'flags': 'suid', 'flavor': 'ns uns'} From 99e1fbd8a2cfd6eaf3193062c2925ac80a9893b6 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 16 Dec 2024 16:38:31 -0800 Subject: [PATCH 040/257] criu/seize.c: clang-format it Done using clang-format 19.1.5 with .clang-format obtained via scripts/fetch-clang-format.sh. Signed-off-by: Kir Kolyshkin --- criu/seize.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/criu/seize.c b/criu/seize.c index 9bd1832d9..529fff562 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -87,7 +87,10 @@ static const char frozen[] = "FROZEN"; static const char freezing[] = "FREEZING"; static const char thawed[] = "THAWED"; -enum freezer_state { FREEZER_ERROR = -1, THAWED, FROZEN, FREEZING }; +enum freezer_state { FREEZER_ERROR = -1, + THAWED, + FROZEN, + FREEZING }; /* Track if we are running on cgroup v2 system. */ static bool cgroup_v2 = false; From 82f4ecda6922020346a6a544b53476e0d527d366 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 12 Dec 2024 17:29:34 -0800 Subject: [PATCH 041/257] freeze_processes: fix logic There are a few issues with the freeze_processes logic: 1. Commit 9fae23fbe2 grossly (by 1000x) miscalculated the number of attempts required, as a result, we are seeing something like this: > (00.000340) freezing processes: 100000 attempts with 100 ms steps > (00.000351) freezer.state=THAWED > (00.000358) freezer.state=FREEZING > (00.100446) freezer.state=FREEZING > ...close to 100 lines skipped... > (09.915110) freezer.state=FREEZING > (10.000432) Error (criu/cr-dump.c:1467): Timeout reached. Try to interrupt: 0 > (10.000563) freezer.state=FREEZING For 10s with 100ms steps we only need 100 attempts, not 100000. 2. When the timeout is hit, the "failed to freeze cgroup" error is not printed, and the log_unfrozen_stacks is not called either. 3. The nanosleep at the last iteration is useless (this was hidden by issue 1 above, as the timeout was hit first). Fix all these. While at it, 4. Amend the error message with the number of attempts, sleep duration, and timeout. 5. Modify the "freezing cgroup" debug message to be in sync with the above error. Was: > freezing processes: 100000 attempts with 100 ms steps Now: > freezing cgroup some/name: 100 x 100ms attempts, timeout: 10s Signed-off-by: Kir Kolyshkin --- criu/seize.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/criu/seize.c b/criu/seize.c index 529fff562..6701446ae 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -545,7 +545,8 @@ static int freeze_processes(void) enum freezer_state state = THAWED; static const unsigned long step_ms = 100; - unsigned long nr_attempts = (opts.timeout * 1000000) / step_ms; + /* Since opts.timeout is in seconds, multiply it by 1000 to convert to milliseconds. */ + unsigned long nr_attempts = (opts.timeout * 1000) / step_ms; unsigned long i = 0; const struct timespec req = { @@ -554,14 +555,12 @@ static int freeze_processes(void) }; if (unlikely(!nr_attempts)) { - /* - * If timeout is turned off, lets - * wait for at least 10 seconds. - */ - nr_attempts = (10 * 1000000) / step_ms; + /* If the timeout is 0, wait for at least 10 seconds. */ + nr_attempts = (10 * 1000) / step_ms; } - pr_debug("freezing processes: %lu attempts with %lu ms steps\n", nr_attempts, step_ms); + pr_debug("freezing cgroup %s: %lu x %lums attempts, timeout: %us\n", + opts.freeze_cgroup, nr_attempts, step_ms, opts.timeout); fd = freezer_open(); if (fd < 0) @@ -588,22 +587,22 @@ static int freeze_processes(void) * not read @tasks pids while freezer in * transition stage. */ - for (; i <= nr_attempts; i++) { + while (1) { state = get_freezer_state(fd); if (state == FREEZER_ERROR) { close(fd); return -1; } - if (state == FROZEN) + if (state == FROZEN || i++ == nr_attempts || alarm_timeouted()) break; - if (alarm_timeouted()) - goto err; + nanosleep(&req, NULL); } - if (i > nr_attempts) { - pr_err("Unable to freeze cgroup %s\n", opts.freeze_cgroup); + if (state != FROZEN) { + pr_err("Unable to freeze cgroup %s (%lu x %lums attempts, timeout: %us)\n", + opts.freeze_cgroup, i, step_ms, opts.timeout); if (!pr_quelled(LOG_DEBUG)) log_unfrozen_stacks(opts.freeze_cgroup); goto err; From 94b9b3c5daf1237493f75d2e538d72d81013c2a2 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 12 Dec 2024 17:34:17 -0800 Subject: [PATCH 042/257] freeze_processes: implement kludges for cgroup v1 Cgroup v1 freezer has always been problematic, failing to freeze a cgroup. In runc, we have implemented a few kludges to increase the chance of succeeding, but those are used when runc freezes a cgroup for its own purposes (for "runc pause" and to modify device properties for cgroup v1). When criu is used, it fails to freeze a cgroup from time to time (see [1], [2]). Let's try adding kludges similar to ones in runc. Alas, I have absolutely no way to test this, so please review carefully. [1]: https://github.com/opencontainers/runc/issues/4273 [2]: https://github.com/opencontainers/runc/issues/4457 Signed-off-by: Kir Kolyshkin --- criu/seize.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/criu/seize.c b/criu/seize.c index 6701446ae..829d7c278 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -539,6 +539,34 @@ err: return exit_code; } +static void cgroupv1_freezer_kludges(int fd, int iter, const struct timespec *req) { + /* As per older kernel docs (freezer-subsystem.txt before + * the kernel commit ef9fe980c6fcc1821), if FREEZING is seen, + * userspace should either retry or thaw. While current + * kernel cgroup v1 docs no longer mention a need to retry, + * even recent kernels can't reliably freeze a cgroup v1. + * + * Let's keep asking the kernel to freeze from time to time. + * In addition, do occasional thaw/sleep/freeze. + * + * This is still a game of chances (the real fix belongs to the kernel) + * but these kludges might improve the probability of success. + * + * Cgroup v2 does not have this problem. + */ + switch (iter % 32) { + case 9: + case 20: + freezer_write_state(fd, FROZEN); + break; + case 31: + freezer_write_state(fd, THAWED); + nanosleep(req, NULL); + freezer_write_state(fd, FROZEN); + break; + } +} + static int freeze_processes(void) { int fd, exit_code = -1; @@ -597,6 +625,9 @@ static int freeze_processes(void) if (state == FROZEN || i++ == nr_attempts || alarm_timeouted()) break; + if (!cgroup_v2) + cgroupv1_freezer_kludges(fd, i, &req); + nanosleep(&req, NULL); } From 6f8efad304a4a65bc5b45ec5985fc7ac3763b5ff Mon Sep 17 00:00:00 2001 From: Liu Chao Date: Thu, 19 Dec 2024 08:16:36 +0000 Subject: [PATCH 043/257] cr: Task CapAmb support Signed-off-by: Liu Chao --- criu/cr-restore.c | 2 ++ criu/include/parasite.h | 1 + criu/include/prctl.h | 9 +++++++++ criu/include/proc_parse.h | 1 + criu/include/restorer.h | 1 + criu/parasite-syscall.c | 3 +++ criu/pie/parasite.c | 13 +++++++++++++ criu/pie/restorer.c | 16 ++++++++++++++++ criu/proc_parse.c | 11 +++++++++-- criu/pstree.c | 3 +++ images/creds.proto | 2 ++ 11 files changed, 60 insertions(+), 2 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 646300bdb..ddca6b8ec 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2992,6 +2992,7 @@ static struct thread_creds_args *rst_prep_creds_args(CredsEntry *ce, unsigned lo args->creds.cap_eff = NULL; args->creds.cap_prm = NULL; args->creds.cap_bnd = NULL; + args->creds.cap_amb = NULL; args->creds.groups = NULL; args->creds.lsm_profile = NULL; @@ -2999,6 +3000,7 @@ static struct thread_creds_args *rst_prep_creds_args(CredsEntry *ce, unsigned lo copy_caps(args->cap_eff, ce->cap_eff, ce->n_cap_eff); copy_caps(args->cap_prm, ce->cap_prm, ce->n_cap_prm); copy_caps(args->cap_bnd, ce->cap_bnd, ce->n_cap_bnd); + copy_caps(args->cap_amb, ce->cap_amb, ce->n_cap_amb); if (ce->n_groups && !groups_match(ce->groups, ce->n_groups)) { unsigned int *groups; diff --git a/criu/include/parasite.h b/criu/include/parasite.h index 1244220f6..b33d6710f 100644 --- a/criu/include/parasite.h +++ b/criu/include/parasite.h @@ -148,6 +148,7 @@ struct parasite_dump_creds { u32 cap_prm[CR_CAP_SIZE]; u32 cap_eff[CR_CAP_SIZE]; u32 cap_bnd[CR_CAP_SIZE]; + u32 cap_amb[CR_CAP_SIZE]; int uids[4]; int gids[4]; diff --git a/criu/include/prctl.h b/criu/include/prctl.h index 4c2a548b1..f5f23c969 100644 --- a/criu/include/prctl.h +++ b/criu/include/prctl.h @@ -36,6 +36,15 @@ #ifndef PR_SET_NO_NEW_PRIVS #define PR_SET_NO_NEW_PRIVS 38 #endif +#ifndef PR_CAP_AMBIENT +#define PR_CAP_AMBIENT 47 +#endif +#ifndef PR_CAP_AMBIENT_IS_SET +#define PR_CAP_AMBIENT_IS_SET 1 +#endif +#ifndef PR_CAP_AMBIENT_RAISE +#define PR_CAP_AMBIENT_RAISE 2 +#endif #ifndef PR_SET_MM #define PR_SET_MM 35 diff --git a/criu/include/proc_parse.h b/criu/include/proc_parse.h index 0c334a190..0bd79bf55 100644 --- a/criu/include/proc_parse.h +++ b/criu/include/proc_parse.h @@ -81,6 +81,7 @@ struct proc_status_creds { u32 cap_prm[PROC_CAP_SIZE]; u32 cap_eff[PROC_CAP_SIZE]; u32 cap_bnd[PROC_CAP_SIZE]; + u32 cap_amb[PROC_CAP_SIZE]; }; #define INVALID_UID ((uid_t)-1) diff --git a/criu/include/restorer.h b/criu/include/restorer.h index 3fb5322a4..a4fb7ea79 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -75,6 +75,7 @@ struct thread_creds_args { u32 cap_prm[CR_CAP_SIZE]; u32 cap_eff[CR_CAP_SIZE]; u32 cap_bnd[CR_CAP_SIZE]; + u32 cap_amb[CR_CAP_SIZE]; char *lsm_profile; unsigned int *groups; diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c index a88f8a66f..6db9d21fe 100644 --- a/criu/parasite-syscall.c +++ b/criu/parasite-syscall.c @@ -103,16 +103,19 @@ static int alloc_groups_copy_creds(CredsEntry *ce, struct parasite_dump_creds *c BUILD_BUG_ON(sizeof(ce->cap_prm[0]) != sizeof(c->cap_prm[0])); BUILD_BUG_ON(sizeof(ce->cap_eff[0]) != sizeof(c->cap_eff[0])); BUILD_BUG_ON(sizeof(ce->cap_bnd[0]) != sizeof(c->cap_bnd[0])); + BUILD_BUG_ON(sizeof(ce->cap_amb[0]) != sizeof(c->cap_amb[0])); BUG_ON(ce->n_cap_inh != CR_CAP_SIZE); BUG_ON(ce->n_cap_prm != CR_CAP_SIZE); BUG_ON(ce->n_cap_eff != CR_CAP_SIZE); BUG_ON(ce->n_cap_bnd != CR_CAP_SIZE); + BUG_ON(ce->n_cap_amb != CR_CAP_SIZE); memcpy(ce->cap_inh, c->cap_inh, sizeof(c->cap_inh[0]) * CR_CAP_SIZE); memcpy(ce->cap_prm, c->cap_prm, sizeof(c->cap_prm[0]) * CR_CAP_SIZE); memcpy(ce->cap_eff, c->cap_eff, sizeof(c->cap_eff[0]) * CR_CAP_SIZE); memcpy(ce->cap_bnd, c->cap_bnd, sizeof(c->cap_bnd[0]) * CR_CAP_SIZE); + memcpy(ce->cap_amb, c->cap_amb, sizeof(c->cap_amb[0]) * CR_CAP_SIZE); if (c->no_new_privs > 0) { ce->no_new_privs = c->no_new_privs; diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c index e151ed656..1bc03dc2a 100644 --- a/criu/pie/parasite.c +++ b/criu/pie/parasite.c @@ -324,6 +324,7 @@ static int dump_creds(struct parasite_dump_creds *args) args->cap_prm[i] = data[i].prm; args->cap_inh[i] = data[i].inh; args->cap_bnd[i] = 0; + args->cap_amb[i] = 0; for (j = 0; j < 32; j++) { if (j + i * 32 > args->cap_last_cap) @@ -336,6 +337,18 @@ static int dump_creds(struct parasite_dump_creds *args) if (ret) args->cap_bnd[i] |= (1 << j); } + + for (j = 0; j < 32; j++) { + if (j + i * 32 > args->cap_last_cap) + break; + ret = sys_prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, j + i * 32, 0, 0); + if (ret < 0) { + pr_err("Unable to read ambient capability %d: %d\n", j + i * 32, ret); + return -1; + } + if (ret) + args->cap_amb[i] |= (1 << j); + } } args->no_new_privs = sys_prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 51ed6ed4c..0a6a7977c 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -347,6 +347,22 @@ skip_xids: return -1; } + for (b = 0; b < CR_CAP_SIZE; b++) { + for (i = 0; i < 32; i++) { + if (b * 32 + i > args->cap_last_cap) + break; + if ((args->cap_amb[b] & (1 << i)) == 0) + /* don't set */ + continue; + ret = sys_prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, i + b * 32, 0, 0); + if (!ret) + continue; + pr_err("Unable to raise ambient capability %d: %d\n", i + b * 32, ret); + return -1; + } + } + + if (lsm_type != LSMTYPE__SELINUX) { /* * SELinux does not support setting the process context for diff --git a/criu/proc_parse.c b/criu/proc_parse.c index be0c3d531..99dc518a5 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -1071,7 +1071,7 @@ int parse_pid_status(pid_t pid, struct seize_task_status *ss, void *data) if (bfdopenr(&f)) return -1; - while (done < 13) { + while (done < 14) { str = breadline(&f); if (str == NULL) break; @@ -1155,6 +1155,13 @@ int parse_pid_status(pid_t pid, struct seize_task_status *ss, void *data) continue; } + if (!strncmp(str, "CapAmb:", 7)) { + if (cap_parse(str + 8, cr->cap_amb)) + goto err_parse; + done++; + continue; + } + if (!strncmp(str, "Seccomp:", 8)) { if (sscanf(str + 9, "%d", &cr->s.seccomp_mode) != 1) { goto err_parse; @@ -1198,7 +1205,7 @@ int parse_pid_status(pid_t pid, struct seize_task_status *ss, void *data) } /* seccomp and nspids are optional */ - expected_done = (parsed_seccomp ? 12 : 11); + expected_done = (parsed_seccomp ? 13 : 12); if (kdat.has_nspid) expected_done++; if (done == expected_done) diff --git a/criu/pstree.c b/criu/pstree.c index 8c44e7134..41df846ed 100644 --- a/criu/pstree.c +++ b/criu/pstree.c @@ -63,6 +63,7 @@ CoreEntry *core_entry_alloc(int th, int tsk) sz += CR_CAP_SIZE * sizeof(ce->cap_prm[0]); sz += CR_CAP_SIZE * sizeof(ce->cap_eff[0]); sz += CR_CAP_SIZE * sizeof(ce->cap_bnd[0]); + sz += CR_CAP_SIZE * sizeof(ce->cap_amb[0]); /* * @groups are dynamic and allocated * on demand. @@ -122,10 +123,12 @@ CoreEntry *core_entry_alloc(int th, int tsk) ce->n_cap_prm = CR_CAP_SIZE; ce->n_cap_eff = CR_CAP_SIZE; ce->n_cap_bnd = CR_CAP_SIZE; + ce->n_cap_amb = CR_CAP_SIZE; ce->cap_inh = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_inh[0])); ce->cap_prm = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_prm[0])); ce->cap_eff = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_eff[0])); ce->cap_bnd = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_bnd[0])); + ce->cap_amb = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_amb[0])); if (arch_alloc_thread_info(core)) { xfree(core); diff --git a/images/creds.proto b/images/creds.proto index 220ed3858..932a40ccf 100644 --- a/images/creds.proto +++ b/images/creds.proto @@ -25,4 +25,6 @@ message creds_entry { optional string lsm_sockcreate = 16; optional bytes apparmor_data = 17; optional uint32 no_new_privs = 18; + + repeated uint32 cap_amb = 19; } From 260c08418bf5540477b55b485ddcf9c86652cb62 Mon Sep 17 00:00:00 2001 From: Liu Chao Date: Fri, 3 Jan 2025 03:33:27 +0000 Subject: [PATCH 044/257] zdtm: Check CapAmb is restored correctly after C/R This test sets CapAmb according to CapPrm and CapInh and check CapAmb after C/R. Signed-off-by: Liu Chao --- test/zdtm/static/Makefile | 1 + test/zdtm/static/caps01.c | 168 +++++++++++++++++++++++++++++++++++ test/zdtm/static/caps01.desc | 1 + 3 files changed, 170 insertions(+) create mode 100644 test/zdtm/static/caps01.c create mode 100644 test/zdtm/static/caps01.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 71a1b6a53..78f96430e 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -8,6 +8,7 @@ TST_NOFILE := \ sleeping00 \ pid00 \ caps00 \ + caps01 \ wait00 \ zombie00 \ zombie01 \ diff --git a/test/zdtm/static/caps01.c b/test/zdtm/static/caps01.c new file mode 100644 index 000000000..0f8a7101e --- /dev/null +++ b/test/zdtm/static/caps01.c @@ -0,0 +1,168 @@ +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that CapAmb are preserved"; +const char *test_author = "Liu Chao "; + +struct cap_hdr { + unsigned int version; + int pid; +}; + +struct cap_data { + unsigned int eff; + unsigned int prm; + unsigned int inh; +}; + +#define _LINUX_CAPABILITY_VERSION_3 0x20080522 +#define _LINUX_CAPABILITY_U32S_3 2 +#define CAP_DAC_OVERRIDE 1 +#define PR_CAP_AMBIENT 47 +#define PR_CAP_AMBIENT_IS_SET 1 +#define PR_CAP_AMBIENT_RAISE 2 +#define PR_CAP_AMBIENT_LOWER 3 + +int capget(struct cap_hdr *hdrp, struct cap_data *datap); +int capset(struct cap_hdr *hdrp, const struct cap_data *datap); + +static int cap_last_cap = 63; + +int main(int argc, char **argv) +{ + task_waiter_t t; + int pid, result_pipe[2]; + unsigned int amb[_LINUX_CAPABILITY_U32S_3]; + unsigned int amb_2[_LINUX_CAPABILITY_U32S_3]; + char res = 'x'; + FILE *f; + + test_init(argc, argv); + task_waiter_init(&t); + + f = fopen("/proc/sys/kernel/cap_last_cap", "r"); + if (f) { + if (fscanf(f, "%d", &cap_last_cap) != 1) { + pr_perror("Unable to read cal_last_cap"); + fclose(f); + return 1; + } + fclose(f); + } else + test_msg("/proc/sys/kernel/cap_last_cap is not available\n"); + + if (pipe(result_pipe)) { + pr_perror("Can't create pipe"); + return 1; + } + + pid = test_fork(); + if (pid == 0) { + int b, i, ret; + struct cap_hdr hdr; + struct cap_data data[_LINUX_CAPABILITY_U32S_3]; + + hdr.version = _LINUX_CAPABILITY_VERSION_3; + hdr.pid = 0; + + if (capget(&hdr, data) < 0) { + pr_perror("capget"); + return -1; + } + + hdr.version = _LINUX_CAPABILITY_VERSION_3; + hdr.pid = 0; + + data[0].eff &= ~((1 << CAP_CHOWN) | (1 << CAP_DAC_OVERRIDE)); + data[0].prm &= ~(1 << CAP_DAC_OVERRIDE); + data[0].inh = data[0].prm; + data[1].inh = data[1].prm; + + if (capset(&hdr, data) < 0) { + pr_perror("capset"); + return -1; + } + + for (b = 0; b < _LINUX_CAPABILITY_U32S_3; b++) { + amb[b] = data[b].prm; + for (i = 0; i < 32; i++) { + if (b * 32 + i > cap_last_cap) + break; + if ((amb[b] & (1 << i)) > 0) + ret = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, i + b * 32, 0, 0); + else + ret = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_LOWER, i + b * 32, 0, 0); + if (ret) { + pr_perror("Unable to set ambient capability %d to %d: %d", i + b * 32, amb[b] & (1 << i), ret); + return -1; + } + } + } + + task_waiter_complete_current(&t); + task_waiter_wait4(&t, getppid()); + + for (b = 0; b < _LINUX_CAPABILITY_U32S_3; b++) { + amb_2[b] = 0; + for (i = 0; i < 32; i++) { + if (b * 32 + i > cap_last_cap) + break; + ret = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, i + b * 32, 0, 0); + if (ret < 0) { + pr_perror("Unable to read ambient capability %d: %d", i + b * 32, ret); + goto bad; + } + + amb_2[b] |= (ret << i); + } + } + + for (b = 0; b < _LINUX_CAPABILITY_U32S_3; b++) { + if (amb[b] != amb_2[b]) { + res = '1'; + goto bad; + } + } + + res = '0'; + bad: + write(result_pipe[1], &res, 1); + + if (res != '0') { + write(result_pipe[1], amb, sizeof(amb)); + write(result_pipe[1], amb_2, sizeof(amb_2)); + } + + close(result_pipe[0]); + close(result_pipe[1]); + _exit(0); + } + + task_waiter_wait4(&t, pid); + + test_daemon(); + test_waitsig(); + + task_waiter_complete_current(&t); + + read(result_pipe[0], &res, 1); + + if (res == '0') + pass(); + else { + read(result_pipe[0], amb, sizeof(amb)); + read(result_pipe[0], amb_2, sizeof(amb_2)); + test_msg("amb[]=%08x, %08x\n", amb[0], amb[1]); + test_msg("amb[]=%08x, %08x\n", amb_2[0], amb_2[1]); + fail("Fail: %c", res); + } + close(result_pipe[0]); + close(result_pipe[1]); + + return 0; +} diff --git a/test/zdtm/static/caps01.desc b/test/zdtm/static/caps01.desc new file mode 100644 index 000000000..2eac7e654 --- /dev/null +++ b/test/zdtm/static/caps01.desc @@ -0,0 +1 @@ +{'flags': 'suid'} From 6dce80c533df445c7926e952387edd59209ff3ee Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 16 Jan 2025 07:52:42 +0000 Subject: [PATCH 045/257] util: added cleanup_file attribute. Signed-off-by: Adrian Reber --- criu/include/util.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/criu/include/util.h b/criu/include/util.h index ae293a68c..4793f7f20 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -406,6 +406,14 @@ static inline void cleanup_freep(void *p) free(*pp); } +#define cleanup_file __attribute__((cleanup(cleanup_filep))) +static inline void cleanup_filep(FILE **f) +{ + FILE *file = *f; + if (file) + (void)fclose(file); +} + extern int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void *args); /* From 97398068b16fd24e244deeb6f8b0b52061b46bbd Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Tue, 17 Dec 2024 08:52:46 +0100 Subject: [PATCH 046/257] net: redirect nftables stdout and stderr to CRIU's log file When using the nftables network locking backend and restoring a process a second time the network locking has already been deleted by the first restore. The second restore will print out to the console text like: Error: Could not process rule: No such file or directory delete table inet CRIU-202621 With this change CRIU's log FD is used by libnftables stdout and stderr. Signed-off-by: Adrian Reber --- criu/net.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/criu/net.c b/criu/net.c index eee331108..efd52db32 100644 --- a/criu/net.c +++ b/criu/net.c @@ -3066,9 +3066,43 @@ err: return ret; } +#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) +static inline FILE *redirect_nftables_output(struct nft_ctx *nft) +{ + FILE *fp; + int fd; + + fd = dup(log_get_fd()); + if (fd < 0) { + pr_perror("dup() to redirect nftables output failed"); + return NULL; + } + + fp = fdopen(fd, "w"); + if (!fp) { + pr_perror("fdopen() to redirect nftables output failed"); + return NULL; + } + + /** + * Without setvbuf() the output from libnftables will be + * somewhere in the log file, probably at the end. + * With setvbuf() potential output will be at the correct + * position. + */ + setvbuf(fp, NULL, _IONBF, 0); + + nft_ctx_set_output(nft, fp); + nft_ctx_set_error(nft, fp); + + return fp; +} +#endif + static inline int nftables_lock_network_internal(void) { #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) + cleanup_file FILE *fp = NULL; struct nft_ctx *nft; int ret = 0; char table[32]; @@ -3081,6 +3115,10 @@ static inline int nftables_lock_network_internal(void) if (!nft) return -1; + fp = redirect_nftables_output(nft); + if (!fp) + goto out; + snprintf(buf, sizeof(buf), "create table %s", table); if (NFT_RUN_CMD(nft, buf)) goto err2; @@ -3168,6 +3206,7 @@ static inline int nftables_network_unlock(void) { #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) int ret = 0; + cleanup_file FILE *fp = NULL; struct nft_ctx *nft; char table[32]; char buf[128]; @@ -3179,6 +3218,10 @@ static inline int nftables_network_unlock(void) if (!nft) return -1; + fp = redirect_nftables_output(nft); + if (!fp) + return -1; + snprintf(buf, sizeof(buf), "delete table %s", table); if (NFT_RUN_CMD(nft, buf)) ret = -1; From 6fdac508186fd645cc0a05a8bf82ea17e4662eb9 Mon Sep 17 00:00:00 2001 From: Yuanhong Peng Date: Thu, 19 Dec 2024 14:30:41 +0800 Subject: [PATCH 047/257] seize: Adjust the position of the log message Based on the code, the `ret` variable at this point does not represent the task state, so this log message should be moved to a position after the `compel_wait_task()` function. Signed-off-by: Yuanhong Peng --- criu/seize.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/criu/seize.c b/criu/seize.c index 829d7c278..007e8e580 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -707,8 +707,6 @@ static int collect_children(struct pstree_item *item) goto free; } - pr_info("Seized task %d, state %d\n", pid, ret); - c = alloc_pstree_item(); if (c == NULL) { ret = -1; @@ -746,6 +744,8 @@ static int collect_children(struct pstree_item *item) if (ret == TASK_STOPPED) c->pid->stop_signo = compel_parse_stop_signo(pid); + pr_info("Seized task %d, state %d\n", pid, ret); + c->pid->real = pid; c->parent = item; c->pid->state = ret; From 2b74924805730c10d4fea7f6e332c3f09167c628 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Tue, 21 Jan 2025 15:05:42 +0100 Subject: [PATCH 048/257] files-reg: fix buffer overflow on aarch64 Running the zdtm/static/unlink_regular00 test on Ubuntu 24.04 on aarch64 results in following error: # ./zdtm.py run -t zdtm/static/unlink_regular00 -k always userns is supported === Run 1/1 ================ zdtm/static/unlink_regular00 ==================== Run zdtm/static/unlink_regular00 in ns ==================== Skipping rtc at root Start test Test is SUID ./unlink_regular00 --pidfile=unlink_regular00.pid --outfile=unlink_regular00.out --dirname=unlink_regular00.test Run criu dump *** buffer overflow detected ***: terminated ############# Test zdtm/static/unlink_regular00 FAIL at CRIU dump ############## Test output: ================================ <<< ================================ Send the 9 signal to 47 Wait for zdtm/static/unlink_regular00(47) to die for 0.100000 ##################################### FAIL ##################################### According to the backtrace: #0 __pthread_kill_implementation (threadid=281473158467616, signo=signo@entry=6, no_tid=no_tid@entry=0) at ./nptl/pthread_kill.c:44 #1 0x0000ffff93477690 in __pthread_kill_internal (signo=6, threadid=) at ./nptl/pthread_kill.c:78 #2 0x0000ffff9342cb3c in __GI_raise (sig=sig@entry=6) at ../sysdeps/posix/raise.c:26 #3 0x0000ffff93417e00 in __GI_abort () at ./stdlib/abort.c:79 #4 0x0000ffff9346abf0 in __libc_message_impl (fmt=fmt@entry=0xffff93552a78 "*** %s ***: terminated\n") at ../sysdeps/posix/libc_fatal.c:132 #5 0x0000ffff934e81a8 in __GI___fortify_fail (msg=msg@entry=0xffff93552a28 "buffer overflow detected") at ./debug/fortify_fail.c:24 #6 0x0000ffff934e79e4 in __GI___chk_fail () at ./debug/chk_fail.c:28 #7 0x0000ffff934e9070 in ___snprintf_chk (s=s@entry=0xffffc6ed04a3 "testfile", maxlen=maxlen@entry=4056, flag=flag@entry=2, slen=slen@entry=4053, format=format@entry=0xaaaacffe3888 "link_remap.%d") at ./debug/snprintf_chk.c:29 #8 0x0000aaaacff4b8b8 in snprintf (__fmt=0xaaaacffe3888 "link_remap.%d", __n=4056, __s=0xffffc6ed04a3 "testfile") at /usr/include/aarch64-linux-gnu/bits/stdio2.h:54 #9 create_link_remap (path=path@entry=0xffffc6ed2901 "/zdtm/static/unlink_regular00.test/subdir/testfile", len=len@entry=60, lfd=lfd@entry=20, idp=idp@entry=0xffffc6ed14ec, nsid=nsid@entry=0xaaaada2bac00, parms=parms@entry=0xffffc6ed2808, fallback=0xaaaacff4c6c0 , fallback@entry=0xffffc6ed2797) at criu/files-reg.c:1164 #10 0x0000aaaacff4c6c0 in dump_linked_remap (path=path@entry=0xffffc6ed2901 "/zdtm/static/unlink_regular00.test/subdir/testfile", len=len@entry=60, parms=parms@entry=0xffffc6ed2808, lfd=lfd@entry=20, id=id@entry=12, nsid=nsid@entry=0xaaaada2bac00, fallback=fallback@entry=0xffffc6ed2797) at criu/files-reg.c:1198 #11 0x0000aaaacff4d8b0 in check_path_remap (nsid=0xaaaada2bac00, id=12, lfd=20, parms=0xffffc6ed2808, link=) at criu/files-reg.c:1426 #12 dump_one_reg_file (lfd=20, id=12, p=0xffffc6ed2808) at criu/files-reg.c:1827 #13 0x0000aaaacff51078 in dump_one_file (pid=, fd=4, lfd=20, opts=opts@entry=0xaaaada2ba2c0, ctl=ctl@entry=0xaaaada2c4d50, e=e@entry=0xffffc6ed39c8, dfds=dfds@entry=0xaaaada2c3d40) at criu/files.c:581 #14 0x0000aaaacff5176c in dump_task_files_seized (ctl=ctl@entry=0xaaaada2c4d50, item=item@entry=0xaaaada2b8f80, dfds=dfds@entry=0xaaaada2c3d40) at criu/files.c:657 #15 0x0000aaaacff3d3c0 in dump_one_task (parent_ie=0x0, item=0xaaaada2b8f80) at criu/cr-dump.c:1679 #16 cr_dump_tasks (pid=) at criu/cr-dump.c:2224 #17 0x0000aaaacff163a0 in main (argc=, argv=0xffffc6ed40e8, envp=) at criu/crtools.c:293 This line is the problem: snprintf(tmp + 1, sizeof(link_name) - (size_t)(tmp - link_name - 1), "link_remap.%d", rfe.id); The problem was that the `-1` was on the inside of the braces and not on the outside. This way the destination size was increase by 1 instead of being decreased by 1 which triggered the buffer overflow detection. Signed-off-by: Adrian Reber --- criu/files-reg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/files-reg.c b/criu/files-reg.c index fc6149350..66c0e6cda 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -1150,7 +1150,7 @@ static int create_link_remap(char *path, int len, int lfd, u32 *idp, struct ns_i rfe.name = link_name + 1; /* Any 'unique' name works here actually. Remap works by reg-file ids. */ - snprintf(tmp + 1, sizeof(link_name) - (size_t)(tmp - link_name - 1), "link_remap.%d", rfe.id); + snprintf(tmp + 1, sizeof(link_name) - (size_t)(tmp - link_name) - 1, "link_remap.%d", rfe.id); mntns_root = mntns_get_root_fd(nsid); From aad66a4f7c7affb59d4d59823c35849e1b3421f5 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Mon, 20 Jan 2025 19:00:16 +0100 Subject: [PATCH 049/257] test: fix cmdlinenv00 on aarch64 On aarch64 the test cmdlinenv00 was failing with: FAIL: cmdlinenv00.c:120: auxv corrupted on restore (errno = 11 (Resource temporarily unavailable)) Starting with Linux kernel version 6.3 the size of AUXV was changed: commit 28c8e088427ad30b4260953f3b6f908972b77c2d Author: Mathieu Desnoyers Date: Wed Jan 4 14:20:54 2023 -0500 rseq: Increase AT_VECTOR_SIZE_BASE to match rseq auxvec entries Two new auxiliary vector entries are introduced for rseq without matching increment of the AT_VECTOR_SIZE_BASE, which causes failures with CONFIG_HARDENED_USERCOPY=y. Fixes: 317c8194e6ae ("rseq: Introduce feature size and alignment ELF auxiliary vector entries") With this change AT_VECTOR_SIZE increases from 40 to 50 on aarch64. CRIU uses AT_VECTOR_SIZE to read the content of /proc/PID/auxv auxv_t mm_saved_auxv[AT_VECTOR_SIZE]; ret = read(fd, mm_saved_auxv, sizeof(mm_saved_auxv)); Now the tests works again on aarch64. Signed-off-by: Adrian Reber --- criu/arch/aarch64/include/asm/types.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/criu/arch/aarch64/include/asm/types.h b/criu/arch/aarch64/include/asm/types.h index 363c1cae2..db118cafd 100644 --- a/criu/arch/aarch64/include/asm/types.h +++ b/criu/arch/aarch64/include/asm/types.h @@ -33,7 +33,16 @@ static inline uint64_t encode_pointer(void *p) return (uint64_t)p; } -#define AT_VECTOR_SIZE 40 +/** + * See also: + * * arch/arm64/include/uapi/asm/auxvec.h + * * include/linux/auxvec.h + * * include/linux/mm_types.h + */ +#define AT_VECTOR_SIZE_BASE 22 +#define AT_VECTOR_SIZE_ARCH 2 +#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) + typedef uint64_t auxv_t; typedef uint64_t tls_t; From 09dc2e9584b70e345fd12402dc3467a806ceb9e8 Mon Sep 17 00:00:00 2001 From: Austin Kuo <104871462+hckuo@users.noreply.github.com> Date: Tue, 7 Jan 2025 04:31:05 +0000 Subject: [PATCH 050/257] timer: Refine itimer_armed logic and improve timer value handling Right now, CRIU skips timers non-periodic timers. This change addresses this issue. Signed-off-by: Austin Kuo --- criu/pie/restorer.c | 2 +- criu/timer.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 0a6a7977c..6d048c3f1 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -2226,7 +2226,7 @@ __visible long __export_restore_task(struct task_restore_args *args) * code below doesn't fail due to bad timing values. */ -#define itimer_armed(args, i) (args->itimers[i].it_interval.tv_sec || args->itimers[i].it_interval.tv_usec) +#define itimer_armed(args, i) (args->itimers[i].it_value.tv_sec || args->itimers[i].it_value.tv_usec) if (itimer_armed(args, 0)) sys_setitimer(ITIMER_REAL, &args->itimers[0], NULL); diff --git a/criu/timer.c b/criu/timer.c index e94cf0280..0413e2a72 100644 --- a/criu/timer.c +++ b/criu/timer.c @@ -16,7 +16,7 @@ static inline int timeval_valid(struct timeval *tv) static inline int decode_itimer(char *n, ItimerEntry *ie, struct itimerval *val) { - if (ie->isec == 0 && ie->iusec == 0) { + if (ie->isec == 0 && ie->iusec == 0 && ie->vsec == 0 && ie->vusec == 0) { memzero_p(val); return 0; } From 061f4266e80dbfc00e5ac58d3f1432e8d833b000 Mon Sep 17 00:00:00 2001 From: Austin Kuo Date: Tue, 21 Jan 2025 12:04:33 -0800 Subject: [PATCH 051/257] test/zdtm: add a new test to check non-periodic timers It creates a few timers with log expiration intervals, waites for C/R and check that timers are armed and their intervals have been restored. Signed-off-by: Austin Kuo --- test/zdtm/static/Makefile | 1 + test/zdtm/static/timers01.c | 74 +++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 test/zdtm/static/timers01.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 78f96430e..f72fb2a77 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -24,6 +24,7 @@ TST_NOFILE := \ sse20 \ mprotect00 \ timers \ + timers01 \ timerfd \ unbound_sock \ sched_prio00 \ diff --git a/test/zdtm/static/timers01.c b/test/zdtm/static/timers01.c new file mode 100644 index 000000000..10ecc3481 --- /dev/null +++ b/test/zdtm/static/timers01.c @@ -0,0 +1,74 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Checks non-periodic timers\n"; +const char *test_author = "Andrei Vagin "; + +static struct { + const int timer_type; + const int signal; + volatile sig_atomic_t count; +} timer_tests[] = { + /* from slowest to fastest */ + { ITIMER_VIRTUAL, SIGVTALRM }, + { ITIMER_PROF, SIGPROF }, + { ITIMER_REAL, SIGALRM }, +}; + +#define NUM_TIMERS (sizeof(timer_tests) / sizeof(timer_tests[0])) +#define TIMER_TIMEOUT 3600 +#define TIMER_ALLOWED_DELTA 300 + +static void setup_timers(void) +{ + int i; + struct itimerval tv = { + .it_interval = { .tv_sec = 0, .tv_usec = 0 }, + .it_value = { .tv_sec = TIMER_TIMEOUT, .tv_usec = 0 }, + }; + + for (i = 0; i < NUM_TIMERS; i++) { + if (setitimer(timer_tests[i].timer_type, &tv, NULL) < 0) { + pr_perror("can't set timer %d", i); + exit(1); + } + } +} + +static void check_timers(void) +{ + int i; + + for (i = 0; i < NUM_TIMERS; i++) { + struct itimerval tv = {}; + + if (getitimer(timer_tests[i].timer_type, &tv)) { + pr_perror("gettimer"); + exit(1); + } + if (tv.it_value.tv_sec > TIMER_TIMEOUT || + tv.it_value.tv_sec < TIMER_TIMEOUT - TIMER_ALLOWED_DELTA) { + fail("%ld isn't in [%d, %d]", (long)tv.it_value.tv_sec, + TIMER_TIMEOUT, + TIMER_TIMEOUT - TIMER_ALLOWED_DELTA); + exit(1); + } + } + pass(); +} + +int main(int argc, char **argv) +{ + test_init(argc, argv); + + setup_timers(); + + test_daemon(); + test_waitsig(); + + check_timers(); + return 0; +} From 815ef68848ad79642273026e68761e6680c047a1 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Fri, 24 Jan 2025 09:27:16 +0100 Subject: [PATCH 052/257] ci: two check-commits.yml changes * Switch to v4 actions/checkout (from v3) * Use our apt wrapper to gracefully handle temporary repository errors Signed-off-by: Adrian Reber --- .github/workflows/check-commits.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/check-commits.yml b/.github/workflows/check-commits.yml index be2fbd285..94861ab52 100644 --- a/.github/workflows/check-commits.yml +++ b/.github/workflows/check-commits.yml @@ -12,14 +12,14 @@ jobs: # Check if pull request does not have label "not-selfcontained-ok" if: "!contains(github.event.pull_request.labels.*.name, 'not-selfcontained-ok')" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: # Needed to rebase against the base branch fetch-depth: 0 # Checkout pull request HEAD commit instead of merge commit ref: ${{ github.event.pull_request.head.sha }} - name: Install dependencies - run: sudo apt-get install -y libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev + run: sudo scripts/ci/apt-install libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev - name: Configure git user details run: | git config --global user.email "checkpoint-restore@users.noreply.github.com" From 54795f174b606cbe2c134e7be2094ea53be3559d Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 23 Jan 2025 09:26:15 +0000 Subject: [PATCH 053/257] criu: use libuuid for criu_run_id generation criu_run_id will be used in upcoming changes to create and remove network rules for network locking. Instead of trying to come up with a way to create unique IDs, just use an existing library. libuuid should be installed on most systems as it is indirectly required by systemd (via libmount). Signed-off-by: Adrian Reber --- .cirrus.yml | 2 +- .github/workflows/check-commits.yml | 2 +- compel/include/uapi/infect-util.h | 11 ++++++++++- compel/src/lib/infect-util.c | 2 +- compel/src/lib/infect.c | 2 +- criu/Makefile.packages | 4 +++- criu/fdstore.c | 2 +- criu/files.c | 2 +- criu/include/util.h | 4 +++- criu/pidfd-store.c | 2 +- criu/unittest/mock.c | 4 +++- criu/util.c | 17 +++++++---------- scripts/build/Dockerfile.alpine | 3 ++- scripts/build/Dockerfile.amd-rocm | 1 + scripts/build/Dockerfile.archlinux | 1 + scripts/build/Dockerfile.hotspot-alpine | 1 + scripts/build/Dockerfile.hotspot-ubuntu | 1 + scripts/build/Dockerfile.linux32.tmpl | 1 + scripts/build/Dockerfile.openj9-ubuntu | 1 + .../build/Dockerfile.riscv64-stable-cross.tmpl | 1 + scripts/build/Dockerfile.stable-cross.tmpl | 1 + scripts/build/Dockerfile.tmpl | 1 + scripts/build/Dockerfile.unstable-cross.tmpl | 1 + scripts/ci/prepare-for-fedora-rawhide.sh | 1 + scripts/ci/run-ci-tests.sh | 2 +- scripts/ci/vagrant.sh | 2 +- 26 files changed, 48 insertions(+), 24 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index 5e30ca2c2..a4b53a54b 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -36,7 +36,7 @@ task: ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto dnf config-manager --set-enabled crb # Same as CentOS 8 powertools dnf -y install epel-release epel-next-release - dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata xmlto libdrm-devel + dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata xmlto libdrm-devel libuuid-devel # The image has a too old version of nettle which does not work with gnutls. # Just upgrade to the latest to make the error go away. dnf -y upgrade nettle nettle-devel diff --git a/.github/workflows/check-commits.yml b/.github/workflows/check-commits.yml index 94861ab52..354873909 100644 --- a/.github/workflows/check-commits.yml +++ b/.github/workflows/check-commits.yml @@ -19,7 +19,7 @@ jobs: # Checkout pull request HEAD commit instead of merge commit ref: ${{ github.event.pull_request.head.sha }} - name: Install dependencies - run: sudo scripts/ci/apt-install libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev + run: sudo scripts/ci/apt-install libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev uuid-dev - name: Configure git user details run: | git config --global user.email "checkpoint-restore@users.noreply.github.com" diff --git a/compel/include/uapi/infect-util.h b/compel/include/uapi/infect-util.h index ace6f6b6b..658df9393 100644 --- a/compel/include/uapi/infect-util.h +++ b/compel/include/uapi/infect-util.h @@ -3,11 +3,20 @@ #include "common/compiler.h" +/** + * The length of the hash is based on what libuuid provides. + * According to the manpage this is: + * + * The uuid_unparse() function converts the supplied UUID uu from the binary + * representation into a 36-byte string (plus trailing '\0') + */ +#define RUN_ID_HASH_LENGTH 37 + /* * compel_run_id is a unique value of the current run. It can be used to * generate resource ID-s to avoid conflicts with other processes. */ -extern uint64_t compel_run_id; +extern char compel_run_id[RUN_ID_HASH_LENGTH]; struct parasite_ctl; extern int __must_check compel_util_send_fd(struct parasite_ctl *ctl, int fd); diff --git a/compel/src/lib/infect-util.c b/compel/src/lib/infect-util.c index 00a7c83f7..dc57e28f7 100644 --- a/compel/src/lib/infect-util.c +++ b/compel/src/lib/infect-util.c @@ -7,7 +7,7 @@ #include "infect-rpc.h" #include "infect-util.h" -uint64_t compel_run_id; +char compel_run_id[RUN_ID_HASH_LENGTH]; int compel_util_send_fd(struct parasite_ctl *ctl, int fd) { diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index 1e3ffb967..caf54e03f 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -427,7 +427,7 @@ static int gen_parasite_saddr(struct sockaddr_un *saddr, int key) int sun_len; saddr->sun_family = AF_UNIX; - snprintf(saddr->sun_path, UNIX_PATH_MAX, "X/crtools-pr-%d-%" PRIx64, key, compel_run_id); + snprintf(saddr->sun_path, UNIX_PATH_MAX, "X/crtools-pr-%d-%s", key, compel_run_id); sun_len = SUN_LEN(saddr); *saddr->sun_path = '\0'; diff --git a/criu/Makefile.packages b/criu/Makefile.packages index 7f6113c8f..3e2e6efd1 100644 --- a/criu/Makefile.packages +++ b/criu/Makefile.packages @@ -6,6 +6,7 @@ REQ-RPM-PKG-NAMES += protobuf-devel REQ-RPM-PKG-NAMES += protobuf-python REQ-RPM-PKG-NAMES += libnl3-devel REQ-RPM-PKG-NAMES += libcap-devel +REQ-RPM-PKG-NAMES += libuuid-devel REQ-RPM-PKG-TEST-NAMES += libaio-devel @@ -16,6 +17,7 @@ REQ-DEB-PKG-NAMES += protobuf-compiler REQ-DEB-PKG-NAMES += $(PYTHON)-protobuf REQ-DEB-PKG-NAMES += libnl-3-dev REQ-DEB-PKG-NAMES += libcap-dev +REQ-DEB-PKG-NAMES += uuid-dev REQ-DEB-PKG-TEST-NAMES += $(PYTHON)-yaml REQ-DEB-PKG-TEST-NAMES += libaio-dev @@ -25,7 +27,7 @@ REQ-DEB-PKG-TEST-NAMES += libaio-dev REQ-RPM-PKG-TEST-NAMES += $(PYTHON)-PyYAML -export LIBS += -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/ -lnet +export LIBS += -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/ -lnet -luuid check-packages-failed: $(warning Can not find some of the required libraries) diff --git a/criu/fdstore.c b/criu/fdstore.c index d615ad15d..6ac639c55 100644 --- a/criu/fdstore.c +++ b/criu/fdstore.c @@ -58,7 +58,7 @@ int fdstore_init(void) } addr.sun_family = AF_UNIX; - addrlen = snprintf(addr.sun_path, sizeof(addr.sun_path), "X/criu-fdstore-%" PRIx64 "-%" PRIx64, st.st_ino, + addrlen = snprintf(addr.sun_path, sizeof(addr.sun_path), "X/criu-fdstore-%" PRIx64 "-%s", st.st_ino, criu_run_id); addrlen += sizeof(addr.sun_family); diff --git a/criu/files.c b/criu/files.c index 31e705bcc..f16ec32a2 100644 --- a/criu/files.c +++ b/criu/files.c @@ -978,7 +978,7 @@ static int receive_fd(struct fdinfo_list_entry *fle); static void transport_name_gen(struct sockaddr_un *addr, int *len, int pid) { addr->sun_family = AF_UNIX; - snprintf(addr->sun_path, UNIX_PATH_MAX, "x/crtools-fd-%d-%" PRIx64, pid, criu_run_id); + snprintf(addr->sun_path, UNIX_PATH_MAX, "x/crtools-fd-%d-%s", pid, criu_run_id); *len = SUN_LEN(addr); *addr->sun_path = '\0'; } diff --git a/criu/include/util.h b/criu/include/util.h index 4793f7f20..194e94dee 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -21,6 +21,8 @@ #include "log.h" #include "common/err.h" +#include "compel/infect-util.h" + #define PREF_SHIFT_OP(pref, op, size) ((size)op(pref##BYTES_SHIFT)) #define KBYTES_SHIFT 10 #define MBYTES_SHIFT 20 @@ -420,7 +422,7 @@ extern int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void * criu_run_id is a unique value of the current run. It can be used to * generate resource ID-s to avoid conflicts with other CRIU processes. */ -extern uint64_t criu_run_id; +extern char criu_run_id[RUN_ID_HASH_LENGTH]; extern void util_init(void); extern char *resolve_mountpoint(char *path); diff --git a/criu/pidfd-store.c b/criu/pidfd-store.c index 9fdc74cb7..110f7802a 100644 --- a/criu/pidfd-store.c +++ b/criu/pidfd-store.c @@ -99,7 +99,7 @@ int init_pidfd_store_sk(pid_t pid, int sk) goto err; } - addrlen = snprintf(addr.sun_path, sizeof(addr.sun_path), "X/criu-pidfd-store-%d-%d-%" PRIx64, pid, sk, + addrlen = snprintf(addr.sun_path, sizeof(addr.sun_path), "X/criu-pidfd-store-%d-%d-%s", pid, sk, criu_run_id); addrlen += sizeof(addr.sun_family); diff --git a/criu/unittest/mock.c b/criu/unittest/mock.c index e517720e4..b2d507278 100644 --- a/criu/unittest/mock.c +++ b/criu/unittest/mock.c @@ -5,6 +5,8 @@ #include #include +#include "compel/infect-util.h" + int add_external(char *key) { return 0; @@ -141,4 +143,4 @@ int check_mount_v2(void) return 0; } -uint64_t compel_run_id; +char compel_run_id[RUN_ID_HASH_LENGTH]; diff --git a/criu/util.c b/criu/util.c index d2bc9a865..58c18e20b 100644 --- a/criu/util.c +++ b/criu/util.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "linux/mount.h" @@ -2026,20 +2027,16 @@ int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void *args) return fret; } -uint64_t criu_run_id; +char criu_run_id[RUN_ID_HASH_LENGTH]; void util_init(void) { - struct stat statbuf; + uuid_t uuid; - criu_run_id = getpid(); - if (!stat("/proc/self/ns/pid", &statbuf)) - criu_run_id |= (uint64_t)statbuf.st_ino << 32; - else if (errno != ENOENT) - pr_perror("Can't stat /proc/self/ns/pid - CRIU run id might not be unique"); - - compel_run_id = criu_run_id; - pr_info("CRIU run id = %#" PRIx64 "\n", criu_run_id); + uuid_generate(uuid); + uuid_unparse(uuid, criu_run_id); + pr_info("CRIU run id = %s\n", criu_run_id); + memcpy(compel_run_id, criu_run_id, sizeof(criu_run_id)); } /* diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index 329d7791d..d843793ea 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -24,7 +24,8 @@ RUN apk update && apk add \ sudo \ libcap-utils \ libdrm-dev \ - util-linux + util-linux \ + util-linux-dev COPY . /criu WORKDIR /criu diff --git a/scripts/build/Dockerfile.amd-rocm b/scripts/build/Dockerfile.amd-rocm index c466a73d2..ed66ae4fe 100644 --- a/scripts/build/Dockerfile.amd-rocm +++ b/scripts/build/Dockerfile.amd-rocm @@ -56,6 +56,7 @@ RUN apt-get clean -qqy && apt-get update -qqy && apt-get install -qqy --no-insta python-protobuf \ python3-minimal \ python-ipaddress \ + uuid-dev \ curl \ wget \ vim \ diff --git a/scripts/build/Dockerfile.archlinux b/scripts/build/Dockerfile.archlinux index 405651489..9d11194bb 100644 --- a/scripts/build/Dockerfile.archlinux +++ b/scripts/build/Dockerfile.archlinux @@ -35,6 +35,7 @@ RUN pacman -Syu --noconfirm \ python-junit-xml \ python-importlib-metadata \ libdrm \ + util-linux-libs \ diffutils COPY . /criu diff --git a/scripts/build/Dockerfile.hotspot-alpine b/scripts/build/Dockerfile.hotspot-alpine index cb9332fd0..6caf9d0b1 100644 --- a/scripts/build/Dockerfile.hotspot-alpine +++ b/scripts/build/Dockerfile.hotspot-alpine @@ -19,6 +19,7 @@ RUN apk update && apk add \ maven \ ip6tables \ iptables \ + util-linux-dev \ bash COPY . /criu diff --git a/scripts/build/Dockerfile.hotspot-ubuntu b/scripts/build/Dockerfile.hotspot-ubuntu index 0318f650f..67de916ac 100644 --- a/scripts/build/Dockerfile.hotspot-ubuntu +++ b/scripts/build/Dockerfile.hotspot-ubuntu @@ -22,6 +22,7 @@ RUN apt-install protobuf-c-compiler \ pkg-config \ iptables \ gcc \ + uuid-dev \ maven COPY . /criu diff --git a/scripts/build/Dockerfile.linux32.tmpl b/scripts/build/Dockerfile.linux32.tmpl index 13e992642..d218e0641 100644 --- a/scripts/build/Dockerfile.linux32.tmpl +++ b/scripts/build/Dockerfile.linux32.tmpl @@ -21,6 +21,7 @@ RUN apt-install \ pkg-config \ protobuf-c-compiler \ protobuf-compiler \ + uuid-dev \ python3-minimal COPY . /criu diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.openj9-ubuntu index e190c2792..0ae4727d2 100644 --- a/scripts/build/Dockerfile.openj9-ubuntu +++ b/scripts/build/Dockerfile.openj9-ubuntu @@ -22,6 +22,7 @@ RUN apt-install protobuf-c-compiler \ pkg-config \ iptables \ gcc \ + uuid-dev \ maven RUN mkdir -p /etc/criu && echo 'ghost-limit 16777216' > /etc/criu/default.conf diff --git a/scripts/build/Dockerfile.riscv64-stable-cross.tmpl b/scripts/build/Dockerfile.riscv64-stable-cross.tmpl index 39a0c33c6..e95a43306 100644 --- a/scripts/build/Dockerfile.riscv64-stable-cross.tmpl +++ b/scripts/build/Dockerfile.riscv64-stable-cross.tmpl @@ -33,6 +33,7 @@ RUN apt-get install -y --no-install-recommends \ libprotobuf-c-dev:${DEBIAN_ARCH} \ libcap-dev:${DEBIAN_ARCH} \ libaio-dev:${DEBIAN_ARCH} \ + uuid-dev:${DEBIAN_ARCH} \ libnl-route-3-dev:${DEBIAN_ARCH} \ libnftables-dev:${DEBIAN_ARCH} \ libgnutls28-dev:${DEBIAN_ARCH} \ diff --git a/scripts/build/Dockerfile.stable-cross.tmpl b/scripts/build/Dockerfile.stable-cross.tmpl index 078372c38..65ae55833 100644 --- a/scripts/build/Dockerfile.stable-cross.tmpl +++ b/scripts/build/Dockerfile.stable-cross.tmpl @@ -18,6 +18,7 @@ RUN apt-install \ libnl-3-dev:${DEBIAN_ARCH} \ libprotobuf-dev:${DEBIAN_ARCH} \ libnet-dev:${DEBIAN_ARCH} \ + uuid-dev:${DEBIAN_ARCH} \ libprotobuf-c-dev:${DEBIAN_ARCH} \ libcap-dev:${DEBIAN_ARCH} \ libaio-dev:${DEBIAN_ARCH} \ diff --git a/scripts/build/Dockerfile.tmpl b/scripts/build/Dockerfile.tmpl index 9b53a76aa..3d6de1044 100644 --- a/scripts/build/Dockerfile.tmpl +++ b/scripts/build/Dockerfile.tmpl @@ -29,6 +29,7 @@ RUN apt-install \ protobuf-compiler \ python3-minimal \ python3-protobuf \ + uuid-dev \ python3-yaml COPY . /criu diff --git a/scripts/build/Dockerfile.unstable-cross.tmpl b/scripts/build/Dockerfile.unstable-cross.tmpl index dacfd96ef..3504b0433 100644 --- a/scripts/build/Dockerfile.unstable-cross.tmpl +++ b/scripts/build/Dockerfile.unstable-cross.tmpl @@ -17,6 +17,7 @@ RUN apt-install \ python3-protobuf \ libnl-3-dev:${DEBIAN_ARCH} \ libprotobuf-dev:${DEBIAN_ARCH} \ + uuid-dev:${DEBIAN_ARCH} \ libnet-dev:${DEBIAN_ARCH} \ libprotobuf-c-dev:${DEBIAN_ARCH} \ libcap-dev:${DEBIAN_ARCH} \ diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index 09085c403..42252c93c 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -36,6 +36,7 @@ dnf install -y \ e2fsprogs \ rubygem-asciidoctor \ libdrm-devel \ + libuuid-devel \ kmod # /tmp is no longer 755 in the rawhide container image and breaks CI - fix it diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index b472e954c..611ff7803 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -4,7 +4,7 @@ set -x -e CI_PKGS=(protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev libgnutls30 libprotobuf-dev protobuf-compiler libcap-dev libnl-3-dev gdb bash libnet-dev util-linux asciidoctor - libnl-route-3-dev time libbsd-dev python3-yaml + libnl-route-3-dev time libbsd-dev python3-yaml uuid-dev libperl-dev pkg-config python3-protobuf python3-pip python3-importlib-metadata python3-junit.xml libdrm-dev) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index 3904c51d2..ed5a01178 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -39,7 +39,7 @@ setup() { ssh default sudo dnf install -y gcc git gnutls-devel nftables-devel libaio-devel \ libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make protobuf-c-devel \ protobuf-devel python3-protobuf python3-importlib-metadata python3-junit_xml \ - rubygem-asciidoctor iptables libselinux-devel libbpf-devel python3-yaml + rubygem-asciidoctor iptables libselinux-devel libbpf-devel python3-yaml libuuid-devel # Disable sssd to avoid zdtm test failures in pty04 due to sssd socket ssh default sudo systemctl mask sssd ssh default cat /proc/cmdline From f83931542afc3eff5bc02344a522bb8662425d4f Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 23 Jan 2025 17:42:45 +0000 Subject: [PATCH 054/257] net: remember the name of the lock chain (nftables) Using libnftables the chain to lock the network is composed of ("CRIU-%d", real_pid). This leads to around 40 zdtm tests failing with errors like this: Error: No such file or directory; did you mean table 'CRIU-62' in family inet? delete table inet CRIU-86 The reason is that as soon as a process is running in a namespace the real PID can be anything and only the PID in the namespace is restored correctly. Relying on the real PID does not work for the chain name. Using the PID of the innermost namespace would lead to the chain be called 'CRIU-1' most of the time which is also not really unique. With this commit the change is now named using the already existing CRIU run ID. To be able to correctly restore the process and delete the locking table, the CRIU run id during checkpointing is now stored in the inventory as dump_criu_run_id. Signed-off-by: Adrian Reber --- criu/image.c | 30 ++++++++++++++++++++++++++++++ criu/include/util.h | 2 ++ criu/netfilter.c | 20 +++++++++++++++++++- images/inventory.proto | 4 ++++ 4 files changed, 55 insertions(+), 1 deletion(-) diff --git a/criu/image.c b/criu/image.c index 9589167fb..f3747d6ff 100644 --- a/criu/image.c +++ b/criu/image.c @@ -25,6 +25,7 @@ bool img_common_magic = true; TaskKobjIdsEntry *root_ids; u32 root_cg_set; Lsmtype image_lsm; +char dump_criu_run_id[RUN_ID_HASH_LENGTH]; struct inventory_plugin { struct list_head node; @@ -120,6 +121,24 @@ int check_img_inventory(bool restore) goto out_err; } } + + /** + * This contains the criu_run_id during dumping of the process. + * For things like removing network locking (nftables) this + * information is needed to identify the name of the network + * locking table. + */ + if (he->dump_criu_run_id) { + strncpy(dump_criu_run_id, he->dump_criu_run_id, sizeof(dump_criu_run_id) - 1); + pr_info("Dump CRIU run id = %s\n", dump_criu_run_id); + } else { + /** + * If restoring from an old image this is a marker + * that no dump_criu_run_id exists. + */ + dump_criu_run_id[0] = NO_DUMP_CRIU_RUN_ID; + } + } ret = 0; @@ -367,6 +386,17 @@ int prepare_inventory(InventoryEntry *he) he->has_network_lock_method = true; he->network_lock_method = opts.network_lock_method; + /** + * This contains the criu_run_id during dumping of the process. + * For things like removing network locking (nftables) this + * information is needed to identify the name of the network + * locking table. + */ + he->dump_criu_run_id = xstrdup(criu_run_id); + + if (!he->dump_criu_run_id) + return -1; + return 0; } diff --git a/criu/include/util.h b/criu/include/util.h index 194e94dee..55ad5b63c 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -424,6 +424,8 @@ extern int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void */ extern char criu_run_id[RUN_ID_HASH_LENGTH]; extern void util_init(void); +#define NO_DUMP_CRIU_RUN_ID 0x7f +extern char dump_criu_run_id[RUN_ID_HASH_LENGTH]; extern char *resolve_mountpoint(char *path); diff --git a/criu/netfilter.c b/criu/netfilter.c index 9e78dc4b0..e2c82764f 100644 --- a/criu/netfilter.c +++ b/criu/netfilter.c @@ -299,7 +299,25 @@ int nftables_lock_connection(struct inet_sk_desc *sk) int nftables_get_table(char *table, int n) { - if (snprintf(table, n, "inet CRIU-%d", root_item->pid->real) < 0) { + int ret; + + switch(dump_criu_run_id[0]) { + case 0: + /* This is not a restore.*/ + ret = snprintf(table, n, "inet CRIU-%s", criu_run_id); + break; + case NO_DUMP_CRIU_RUN_ID: + /** + * This is a restore from an older image with no + * dump_criu_run_id available. Let's use the old ID. + */ + ret = snprintf(table, n, "inet CRIU-%d", root_item->pid->real); + break; + default: + ret = snprintf(table, n, "inet CRIU-%s", dump_criu_run_id); + } + + if (ret < 0) { pr_err("Cannot generate CRIU's nftables table name\n"); return -1; } diff --git a/images/inventory.proto b/images/inventory.proto index 7f655031b..1e18815bb 100644 --- a/images/inventory.proto +++ b/images/inventory.proto @@ -29,4 +29,8 @@ message inventory_entry { optional bool tcp_close = 10; optional uint32 network_lock_method = 11; optional plugins_entry plugins_entry = 12; + // Remember the criu_run_id when CRIU dumped the process. + // This is currently used to delete the correct nftables + // network locking rule. + optional string dump_criu_run_id = 13; } From 02056bf41aaef1522a6d9fae18cd45c3f119ca83 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 14 Dec 2024 21:14:58 +0000 Subject: [PATCH 055/257] cuda: prevent task lockup on timeout error When creating a checkpoint of large models, the `checkpoint` action of `cuda-checkpoint` can exceed the CRIU timeout. This causes CRIU to fail with the following error, leaving the CUDA task in a locked state: cuda_plugin: Checkpointing CUDA devices on pid 84145 restore_tid 84202 Error (criu/cr-dump.c:1791): Timeout reached. Try to interrupt: 0 Error (cuda_plugin.c:139): cuda_plugin: Unable to read output of cuda-checkpoint: Interrupted system call Error (cuda_plugin.c:396): cuda_plugin: CHECKPOINT_DEVICES failed with net: Unlock network cuda_plugin: finished cuda_plugin stage 0 err -1 cuda_plugin: resuming devices on pid 84145 cuda_plugin: Restore thread pid 84202 found for real pid 84145 Unfreezing tasks into 1 Unseizing 84145 into 1 Error (criu/cr-dump.c:2111): Dumping FAILED. To fix this, we set `task_info->checkpointed` before invoking the `checkpoint` action to ensure that the CUDA task is resumed even if CRIU times out. Signed-off-by: Radostin Stoyanov --- plugins/cuda/cuda_plugin.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index e78828b18..976ce824c 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -391,14 +391,14 @@ int cuda_plugin_checkpoint_devices(int pid) if (resume_restore_thread(restore_tid, &save_sigset)) { return -1; } + + task_info->checkpointed = 1; status = cuda_process_checkpoint_action(pid, ACTION_CHECKPOINT, 0, msg_buf, sizeof(msg_buf)); if (status) { pr_err("CHECKPOINT_DEVICES failed with %s\n", msg_buf); goto interrupt; } - task_info->checkpointed = 1; - interrupt: int_ret = interrupt_restore_thread(restore_tid, &save_sigset); From 7f0d107fe576e7e0b521c2df6e96fe1501a8e1f6 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 21 Dec 2024 14:17:35 +0000 Subject: [PATCH 056/257] seize: use separate checkpoint_devices function Move `run_plugins(CHECKPOINT_DEVICES)` out of `collect_pstree()` to ensure that the function's sole responsibility is to use the cgroup freezer for the process tree. This allows us to avoid a time-out error when checkpointing applications with large GPU state. v2: This patch calls `checkpoint_devices()` only for `criu dump`. Support for GPU checkpointing with `pre-dump` will be introduced in a separate patch. Suggested-by: Andrei Vagin Suggested-by: Jesus Ramos Signed-off-by: Radostin Stoyanov --- criu/cr-dump.c | 3 +++ criu/include/seize.h | 1 + criu/seize.c | 27 ++++++++++++++++++--------- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 1bc5d934f..302078caa 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -2192,6 +2192,9 @@ int cr_dump_tasks(pid_t pid) if (collect_pstree()) goto err; + if (checkpoint_devices()) + goto err; + if (collect_pstree_ids()) goto err; diff --git a/criu/include/seize.h b/criu/include/seize.h index 64e8d2d12..fc7facad3 100644 --- a/criu/include/seize.h +++ b/criu/include/seize.h @@ -2,6 +2,7 @@ #define __CR_SEIZE_H__ extern int collect_pstree(void); +extern int checkpoint_devices(void); struct pstree_item; extern void pstree_switch_state(struct pstree_item *root_item, int st); extern const char *get_real_freezer_state(void); diff --git a/criu/seize.c b/criu/seize.c index 007e8e580..f56357ac7 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -1050,7 +1050,6 @@ int collect_pstree(void) pid_t pid = root_item->pid->real; int ret, exit_code = -1; struct proc_status_creds creds; - struct pstree_item *iter; timing_start(TIME_FREEZING); @@ -1111,14 +1110,6 @@ int collect_pstree(void) goto err; } - for_each_pstree_item(iter) { - if (!task_alive(iter)) - continue; - ret = run_plugins(CHECKPOINT_DEVICES, iter->pid->real); - if (ret < 0 && ret != -ENOTSUP) - goto err; - } - exit_code = 0; timing_stop(TIME_FREEZING); timing_start(TIME_FROZEN); @@ -1128,3 +1119,21 @@ err: alarm(0); return exit_code; } + +int checkpoint_devices(void) +{ + struct pstree_item *iter; + int ret, exit_code = -1; + + for_each_pstree_item(iter) { + if (!task_alive(iter)) + continue; + ret = run_plugins(CHECKPOINT_DEVICES, iter->pid->real); + if (ret < 0 && ret != -ENOTSUP) + goto err; + } + + exit_code = 0; +err: + return exit_code; +} \ No newline at end of file From 82b03429b71d2334f7444cec80a827b2604cd981 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 15 Jan 2025 20:54:10 +0000 Subject: [PATCH 057/257] cuda: disable CUDA plugin for pre-dump Temporarily disable CUDA plugin for `criu pre-dump`. pre-dump currently fails with the following error: Handling VMA with the following smaps entry: 1822c000-18da5000 rw-p 00000000 00:00 0 [heap] Handling VMA with the following smaps entry: 200000000-200200000 ---p 00000000 00:00 0 Handling VMA with the following smaps entry: 200200000-200400000 rw-s 00000000 00:06 895 /dev/nvidia0 Error (criu/proc_parse.c:116): handle_device_vma plugin failed: No such file or directory Error (criu/proc_parse.c:632): Can't handle non-regular mapping on 705693's map 200200000 Error (criu/cr-dump.c:1486): Collect mappings (pid: 705693) failed with -1 We plan to enable support for pre-dump by skipping nvidia mappings in a separate patch. Signed-off-by: Radostin Stoyanov --- plugins/cuda/cuda_plugin.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 976ce824c..99e4caf74 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -565,6 +565,12 @@ int cuda_plugin_init(int stage) { int ret; + /* Disable CUDA checkpointing with pre-dump */ + if (stage == CR_PLUGIN_STAGE__PRE_DUMP) { + plugin_disabled = true; + return 0; + } + if (stage == CR_PLUGIN_STAGE__RESTORE) { if (!check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name))) { plugin_disabled = true; From e2dffcbc8e717c8a837f476c8b9f552821a58753 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 19 Dec 2024 10:33:54 +0000 Subject: [PATCH 058/257] lib: do not set protobuf has_* field too early For two cases libcriu was setting the RPC protobuf field `has_*` before checking if the given parameter is valid. This can lead to situations, if the caller doesn't check the return value, that we pass as RPC struct to CRIU which has the `has_*` protobuf field set to true, but does not have a verified value (or non at all) set for the actual RPC entry. Signed-off-by: Adrian Reber --- lib/c/criu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/c/criu.c b/lib/c/criu.c index 7f766db85..c16fe5dcd 100644 --- a/lib/c/criu.c +++ b/lib/c/criu.c @@ -352,8 +352,8 @@ int criu_set_parent_images(const char *path) int criu_local_set_pre_dump_mode(criu_opts *opts, enum criu_pre_dump_mode mode) { - opts->rpc->has_pre_dump_mode = true; if (mode == CRIU_PRE_DUMP_SPLICE || mode == CRIU_PRE_DUMP_READ) { + opts->rpc->has_pre_dump_mode = true; opts->rpc->pre_dump_mode = (CriuPreDumpMode)mode; return 0; } @@ -1867,8 +1867,8 @@ void criu_set_pidfd_store_sk(int sk) int criu_local_set_network_lock(criu_opts *opts, enum criu_network_lock_method method) { - opts->rpc->has_network_lock = true; if (method == CRIU_NETWORK_LOCK_IPTABLES || method == CRIU_NETWORK_LOCK_NFTABLES || method == CRIU_NETWORK_LOCK_SKIP) { + opts->rpc->has_network_lock = true; opts->rpc->network_lock = (CriuNetworkLockMethod)method; return 0; } From d226bd4f670b6a001d1b6809c90495e8710e387a Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 29 Jan 2025 15:19:16 +0000 Subject: [PATCH 059/257] ci: handle results from latest codespell CI pulls in a newer version of codespell. This fixes complaints from that codespell version. Signed-off-by: Adrian Reber --- .codespellrc | 2 +- criu/include/rbtree.h | 2 +- criu/include/rst_info.h | 2 +- criu/page-xfer.c | 4 ++-- test/zdtm/static/packet_sock.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.codespellrc b/.codespellrc index dd31dd851..15e6fc7bc 100644 --- a/.codespellrc +++ b/.codespellrc @@ -1,3 +1,3 @@ [codespell] skip = ./.git,./test/pki -ignore-words-list = creat,fpr,fle,ue,bord,parms,nd,te,testng,inh,wronly,renderd,bui,clen +ignore-words-list = creat,fpr,fle,ue,bord,parms,nd,te,testng,inh,wronly,renderd,bui,clen,sems diff --git a/criu/include/rbtree.h b/criu/include/rbtree.h index ba0a8100e..6981aa8f9 100644 --- a/criu/include/rbtree.h +++ b/criu/include/rbtree.h @@ -14,7 +14,7 @@ #define RB_MASK 3 struct rb_node { - unsigned long rb_parent_color; /* Keeps both parent anc color */ + unsigned long rb_parent_color; /* Keeps both parent and color */ struct rb_node *rb_right; struct rb_node *rb_left; } __aligned(sizeof(long)); diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h index 59b891fa2..df9f9de01 100644 --- a/criu/include/rst_info.h +++ b/criu/include/rst_info.h @@ -22,7 +22,7 @@ struct fdt { pid_t pid; /* Who should restore this fd table */ /* * The fd table is ready for restoing, if fdt_lock is equal to nr - * The fdt table was restrored, if fdt_lock is equal to nr + 1 + * The fdt table was restored, if fdt_lock is equal to nr + 1 */ futex_t fdt_lock; }; diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 94f477414..0314963e6 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -1421,7 +1421,7 @@ int cr_page_server(bool daemon_mode, bool lazy_dump, int cfd) if (opts.ps_socket != -1) { ask = opts.ps_socket; - pr_info("Re-using ps socket %d\n", ask); + pr_info("Reusing ps socket %d\n", ask); goto no_server; } @@ -1467,7 +1467,7 @@ static int connect_to_page_server(void) if (opts.ps_socket != -1) { page_server_sk = opts.ps_socket; - pr_info("Re-using ps socket %d\n", page_server_sk); + pr_info("Reusing ps socket %d\n", page_server_sk); goto out; } diff --git a/test/zdtm/static/packet_sock.c b/test/zdtm/static/packet_sock.c index 4a9078f81..c1c94ac21 100644 --- a/test/zdtm/static/packet_sock.c +++ b/test/zdtm/static/packet_sock.c @@ -5,7 +5,7 @@ const char *test_author = "Pavel Emelyanov "; /* * Description: - * Create and bind several packet sockets, check thet getname + * Create and bind several packet sockets, check that getname * reports same result before and after c/r cycle. This is enough * for _basic_ packet functionality only, but still. */ From 9c40781c2674ff99fb56b339fabe62301a0f6ea4 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 16 Jan 2025 10:25:24 +0800 Subject: [PATCH 060/257] net/sysctl: put common multiplier outside the brackets Also add an explanation of the logic behind this calculation. Signed-off-by: Pavel Tikhomirov --- criu/net.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/criu/net.c b/criu/net.c index efd52db32..97c53f84f 100644 --- a/criu/net.c +++ b/criu/net.c @@ -2149,10 +2149,16 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) list_for_each_entry(p, &ns->net.ids, node) i++; + /* + * Here we allocate one single big buffer for storing multiple arrays + * of protobuf entries and pointers to entries in it and we later use + * xptr_pull_s to claim a part of this buffer of proper size for each + * particular array. Next we read data from sysctl files to those + * arrays and then finally save them into images. + */ o_buf = buf = xmalloc(i * (sizeof(NetnsId *) + sizeof(NetnsId)) + - size4 * (sizeof(SysctlEntry *) + sizeof(SysctlEntry)) * 2 + - size6 * (sizeof(SysctlEntry *) + sizeof(SysctlEntry)) * 2 + - sizex * (sizeof(SysctlEntry *) + sizeof(SysctlEntry))); + (size4 * 2 + size6 * 2 + sizex) * + (sizeof(SysctlEntry *) + sizeof(SysctlEntry))); if (!buf) goto out; From 4ca74b9aff301b1db3696077b5bea7e32d466152 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 16 Jan 2025 11:00:28 +0800 Subject: [PATCH 061/257] net/sysctl: c/r ipv4/ping_group_range value It is per net namespace, we need it to allow creation of unprivileged ICMP sockets. Note: in case this sysctl was disabled after unprivileged ICMP socket was created we still need to somehow handle it on restore. Signed-off-by: Pavel Tikhomirov --- criu/net.c | 103 +++++++++++++++++++++++++++++++++++++++++++- images/netdev.proto | 1 + 2 files changed, 103 insertions(+), 1 deletion(-) diff --git a/criu/net.c b/criu/net.c index 97c53f84f..ee46f1c49 100644 --- a/criu/net.c +++ b/criu/net.c @@ -2128,6 +2128,79 @@ nft_ctx_free_out: } #endif +static const char *ipv4_sysctl_entries[] = { + "ping_group_range", +}; + +#define IPV4_SYSCTL_BASE "net/ipv4" +#define IPV4_SYSCTL_FMT IPV4_SYSCTL_BASE"/%s" +#define MAX_IPV4_SYSCTL_OPT 32 +#define MAX_IPV4_SYSCTL_PATH (sizeof(IPV4_SYSCTL_FMT) + MAX_IPV4_SYSCTL_OPT - 2) +#define MAX_STR_IPV4_SYSCTL_LEN 200 + +static int ipv4_sysctls_op(SysctlEntry ***rsysctl, size_t *pn, int op) +{ + int i, ret = -1, flags = 0; + char path[ARRAY_SIZE(ipv4_sysctl_entries)][MAX_IPV4_SYSCTL_PATH] = {}; + struct sysctl_req req[ARRAY_SIZE(ipv4_sysctl_entries)] = {}; + SysctlEntry **sysctl = *rsysctl; + size_t n = *pn; + + if (n != ARRAY_SIZE(ipv4_sysctl_entries)) { + pr_err("unix: Unexpected entries in sysctlig (%zu %zu)\n", n, ARRAY_SIZE(ipv4_sysctl_entries)); + return -EINVAL; + } + + if (opts.weak_sysctls || op == CTL_READ) + flags = CTL_FLAGS_OPTIONAL; + + for (i = 0; i < n; i++) { + snprintf(path[i], MAX_IPV4_SYSCTL_PATH, IPV4_SYSCTL_FMT, ipv4_sysctl_entries[i]); + req[i].name = path[i]; + req[i].flags = flags; + + switch (sysctl[i]->type) { + case SYSCTL_TYPE__CTL_STR: + req[i].type = CTL_STR(MAX_STR_IPV4_SYSCTL_LEN); + + /* skip write if have no value */ + if (op == CTL_WRITE && !sysctl[i]->sarg) + continue; + + req[i].arg = sysctl[i]->sarg; + break; + default: + pr_err("ipv4: Unknown sysctl type %d\n", sysctl[i]->type); + return -1; + } + } + + ret = sysctl_op(req, n, op, CLONE_NEWNET); + if (ret < 0) { + pr_err("unix: Failed to %s %s/\n", (op == CTL_READ) ? "read" : "write", IPV4_SYSCTL_BASE); + return -1; + } + + if (op == CTL_READ) { + bool has_entries = false; + + for (i = 0; i < n; i++) { + if (req[i].flags & CTL_FLAGS_HAS) { + sysctl[i]->has_iarg = true; + if (!has_entries) + has_entries = true; + } + } + + if (!has_entries) { + *pn = 0; + *rsysctl = NULL; + } + } + + return 0; +} + static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) { void *buf, *o_buf; @@ -2142,6 +2215,9 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) int size6 = ARRAY_SIZE(devconfs6); char def_stable_secret[MAX_STR_CONF_LEN + 1] = {}; char all_stable_secret[MAX_STR_CONF_LEN + 1] = {}; + SysctlEntry *ipv4_sysctls = NULL; + size_t ipv4_sysctl_size = ARRAY_SIZE(ipv4_sysctl_entries); + char ping_group_range[MAX_STR_IPV4_SYSCTL_LEN + 1] = {}; NetnsId *ids; struct netns_id *p; @@ -2157,7 +2233,7 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) * arrays and then finally save them into images. */ o_buf = buf = xmalloc(i * (sizeof(NetnsId *) + sizeof(NetnsId)) + - (size4 * 2 + size6 * 2 + sizex) * + (2 * size4 + 2 * size6 + sizex + ipv4_sysctl_size) * (sizeof(SysctlEntry *) + sizeof(SysctlEntry))); if (!buf) goto out; @@ -2223,6 +2299,21 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) netns.unix_conf[i]->type = SYSCTL_TYPE__CTL_32; } + netns.n_ipv4_sysctl = ipv4_sysctl_size; + netns.ipv4_sysctl = xptr_pull_s(&buf, ipv4_sysctl_size * sizeof(SysctlEntry *)); + ipv4_sysctls = xptr_pull_s(&buf, ipv4_sysctl_size * sizeof(SysctlEntry)); + for (i = 0; i < ipv4_sysctl_size; i++) { + sysctl_entry__init(&ipv4_sysctls[i]); + netns.ipv4_sysctl[i] = &ipv4_sysctls[i]; + if (!strcmp(ipv4_sysctl_entries[i], "ping_group_range")) { + netns.ipv4_sysctl[i]->type = SYSCTL_TYPE__CTL_STR; + netns.ipv4_sysctl[i]->sarg = ping_group_range; + } else { + /* Need to handle this case when we have more sysctls */ + BUG(); + } + } + ret = ipv4_conf_op("default", netns.def_conf4, size4, CTL_READ, NULL); if (ret < 0) goto err_free; @@ -2241,6 +2332,10 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) if (ret < 0) goto err_free; + ret = ipv4_sysctls_op(&netns.ipv4_sysctl, &netns.n_ipv4_sysctl, CTL_READ); + if (ret < 0) + goto err_free; + ret = pb_write_one(img_from_set(fds, CR_FD_NETNS), &netns, PB_NETNS); err_free: xfree(o_buf); @@ -2593,6 +2688,12 @@ static int restore_netns_conf(struct ns_id *ns) goto out; } + if ((netns)->ipv4_sysctl) { + ret = ipv4_sysctls_op(&(netns)->ipv4_sysctl, &(netns)->n_ipv4_sysctl, CTL_WRITE); + if (ret) + goto out; + } + ns->net.netns = netns; out: return ret; diff --git a/images/netdev.proto b/images/netdev.proto index 748fd0200..42e2bc7d7 100644 --- a/images/netdev.proto +++ b/images/netdev.proto @@ -74,4 +74,5 @@ message netns_entry { repeated netns_id nsids = 7; optional string ext_key = 8; repeated sysctl_entry unix_conf = 9; + repeated sysctl_entry ipv4_sysctl = 10; } From 6710cfce10d32a4ebabae61420fa495e42385966 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 16 Jan 2025 11:56:00 +0800 Subject: [PATCH 062/257] zdtm/netns_sub_sysctl: add ipv4/ping_group_range sysctl check Signed-off-by: Pavel Tikhomirov --- test/zdtm/lib/sysctl.c | 43 +++++++++++++++++++++ test/zdtm/lib/sysctl.h | 2 + test/zdtm/static/netns_sub_sysctl.c | 58 +++++++++++++++++++++++------ 3 files changed, 91 insertions(+), 12 deletions(-) diff --git a/test/zdtm/lib/sysctl.c b/test/zdtm/lib/sysctl.c index 9583ec3df..3b1ebc168 100644 --- a/test/zdtm/lib/sysctl.c +++ b/test/zdtm/lib/sysctl.c @@ -3,6 +3,49 @@ #include "zdtmtst.h" #include "sysctl.h" +int sysctl_read_str(const char *name, char *data, size_t size) +{ + int fd, ret; + + fd = open(name, O_RDONLY); + if (fd < 0) { + pr_perror("Can't open %s", name); + return -1; + } + + ret = read(fd, data, size - 1); + if (ret < 0) { + pr_perror("Can't read %s", name); + close(fd); + return -1; + } + data[ret] = '\0'; + close(fd); + + return 0; +} + +int sysctl_write_str(const char *name, char *data) +{ + int fd, ret; + + fd = open(name, O_WRONLY); + if (fd < 0) { + pr_perror("Can't open %s", name); + return -1; + } + + ret = write(fd, data, strlen(data)); + if (ret < 0) { + pr_perror("Can't write %s into %s", data, name); + close(fd); + return -1; + } + close(fd); + + return 0; +} + int sysctl_read_int(const char *name, int *data) { int fd; diff --git a/test/zdtm/lib/sysctl.h b/test/zdtm/lib/sysctl.h index 67129102f..d435bd7e9 100644 --- a/test/zdtm/lib/sysctl.h +++ b/test/zdtm/lib/sysctl.h @@ -3,5 +3,7 @@ extern int sysctl_read_int(const char *name, int *data); extern int sysctl_write_int(const char *name, int val); +extern int sysctl_read_str(const char *name, char *data, size_t size); +extern int sysctl_write_str(const char *name, char *data); #endif diff --git a/test/zdtm/static/netns_sub_sysctl.c b/test/zdtm/static/netns_sub_sysctl.c index 545a17308..0f94c40a7 100644 --- a/test/zdtm/static/netns_sub_sysctl.c +++ b/test/zdtm/static/netns_sub_sysctl.c @@ -3,18 +3,33 @@ #include "zdtmtst.h" #include "sysctl.h" -const char *test_doc = "Check dump and restore a net.unix.max_dgram_qlen sysctl parameter in subns"; +const char *test_doc = "Check dump and restore of sysctls in subns"; const char *test_author = "Alexander Mikhalitsyn "; +#define MAX_STR_SYSCTL_LEN 200 + +enum { + SYSCTL_INT, + SYSCTL_STR, +}; + typedef struct { const char *path; + int type; int old; int new; + char s_old[MAX_STR_SYSCTL_LEN]; + char s_new[MAX_STR_SYSCTL_LEN]; } sysctl_opt_t; #define CONF_UNIX_BASE "/proc/sys/net/unix" +#define IPV4_SYSCTL_BASE "/proc/sys/net/ipv4" -static sysctl_opt_t net_unix_params[] = { { CONF_UNIX_BASE "/max_dgram_qlen", 0, 0 }, { NULL, 0, 0 } }; +static sysctl_opt_t net_unix_params[] = { + {CONF_UNIX_BASE "/max_dgram_qlen", SYSCTL_INT}, + {IPV4_SYSCTL_BASE "/ping_group_range", SYSCTL_STR, 0, 0, "40000\t50000\n"}, + {NULL, 0, 0} +}; int main(int argc, char **argv) { @@ -23,10 +38,17 @@ int main(int argc, char **argv) test_init(argc, argv); for (p = net_unix_params; p->path != NULL; p++) { - p->old = (((unsigned)lrand48()) % 1023) + 1; - if (sysctl_write_int(p->path, p->old)) { - pr_perror("Can't change %s", p->path); - return -1; + if (p->type == SYSCTL_INT) { + p->old = (((unsigned)lrand48()) % 1023) + 1; + if (sysctl_write_int(p->path, p->old)) { + pr_perror("Can't change %s", p->path); + return -1; + } + } else if (p->type == SYSCTL_STR) { + if (sysctl_write_str(p->path, p->s_old)) { + pr_perror("Can't change %s", p->path); + return -1; + } } } @@ -34,13 +56,25 @@ int main(int argc, char **argv) test_waitsig(); for (p = net_unix_params; p->path != NULL; p++) { - if (sysctl_read_int(p->path, &p->new)) - ret = 1; + if (p->type == SYSCTL_INT) { + if (sysctl_read_int(p->path, &p->new)) + ret = 1; - if (p->old != p->new) { - errno = EINVAL; - pr_perror("%s changed: %d ---> %d", p->path, p->old, p->new); - ret = 1; + if (p->old != p->new) { + errno = EINVAL; + pr_perror("%s changed: %d ---> %d", p->path, p->old, p->new); + ret = 1; + } + } else if (p->type == SYSCTL_STR) { + if (sysctl_read_str(p->path, p->s_new, MAX_STR_SYSCTL_LEN)) { + ret = 1; + } else { + if (strcmp(p->s_old, p->s_new)) { + errno = EINVAL; + pr_perror("%s changed: %s ---> %s", p->path, p->s_old, p->s_new); + ret = 1; + } + } } } From 8a06ca27cc9ac711faf818b8ac4d061be4d810a8 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 22 Jan 2025 14:35:26 +0100 Subject: [PATCH 063/257] vdso: switch from DT_HASH to DT_GNU_HASH (aarch64) Trying to run latest CRIU on CentOS Stream 10 or Ubuntu 24.04 (aarch64) fails like this: # criu/criu check -v4 [...] (00.096460) vdso: Parsing at ffffb2e2a000 ffffb2e2c000 (00.096539) vdso: PT_LOAD p_vaddr: 0 (00.096567) vdso: DT_STRTAB: 1d0 (00.096592) vdso: DT_SYMTAB: 128 (00.096616) vdso: DT_STRSZ: 8a (00.096640) vdso: DT_SYMENT: 18 (00.096663) Error (criu/pie-util-vdso.c:193): vdso: Not all dynamic entries are present (00.096688) Error (criu/vdso.c:627): vdso: Failed to fill self vdso symtable (00.096713) Error (criu/kerndat.c:1906): kerndat_vdso_fill_symtable failed when initializing kerndat. (00.096812) Found mmap_min_addr 0x10000 (00.096881) files stat: fs/nr_open 1073741816 (00.096908) Error (criu/crtools.c:267): Could not initialize kernel features detection. This seems to be related to the kernel (6.12.0-41.el10.aarch64). The Ubuntu user-space is running in a container on the same kernel. Looking at the kernel this seems to be related to: commit 48f6430505c0b0498ee9020ce3cf9558b1caaaeb Author: Fangrui Song Date: Thu Jul 18 10:34:23 2024 -0700 arm64/vdso: Remove --hash-style=sysv glibc added support for .gnu.hash in 2006 and .hash has been obsoleted for more than one decade in many Linux distributions. Using --hash-style=sysv might imply unaddressed issues and confuse readers. Just drop the option and rely on the linker default, which is likely "both", or "gnu" when the distribution really wants to eliminate sysv hash overhead. Similar to commit 6b7e26547fad ("x86/vdso: Emit a GNU hash"). The commit basically does: -ldflags-y := -shared -soname=linux-vdso.so.1 --hash-style=sysv \ +ldflags-y := -shared -soname=linux-vdso.so.1 \ Which results in only a GNU hash being added to the ELF header. This change has been merged with 6.11. Looking at the referenced x86 commit: commit 6b7e26547fad7ace3dcb27a5babd2317fb9d1e12 Author: Andy Lutomirski Date: Thu Aug 6 14:45:45 2015 -0700 x86/vdso: Emit a GNU hash Some dynamic loaders may be slightly faster if a GNU hash is available. Strangely, this seems to have no effect at all on the vdso size. This is unlikely to have any measurable effect on the time it takes to resolve vdso symbols (since there are so few of them). In some contexts, it can be a win for a different reason: if every DSO has a GNU hash section, then libc can avoid calculating SysV hashes at all. Both musl and glibc appear to have this optimization. It's plausible that this breaks some ancient glibc version. If so, then, depending on what glibc versions break, we could either require COMPAT_VDSO for them or consider reverting. Which is also a really simple change: -VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \ +VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=both) \ The big difference here is that for x86 both hash sections are generated. For aarch64 only the newer GNU hash is generated. That is why we only see this error on kernel >= 6.11 and aarch64. Changing from DT_HASH to DT_GNU_HASH seems to work on aarch64. The test suite runs without any errors. Unfortunately I am not aware of all implication of this change and if a successful test suite run means that it still works. Looking at the kernel I see following hash styles for the VDSO: aarch64: not specified (only GNU hash style) arm: --hash-style=sysv loongarch: --hash-style=sysv mips: --hash-style=sysv powerpc: --hash-style=both riscv: --hash-style=both s390: --hash-style=both x86: --hash-style=both Only aarch64 on kernels >= 6.11 is a problem right now, because all other platforms provide the old style hashing. Signed-off-by: Adrian Reber Co-developed-by: Dmitry Safonov Co-authored-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- criu/pie/util-vdso.c | 243 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 197 insertions(+), 46 deletions(-) diff --git a/criu/pie/util-vdso.c b/criu/pie/util-vdso.c index f1e3239ff..9819335d8 100644 --- a/criu/pie/util-vdso.c +++ b/criu/pie/util-vdso.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -48,10 +49,25 @@ static bool __ptr_struct_oob(uintptr_t ptr, size_t struct_size, uintptr_t start, return __ptr_oob(ptr, start, size) || __ptr_struct_end_oob(ptr, struct_size, start, size); } +/* Local strlen implementation */ +static size_t __strlen(const char *str) +{ + const char *ptr; + + if (!str) + return 0; + + ptr = str; + while (*ptr != '\0') + ptr++; + + return ptr - str; +} + /* * Elf hash, see format specification. */ -static unsigned long elf_hash(const unsigned char *name) +static unsigned long elf_sysv_hash(const unsigned char *name) { unsigned long h = 0, g; @@ -65,6 +81,15 @@ static unsigned long elf_hash(const unsigned char *name) return h; } +/* * The GNU hash format. Taken from glibc. */ +static unsigned long elf_gnu_hash(const unsigned char *name) +{ + unsigned long h = 5381; + for (unsigned char c = *name; c != '\0'; c = *++name) + h = h * 33 + c; + return h; +} + #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #define BORD ELFDATA2MSB /* 0x02 */ #else @@ -149,11 +174,14 @@ err_oob: * Output parameters are: * @dyn_strtab - address of the symbol table * @dyn_symtab - address of the string table section - * @dyn_hash - address of the symbol hash table + * @dyn_hash - address of the symbol hash table + * @use_gnu_hash - the format of hash DT_HASH or DT_GNU_HASH */ -static int parse_elf_dynamic(uintptr_t mem, size_t size, Phdr_t *dynamic, Dyn_t **dyn_strtab, Dyn_t **dyn_symtab, - Dyn_t **dyn_hash) +static int parse_elf_dynamic(uintptr_t mem, size_t size, Phdr_t *dynamic, + Dyn_t **dyn_strtab, Dyn_t **dyn_symtab, + Dyn_t **dyn_hash, bool *use_gnu_hash) { + Dyn_t *dyn_gnu_hash = NULL, *dyn_sysv_hash = NULL; Dyn_t *dyn_syment = NULL; Dyn_t *dyn_strsz = NULL; uintptr_t addr; @@ -184,16 +212,52 @@ static int parse_elf_dynamic(uintptr_t mem, size_t size, Phdr_t *dynamic, Dyn_t dyn_syment = d; pr_debug("DT_SYMENT: %lx\n", (unsigned long)d->d_un.d_val); } else if (d->d_tag == DT_HASH) { - *dyn_hash = d; + dyn_sysv_hash = d; pr_debug("DT_HASH: %lx\n", (unsigned long)d->d_un.d_ptr); + } else if (d->d_tag == DT_GNU_HASH) { + /* + * This is complicated. + * + * Looking at the Linux kernel source, the following can be seen + * regarding which hashing style the VDSO uses on each arch: + * + * aarch64: not specified (depends on linker, can be + * only GNU hash style) + * arm: --hash-style=sysv + * loongarch: --hash-style=sysv + * mips: --hash-style=sysv + * powerpc: --hash-style=both + * riscv: --hash-style=both + * s390: --hash-style=both + * x86: --hash-style=both + * + * Some architectures are using both hash-styles, that + * is the easiest for CRIU. Some architectures are only + * using the old style (sysv), that is what CRIU supports. + * + * Starting with Linux 6.11, aarch64 unfortunately decided + * to switch from '--hash-style=sysv' to ''. Specifying + * nothing unfortunately may mean GNU hash style only and not + * 'both' (depending on the linker). + */ + dyn_gnu_hash = d; + pr_debug("DT_GNU_HASH: %lx\n", (unsigned long)d->d_un.d_ptr); } } - if (!*dyn_strtab || !*dyn_symtab || !dyn_strsz || !dyn_syment || !*dyn_hash) { + if (!*dyn_strtab || !*dyn_symtab || !dyn_strsz || !dyn_syment || + (!dyn_gnu_hash && !dyn_sysv_hash)) { pr_err("Not all dynamic entries are present\n"); return -EINVAL; } + /* + * Prefer DT_HASH over DT_GNU_HASH as it's been more tested and + * as a result more stable. + */ + *use_gnu_hash = !dyn_sysv_hash; + *dyn_hash = dyn_sysv_hash ?: dyn_gnu_hash; + return 0; err_oob: @@ -208,60 +272,141 @@ typedef unsigned long Hash_t; typedef Word_t Hash_t; #endif -static void parse_elf_symbols(uintptr_t mem, size_t size, Phdr_t *load, struct vdso_symtable *t, - uintptr_t dynsymbol_names, Hash_t *hash, Dyn_t *dyn_symtab) +static bool elf_symbol_match(uintptr_t mem, size_t size, + uintptr_t dynsymbol_names, Sym_t *sym, + const char *symbol, const size_t vdso_symbol_length) +{ + uintptr_t addr = (uintptr_t)sym; + char *name; + + if (__ptr_struct_oob(addr, sizeof(Sym_t), mem, size)) + return false; + + if (ELF_ST_TYPE(sym->st_info) != STT_FUNC && ELF_ST_BIND(sym->st_info) != STB_GLOBAL) + return false; + + addr = dynsymbol_names + sym->st_name; + if (__ptr_struct_oob(addr, vdso_symbol_length, mem, size)) + return false; + name = (void *)addr; + + return !std_strncmp(name, symbol, vdso_symbol_length); +} + + +static unsigned long elf_symbol_lookup(uintptr_t mem, size_t size, + const char *symbol, uint32_t symbol_hash, unsigned int sym_off, + uintptr_t dynsymbol_names, Dyn_t *dyn_symtab, Phdr_t *load, + Hash_t nbucket, Hash_t nchain, Hash_t *bucket, Hash_t *chain, + const size_t vdso_symbol_length, bool use_gnu_hash) +{ + unsigned int j; + uintptr_t addr; + + j = bucket[symbol_hash % nbucket]; + if (j == STN_UNDEF) + return 0; + + addr = mem + dyn_symtab->d_un.d_ptr - load->p_vaddr; + + if (use_gnu_hash) { + uint32_t *h = bucket + nbucket + (j - sym_off); + uint32_t hash_val; + + symbol_hash |= 1; + do { + Sym_t *sym = (void *)addr + sizeof(Sym_t) * j; + + hash_val = *h++; + if ((hash_val | 1) == symbol_hash && + elf_symbol_match(mem, size, dynsymbol_names, sym, + symbol, vdso_symbol_length)) + return sym->st_value; + j++; + } while (!(hash_val & 1)); + } else { + for (; j < nchain && j != STN_UNDEF; j = chain[j]) { + Sym_t *sym = (void *)addr + sizeof(Sym_t) * j; + + if (elf_symbol_match(mem, size, dynsymbol_names, sym, + symbol, vdso_symbol_length)) + return sym->st_value; + } + } + return 0; +} + +static int parse_elf_symbols(uintptr_t mem, size_t size, Phdr_t *load, + struct vdso_symtable *t, uintptr_t dynsymbol_names, + Hash_t *hash, Dyn_t *dyn_symtab, bool use_gnu_hash) { ARCH_VDSO_SYMBOLS_LIST const char *vdso_symbols[VDSO_SYMBOL_MAX] = { ARCH_VDSO_SYMBOLS }; const size_t vdso_symbol_length = sizeof(t->symbols[0].name) - 1; - Hash_t nbucket, nchain; - Hash_t *bucket, *chain; + Hash_t *bucket = NULL; + Hash_t *chain = NULL; + Hash_t nbucket = 0; + Hash_t nchain = 0; - unsigned int i, j, k; - uintptr_t addr; + unsigned int sym_off = 0; + unsigned int i = 0; - nbucket = hash[0]; - nchain = hash[1]; - bucket = &hash[2]; - chain = &hash[nbucket + 2]; + unsigned long (*elf_hash)(const unsigned char *); + + if (use_gnu_hash) { + uint32_t *gnu_hash = (uint32_t *)hash; + uint32_t bloom_sz; + size_t *bloom; + + nbucket = gnu_hash[0]; + sym_off = gnu_hash[1]; + bloom_sz = gnu_hash[2]; + bloom = (size_t *)&gnu_hash[4]; + bucket = (Hash_t *)(&bloom[bloom_sz]); + elf_hash = &elf_gnu_hash; + pr_debug("nbucket %lx sym_off %lx bloom_sz %lx bloom %lx bucket %lx\n", + (unsigned long)nbucket, (unsigned long)sym_off, + (unsigned long)bloom_sz, (unsigned long)bloom, + (unsigned long)bucket); + } else { + nbucket = hash[0]; + nchain = hash[1]; + bucket = &hash[2]; + chain = &hash[nbucket + 2]; + elf_hash = &elf_sysv_hash; + pr_debug("nbucket %lx nchain %lx bucket %lx chain %lx\n", + (unsigned long)nbucket, (unsigned long)nchain, + (unsigned long)bucket, (unsigned long)chain); + } - pr_debug("nbucket %lx nchain %lx bucket %lx chain %lx\n", (long)nbucket, (long)nchain, (unsigned long)bucket, - (unsigned long)chain); for (i = 0; i < VDSO_SYMBOL_MAX; i++) { const char *symbol = vdso_symbols[i]; - k = elf_hash((const unsigned char *)symbol); + unsigned long addr, symbol_hash; + const size_t symbol_length = __strlen(symbol); - for (j = bucket[k % nbucket]; j < nchain && j != STN_UNDEF; j = chain[j]) { - Sym_t *sym; - char *name; + symbol_hash = elf_hash((const unsigned char *)symbol); + addr = elf_symbol_lookup(mem, size, symbol, symbol_hash, + sym_off, dynsymbol_names, dyn_symtab, load, + nbucket, nchain, bucket, chain, + vdso_symbol_length, use_gnu_hash); + pr_debug("symbol %s at address %lx\n", symbol, addr); + if (!addr) + continue; - addr = mem + dyn_symtab->d_un.d_ptr - load->p_vaddr; - - addr += sizeof(Sym_t) * j; - if (__ptr_struct_oob(addr, sizeof(Sym_t), mem, size)) - continue; - sym = (void *)addr; - - if (ELF_ST_TYPE(sym->st_info) != STT_FUNC && ELF_ST_BIND(sym->st_info) != STB_GLOBAL) - continue; - - addr = dynsymbol_names + sym->st_name; - if (__ptr_struct_oob(addr, vdso_symbol_length, mem, size)) - continue; - name = (void *)addr; - - if (std_strncmp(name, symbol, vdso_symbol_length)) - continue; - - /* XXX: provide strncpy() implementation for PIE */ - memcpy(t->symbols[i].name, name, vdso_symbol_length); - t->symbols[i].offset = (unsigned long)sym->st_value - load->p_vaddr; - break; + /* XXX: provide strncpy() implementation for PIE */ + if (symbol_length > vdso_symbol_length) { + pr_err("strlen(%s) %zd, only %zd bytes available\n", + symbol, symbol_length, vdso_symbol_length); + return -EINVAL; } + memcpy(t->symbols[i].name, symbol, symbol_length); + t->symbols[i].offset = addr - load->p_vaddr; } + + return 0; } int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) @@ -271,6 +416,7 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) Dyn_t *dyn_symtab = NULL; Dyn_t *dyn_hash = NULL; Hash_t *hash = NULL; + bool use_gnu_hash; uintptr_t dynsymbol_names; uintptr_t addr; @@ -296,7 +442,8 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) * needed. Note that we're interested in a small set of tags. */ - ret = parse_elf_dynamic(mem, size, dynamic, &dyn_strtab, &dyn_symtab, &dyn_hash); + ret = parse_elf_dynamic(mem, size, dynamic, &dyn_strtab, &dyn_symtab, + &dyn_hash, &use_gnu_hash); if (ret < 0) return ret; @@ -310,7 +457,11 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) goto err_oob; hash = (void *)addr; - parse_elf_symbols(mem, size, load, t, dynsymbol_names, hash, dyn_symtab); + ret = parse_elf_symbols(mem, size, load, t, dynsymbol_names, hash, dyn_symtab, + use_gnu_hash); + + if (ret <0) + return ret; return 0; From d66bc349957137113cdeab0ca679898f0b379395 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Fri, 31 Jan 2025 14:45:03 -0800 Subject: [PATCH 064/257] Makefile: move codespell options to .codespellrc This way, - Makefile is less cluttered; - one can run codespell from the command line. Fixes: fd7e97fcf ("lint: exclude tags file from codespell") Signed-off-by: Kir Kolyshkin --- .codespellrc | 2 +- Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.codespellrc b/.codespellrc index 15e6fc7bc..e91a6d2eb 100644 --- a/.codespellrc +++ b/.codespellrc @@ -1,3 +1,3 @@ [codespell] -skip = ./.git,./test/pki +skip = ./.git,./test/pki,./tags ignore-words-list = creat,fpr,fle,ue,bord,parms,nd,te,testng,inh,wronly,renderd,bui,clen,sems diff --git a/Makefile b/Makefile index 60b78a074..90908de83 100644 --- a/Makefile +++ b/Makefile @@ -466,7 +466,7 @@ shellcheck: shellcheck -x test/others/action-script/*.sh codespell: - codespell -S tags + codespell lint: ruff shellcheck codespell # Do not append \n to pr_perror, pr_pwarn or fail From 528c94c48b3d25b27a22bd672e700ccf413b5945 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 6 Feb 2025 17:32:48 +0100 Subject: [PATCH 065/257] ci: install gawk for Fedora based tests Currently Fedora rawhide based CI runs fail with: /bin/sh: line 1: awk: command not found Let's install it. Signed-off-by: Adrian Reber --- scripts/ci/prepare-for-fedora-rawhide.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index 42252c93c..f8ad9cf97 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -4,6 +4,7 @@ set -e -x dnf install -y \ diffutils \ findutils \ + gawk \ gcc \ git \ gnutls-devel \ From b7fa7d304c12860405f1aacb012bdc5fc23f7636 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 20 Feb 2025 04:31:12 +0000 Subject: [PATCH 066/257] kerndat: run iptables with -n to not resolve service names Resolving service names can be slow and it isn't needed here. Fixes #2032 Signed-off-by: Andrei Vagin --- criu/kerndat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/kerndat.c b/criu/kerndat.c index fa1ed21fa..5939005a4 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -646,7 +646,7 @@ static int kerndat_loginuid(void) static int kerndat_iptables_has_xtlocks(void) { int fd; - char *argv[4] = { "sh", "-c", "iptables -w -L", NULL }; + char *argv[4] = { "sh", "-c", "iptables -n -w -L", NULL }; fd = open("/dev/null", O_RDWR); if (fd < 0) { From 030fa4affda75402bf9c2819c7050de27da1a3b0 Mon Sep 17 00:00:00 2001 From: dschervov Date: Wed, 5 Feb 2025 20:04:37 +0300 Subject: [PATCH 067/257] criu: fix internal representation of cgroups hierarchical structure strstartswith() function is incorrect choice for finding parent directory so i change it to issubpath() function Signed-off-by: Dmitrii Chervov --- criu/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/cgroup.c b/criu/cgroup.c index fcaed0708..9246be639 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -248,7 +248,7 @@ static int find_dir(const char *path, struct list_head *dirs, struct cgroup_dir return EXACT_MATCH; } - if (strstartswith(path, d->path)) { + if (issubpath(path, d->path)) { int ret = find_dir(path, &d->children, rdir); if (ret == NO_MATCH) { *rdir = d; From da90b33a42a071ee1702a84e076a03d733037632 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E0=A4=B8=E0=A4=AE=E0=A5=80=E0=A4=B0=20=E0=A4=B8=E0=A4=BF?= =?UTF-8?q?=E0=A4=82=E0=A4=B9=20Sameer=20Singh?= Date: Thu, 23 Jan 2025 04:07:42 +0530 Subject: [PATCH 068/257] coredump: enable coredump generation on aarch64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add relevant elf header constants and notes for the aarch64 platform to enable coredump generation. Signed-off-by: समीर सिंह Sameer Singh --- coredump/coredump | 6 +- coredump/criu_coredump/coredump.py | 164 +++++++++++++++++++++-------- coredump/criu_coredump/elf.py | 55 +++++++++- test/others/criu-coredump/test.sh | 5 +- 4 files changed, 178 insertions(+), 52 deletions(-) diff --git a/coredump/coredump b/coredump/coredump index 3fbdafe81..f1027773d 100755 --- a/coredump/coredump +++ b/coredump/coredump @@ -6,6 +6,8 @@ import sys import criu_coredump +PLATFORMS = ["aarch64", "x86_64"] + def coredump(opts): generator = criu_coredump.coredump_generator() @@ -37,8 +39,8 @@ def main(): opts = vars(parser.parse_args()) - if platform.machine() != 'x86_64': - print('ERROR: %s only supported on x86_64' % sys.argv[0]) + if platform.machine() not in PLATFORMS: + print("ERROR: %s is only supported on: %s" % (sys.argv[0], ', '.join(PLATFORMS))) sys.exit(1) try: diff --git a/coredump/criu_coredump/coredump.py b/coredump/criu_coredump/coredump.py index 20ec8e5dc..6bfc462f2 100644 --- a/coredump/criu_coredump/coredump.py +++ b/coredump/criu_coredump/coredump.py @@ -31,6 +31,7 @@ import io import sys import ctypes +import platform from pycriu import images from . import elf @@ -130,6 +131,11 @@ class coredump_generator: reg_files = None # reg-files; pagemaps = {} # pagemap by pid; + # thread info key based on the current arch + thread_info_key = {"aarch64": "ti_aarch64", "x86_64": "thread_info"} + + machine = platform.machine() # current arch + def _img_open_and_strip(self, name, single=False, pid=None): """ Load criu image and strip it from magic and redundant list. @@ -213,7 +219,7 @@ class coredump_generator: ehdr.e_ident[elf.EI_VERSION] = elf.EV_CURRENT ehdr.e_type = elf.ET_CORE - ehdr.e_machine = elf.EM_X86_64 + ehdr.e_machine = self._get_e_machine() ehdr.e_version = elf.EV_CURRENT ehdr.e_phoff = ctypes.sizeof(elf.Elf64_Ehdr()) ehdr.e_ehsize = ctypes.sizeof(elf.Elf64_Ehdr()) @@ -224,6 +230,13 @@ class coredump_generator: return ehdr + def _get_e_machine(self): + """ + Get the e_machine field based on the current architecture. + """ + e_machine_dict = {"aarch64": elf.EM_AARCH64, "x86_64": elf.EM_X86_64} + return e_machine_dict[self.machine] + def _gen_phdrs(self, pid, notes, vmas): """ Generate program headers for process pid. @@ -332,7 +345,7 @@ class coredump_generator: Generate NT_PRSTATUS note for thread tid of process pid. """ core = self.cores[tid] - regs = core["thread_info"]["gpregs"] + regs = self._get_gpregs(core) pstree = self.pstree[pid] prstatus = elf.elf_prstatus() @@ -345,33 +358,7 @@ class coredump_generator: prstatus.pr_pgrp = pstree["pgid"] prstatus.pr_sid = pstree["sid"] - prstatus.pr_reg.r15 = regs["r15"] - prstatus.pr_reg.r14 = regs["r14"] - prstatus.pr_reg.r13 = regs["r13"] - prstatus.pr_reg.r12 = regs["r12"] - prstatus.pr_reg.rbp = regs["bp"] - prstatus.pr_reg.rbx = regs["bx"] - prstatus.pr_reg.r11 = regs["r11"] - prstatus.pr_reg.r10 = regs["r10"] - prstatus.pr_reg.r9 = regs["r9"] - prstatus.pr_reg.r8 = regs["r8"] - prstatus.pr_reg.rax = regs["ax"] - prstatus.pr_reg.rcx = regs["cx"] - prstatus.pr_reg.rdx = regs["dx"] - prstatus.pr_reg.rsi = regs["si"] - prstatus.pr_reg.rdi = regs["di"] - prstatus.pr_reg.orig_rax = regs["orig_ax"] - prstatus.pr_reg.rip = regs["ip"] - prstatus.pr_reg.cs = regs["cs"] - prstatus.pr_reg.eflags = regs["flags"] - prstatus.pr_reg.rsp = regs["sp"] - prstatus.pr_reg.ss = regs["ss"] - prstatus.pr_reg.fs_base = regs["fs_base"] - prstatus.pr_reg.gs_base = regs["gs_base"] - prstatus.pr_reg.ds = regs["ds"] - prstatus.pr_reg.es = regs["es"] - prstatus.pr_reg.fs = regs["fs"] - prstatus.pr_reg.gs = regs["gs"] + self._set_pr_regset(prstatus.pr_reg, regs) nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 @@ -385,28 +372,64 @@ class coredump_generator: return note + def _get_gpregs(self, core): + """ + Get the general purpose registers based on the current architecture. + """ + thread_info_key = self.thread_info_key[self.machine] + thread_info = core[thread_info_key] + + return thread_info["gpregs"] + + def _set_pr_regset(self, pr_reg, regs): + """ + Set the pr_reg struct based on the current architecture. + """ + if self.machine == "aarch64": + pr_reg.regs = (ctypes.c_ulonglong * len(regs["regs"]))(*regs["regs"]) + pr_reg.sp = regs["sp"] + pr_reg.pc = regs["pc"] + pr_reg.pstate = regs["pstate"] + elif self.machine == "x86_64": + pr_reg.r15 = regs["r15"] + pr_reg.r14 = regs["r14"] + pr_reg.r13 = regs["r13"] + pr_reg.r12 = regs["r12"] + pr_reg.rbp = regs["bp"] + pr_reg.rbx = regs["bx"] + pr_reg.r11 = regs["r11"] + pr_reg.r10 = regs["r10"] + pr_reg.r9 = regs["r9"] + pr_reg.r8 = regs["r8"] + pr_reg.rax = regs["ax"] + pr_reg.rcx = regs["cx"] + pr_reg.rdx = regs["dx"] + pr_reg.rsi = regs["si"] + pr_reg.rdi = regs["di"] + pr_reg.orig_rax = regs["orig_ax"] + pr_reg.rip = regs["ip"] + pr_reg.cs = regs["cs"] + pr_reg.eflags = regs["flags"] + pr_reg.rsp = regs["sp"] + pr_reg.ss = regs["ss"] + pr_reg.fs_base = regs["fs_base"] + pr_reg.gs_base = regs["gs_base"] + pr_reg.ds = regs["ds"] + pr_reg.es = regs["es"] + pr_reg.fs = regs["fs"] + pr_reg.gs = regs["gs"] + def _gen_fpregset(self, pid, tid): """ Generate NT_FPREGSET note for thread tid of process pid. """ core = self.cores[tid] - regs = core["thread_info"]["fpregs"] + regs = self._get_fpregs(core) fpregset = elf.elf_fpregset_t() ctypes.memset(ctypes.addressof(fpregset), 0, ctypes.sizeof(fpregset)) - fpregset.cwd = regs["cwd"] - fpregset.swd = regs["swd"] - fpregset.ftw = regs["twd"] - fpregset.fop = regs["fop"] - fpregset.rip = regs["rip"] - fpregset.rdp = regs["rdp"] - fpregset.mxcsr = regs["mxcsr"] - fpregset.mxcr_mask = regs["mxcsr_mask"] - fpregset.st_space = (ctypes.c_uint * len(regs["st_space"]))( - *regs["st_space"]) - fpregset.xmm_space = (ctypes.c_uint * len(regs["xmm_space"]))( - *regs["xmm_space"]) + self._set_fpregset(fpregset, regs) nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 @@ -420,6 +443,58 @@ class coredump_generator: return note + def _get_fpregs(self, core): + """ + Get the floating point register dictionary based on the current architecture. + """ + fpregs_key_dict = {"aarch64": "fpsimd", "x86_64": "fpregs"} + fpregs_key = fpregs_key_dict[self.machine] + + thread_info_key = self.thread_info_key[self.machine] + + return core[thread_info_key][fpregs_key] + + def _set_fpregset(self, fpregset, regs): + """ + Set the fpregset struct based on the current architecture. + """ + if self.machine == "aarch64": + fpregset.vregs = (ctypes.c_ulonglong * len(regs["vregs"]))(*regs["vregs"]) + fpregset.fpsr = regs["fpsr"] + fpregset.fpcr = regs["fpcr"] + elif self.machine == "x86_64": + fpregset.cwd = regs["cwd"] + fpregset.swd = regs["swd"] + fpregset.ftw = regs["twd"] + fpregset.fop = regs["fop"] + fpregset.rip = regs["rip"] + fpregset.rdp = regs["rdp"] + fpregset.mxcsr = regs["mxcsr"] + fpregset.mxcr_mask = regs["mxcsr_mask"] + fpregset.st_space = (ctypes.c_uint * len(regs["st_space"]))( + *regs["st_space"]) + fpregset.xmm_space = (ctypes.c_uint * len(regs["xmm_space"]))( + *regs["xmm_space"]) + + def _gen_arm_tls(self, tid): + """ + Generate NT_ARM_TLS note for thread tid of process pid. + """ + core = self.cores[tid] + tls = ctypes.c_ulonglong(core["ti_aarch64"]["tls"]) + + nhdr = elf.Elf64_Nhdr() + nhdr.n_namesz = 6 + nhdr.n_descsz = ctypes.sizeof(ctypes.c_ulonglong) + nhdr.n_type = elf.NT_ARM_TLS + + note = elf_note() + note.data = tls + note.owner = b"LINUX" + note.nhdr = nhdr + + return note + def _gen_x86_xstate(self, pid, tid): """ Generate NT_X86_XSTATE note for thread tid of process pid. @@ -593,8 +668,11 @@ class coredump_generator: notes.append(self._gen_prstatus(pid, tid)) notes.append(self._gen_fpregset(pid, tid)) - notes.append(self._gen_x86_xstate(pid, tid)) notes.append(self._gen_siginfo(pid, tid)) + if self.machine == "aarch64": + notes.append(self._gen_arm_tls(tid)) + elif self.machine == "x86_64": + notes.append(self._gen_x86_xstate(pid, tid)) return notes diff --git a/coredump/criu_coredump/elf.py b/coredump/criu_coredump/elf.py index 092b47857..2697fad07 100644 --- a/coredump/criu_coredump/elf.py +++ b/coredump/criu_coredump/elf.py @@ -1,5 +1,8 @@ # Define structures and constants for generating elf file. import ctypes +import platform + +MACHINE = platform.machine() Elf64_Half = ctypes.c_uint16 # typedef uint16_t Elf64_Half; Elf64_Word = ctypes.c_uint32 # typedef uint32_t Elf64_Word; @@ -39,6 +42,7 @@ ET_CORE = 4 # #define ET_CORE 4 /* Core file */ # Legal values for e_machine (architecture). EM_X86_64 = 62 # #define EM_X86_64 62 /* AMD x86-64 architecture */ +EM_AARCH64 = 183 # #define EM_AARCH64 183 /* ARM AARCH64 */ # Legal values for e_version (version). EV_CURRENT = 1 # #define EV_CURRENT 1 /* Current version */ @@ -119,6 +123,7 @@ NT_AUXV = 6 # #define NT_AUXV 6 /* Contains copy of auxv array */ NT_SIGINFO = 0x53494749 # #define NT_SIGINFO 0x53494749 /* Contains copy of siginfo_t, size might increase */ NT_FILE = 0x46494c45 # #define NT_FILE 0x46494c45 /* Contains information about mapped files */ NT_X86_XSTATE = 0x202 # #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ +NT_ARM_TLS = 0x401 # #define NT_ARM_TLS 0x401 /* ARM TLS register */ class Elf64_Nhdr(ctypes.Structure): # typedef struct @@ -218,7 +223,7 @@ class timeval(ctypes.Structure): # struct timeval ] -class user_regs_struct(ctypes.Structure): # struct user_regs_struct +class x86_64_user_regs_struct(ctypes.Structure): # struct x86_64_user_regs_struct _fields_ = [ ("r15", ctypes.c_ulonglong), # __extension__ unsigned long long int r15; @@ -277,10 +282,31 @@ class user_regs_struct(ctypes.Structure): # struct user_regs_struct ] +class aarch64_user_regs_struct(ctypes.Structure): # struct aarch64_user_regs_struct + _fields_ = [ + ("regs", + ctypes.c_ulonglong * 31), # unsigned long long int regs[31]; + ("sp", + ctypes.c_ulonglong), # unsigned long long int sp; + ("pc", + ctypes.c_ulonglong), # unsigned long long int pc; + ("pstate", + ctypes.c_ulonglong), # unsigned long long int pstate; + ] + + # elf_greg_t = ctypes.c_ulonglong # ELF_NGREG = ctypes.sizeof(user_regs_struct)/ctypes.sizeof(elf_greg_t) # elf_gregset_t = elf_greg_t*ELF_NGREG -elf_gregset_t = user_regs_struct +user_regs_dict = { + "aarch64": aarch64_user_regs_struct, + "x86_64": x86_64_user_regs_struct, +} + +try: + elf_gregset_t = user_regs_dict[MACHINE] +except KeyError: + raise ValueError("Current architecture %s is not supported." % MACHINE) class elf_prstatus(ctypes.Structure): # struct elf_prstatus @@ -420,7 +446,7 @@ class elf_prpsinfo(ctypes.Structure): # struct elf_prpsinfo ] -class user_fpregs_struct(ctypes.Structure): # struct user_fpregs_struct +class x86_64_user_fpregs_struct(ctypes.Structure): # struct x86_64_user_fpregs_struct _fields_ = [ # unsigned short int cwd; ("cwd", ctypes.c_ushort), @@ -447,7 +473,28 @@ class user_fpregs_struct(ctypes.Structure): # struct user_fpregs_struct ] -elf_fpregset_t = user_fpregs_struct +class aarch64_user_fpregs_struct(ctypes.Structure): # struct aarch64_user_fpregs_struct + _fields_ = [ + # unsigned long long int vregs[64]; + ("vregs", ctypes.c_ulonglong * 64), + # unsigned int fpsr; + ("fpsr", ctypes.c_uint), + # unsigned int fpcr; + ("fpcr", ctypes.c_uint), + # unsigned int padding[2]; + ("padding", ctypes.c_uint * 2), + ] + + +user_fpregs_dict = { + "aarch64": aarch64_user_fpregs_struct, + "x86_64": x86_64_user_fpregs_struct, +} + +try: + elf_fpregset_t = user_fpregs_dict[MACHINE] +except KeyError: + raise ValueError("Current architecture %s is not supported." % MACHINE) # siginfo_t related constants. diff --git a/test/others/criu-coredump/test.sh b/test/others/criu-coredump/test.sh index 4399044d7..e0ddce58d 100755 --- a/test/others/criu-coredump/test.sh +++ b/test/others/criu-coredump/test.sh @@ -45,9 +45,8 @@ function run_test { UNAME_M=$(uname -m) -if [ "$UNAME_M" != "x86_64" ]; then - # the criu-coredump script is only x86_64 aware - echo "criu-coredump only support x86_64. skipping." +if [[ "$UNAME_M" != "aarch64" && "$UNAME_M" != "x86_64" ]]; then + echo "criu-coredump only supports aarch64 and x86_64. skipping." exit 0 fi From 38b9807cd5ef74fa1cd5359b32861e7d94c1897c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E0=A4=B8=E0=A4=AE=E0=A5=80=E0=A4=B0=20=E0=A4=B8=E0=A4=BF?= =?UTF-8?q?=E0=A4=82=E0=A4=B9=20Sameer=20Singh?= Date: Mon, 17 Feb 2025 18:06:10 +0530 Subject: [PATCH 069/257] coredump: enable coredump generation on arm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add relevant elf header constants and notes for the arm platform to enable coredump generation. Signed-off-by: समीर सिंह Sameer Singh --- coredump/coredump | 2 +- coredump/criu_coredump/coredump.py | 124 +++++++++++++++---- coredump/criu_coredump/elf.py | 188 ++++++++++++++++++++++++++++- test/others/criu-coredump/test.sh | 4 +- 4 files changed, 288 insertions(+), 30 deletions(-) diff --git a/coredump/coredump b/coredump/coredump index f1027773d..5b3e6f366 100755 --- a/coredump/coredump +++ b/coredump/coredump @@ -6,7 +6,7 @@ import sys import criu_coredump -PLATFORMS = ["aarch64", "x86_64"] +PLATFORMS = ["aarch64", "armv7l", "x86_64"] def coredump(opts): diff --git a/coredump/criu_coredump/coredump.py b/coredump/criu_coredump/coredump.py index 6bfc462f2..c6a758c8a 100644 --- a/coredump/criu_coredump/coredump.py +++ b/coredump/criu_coredump/coredump.py @@ -95,8 +95,13 @@ class coredump: buf.write(b"\0" * (8 - len(note.owner))) buf.write(note.data) - offset = ctypes.sizeof(elf.Elf64_Ehdr()) - offset += (len(self.vmas) + 1) * ctypes.sizeof(elf.Elf64_Phdr()) + bits = platform.architecture()[0] # 32 or 64 bits + + ehdr = {"32bit": elf.Elf32_Ehdr, "64bit": elf.Elf64_Ehdr} + phdr = {"32bit": elf.Elf32_Phdr, "64bit": elf.Elf64_Phdr} + + offset = ctypes.sizeof(ehdr[bits]()) + offset += (len(self.vmas) + 1) * ctypes.sizeof(phdr[bits]()) filesz = 0 for note in self.notes: @@ -132,9 +137,18 @@ class coredump_generator: pagemaps = {} # pagemap by pid; # thread info key based on the current arch - thread_info_key = {"aarch64": "ti_aarch64", "x86_64": "thread_info"} + thread_info_key = { + "aarch64": "ti_aarch64", + "armv7l": "ti_arm", + "x86_64": "thread_info", + } machine = platform.machine() # current arch + bits = platform.architecture()[0] # 32 or 64 bits + + ehdr = {"32bit": elf.Elf32_Ehdr, "64bit": elf.Elf64_Ehdr} # 32 or 64 bits Ehdr + nhdr = {"32bit": elf.Elf32_Nhdr, "64bit": elf.Elf64_Nhdr} # 32 or 64 bits Nhdr + phdr = {"32bit": elf.Elf32_Phdr, "64bit": elf.Elf64_Phdr} # 32 or 64 bits Phdr def _img_open_and_strip(self, name, single=False, pid=None): """ @@ -207,23 +221,30 @@ class coredump_generator: """ Generate elf header for process pid with program headers phdrs. """ - ehdr = elf.Elf64_Ehdr() + ei_class = {"32bit": elf.ELFCLASS32, "64bit": elf.ELFCLASS64} + + ehdr = self.ehdr[self.bits]() ctypes.memset(ctypes.addressof(ehdr), 0, ctypes.sizeof(ehdr)) ehdr.e_ident[elf.EI_MAG0] = elf.ELFMAG0 ehdr.e_ident[elf.EI_MAG1] = elf.ELFMAG1 ehdr.e_ident[elf.EI_MAG2] = elf.ELFMAG2 ehdr.e_ident[elf.EI_MAG3] = elf.ELFMAG3 - ehdr.e_ident[elf.EI_CLASS] = elf.ELFCLASS64 + ehdr.e_ident[elf.EI_CLASS] = ei_class[self.bits] ehdr.e_ident[elf.EI_DATA] = elf.ELFDATA2LSB ehdr.e_ident[elf.EI_VERSION] = elf.EV_CURRENT + if self.machine == "armv7l": + ehdr.e_ident[elf.EI_OSABI] = elf.ELFOSABI_ARM + else: + ehdr.e_ident[elf.EI_OSABI] = elf.ELFOSABI_NONE + ehdr.e_type = elf.ET_CORE ehdr.e_machine = self._get_e_machine() ehdr.e_version = elf.EV_CURRENT - ehdr.e_phoff = ctypes.sizeof(elf.Elf64_Ehdr()) - ehdr.e_ehsize = ctypes.sizeof(elf.Elf64_Ehdr()) - ehdr.e_phentsize = ctypes.sizeof(elf.Elf64_Phdr()) + ehdr.e_phoff = ctypes.sizeof(self.ehdr[self.bits]()) + ehdr.e_ehsize = ctypes.sizeof(self.ehdr[self.bits]()) + ehdr.e_phentsize = ctypes.sizeof(self.phdr[self.bits]()) # FIXME Case len(phdrs) > PN_XNUM should be handled properly. # See fs/binfmt_elf.c from linux kernel. ehdr.e_phnum = len(phdrs) @@ -234,7 +255,11 @@ class coredump_generator: """ Get the e_machine field based on the current architecture. """ - e_machine_dict = {"aarch64": elf.EM_AARCH64, "x86_64": elf.EM_X86_64} + e_machine_dict = { + "aarch64": elf.EM_AARCH64, + "armv7l": elf.EM_ARM, + "x86_64": elf.EM_X86_64, + } return e_machine_dict[self.machine] def _gen_phdrs(self, pid, notes, vmas): @@ -243,15 +268,15 @@ class coredump_generator: """ phdrs = [] - offset = ctypes.sizeof(elf.Elf64_Ehdr()) - offset += (len(vmas) + 1) * ctypes.sizeof(elf.Elf64_Phdr()) + offset = ctypes.sizeof(self.ehdr[self.bits]()) + offset += (len(vmas) + 1) * ctypes.sizeof(self.phdr[self.bits]()) filesz = 0 for note in notes: filesz += ctypes.sizeof(note.nhdr) + ctypes.sizeof(note.data) + 8 # PT_NOTE - phdr = elf.Elf64_Phdr() + phdr = self.phdr[self.bits]() ctypes.memset(ctypes.addressof(phdr), 0, ctypes.sizeof(phdr)) phdr.p_type = elf.PT_NOTE phdr.p_offset = offset @@ -271,7 +296,7 @@ class coredump_generator: for vma in vmas: offset += filesz filesz = vma.filesz - phdr = elf.Elf64_Phdr() + phdr = self.phdr[self.bits]() ctypes.memset(ctypes.addressof(phdr), 0, ctypes.sizeof(phdr)) phdr.p_type = elf.PT_LOAD phdr.p_align = PAGESIZE @@ -328,7 +353,7 @@ class coredump_generator: prpsinfo.pr_psargs = self._gen_cmdline(pid)[:80] prpsinfo.pr_fname = core["tc"]["comm"].encode() - nhdr = elf.Elf64_Nhdr() + nhdr = self.nhdr[self.bits]() nhdr.n_namesz = 5 nhdr.n_descsz = ctypes.sizeof(elf.elf_prpsinfo()) nhdr.n_type = elf.NT_PRPSINFO @@ -360,7 +385,7 @@ class coredump_generator: self._set_pr_regset(prstatus.pr_reg, regs) - nhdr = elf.Elf64_Nhdr() + nhdr = self.nhdr[self.bits]() nhdr.n_namesz = 5 nhdr.n_descsz = ctypes.sizeof(elf.elf_prstatus()) nhdr.n_type = elf.NT_PRSTATUS @@ -390,6 +415,25 @@ class coredump_generator: pr_reg.sp = regs["sp"] pr_reg.pc = regs["pc"] pr_reg.pstate = regs["pstate"] + elif self.machine == "armv7l": + pr_reg.r0 = regs["r0"] + pr_reg.r1 = regs["r1"] + pr_reg.r2 = regs["r2"] + pr_reg.r3 = regs["r3"] + pr_reg.r4 = regs["r4"] + pr_reg.r5 = regs["r5"] + pr_reg.r6 = regs["r6"] + pr_reg.r7 = regs["r7"] + pr_reg.r8 = regs["r8"] + pr_reg.r9 = regs["r9"] + pr_reg.r10 = regs["r10"] + pr_reg.fp = regs["fp"] + pr_reg.ip = regs["ip"] + pr_reg.sp = regs["sp"] + pr_reg.lr = regs["lr"] + pr_reg.pc = regs["pc"] + pr_reg.cpsr = regs["cpsr"] + pr_reg.orig_r0 = regs["orig_r0"] elif self.machine == "x86_64": pr_reg.r15 = regs["r15"] pr_reg.r14 = regs["r14"] @@ -495,6 +539,34 @@ class coredump_generator: return note + def _gen_arm_vfp(self, tid): + """ + Generate NT_ARM_VFP note for thread tid of process pid. + """ + core = self.cores[tid] + fpstate = core["ti_arm"]["fpstate"] + + data = elf.vfp_hard_struct() + ctypes.memset(ctypes.addressof(data), 0, ctypes.sizeof(data)) + + data.vfp_regs = (ctypes.c_uint64 * len(fpstate["vfp_regs"]))(*fpstate["vfp_regs"]) + data.fpexc = fpstate["fpexc"] + data.fpscr = fpstate["fpscr"] + data.fpinst = fpstate["fpinst"] + data.fpinst2 = fpstate["fpinst2"] + + nhdr = elf.Elf32_Nhdr() + nhdr.n_namesz = 6 + nhdr.n_descsz = ctypes.sizeof(data) + nhdr.n_type = elf.NT_ARM_VFP + + note = elf_note() + note.data = data + note.owner = b"LINUX" + note.nhdr = nhdr + + return note + def _gen_x86_xstate(self, pid, tid): """ Generate NT_X86_XSTATE note for thread tid of process pid. @@ -544,7 +616,7 @@ class coredump_generator: # FIXME zeroify everything for now ctypes.memset(ctypes.addressof(siginfo), 0, ctypes.sizeof(siginfo)) - nhdr = elf.Elf64_Nhdr() + nhdr = self.nhdr[self.bits]() nhdr.n_namesz = 5 nhdr.n_descsz = ctypes.sizeof(elf.siginfo_t()) nhdr.n_type = elf.NT_SIGINFO @@ -563,17 +635,22 @@ class coredump_generator: mm = self.mms[pid] num_auxv = len(mm["mm_saved_auxv"]) // 2 - class elf_auxv(ctypes.Structure): + class elf32_auxv(ctypes.Structure): + _fields_ = [("auxv", elf.Elf32_auxv_t * num_auxv)] + + class elf64_auxv(ctypes.Structure): _fields_ = [("auxv", elf.Elf64_auxv_t * num_auxv)] - auxv = elf_auxv() + elf_auxv = {"32bit": elf32_auxv(), "64bit": elf64_auxv()} + + auxv = elf_auxv[self.bits] for i in range(num_auxv): auxv.auxv[i].a_type = mm["mm_saved_auxv"][i] auxv.auxv[i].a_val = mm["mm_saved_auxv"][i + 1] - nhdr = elf.Elf64_Nhdr() + nhdr = self.nhdr[self.bits]() nhdr.n_namesz = 5 - nhdr.n_descsz = ctypes.sizeof(elf_auxv()) + nhdr.n_descsz = ctypes.sizeof(elf_auxv[self.bits]) nhdr.n_type = elf.NT_AUXV note = elf_note() @@ -650,7 +727,7 @@ class coredump_generator: setattr(data, "file_ofs" + str(i), info.file_ofs) setattr(data, "name" + str(i), info.name.encode()) - nhdr = elf.Elf64_Nhdr() + nhdr = self.nhdr[self.bits]() nhdr.n_namesz = 5 # strlen + 1 nhdr.n_descsz = ctypes.sizeof(elf_files()) @@ -667,10 +744,13 @@ class coredump_generator: notes = [] notes.append(self._gen_prstatus(pid, tid)) - notes.append(self._gen_fpregset(pid, tid)) + if self.machine != "armv7l": + notes.append(self._gen_fpregset(pid, tid)) notes.append(self._gen_siginfo(pid, tid)) if self.machine == "aarch64": notes.append(self._gen_arm_tls(tid)) + elif self.machine == "armv7l": + notes.append(self._gen_arm_vfp(tid)) elif self.machine == "x86_64": notes.append(self._gen_x86_xstate(pid, tid)) diff --git a/coredump/criu_coredump/elf.py b/coredump/criu_coredump/elf.py index 2697fad07..2911f491e 100644 --- a/coredump/criu_coredump/elf.py +++ b/coredump/criu_coredump/elf.py @@ -4,13 +4,19 @@ import platform MACHINE = platform.machine() +Elf32_Half = ctypes.c_uint16 # typedef uint16_t Elf32_Half; +Elf32_Word = ctypes.c_uint32 # typedef uint32_t Elf32_Word; +Elf32_Addr = ctypes.c_uint32 # typedef uint32_t Elf32_Addr; +Elf32_Off = ctypes.c_uint32 # typedef uint32_t Elf32_Off; +Elf32_Xword = ctypes.c_uint64 # typedef uint64_t Elf32_Xword; + Elf64_Half = ctypes.c_uint16 # typedef uint16_t Elf64_Half; Elf64_Word = ctypes.c_uint32 # typedef uint32_t Elf64_Word; Elf64_Addr = ctypes.c_uint64 # typedef uint64_t Elf64_Addr; Elf64_Off = ctypes.c_uint64 # typedef uint64_t Elf64_Off; Elf64_Xword = ctypes.c_uint64 # typedef uint64_t Elf64_Xword; -# Elf64_Ehdr related constants. +# Elf_Ehdr related constants. # e_ident size. EI_NIDENT = 16 # #define EI_NIDENT (16) @@ -31,22 +37,50 @@ EI_CLASS = 4 # #define EI_CLASS 4 /* File class byte index EI_DATA = 5 # #define EI_DATA 5 /* Data encoding byte index */ +EI_OSABI = 7 # #define EI_OSABI 7 /* OS ABI identification */ + EI_VERSION = 6 # #define EI_VERSION 6 /* File version byte index */ ELFDATA2LSB = 1 # #define ELFDATA2LSB 1 /* 2's complement, little endian */ +ELFCLASS32 = 1 # #define ELFCLASS32 1 /* 32-bit objects */ ELFCLASS64 = 2 # #define ELFCLASS64 2 /* 64-bit objects */ # Legal values for e_type (object file type). ET_CORE = 4 # #define ET_CORE 4 /* Core file */ # Legal values for e_machine (architecture). +EM_ARM = 40 # #define EM_ARM 40 /* ARM */ EM_X86_64 = 62 # #define EM_X86_64 62 /* AMD x86-64 architecture */ EM_AARCH64 = 183 # #define EM_AARCH64 183 /* ARM AARCH64 */ # Legal values for e_version (version). EV_CURRENT = 1 # #define EV_CURRENT 1 /* Current version */ +# Legal values for e_osabi +ELFOSABI_NONE = 0 # #define ELFOSABI_NONE 0 /* UNIX System V ABI */ +ELFOSABI_ARM = 97 # #define ELFOSABI_ARM 97 /* ARM */ + + +class Elf32_Ehdr(ctypes.Structure): # typedef struct + _fields_ = [ + ("e_ident", + ctypes.c_ubyte * EI_NIDENT), # unsigned char e_ident[EI_NIDENT]; + ("e_type", Elf32_Half), # Elf32_Half e_type; + ("e_machine", Elf32_Half), # Elf32_Half e_machine; + ("e_version", Elf32_Word), # Elf32_Word e_version; + ("e_entry", Elf32_Addr), # Elf32_Addr e_entry; + ("e_phoff", Elf32_Off), # Elf32_Off e_phoff; + ("e_shoff", Elf32_Off), # Elf32_Off e_shoff; + ("e_flags", Elf32_Word), # Elf32_Word e_flags; + ("e_ehsize", Elf32_Half), # Elf32_Half e_ehsize; + ("e_phentsize", Elf32_Half), # Elf32_Half e_phentsize; + ("e_phnum", Elf32_Half), # Elf32_Half e_phnum; + ("e_shentsize", Elf32_Half), # Elf32_Half e_shentsize; + ("e_shnum", Elf32_Half), # Elf32_Half e_shnum; + ("e_shstrndx", Elf32_Half) # Elf32_Half e_shstrndx; + ] # } Elf32_Ehdr; + class Elf64_Ehdr(ctypes.Structure): # typedef struct _fields_ = [ @@ -68,7 +102,7 @@ class Elf64_Ehdr(ctypes.Structure): # typedef struct ] # } Elf64_Ehdr; -# Elf64_Phdr related constants. +# Elf_Phdr related constants. # Legal values for p_type (segment type). PT_LOAD = 1 # #define PT_LOAD 1 /* Loadable program segment */ @@ -80,6 +114,19 @@ PF_W = 1 << 1 # #define PF_W (1 << 1) /* Segment is writable PF_R = 1 << 2 # #define PF_R (1 << 2) /* Segment is readable */ +class Elf32_Phdr(ctypes.Structure): # typedef struct + _fields_ = [ + ("p_type", Elf32_Word), # Elf32_Word p_type; + ("p_offset", Elf32_Off), # Elf32_Off p_offset; + ("p_vaddr", Elf32_Addr), # Elf32_Addr p_vaddr; + ("p_paddr", Elf32_Addr), # Elf32_Addr p_paddr; + ("p_filesz", Elf32_Word), # Elf32_Word p_filesz; + ("p_memsz", Elf32_Word), # Elf32_Word p_memsz; + ("p_flags", Elf32_Word), # Elf32_Word p_flags; + ("p_align", Elf32_Word), # Elf32_Word p_align; + ] # } Elf32_Phdr; + + class Elf64_Phdr(ctypes.Structure): # typedef struct _fields_ = [ ("p_type", Elf64_Word), # Elf64_Word p_type; @@ -93,7 +140,25 @@ class Elf64_Phdr(ctypes.Structure): # typedef struct ] # } Elf64_Phdr; -# Elf64_auxv_t related constants. +# Elf_auxv_t related constants. + + +class _Elf32_auxv_t_U(ctypes.Union): + _fields_ = [("a_val", ctypes.c_uint32)] + + +class Elf32_auxv_t(ctypes.Structure): # typedef struct + _fields_ = [ + ("a_type", + ctypes.c_uint32), # uint32_t a_type; /* Entry type */ + ("a_un", _Elf32_auxv_t_U) # union + + # uint32_t a_val; /* Integer value */ + # /* We use to have pointer elements added here. We cannot do that, + # though, since it does not work when using 32-bit definitions + # on 64-bit platforms and vice versa. */ + # } a_un; + ] # } Elf32_auxv_t; class _Elf64_auxv_t_U(ctypes.Union): @@ -114,7 +179,7 @@ class Elf64_auxv_t(ctypes.Structure): # typedef struct ] # } Elf64_auxv_t; -# Elf64_Nhdr related constants. +# Elf_Nhdr related constants. NT_PRSTATUS = 1 # #define NT_PRSTATUS 1 /* Contains copy of prstatus struct */ NT_FPREGSET = 2 # #define NT_FPREGSET 2 /* Contains copy of fpregset struct */ @@ -123,9 +188,24 @@ NT_AUXV = 6 # #define NT_AUXV 6 /* Contains copy of auxv array */ NT_SIGINFO = 0x53494749 # #define NT_SIGINFO 0x53494749 /* Contains copy of siginfo_t, size might increase */ NT_FILE = 0x46494c45 # #define NT_FILE 0x46494c45 /* Contains information about mapped files */ NT_X86_XSTATE = 0x202 # #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ +NT_ARM_VFP = 0x400 # #define NT_ARM_VFP 0x400 /* ARM VFP/NEON registers */ NT_ARM_TLS = 0x401 # #define NT_ARM_TLS 0x401 /* ARM TLS register */ +class Elf32_Nhdr(ctypes.Structure): # typedef struct + _fields_ = [ + ( + "n_namesz", Elf32_Word + ), # Elf32_Word n_namesz; /* Length of the note's name. */ + ( + "n_descsz", Elf32_Word + ), # Elf32_Word n_descsz; /* Length of the note's descriptor. */ + ( + "n_type", Elf32_Word + ), # Elf32_Word n_type; /* Type of the note. */ + ] # } Elf32_Nhdr; + + class Elf64_Nhdr(ctypes.Structure): # typedef struct _fields_ = [ ( @@ -139,7 +219,52 @@ class Elf64_Nhdr(ctypes.Structure): # typedef struct ] # } Elf64_Nhdr; -# Elf64_Shdr related constants. +# Elf_Shdr related constants. + + +class Elf32_Shdr(ctypes.Structure): + _fields_ = [ + ( + # Section name (string tbl index) + "sh_name", Elf32_Word + ), + ( + # Section type + "sh_type", Elf32_Word + ), + ( + # Section flags + "sh_flags", Elf32_Word + ), + ( + # Section virtual addr at execution + "sh_addr", Elf32_Addr + ), + ( + # Section file offset + "sh_offset", Elf32_Off + ), + ( + # Section size in bytes + "sh_size", Elf32_Word + ), + ( + # Link to another section + "sh_link", Elf32_Word + ), + ( + # Additional section information + "sh_info", Elf32_Word + ), + ( + # Section alignment + "sh_addralign", Elf32_Word + ), + ( + # Entry size if section holds table + "sh_entsize", Elf32_Word + ) + ] class Elf64_Shdr(ctypes.Structure): @@ -295,11 +420,53 @@ class aarch64_user_regs_struct(ctypes.Structure): # struct aarch64_user_regs_st ] +class arm_user_regs_struct(ctypes.Structure): # struct arm_user_regs_struct + _fields_ = [ + ("r0", + ctypes.c_ulong), # unsigned ulong int r0; + ("r1", + ctypes.c_ulong), # unsigned ulong int r1; + ("r2", + ctypes.c_ulong), # unsigned ulong int r2; + ("r3", + ctypes.c_ulong), # unsigned ulong int r3; + ("r4", + ctypes.c_ulong), # unsigned ulong int r4; + ("r5", + ctypes.c_ulong), # unsigned ulong int r5; + ("r6", + ctypes.c_ulong), # unsigned ulong int r6; + ("r7", + ctypes.c_ulong), # unsigned ulong int r7; + ("r8", + ctypes.c_ulong), # unsigned ulong int r8; + ("r9", + ctypes.c_ulong), # unsigned ulong int r9; + ("r10", + ctypes.c_ulong), # unsigned ulong int r10; + ("fp", + ctypes.c_ulong), # unsigned ulong int fp; + ("ip", + ctypes.c_ulong), # unsigned ulong int ip; + ("sp", + ctypes.c_ulong), # unsigned ulong int sp; + ("lr", + ctypes.c_ulong), # unsigned ulong int lr; + ("pc", + ctypes.c_ulong), # unsigned ulong int pc; + ("cpsr", + ctypes.c_ulong), # unsigned ulong int cpsr; + ("orig_r0", + ctypes.c_ulong), # unsigned ulong int orig_r0; + ] + + # elf_greg_t = ctypes.c_ulonglong # ELF_NGREG = ctypes.sizeof(user_regs_struct)/ctypes.sizeof(elf_greg_t) # elf_gregset_t = elf_greg_t*ELF_NGREG user_regs_dict = { "aarch64": aarch64_user_regs_struct, + "armv7l": arm_user_regs_struct, "x86_64": x86_64_user_regs_struct, } @@ -488,6 +655,7 @@ class aarch64_user_fpregs_struct(ctypes.Structure): # struct aarch64_user_fpreg user_fpregs_dict = { "aarch64": aarch64_user_fpregs_struct, + "armv7l": None, "x86_64": x86_64_user_fpregs_struct, } @@ -889,3 +1057,13 @@ class elf_xsave_struct(ctypes.Structure): # struct xsave_struct { # struct ymmh_struct ymmh; ("ymmh", ymmh_struct) ] # } __aligned(FP_MIN_ALIGN_BYTES) __packed; + + +class vfp_hard_struct(ctypes.Structure): # struct vfp_hard_struct { + _fields_ = [ + ("vfp_regs", ctypes.c_ulonglong * 32), # __u64 fpregs[32]; + ("fpexc", ctypes.c_ulong), # __u32 fpexc; + ("fpscr", ctypes.c_ulong), # __u32 fpscr; + ("fpinst", ctypes.c_ulong), # __u32 fpinst; + ("fpinst2", ctypes.c_ulong), # __u32 fpinst2; + ] # }; diff --git a/test/others/criu-coredump/test.sh b/test/others/criu-coredump/test.sh index e0ddce58d..2be82e64c 100755 --- a/test/others/criu-coredump/test.sh +++ b/test/others/criu-coredump/test.sh @@ -45,8 +45,8 @@ function run_test { UNAME_M=$(uname -m) -if [[ "$UNAME_M" != "aarch64" && "$UNAME_M" != "x86_64" ]]; then - echo "criu-coredump only supports aarch64 and x86_64. skipping." +if [[ "$UNAME_M" != "aarch64" && "$UNAME_M" != "armv7l" &&"$UNAME_M" != "x86_64" ]]; then + echo "criu-coredump only supports aarch64 armv7l, and x86_64. skipping." exit 0 fi From c298b51a6989a26611fe3cbccd1f3da64a23eb50 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 3 Mar 2025 15:03:51 +0000 Subject: [PATCH 070/257] scripts/uninstall_module: import signal module With Python 3.13, the `subprocess` module now uses the `posix_spawn()` function [1], which requires the `signal` module to be imported. Fixes: #2607 [1] https://docs.python.org/3/whatsnew/3.13.html#subprocess Signed-off-by: Radostin Stoyanov --- scripts/uninstall_module.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/scripts/uninstall_module.py b/scripts/uninstall_module.py index 8a9b70892..2da63c800 100755 --- a/scripts/uninstall_module.py +++ b/scripts/uninstall_module.py @@ -10,6 +10,16 @@ import site import subprocess import sys +# With Python 3.13 the subprocess module now uses the `posix_spawn()` +# function which requires loading the `signal` module: +# https://docs.python.org/3/whatsnew/3.13.html#subprocess +# +# We need to load this module here, before PYTHONPATH and sys.path +# have been modified to use the path specified with `--prefix`. +# +# flake8: noqa: F401 +import signal + import importlib_metadata From d35808f5eec67810df70e34b99ec9064c40cbc13 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 6 Feb 2025 16:38:33 +0100 Subject: [PATCH 071/257] ci: update to latest actions for codeql CI job Signed-off-by: Adrian Reber --- .github/workflows/codeql.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 518d9b8ae..88e21d3d1 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -29,22 +29,22 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Packages (cpp) if: ${{ matrix.language == 'cpp' }} run: | sudo scripts/ci/apt-install protobuf-c-compiler libprotobuf-c-dev libprotobuf-dev build-essential libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnet-dev pkg-config libnl-3-dev libbsd0 libbsd-dev iproute2 libcap-dev libaio-dev libbsd-dev python3-yaml libnl-route-3-dev gnutls-dev - name: Initialize CodeQL - uses: github/codeql-action/init@v2 + uses: github/codeql-action/init@v3 with: languages: ${{ matrix.language }} queries: +security-and-quality - name: Autobuild - uses: github/codeql-action/autobuild@v2 + uses: github/codeql-action/autobuild@v3 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v2 + uses: github/codeql-action/analyze@v3 with: category: "/language:${{ matrix.language }}" From ed6374b48c5923bca53d760ac6f04a2817236407 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 6 Feb 2025 11:07:17 +0100 Subject: [PATCH 072/257] lsm: use the user provided lsm label Currently CRIU has the possibility to specify a LSM label during restore. Unfortunately the information is completely ignored in the case of SELinux. This change selects the lsm label from the user if it is provided and else the label from the checkpoint image is used. Signed-off-by: Adrian Reber --- criu/lsm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/lsm.c b/criu/lsm.c index d1b73cc79..70b66d42e 100644 --- a/criu/lsm.c +++ b/criu/lsm.c @@ -370,7 +370,7 @@ int render_lsm_profile(char *profile, char **val) case LSMTYPE__APPARMOR: return render_aa_profile(val, profile); case LSMTYPE__SELINUX: - if (asprintf(val, "%s", profile) < 0) { + if (asprintf(val, "%s", opts.lsm_supplied ? opts.lsm_profile : profile) < 0) { *val = NULL; return -1; } From d8555015759724c1e90462105ec21c77f89127ec Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Fri, 7 Feb 2025 09:24:19 +0100 Subject: [PATCH 073/257] vdso: Fixes in DT_GNU_HASH handling * Hash buckets is an array of 32-bit words. While DT_HASH is 32-bit on most platforms except s390 (where it's 64-bit). * The bloom filter word size differs between 32-bit and 64-bit ELF files. This commit adjusts the code to handle both cases. Signed-off-by: Andrei Vagin --- criu/pie/util-vdso.c | 57 +++++++++++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 19 deletions(-) diff --git a/criu/pie/util-vdso.c b/criu/pie/util-vdso.c index 9819335d8..af3c08985 100644 --- a/criu/pie/util-vdso.c +++ b/criu/pie/util-vdso.c @@ -121,7 +121,8 @@ static int has_elf_identity(Ehdr_t *ehdr) return true; } -static int parse_elf_phdr(uintptr_t mem, size_t size, Phdr_t **dynamic, Phdr_t **load) +static int parse_elf_phdr(uintptr_t mem, size_t size, + Phdr_t **dynamic, Phdr_t **load, bool *is_32bit) { Ehdr_t *ehdr = (void *)mem; uintptr_t addr; @@ -136,6 +137,8 @@ static int parse_elf_phdr(uintptr_t mem, size_t size, Phdr_t **dynamic, Phdr_t * if (!has_elf_identity(ehdr)) return -EINVAL; + *is_32bit = ehdr->e_ident[EI_CLASS] != ELFCLASS64; + addr = mem + ehdr->e_phoff; if (__ptr_oob(addr, mem, size)) goto err_oob; @@ -272,6 +275,8 @@ typedef unsigned long Hash_t; typedef Word_t Hash_t; #endif +typedef uint32_t Hash32_t; + static bool elf_symbol_match(uintptr_t mem, size_t size, uintptr_t dynsymbol_names, Sym_t *sym, const char *symbol, const size_t vdso_symbol_length) @@ -297,21 +302,22 @@ static bool elf_symbol_match(uintptr_t mem, size_t size, static unsigned long elf_symbol_lookup(uintptr_t mem, size_t size, const char *symbol, uint32_t symbol_hash, unsigned int sym_off, uintptr_t dynsymbol_names, Dyn_t *dyn_symtab, Phdr_t *load, - Hash_t nbucket, Hash_t nchain, Hash_t *bucket, Hash_t *chain, + uint64_t nbucket, uint64_t nchain, void *_bucket, Hash_t *chain, const size_t vdso_symbol_length, bool use_gnu_hash) { unsigned int j; uintptr_t addr; - j = bucket[symbol_hash % nbucket]; - if (j == STN_UNDEF) - return 0; - addr = mem + dyn_symtab->d_un.d_ptr - load->p_vaddr; if (use_gnu_hash) { - uint32_t *h = bucket + nbucket + (j - sym_off); - uint32_t hash_val; + Hash32_t *h, hash_val, *bucket = _bucket; + + j = bucket[symbol_hash % nbucket]; + if (j == STN_UNDEF) + return 0; + + h = bucket + nbucket + (j - sym_off); symbol_hash |= 1; do { @@ -325,6 +331,12 @@ static unsigned long elf_symbol_lookup(uintptr_t mem, size_t size, j++; } while (!(hash_val & 1)); } else { + Hash_t *bucket = _bucket; + + j = bucket[symbol_hash % nbucket]; + if (j == STN_UNDEF) + return 0; + for (; j < nchain && j != STN_UNDEF; j = chain[j]) { Sym_t *sym = (void *)addr + sizeof(Sym_t) * j; @@ -338,17 +350,17 @@ static unsigned long elf_symbol_lookup(uintptr_t mem, size_t size, static int parse_elf_symbols(uintptr_t mem, size_t size, Phdr_t *load, struct vdso_symtable *t, uintptr_t dynsymbol_names, - Hash_t *hash, Dyn_t *dyn_symtab, bool use_gnu_hash) + Hash_t *hash, Dyn_t *dyn_symtab, bool use_gnu_hash, + bool is_32bit) { ARCH_VDSO_SYMBOLS_LIST const char *vdso_symbols[VDSO_SYMBOL_MAX] = { ARCH_VDSO_SYMBOLS }; const size_t vdso_symbol_length = sizeof(t->symbols[0].name) - 1; - Hash_t *bucket = NULL; + void *bucket = NULL; Hash_t *chain = NULL; - Hash_t nbucket = 0; - Hash_t nchain = 0; + uint64_t nbucket, nchain = 0; unsigned int sym_off = 0; unsigned int i = 0; @@ -358,17 +370,23 @@ static int parse_elf_symbols(uintptr_t mem, size_t size, Phdr_t *load, if (use_gnu_hash) { uint32_t *gnu_hash = (uint32_t *)hash; uint32_t bloom_sz; - size_t *bloom; nbucket = gnu_hash[0]; sym_off = gnu_hash[1]; bloom_sz = gnu_hash[2]; - bloom = (size_t *)&gnu_hash[4]; - bucket = (Hash_t *)(&bloom[bloom_sz]); + if (is_32bit) { + uint32_t *bloom; + bloom = (uint32_t *)&gnu_hash[4]; + bucket = (Hash_t *)(&bloom[bloom_sz]); + } else { + uint64_t *bloom; + bloom = (uint64_t *)&gnu_hash[4]; + bucket = (Hash_t *)(&bloom[bloom_sz]); + } elf_hash = &elf_gnu_hash; - pr_debug("nbucket %lx sym_off %lx bloom_sz %lx bloom %lx bucket %lx\n", + pr_debug("nbucket %lx sym_off %lx bloom_sz %lx bucket %lx\n", (unsigned long)nbucket, (unsigned long)sym_off, - (unsigned long)bloom_sz, (unsigned long)bloom, + (unsigned long)bloom_sz, (unsigned long)bucket); } else { nbucket = hash[0]; @@ -417,6 +435,7 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) Dyn_t *dyn_hash = NULL; Hash_t *hash = NULL; bool use_gnu_hash; + bool is_32bit; uintptr_t dynsymbol_names; uintptr_t addr; @@ -427,7 +446,7 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) /* * We need PT_LOAD and PT_DYNAMIC here. Each once. */ - ret = parse_elf_phdr(mem, size, &dynamic, &load); + ret = parse_elf_phdr(mem, size, &dynamic, &load, &is_32bit); if (ret < 0) return ret; if (!load || !dynamic) { @@ -458,7 +477,7 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) hash = (void *)addr; ret = parse_elf_symbols(mem, size, load, t, dynsymbol_names, hash, dyn_symtab, - use_gnu_hash); + use_gnu_hash, is_32bit); if (ret <0) return ret; From 7748b3fe7326f6f987fc9fd0d3fa267800420264 Mon Sep 17 00:00:00 2001 From: Han-Wen Nienhuys Date: Mon, 10 Mar 2025 14:43:24 +0100 Subject: [PATCH 074/257] pstree: print clone flags in error message Signed-off-by: Han-Wen Nienhuys --- criu/pstree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/pstree.c b/criu/pstree.c index 41df846ed..660f1b9d9 100644 --- a/criu/pstree.c +++ b/criu/pstree.c @@ -962,7 +962,7 @@ static int prepare_pstree_kobj_ids(void) * this namespace is either inherited from the * criu or is created for the init task (only) */ - pr_err("Can't restore sub-task in NS\n"); + pr_err("Can't restore sub-task in NS (cflags %lx)\n", cflags); return -1; } } From c5d46d86a8b07b063bca7e2de762f3c3b1f7b364 Mon Sep 17 00:00:00 2001 From: Han-Wen Nienhuys Date: Thu, 13 Mar 2025 08:46:16 +0100 Subject: [PATCH 075/257] restorer: Add a lock around cgroupd communication. Threads are put into cgroups through the cgroupd thread, which communicates with other threads using a socketpair. Previously, each thread received a dup'd copy of the socket, and did the following sendmsg(socket_dup_fd, my_cgroup_set); // wait for ack. while (1) { recvmsg(socket_dup_fd, &h, MSG_PEEK); if (h.pid != my_pid) continue; recvmsg(socket_dup_fd, &h, 0); } close(socket_dup_fd); When restoring many threads, many threads would be spinning in the above loop waiting for their PID to appear. In my test-case, restoring a process with a 11.5G heap and 491 threads could take anywhere between 10 seconds and 60 seconds to complete. To avoid the spinning, we drop the loop and MSG_PEEK, and add a lock around the above code. This does not decrease parallelism, as the cgroupd daemon uses a single thread anyway. With the lock in place, the same restore consistently takes around 10 seconds on my machine (Thinkpad P14s, AMD Ryzen 8840HS). There is a similar "daemon" thread for user namespaces. That already is protected with a similar userns_sync_lock in __userns_call(). Fixes #2614 Signed-off-by: Han-Wen Nienhuys --- criu/cr-restore.c | 1 + criu/include/rst_info.h | 1 + criu/pie/restorer.c | 61 ++++++++++++++++++++--------------------- 3 files changed, 31 insertions(+), 32 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index ddca6b8ec..e906da0ce 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2329,6 +2329,7 @@ int prepare_task_entries(void) task_entries->nr_helpers = 0; futex_set(&task_entries->start, CR_STATE_FAIL); mutex_init(&task_entries->userns_sync_lock); + mutex_init(&task_entries->cgroupd_sync_lock); mutex_init(&task_entries->last_pid_mutex); return 0; diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h index df9f9de01..4c9335a73 100644 --- a/criu/include/rst_info.h +++ b/criu/include/rst_info.h @@ -14,6 +14,7 @@ struct task_entries { futex_t start; atomic_t cr_err; mutex_t userns_sync_lock; + mutex_t cgroupd_sync_lock; mutex_t last_pid_mutex; }; diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 6d048c3f1..348ce6659 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -704,9 +704,8 @@ static int send_cg_set(int sk, int cg_set) } /* - * As this socket is shared among threads, recvmsg(MSG_PEEK) - * from the socket until getting its own thread id as an - * acknowledge of successful threaded cgroup fixup + * As the cgroupd socket is shared among threads and processes, this + * should be called with task_entries->cgroupd_sync_lock held. */ static int recv_cg_set_restore_ack(int sk) { @@ -719,33 +718,22 @@ static int recv_cg_set_restore_ack(int sk) h.msg_control = cmsg; h.msg_controllen = sizeof(cmsg); - while (1) { - ret = sys_recvmsg(sk, &h, MSG_PEEK); - if (ret < 0) { - pr_err("Unable to peek from cgroupd %d\n", ret); - return -1; - } + ret = sys_recvmsg(sk, &h, 0); + if (ret < 0) { + pr_err("Unable to receive from cgroupd %d\n", ret); + return -1; + } - if (h.msg_controllen != sizeof(cmsg)) { - pr_err("The message from cgroupd is truncated\n"); - return -1; - } + if (h.msg_controllen != sizeof(cmsg)) { + pr_err("The message from cgroupd is truncated\n"); + return -1; + } - ch = CMSG_FIRSTHDR(&h); - cred = (struct ucred *)CMSG_DATA(ch); - if (cred->pid != sys_gettid()) - continue; - - /* - * Actual remove message from recv queue of socket - */ - ret = sys_recvmsg(sk, &h, 0); - if (ret < 0) { - pr_err("Unable to receive from cgroupd %d\n", ret); - return -1; - } - - break; + ch = CMSG_FIRSTHDR(&h); + cred = (struct ucred *)CMSG_DATA(ch); + if (cred->pid != sys_gettid()) { + pr_err("cred pid %d != gettid\n", cred->pid); + return -1; } return 0; } @@ -782,12 +770,21 @@ __visible long __export_restore_thread(struct thread_restore_args *args) rt_sigframe = (void *)&args->mz->rt_sigframe; if (args->cg_set != -1) { + int err = 0; + + mutex_lock(&task_entries_local->cgroupd_sync_lock); + pr_info("Restore cg_set in thread cg_set: %d\n", args->cg_set); - if (send_cg_set(args->cgroupd_sk, args->cg_set)) - goto core_restore_end; - if (recv_cg_set_restore_ack(args->cgroupd_sk)) - goto core_restore_end; + + err = send_cg_set(args->cgroupd_sk, args->cg_set); + if (!err) + err = recv_cg_set_restore_ack(args->cgroupd_sk); + + mutex_unlock(&task_entries_local->cgroupd_sync_lock); sys_close(args->cgroupd_sk); + + if (err) + goto core_restore_end; } if (restore_thread_common(args)) From 8ae5db37bb01f405ece0a08160a35cd92034e26a Mon Sep 17 00:00:00 2001 From: AV Date: Mon, 3 Mar 2025 19:14:54 +0000 Subject: [PATCH 076/257] arm64: C/R PAC keys PAC stands for Pointer Authentication Code. Each process has 5 PAC keys and a mask of enabled keys. All this properties have to be C/R-ed. As they are per-process protperties, we can save/restore them just for one thread. Signed-off-by: Andrei Vagin --- compel/arch/aarch64/src/lib/infect.c | 2 +- compel/arch/arm/src/lib/infect.c | 2 +- compel/arch/loongarch64/src/lib/infect.c | 2 +- compel/arch/mips/src/lib/infect.c | 2 +- compel/arch/ppc64/src/lib/infect.c | 2 +- compel/arch/riscv64/src/lib/infect.c | 2 +- compel/arch/s390/src/lib/infect.c | 2 +- compel/arch/x86/src/lib/infect.c | 2 +- compel/include/uapi/infect.h | 2 +- compel/src/lib/infect.c | 2 +- criu/arch/aarch64/crtools.c | 167 ++++++++++++++++++++++- criu/arch/aarch64/include/asm/dump.h | 2 +- criu/arch/aarch64/include/asm/restore.h | 10 ++ criu/arch/arm/crtools.c | 2 +- criu/arch/arm/include/asm/dump.h | 2 +- criu/arch/loongarch64/crtools.c | 2 +- criu/arch/loongarch64/include/asm/dump.h | 2 +- criu/arch/mips/crtools.c | 2 +- criu/arch/mips/include/asm/dump.h | 2 +- criu/arch/ppc64/crtools.c | 2 +- criu/arch/ppc64/include/asm/dump.h | 2 +- criu/arch/riscv64/crtools.c | 2 +- criu/arch/riscv64/include/asm/dump.h | 2 +- criu/arch/s390/crtools.c | 2 +- criu/arch/s390/include/asm/dump.h | 2 +- criu/arch/x86/crtools.c | 2 +- criu/arch/x86/include/asm/compat.h | 2 + criu/arch/x86/include/asm/dump.h | 2 +- criu/cr-restore.c | 10 ++ criu/include/rst_info.h | 8 ++ images/core-aarch64.proto | 23 ++++ 31 files changed, 244 insertions(+), 26 deletions(-) diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index 812ba34a3..ec1d0d59e 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -81,7 +81,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct goto err; } - ret = save(arg, regs, fpsimd); + ret = save(pid, arg, regs, fpsimd); err: return ret; } diff --git a/compel/arch/arm/src/lib/infect.c b/compel/arch/arm/src/lib/infect.c index 8b810a88f..a9fb639e2 100644 --- a/compel/arch/arm/src/lib/infect.c +++ b/compel/arch/arm/src/lib/infect.c @@ -94,7 +94,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct } } - ret = save(arg, regs, vfp); + ret = save(pid, arg, regs, vfp); err: return ret; } diff --git a/compel/arch/loongarch64/src/lib/infect.c b/compel/arch/loongarch64/src/lib/infect.c index 8e3c19aff..190c39227 100644 --- a/compel/arch/loongarch64/src/lib/infect.c +++ b/compel/arch/loongarch64/src/lib/infect.c @@ -91,7 +91,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct goto err; } - ret = save(arg, regs, fpregs); + ret = save(pid, arg, regs, fpregs); err: return 0; } diff --git a/compel/arch/mips/src/lib/infect.c b/compel/arch/mips/src/lib/infect.c index 0e98aaee3..a1d4865cc 100644 --- a/compel/arch/mips/src/lib/infect.c +++ b/compel/arch/mips/src/lib/infect.c @@ -149,7 +149,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct regs->regs[0] = 0; } - ret = save(arg, regs, xs); + ret = save(pid, arg, regs, xs); return ret; } diff --git a/compel/arch/ppc64/src/lib/infect.c b/compel/arch/ppc64/src/lib/infect.c index 84c2b1d7c..54abd48a4 100644 --- a/compel/arch/ppc64/src/lib/infect.c +++ b/compel/arch/ppc64/src/lib/infect.c @@ -400,7 +400,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct if (ret) return ret; - return save(arg, regs, fpregs); + return save(pid, arg, regs, fpregs); } int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) diff --git a/compel/arch/riscv64/src/lib/infect.c b/compel/arch/riscv64/src/lib/infect.c index 861fe3b2f..3f3a4b7ec 100644 --- a/compel/arch/riscv64/src/lib/infect.c +++ b/compel/arch/riscv64/src/lib/infect.c @@ -92,7 +92,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct return -1; } - ret = save(arg, regs, fpsimd); + ret = save(pid, arg, regs, fpsimd); return ret; } diff --git a/compel/arch/s390/src/lib/infect.c b/compel/arch/s390/src/lib/infect.c index 85dfc3a4d..a77b38917 100644 --- a/compel/arch/s390/src/lib/infect.c +++ b/compel/arch/s390/src/lib/infect.c @@ -348,7 +348,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct } } /* Call save_task_regs() */ - return save(arg, regs, fpregs); + return save(pid, arg, regs, fpregs); } int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) diff --git a/compel/arch/x86/src/lib/infect.c b/compel/arch/x86/src/lib/infect.c index a07b1c9f3..644c483b4 100644 --- a/compel/arch/x86/src/lib/infect.c +++ b/compel/arch/x86/src/lib/infect.c @@ -453,7 +453,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct goto err; out: - ret = save(arg, regs, xs); + ret = save(pid, arg, regs, xs); err: return ret; } diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index 7e6134f4b..ed97d64dd 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -97,7 +97,7 @@ extern k_rtsigset_t *compel_thread_sigmask(struct parasite_thread_ctl *tctl); struct rt_sigframe; typedef int (*open_proc_fn)(int pid, int mode, const char *fmt, ...) __attribute__((__format__(__printf__, 3, 4))); -typedef int (*save_regs_t)(void *, user_regs_struct_t *, user_fpregs_struct_t *); +typedef int (*save_regs_t)(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); typedef int (*make_sigframe_t)(void *, struct rt_sigframe *, struct rt_sigframe *, k_rtsigset_t *); struct infect_ctx { diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index caf54e03f..a9bbd6400 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -1300,7 +1300,7 @@ struct plain_regs_struct { user_fpregs_struct_t fpregs; }; -static int save_regs_plain(void *to, user_regs_struct_t *r, user_fpregs_struct_t *f) +static int save_regs_plain(pid_t pid, void *to, user_regs_struct_t *r, user_fpregs_struct_t *f) { struct plain_regs_struct *prs = to; diff --git a/criu/arch/aarch64/crtools.c b/criu/arch/aarch64/crtools.c index e87b8629a..6cde03ee3 100644 --- a/criu/arch/aarch64/crtools.c +++ b/criu/arch/aarch64/crtools.c @@ -1,5 +1,6 @@ #include #include +#include #include @@ -20,10 +21,86 @@ #include "cpu.h" #include "restorer.h" #include "compel/infect.h" +#include "pstree.h" + +extern unsigned long getauxval(unsigned long type); #define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))(src)->e -int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) +static int save_pac_keys(int pid, CoreEntry *core) +{ + struct user_pac_address_keys paca; + struct user_pac_generic_keys pacg; + PacKeys *pac_entry; + long pac_enabled_key; + struct iovec iov; + int ret; + + unsigned long hwcaps = getauxval(AT_HWCAP); + + pac_entry = xmalloc(sizeof(PacKeys)); + if (!pac_entry) + return -1; + core->ti_aarch64->pac_keys = pac_entry; + pac_keys__init(pac_entry); + + if (hwcaps & HWCAP_PACA) { + PacAddressKeys *pac_address_keys; + + pr_debug("%d: Dumping address authentication keys\n", pid); + iov.iov_base = &paca; + iov.iov_len = sizeof(paca); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PACA_KEYS, &iov))) { + pr_perror("Failed to get address authentication key for %d", pid); + return -1; + } + pac_address_keys = xmalloc(sizeof(PacAddressKeys)); + if (!pac_address_keys) + return -1; + pac_address_keys__init(pac_address_keys); + pac_entry->pac_address_keys = pac_address_keys; + pac_address_keys->apiakey_lo = paca.apiakey; + pac_address_keys->apiakey_hi = paca.apiakey >> 64; + pac_address_keys->apibkey_lo = paca.apibkey; + pac_address_keys->apibkey_hi = paca.apibkey >> 64; + pac_address_keys->apdakey_lo = paca.apdakey; + pac_address_keys->apdakey_hi = paca.apdakey >> 64; + pac_address_keys->apdbkey_lo = paca.apdbkey; + pac_address_keys->apdbkey_hi = paca.apdbkey >> 64; + + iov.iov_base = &pac_enabled_key; + iov.iov_len = sizeof(pac_enabled_key); + ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PAC_ENABLED_KEYS, &iov); + if (ret) { + pr_perror("Failed to get authentication key mask for %d", pid); + return -1; + } + + pac_address_keys->pac_enabled_key = pac_enabled_key; + + } + if (hwcaps & HWCAP_PACG) { + PacGenericKeys *pac_generic_keys; + + pr_debug("%d: Dumping generic authentication keys\n", pid); + iov.iov_base = &pacg; + iov.iov_len = sizeof(pacg); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PACG_KEYS, &iov))) { + pr_perror("Failed to get a generic authantication key for %d", pid); + return -1; + } + pac_generic_keys = xmalloc(sizeof(PacGenericKeys)); + if (!pac_generic_keys) + return -1; + pac_generic_keys__init(pac_generic_keys); + pac_entry->pac_generic_keys = pac_generic_keys; + pac_generic_keys->apgakey_lo = pacg.apgakey; + pac_generic_keys->apgakey_hi = pacg.apgakey >> 64; + } + return 0; +} + +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) { int i; CoreEntry *core = x; @@ -43,6 +120,8 @@ int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsi assign_reg(core->ti_aarch64->fpsimd, fpsimd, fpsr); assign_reg(core->ti_aarch64->fpsimd, fpsimd, fpcr); + if (save_pac_keys(pid, core)) + return -1; return 0; } @@ -92,6 +171,12 @@ void arch_free_thread_info(CoreEntry *core) xfree(CORE_THREAD_ARCH_INFO(core)->fpsimd->vregs); xfree(CORE_THREAD_ARCH_INFO(core)->fpsimd); } + if (CORE_THREAD_ARCH_INFO(core)->pac_keys) { + PacKeys *pac_entry = CORE_THREAD_ARCH_INFO(core)->pac_keys; + xfree(pac_entry->pac_address_keys); + xfree(pac_entry->pac_generic_keys); + xfree(pac_entry); + } xfree(CORE_THREAD_ARCH_INFO(core)->gpregs->regs); xfree(CORE_THREAD_ARCH_INFO(core)->gpregs); xfree(CORE_THREAD_ARCH_INFO(core)); @@ -135,3 +220,83 @@ int restore_gpregs(struct rt_sigframe *f, UserRegsEntry *r) return 0; } + +int arch_ptrace_restore(int pid, struct pstree_item *item) +{ + unsigned long hwcaps = getauxval(AT_HWCAP); + struct user_pac_address_keys upaca; + struct user_pac_generic_keys upacg; + PacAddressKeys *paca; + PacGenericKeys *pacg; + long pac_enabled_keys; + struct iovec iov; + int ret; + + + pr_debug("%d: Restoring PAC keys\n", pid); + + paca = &rsti(item)->arch_info.pac_address_keys; + pacg = &rsti(item)->arch_info.pac_generic_keys; + if (rsti(item)->arch_info.has_paca) { + if (!(hwcaps & HWCAP_PACA)) { + pr_err("PACG support is required from the source system.\n"); + return 1; + } + pac_enabled_keys = rsti(item)->arch_info.pac_address_keys.pac_enabled_key; + + upaca.apiakey = paca->apiakey_lo + ((__uint128_t)paca->apiakey_hi << 64); + upaca.apibkey = paca->apibkey_lo + ((__uint128_t)paca->apibkey_hi << 64); + upaca.apdakey = paca->apdakey_lo + ((__uint128_t)paca->apdakey_hi << 64); + upaca.apdbkey = paca->apdbkey_lo + ((__uint128_t)paca->apdbkey_hi << 64); + + iov.iov_base = &upaca; + iov.iov_len = sizeof(upaca); + + if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PACA_KEYS, &iov))) { + pr_perror("Failed to set address authentication keys for %d", pid); + return 1; + } + iov.iov_base = &pac_enabled_keys; + iov.iov_len = sizeof(pac_enabled_keys); + if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PAC_ENABLED_KEYS, &iov))) { + pr_perror("Failed to set enabled key mask for %d", pid); + return 1; + } + } + + if (rsti(item)->arch_info.has_pacg) { + if (!(hwcaps & HWCAP_PACG)) { + pr_err("PACG support is required from the source system.\n"); + return 1; + } + upacg.apgakey = pacg->apgakey_lo + ((__uint128_t)pacg->apgakey_hi << 64); + iov.iov_base = &upacg; + iov.iov_len = sizeof(upacg); + if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PACG_KEYS, &iov))) { + pr_perror("Failed to set the generic authentication key for %d", pid); + return 1; + } + } + + return 0; +} + +void arch_rsti_init(struct pstree_item *p) +{ + PacKeys *pac_keys = p->core[0]->ti_aarch64->pac_keys; + + rsti(p)->arch_info.has_paca = false; + rsti(p)->arch_info.has_pacg = false; + + if (!pac_keys) + return; + + if (pac_keys->pac_address_keys) { + rsti(p)->arch_info.has_paca = true; + rsti(p)->arch_info.pac_address_keys = *pac_keys->pac_address_keys; + } + if (pac_keys->pac_generic_keys) { + rsti(p)->arch_info.has_pacg = true; + rsti(p)->arch_info.pac_generic_keys = *pac_keys->pac_generic_keys; + } +} diff --git a/criu/arch/aarch64/include/asm/dump.h b/criu/arch/aarch64/include/asm/dump.h index 90cd8bca8..ecab061c3 100644 --- a/criu/arch/aarch64/include/asm/dump.h +++ b/criu/arch/aarch64/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); diff --git a/criu/arch/aarch64/include/asm/restore.h b/criu/arch/aarch64/include/asm/restore.h index 75e87996a..c79605c40 100644 --- a/criu/arch/aarch64/include/asm/restore.h +++ b/criu/arch/aarch64/include/asm/restore.h @@ -26,4 +26,14 @@ static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); +#define ARCH_RST_INFO y +struct rst_arch_info { + bool has_paca, has_pacg; + PacAddressKeys pac_address_keys; + PacGenericKeys pac_generic_keys; +}; + +int arch_ptrace_restore(int pid, struct pstree_item *item); +void arch_rsti_init(struct pstree_item *current); + #endif diff --git a/criu/arch/arm/crtools.c b/criu/arch/arm/crtools.c index 26b94e157..6a5e4c89a 100644 --- a/criu/arch/arm/crtools.c +++ b/criu/arch/arm/crtools.c @@ -22,7 +22,7 @@ #define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))((src)->ARM_##e) -int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { CoreEntry *core = x; diff --git a/criu/arch/arm/include/asm/dump.h b/criu/arch/arm/include/asm/dump.h index 485986065..b0ac5715d 100644 --- a/criu/arch/arm/include/asm/dump.h +++ b/criu/arch/arm/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); diff --git a/criu/arch/loongarch64/crtools.c b/criu/arch/loongarch64/crtools.c index eeb0731ca..783951b5b 100644 --- a/criu/arch/loongarch64/crtools.c +++ b/criu/arch/loongarch64/crtools.c @@ -29,7 +29,7 @@ #define assign_reg(dst, src, e) (dst)->e = (__typeof__(dst->e))(src)->e -int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { int i; CoreEntry *core = x; diff --git a/criu/arch/loongarch64/include/asm/dump.h b/criu/arch/loongarch64/include/asm/dump.h index 04347155c..a1c0c4c58 100644 --- a/criu/arch/loongarch64/include/asm/dump.h +++ b/criu/arch/loongarch64/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); diff --git a/criu/arch/mips/crtools.c b/criu/arch/mips/crtools.c index ed4da9b7e..eabbd85f4 100644 --- a/criu/arch/mips/crtools.c +++ b/criu/arch/mips/crtools.c @@ -27,7 +27,7 @@ #include "images/core.pb-c.h" #include "images/creds.pb-c.h" -int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { CoreEntry *core = x; diff --git a/criu/arch/mips/include/asm/dump.h b/criu/arch/mips/include/asm/dump.h index 58015833d..ec59b051b 100644 --- a/criu/arch/mips/include/asm/dump.h +++ b/criu/arch/mips/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); extern int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info); diff --git a/criu/arch/ppc64/crtools.c b/criu/arch/ppc64/crtools.c index a08a2ca5b..d57040008 100644 --- a/criu/arch/ppc64/crtools.c +++ b/criu/arch/ppc64/crtools.c @@ -404,7 +404,7 @@ static int __copy_task_regs(user_regs_struct_t *regs, user_fpregs_struct_t *fpre return 0; } -int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) +int save_task_regs(pid_t pid, void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) { return __copy_task_regs(u, f, (CoreEntry *)arg); } diff --git a/criu/arch/ppc64/include/asm/dump.h b/criu/arch/ppc64/include/asm/dump.h index eb488900a..7393654fa 100644 --- a/criu/arch/ppc64/include/asm/dump.h +++ b/criu/arch/ppc64/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); diff --git a/criu/arch/riscv64/crtools.c b/criu/arch/riscv64/crtools.c index b2d6d2951..eea98d6de 100644 --- a/criu/arch/riscv64/crtools.c +++ b/criu/arch/riscv64/crtools.c @@ -23,7 +23,7 @@ #define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))(src)->e -int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) { int i; CoreEntry *core = x; diff --git a/criu/arch/riscv64/include/asm/dump.h b/criu/arch/riscv64/include/asm/dump.h index c2988f9bf..4f0a2d209 100644 --- a/criu/arch/riscv64/include/asm/dump.h +++ b/criu/arch/riscv64/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); diff --git a/criu/arch/s390/crtools.c b/criu/arch/s390/crtools.c index 5cf160d82..96cef819e 100644 --- a/criu/arch/s390/crtools.c +++ b/criu/arch/s390/crtools.c @@ -282,7 +282,7 @@ static void free_ri_cb(UserS390RiEntry *ri_cb) /* * Copy internal structures into Google Protocol Buffers */ -int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) +int save_task_regs(pid_t pid, void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) { UserS390VxrsHighEntry *vxrs_high = NULL; UserS390VxrsLowEntry *vxrs_low = NULL; diff --git a/criu/arch/s390/include/asm/dump.h b/criu/arch/s390/include/asm/dump.h index c200724d7..5a24c5b3d 100644 --- a/criu/arch/s390/include/asm/dump.h +++ b/criu/arch/s390/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f); +int save_task_regs(pid_t pid, void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f); int arch_alloc_thread_info(CoreEntry *core); void arch_free_thread_info(CoreEntry *core); diff --git a/criu/arch/x86/crtools.c b/criu/arch/x86/crtools.c index e068a9a02..1f4d0736b 100644 --- a/criu/arch/x86/crtools.c +++ b/criu/arch/x86/crtools.c @@ -15,7 +15,7 @@ #define XSAVE_PB_NELEMS(__s, __obj, __member) (sizeof(__s) / sizeof(*(__obj)->__member)) -int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { CoreEntry *core = x; UserX86RegsEntry *gpregs = core->thread_info->gpregs; diff --git a/criu/arch/x86/include/asm/compat.h b/criu/arch/x86/include/asm/compat.h index 867357fa2..4ca704fd7 100644 --- a/criu/arch/x86/include/asm/compat.h +++ b/criu/arch/x86/include/asm/compat.h @@ -11,6 +11,8 @@ #include +#include "log.h" + static inline void *alloc_compat_syscall_stack(void) { void *mem = (void *)sys_mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_32BIT | MAP_ANONYMOUS | MAP_PRIVATE, diff --git a/criu/arch/x86/include/asm/dump.h b/criu/arch/x86/include/asm/dump.h index 192f6bd02..925ea91ff 100644 --- a/criu/arch/x86/include/asm/dump.h +++ b/criu/arch/x86/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); extern int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info); diff --git a/criu/cr-restore.c b/criu/cr-restore.c index e906da0ce..1f4881dab 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "types.h" #include @@ -1707,6 +1708,9 @@ static int restore_task_with_children(void *_arg) arg); } +int __attribute((weak)) arch_ptrace_restore(int pid, struct pstree_item *item); +int arch_ptrace_restore(int pid, struct pstree_item *item) { return 0; } + static int attach_to_tasks(bool root_seized) { struct pstree_item *item; @@ -1747,6 +1751,8 @@ static int attach_to_tasks(bool root_seized) pr_perror("Unable to set PTRACE_O_TRACESYSGOOD for %d", pid); return -1; } + if (arch_ptrace_restore(pid, item)) + return -1; /* * Suspend seccomp if necessary. We need to do this because * although seccomp is restored at the very end of the @@ -3104,6 +3110,9 @@ static void *restorer_munmap_addr(CoreEntry *core, void *restorer_blob) return restorer_sym(restorer_blob, arch_export_unmap); } +void arch_rsti_init(struct pstree_item *p) __attribute__((weak)); +void arch_rsti_init(struct pstree_item *p) {} + static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, unsigned long alen, CoreEntry *core) { void *mem = MAP_FAILED; @@ -3323,6 +3332,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns */ creds_pos_next = creds_pos; siginfo_n = task_args->siginfo_n; + arch_rsti_init(current); for (i = 0; i < current->nr_threads; i++) { CoreEntry *tcore; struct rt_sigframe *sigframe; diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h index 4c9335a73..deb297e5f 100644 --- a/criu/include/rst_info.h +++ b/criu/include/rst_info.h @@ -1,6 +1,7 @@ #ifndef __CR_RST_INFO_H__ #define __CR_RST_INFO_H__ +#include "asm/restore.h" #include "common/lock.h" #include "common/list.h" #include "vma.h" @@ -33,6 +34,11 @@ struct rst_rseq { uint64_t rseq_cs_pointer; }; +#ifndef ARCH_RST_INFO +struct rst_arch_info { +}; +#endif + struct rst_info { struct list_head fds; @@ -80,6 +86,8 @@ struct rst_info { futex_t shstk_unlock; void *breakpoint; + + struct rst_arch_info arch_info; }; extern struct task_entries *task_entries; diff --git a/images/core-aarch64.proto b/images/core-aarch64.proto index 3356e6b75..64b0ee9fb 100644 --- a/images/core-aarch64.proto +++ b/images/core-aarch64.proto @@ -17,9 +17,32 @@ message user_aarch64_fpsimd_context_entry { required uint32 fpcr = 3; } +message pac_address_keys { + required uint64 apiakey_lo = 1; + required uint64 apiakey_hi = 2; + required uint64 apibkey_lo = 3; + required uint64 apibkey_hi = 4; + required uint64 apdakey_lo = 5; + required uint64 apdakey_hi = 6; + required uint64 apdbkey_lo = 7; + required uint64 apdbkey_hi = 8; + required uint64 pac_enabled_key = 9; +} + +message pac_generic_keys { + required uint64 apgakey_lo = 1; + required uint64 apgakey_hi = 2; +} + +message pac_keys { + optional pac_address_keys pac_address_keys = 6; + optional pac_generic_keys pac_generic_keys = 7; +} + message thread_info_aarch64 { required uint64 clear_tid_addr = 1[(criu).hex = true]; required uint64 tls = 2; required user_aarch64_regs_entry gpregs = 3[(criu).hex = true]; required user_aarch64_fpsimd_context_entry fpsimd = 4; + optional pac_keys pac_keys = 5; } From b8553d19edc1d5278c619420844b24aad2bdd415 Mon Sep 17 00:00:00 2001 From: AV Date: Mon, 3 Mar 2025 20:09:05 +0000 Subject: [PATCH 077/257] test/zdtm: check that PAC keys are C/R-ed Add another variation of ptrhead00 compiled with enabled branch-protection. Signed-off-by: Andrei Vagin --- test/zdtm/static/Makefile | 9 +++++++++ test/zdtm/static/pthread00-pac.c | 1 + 2 files changed, 10 insertions(+) create mode 120000 test/zdtm/static/pthread00-pac.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index f72fb2a77..6a19cad3c 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -66,6 +66,7 @@ TST_NOFILE := \ pipe01 \ pipe02 \ pthread00 \ + pthread00-pac \ pthread01 \ pthread02 \ pthread_timers \ @@ -497,6 +498,12 @@ STATE_OUT = $(TST_STATE:%=%.out) include ../Makefile.inc +ifeq ($(ARCH),aarch64) + PAC_CFLAGS := -mbranch-protection=standard +else + PAC_CFLAGS := +endif + all: $(TST) criu-rtc.so install: all .PHONY: all install @@ -588,6 +595,8 @@ uptime_grow: LDLIBS += -lrt -pthread unlink_largefile: CFLAGS += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE inotify_system_nodel: CFLAGS += -DNO_DEL pthread00: LDLIBS += -pthread +pthread00-pac: CFLAGS += ${PAC_CFLAGS} +pthread00-pac: LDLIBS += -pthread pthread01: LDLIBS += -pthread pthread02: LDLIBS += -pthread pthread_timers: LDLIBS += -lrt -pthread diff --git a/test/zdtm/static/pthread00-pac.c b/test/zdtm/static/pthread00-pac.c new file mode 120000 index 000000000..3ee8dc1f1 --- /dev/null +++ b/test/zdtm/static/pthread00-pac.c @@ -0,0 +1 @@ +pthread00.c \ No newline at end of file From 62a4a5874b4b0bd462f28d659b93c73c5c06a900 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 12 Mar 2025 23:46:05 +0000 Subject: [PATCH 078/257] vdso: correct data types for ELF hash table sizes Let's change the data types of `nbucket` and `nchain` to uint32. This should fix the following compile-time error on arm32: /criu/criu/pie/util-vdso.c:336: undefined reference to `__aeabi_uldivmod' Signed-off-by: Andrei Vagin --- criu/pie/util-vdso.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/criu/pie/util-vdso.c b/criu/pie/util-vdso.c index af3c08985..8daf5c71f 100644 --- a/criu/pie/util-vdso.c +++ b/criu/pie/util-vdso.c @@ -302,7 +302,7 @@ static bool elf_symbol_match(uintptr_t mem, size_t size, static unsigned long elf_symbol_lookup(uintptr_t mem, size_t size, const char *symbol, uint32_t symbol_hash, unsigned int sym_off, uintptr_t dynsymbol_names, Dyn_t *dyn_symtab, Phdr_t *load, - uint64_t nbucket, uint64_t nchain, void *_bucket, Hash_t *chain, + uint32_t nbucket, uint32_t nchain, void *_bucket, Hash_t *chain, const size_t vdso_symbol_length, bool use_gnu_hash) { unsigned int j; @@ -360,7 +360,7 @@ static int parse_elf_symbols(uintptr_t mem, size_t size, Phdr_t *load, void *bucket = NULL; Hash_t *chain = NULL; - uint64_t nbucket, nchain = 0; + uint32_t nbucket, nchain = 0; unsigned int sym_off = 0; unsigned int i = 0; From 720bf67e065525133f4b0209baa7142192fbb667 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 16 Mar 2025 22:23:14 +0000 Subject: [PATCH 079/257] zdtm/vdso02: unmap vvar_vclock mappings It is a part of vvar and this test intends to unmap vdso and all vvar mappings. Fixes #2622 Signed-off-by: Andrei Vagin --- test/zdtm/static/vdso02.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/test/zdtm/static/vdso02.c b/test/zdtm/static/vdso02.c index 2050bca71..5779b7fd6 100644 --- a/test/zdtm/static/vdso02.c +++ b/test/zdtm/static/vdso02.c @@ -29,7 +29,8 @@ static int parse_vm_area(char *buf, struct vm_area *vma) return -1; } -static int find_blobs(pid_t pid, struct vm_area *vdso, struct vm_area *vvar) +static int find_blobs(pid_t pid, struct vm_area *vdso, + struct vm_area *vvar, struct vm_area *vvar_vclock) { char buf[BUF_SZ]; int ret = -1; @@ -39,6 +40,8 @@ static int find_blobs(pid_t pid, struct vm_area *vdso, struct vm_area *vvar) vdso->end = VDSO_BAD_ADDR; vvar->start = VVAR_BAD_ADDR; vvar->end = VVAR_BAD_ADDR; + vvar_vclock->start = VVAR_BAD_ADDR; + vvar_vclock->end = VVAR_BAD_ADDR; if (snprintf(buf, BUF_SZ, "/proc/%d/maps", pid) < 0) { pr_perror("snprintf() failure for path"); @@ -57,12 +60,18 @@ static int find_blobs(pid_t pid, struct vm_area *vdso, struct vm_area *vvar) if (strstr(buf, "[vvar]") && parse_vm_area(buf, vvar)) goto err; + if (strstr(buf, "[vvar_vclock]") && + parse_vm_area(buf, vvar_vclock)) + goto err; } if (vdso->start != VDSO_BAD_ADDR) test_msg("[vdso] %lx-%lx\n", vdso->start, vdso->end); if (vvar->start != VVAR_BAD_ADDR) test_msg("[vvar] %lx-%lx\n", vvar->start, vvar->end); + if (vvar_vclock->start != VVAR_BAD_ADDR) + test_msg("[vvar_vclock] %lx-%lx\n", + vvar_vclock->start, vvar_vclock->end); ret = 0; err: fclose(maps); @@ -143,10 +152,10 @@ void sys_exit(int status) static int unmap_blobs(void) { - struct vm_area vdso, vvar; + struct vm_area vdso, vvar, vvar_vclock; int ret; - if (find_blobs(getpid(), &vdso, &vvar)) + if (find_blobs(getpid(), &vdso, &vvar, &vvar_vclock)) return -1; if (vdso.start != VDSO_BAD_ADDR) { @@ -159,13 +168,19 @@ static int unmap_blobs(void) if (ret) return ret; } + if (vvar_vclock.start != VVAR_BAD_ADDR) { + ret = sys_munmap((void *)vvar_vclock.start, + vvar_vclock.end - vvar_vclock.start); + if (ret) + return ret; + } return 0; } int main(int argc, char *argv[]) { - struct vm_area vdso, vvar; + struct vm_area vdso, vvar, vvar_vclock; pid_t child; int status, ret = -1; @@ -201,9 +216,11 @@ int main(int argc, char *argv[]) goto out_kill; } - if (find_blobs(child, &vdso, &vvar)) + if (find_blobs(child, &vdso, &vvar, &vvar_vclock)) goto out_kill; - if (vdso.start != VDSO_BAD_ADDR || vvar.start != VVAR_BAD_ADDR) { + if (vdso.start != VDSO_BAD_ADDR || + vvar.start != VVAR_BAD_ADDR || + vvar_vclock.start != VVAR_BAD_ADDR) { pr_err("Found vvar or vdso blob(s) in child, which should have unmapped them\n"); goto out_kill; } @@ -211,7 +228,7 @@ int main(int argc, char *argv[]) test_daemon(); test_waitsig(); - if (find_blobs(child, &vdso, &vvar)) + if (find_blobs(child, &vdso, &vvar, &vvar_vclock)) goto out_kill; if (vdso.start != VDSO_BAD_ADDR || vvar.start != VVAR_BAD_ADDR) { pr_err("Child without vdso got it after C/R\n"); From 867c773031aef74e66cd15b55418141bcc538b95 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Tue, 4 Feb 2025 15:50:02 +0000 Subject: [PATCH 080/257] make: allow setting the default network locking backend As different Linux distributions are switching away from iptables to nftables, this makes it easier to compile CRIU with a different default network locking backend. Instead of changing the source code it is now possible to select the nft backend like this: make NETWORK_LOCK_DEFAULT=NETWORK_LOCK_NFTABLES Signed-off-by: Adrian Reber --- Makefile | 4 ++++ criu/include/cr_options.h | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/Makefile b/Makefile index 90908de83..5d8e89ac1 100644 --- a/Makefile +++ b/Makefile @@ -140,6 +140,10 @@ ifneq ($(GCOV),) CFLAGS += $(CFLAGS-GCOV) endif +ifneq ($(NETWORK_LOCK_DEFAULT),) + CFLAGS += -DNETWORK_LOCK_DEFAULT=$(NETWORK_LOCK_DEFAULT) +endif + ifeq ($(ASAN),1) CFLAGS-ASAN := -fsanitize=address export CFLAGS-ASAN diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index 60cf9437e..ab0bd8fa3 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -70,7 +70,15 @@ enum NETWORK_LOCK_METHOD { NETWORK_LOCK_SKIP, }; +/** + * CRIU currently defaults to the iptables locking backend. + * + * It is, however, possible to change this by defining + * NETWORK_LOCK_DEFAULT to a different value on the command-line. + */ +#ifndef NETWORK_LOCK_DEFAULT #define NETWORK_LOCK_DEFAULT NETWORK_LOCK_IPTABLES +#endif /* * Ghost file size we allow to carry by default. From 2cd9d5ded86204e1a43f57102b86cc06e9ecf0eb Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Tue, 4 Feb 2025 15:56:27 +0000 Subject: [PATCH 081/257] docs: update INSTALL.md with a section about building CRIU The building section also contains the information how to change the network locking backend without source code changes. Signed-off-by: Adrian Reber --- INSTALL.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/INSTALL.md b/INSTALL.md index d786d06eb..76ace5b02 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -1,3 +1,23 @@ +## Building CRIU from source code + +First, you need to install compile-time dependencies. Check [Installation dependencies](https://criu.org/Installation#Dependencies) for more info. + +To compile CRIU, run: +``` +make +``` +This should create the `./criu/criu` executable. + +To change the default behaviour of CRIU, the following variables can be passed +to the make command: + + * **NETWORK_LOCK_DEFAULT**, can be set to one of the following + values: `NETWORK_LOCK_IPTABLES`, `NETWORK_LOCK_NFTABLES`, + `NETWORK_LOCK_SKIP`. CRIU defaults to `NETWORK_LOCK_IPTABLES` + if nothing is specified. If another network locking backend is + needed, `make` can be called like this: + `make NETWORK_LOCK_DEFAULT=NETWORK_LOCK_NFTABLES` + ## Installing CRIU from source code Once CRIU is built one can easily setup the complete CRIU package From 95729ec328a02a81824ce2b8c3ecd5eb90a170d4 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Tue, 4 Feb 2025 15:57:47 +0000 Subject: [PATCH 082/257] docs: mark make commands with same format as elsewhere This uses the same formatting for the make command examples as seen in README.md. Signed-off-by: Adrian Reber --- INSTALL.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 76ace5b02..af0702518 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -23,9 +23,9 @@ to the make command: Once CRIU is built one can easily setup the complete CRIU package (which includes executable itself, CRIT tool, libraries, manual and etc) simply typing - - make install - +``` +make install +``` this command accepts the following variables: * **DESTDIR**, to specify global root where all components will be placed under (empty by default); @@ -36,17 +36,17 @@ this command accepts the following variables: * **LIBDIR**, to specify directory where to put libraries (guess the correct path by default). Thus one can type - - make DESTDIR=/some/new/place install - +``` +make DESTDIR=/some/new/place install +``` and get everything installed under `/some/new/place`. ## Uninstalling CRIU To clean up previously installed CRIU instance one can type - - make uninstall - +``` +make uninstall +``` and everything should be removed. Note though that if some variable (**DESTDIR**, **BINDIR** and such) has been used during installation procedure, the same *must* be passed with uninstall action. From 29ccb5b625a5cf915f87d1d85952dde6b9b572ee Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 6 Feb 2025 07:34:26 +0000 Subject: [PATCH 083/257] test: others/rpc do not use nftables locking backend The tests in others/rpc are running as non-root and fail silently if the nftables network locking backend is used. This switches those tests to skip the network locking. Signed-off-by: Adrian Reber --- test/others/rpc/errno.py | 2 ++ test/others/rpc/ps_test.py | 1 + test/others/rpc/run.sh | 2 +- test/others/rpc/test-c.c | 2 ++ test/others/rpc/test.py | 1 + 5 files changed, 7 insertions(+), 1 deletion(-) diff --git a/test/others/rpc/errno.py b/test/others/rpc/errno.py index b600b6d1c..4ea6c9d44 100755 --- a/test/others/rpc/errno.py +++ b/test/others/rpc/errno.py @@ -67,6 +67,7 @@ class test: req = self.get_base_req() req.type = rpc.DUMP req.opts.pid = pid + req.opts.network_lock = rpc.SKIP self.send_req(req) resp = self.recv_resp() @@ -84,6 +85,7 @@ class test: req = self.get_base_req() req.type = rpc.DUMP req.opts.leave_running = True + req.opts.network_lock = rpc.SKIP self.send_req(req) resp = self.recv_resp() diff --git a/test/others/rpc/ps_test.py b/test/others/rpc/ps_test.py index daeda49bc..259f22e77 100755 --- a/test/others/rpc/ps_test.py +++ b/test/others/rpc/ps_test.py @@ -23,6 +23,7 @@ req.type = rpc.PAGE_SERVER req.opts.log_file = 'page-server.log' req.opts.log_level = 4 req.opts.images_dir_fd = os.open(args['dir'], os.O_DIRECTORY) +req.opts.network_lock = rpc.SKIP s.send(req.SerializeToString()) diff --git a/test/others/rpc/run.sh b/test/others/rpc/run.sh index afd4fb5e3..3d5a53ae6 100755 --- a/test/others/rpc/run.sh +++ b/test/others/rpc/run.sh @@ -51,7 +51,7 @@ function test_restore_loop { title_print "Dump loop process" # So theoretically '-j' (--shell-job) should not be necessary, but on alpine # this test fails without it. - ${CRIU} dump -j -v4 -o dump-loop.log -D build/imgs_loop -t ${P} + ${CRIU} dump -j -v4 -o dump-loop.log --network-lock skip -D build/imgs_loop -t ${P} title_print "Run restore-loop" ./restore-loop.py build/criu_service.socket build/imgs_loop diff --git a/test/others/rpc/test-c.c b/test/others/rpc/test-c.c index 792dbbf9c..b3507975f 100644 --- a/test/others/rpc/test-c.c +++ b/test/others/rpc/test-c.c @@ -99,6 +99,8 @@ int main(int argc, char *argv[]) req.opts->images_dir_fd = dir_fd; req.opts->has_log_level = true; req.opts->log_level = 4; + req.opts->has_network_lock = true; + req.opts->network_lock = CRIU_NETWORK_LOCK_METHOD__SKIP; /* * Connect to service socket diff --git a/test/others/rpc/test.py b/test/others/rpc/test.py index ce8411bc6..6f692f755 100755 --- a/test/others/rpc/test.py +++ b/test/others/rpc/test.py @@ -24,6 +24,7 @@ req.type = rpc.DUMP req.opts.leave_running = True req.opts.log_level = 4 req.opts.images_dir_fd = os.open(args['dir'], os.O_DIRECTORY) +req.opts.network_lock = rpc.SKIP # Send request s.send(req.SerializeToString()) From f22330ff07354fd8007a42247fb1e29bcc346033 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 6 Feb 2025 09:57:52 +0000 Subject: [PATCH 084/257] test: print out logs if tests fail If the tests in others/rpc are failing no information about that error can be seen in a CI run. This change displays the log files if the test fails. Signed-off-by: Adrian Reber --- test/others/rpc/Makefile | 10 +++++++++- test/others/rpc/run.sh | 10 ++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/test/others/rpc/Makefile b/test/others/rpc/Makefile index 69537bb0d..b2f907abe 100644 --- a/test/others/rpc/Makefile +++ b/test/others/rpc/Makefile @@ -8,9 +8,17 @@ PYTHON ?= python3 run: all @make -C .. loop - mkdir -p build + mkdir -p build/{imgs_errno,imgs_ps,imgs_c,imgs_loop,imgs_py} chmod a+rwx build + chmod a+rwx build/{imgs_errno,imgs_ps,imgs_c,imgs_loop,imgs_py} rm -f build/status + @# Create all log files to be accessible for anybody + @# so that they can be displayed by any user. + for i in imgs_errno/criu.log imgs_ps/page-server.log imgs_ps/dump.log \ + imgs_c/restore-c.log imgs_loop/criu.log imgs_loop/dump-loop.log \ + imgs_py/criu.log imgs_py/restore-py.log imgs_c/criu.log service.log; do \ + touch build/$$i; chmod 666 build/$$i; \ + done sudo -g '#1000' -u '#1000' mkfifo build/status @# Need to start the criu daemon here to access the pidfile. @# The script read.py is used to wait until 'criu service' diff --git a/test/others/rpc/run.sh b/test/others/rpc/run.sh index 3d5a53ae6..b6158dfea 100755 --- a/test/others/rpc/run.sh +++ b/test/others/rpc/run.sh @@ -3,6 +3,7 @@ set -e CRIU=./criu +FAIL=1 export PROTODIR=`readlink -f "${PWD}/../../protobuf"` @@ -19,6 +20,13 @@ function stop_server { title_print "Shutdown service server" kill -SIGTERM $(cat build/pidfile) unlink build/pidfile + if [ "${FAIL}" == "1" ]; then + for i in build/output*; do + echo "File: $i" + cat $i + done + find . -name "*.log" -print -exec cat {} \; || true + fi } function test_c { @@ -80,6 +88,8 @@ test_restore_loop test_ps test_errno +FAIL=0 + stop_server trap 'echo "Success"' EXIT From 700a8c4b5ebeef536612a95f4d697f8ff0bf9b34 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 5 Feb 2025 07:51:38 +0000 Subject: [PATCH 085/257] ci: do not run tests requiring iptables if it is missing There are a couple of tests that require the iptables binary. Instead of adding a checkskip script, which could also handle this, this change now uses CRIU's feature detection to see if the CRIU feature 'has_ipt_legacy' exists. Signed-off-by: Adrian Reber --- test/zdtm/static/net_lock_socket_iptables.desc | 1 + test/zdtm/static/net_lock_socket_iptables6.desc | 1 + test/zdtm/static/netns-nf.desc | 1 + test/zdtm/static/netns_lock_iptables.desc | 1 + test/zdtm/static/socket-tcp-closed-last-ack.desc | 2 +- test/zdtm/static/socket-tcp-reseted.desc | 2 +- test/zdtm/static/socket-tcp-syn-sent.desc | 2 +- 7 files changed, 7 insertions(+), 3 deletions(-) diff --git a/test/zdtm/static/net_lock_socket_iptables.desc b/test/zdtm/static/net_lock_socket_iptables.desc index 936ff8702..cb622536f 100644 --- a/test/zdtm/static/net_lock_socket_iptables.desc +++ b/test/zdtm/static/net_lock_socket_iptables.desc @@ -1,5 +1,6 @@ { 'flavor': 'h', + 'feature': 'has_ipt_legacy', 'flags': 'suid excl reqrst', 'dopts': '--tcp-established --network-lock iptables', 'ropts': '--tcp-established', diff --git a/test/zdtm/static/net_lock_socket_iptables6.desc b/test/zdtm/static/net_lock_socket_iptables6.desc index 936ff8702..cb622536f 100644 --- a/test/zdtm/static/net_lock_socket_iptables6.desc +++ b/test/zdtm/static/net_lock_socket_iptables6.desc @@ -1,5 +1,6 @@ { 'flavor': 'h', + 'feature': 'has_ipt_legacy', 'flags': 'suid excl reqrst', 'dopts': '--tcp-established --network-lock iptables', 'ropts': '--tcp-established', diff --git a/test/zdtm/static/netns-nf.desc b/test/zdtm/static/netns-nf.desc index c99696d1c..58c23e8ba 100644 --- a/test/zdtm/static/netns-nf.desc +++ b/test/zdtm/static/netns-nf.desc @@ -3,4 +3,5 @@ '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/iptables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_standard.so', '/usr/bin/diff'], 'flags': 'suid', + 'feature': 'has_ipt_legacy', 'flavor': 'ns uns'} diff --git a/test/zdtm/static/netns_lock_iptables.desc b/test/zdtm/static/netns_lock_iptables.desc index 69020f34e..b465706b8 100644 --- a/test/zdtm/static/netns_lock_iptables.desc +++ b/test/zdtm/static/netns_lock_iptables.desc @@ -1,6 +1,7 @@ { 'flavor': 'h', 'flags': 'suid excl reqrst', + 'feature': 'has_ipt_legacy', 'opts': '--tcp-established', 'dopts': '--network-lock iptables', 'ropts': '--join-ns net:/var/run/netns/criu-net-lock-test' diff --git a/test/zdtm/static/socket-tcp-closed-last-ack.desc b/test/zdtm/static/socket-tcp-closed-last-ack.desc index 309854fa5..c77d58477 100644 --- a/test/zdtm/static/socket-tcp-closed-last-ack.desc +++ b/test/zdtm/static/socket-tcp-closed-last-ack.desc @@ -5,6 +5,6 @@ ], 'opts': '--tcp-established', 'flags': 'suid nouser samens', - 'feature' : 'tcp_half_closed', + 'feature' : 'tcp_half_closed has_ipt_legacy', 'flavor': 'ns uns', } diff --git a/test/zdtm/static/socket-tcp-reseted.desc b/test/zdtm/static/socket-tcp-reseted.desc index 4aa48ad87..ff92e9f9f 100644 --- a/test/zdtm/static/socket-tcp-reseted.desc +++ b/test/zdtm/static/socket-tcp-reseted.desc @@ -6,5 +6,5 @@ ], 'opts': '--tcp-established', 'flags': 'suid nouser samens', - 'feature' : 'tcp_half_closed' + 'feature' : 'tcp_half_closed has_ipt_legacy' } diff --git a/test/zdtm/static/socket-tcp-syn-sent.desc b/test/zdtm/static/socket-tcp-syn-sent.desc index 71cd26d72..52382414b 100644 --- a/test/zdtm/static/socket-tcp-syn-sent.desc +++ b/test/zdtm/static/socket-tcp-syn-sent.desc @@ -5,5 +5,5 @@ ], 'opts': '--tcp-established', 'flags': 'suid nouser samens', - 'feature' : 'tcp_half_closed' + 'feature' : 'tcp_half_closed has_ipt_legacy' } From 6826ac58ce842393a8a7d8cf73cd0478d4456330 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Tue, 4 Feb 2025 16:10:02 +0000 Subject: [PATCH 086/257] ci: run tests on a nftables only system Signed-off-by: Adrian Reber --- .github/workflows/nftables-test.yml | 24 ++++++++++++++++++++++++ scripts/ci/run-ci-tests.sh | 13 ++++++++++++- 2 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/nftables-test.yml diff --git a/.github/workflows/nftables-test.yml b/.github/workflows/nftables-test.yml new file mode 100644 index 000000000..eb3d8e814 --- /dev/null +++ b/.github/workflows/nftables-test.yml @@ -0,0 +1,24 @@ +name: Nftables bases testing + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: nftables-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4 + - name: Remove iptables + run: sudo apt remove -y iptables + - name: Install libnftables-dev + run: sudo scripts/ci/apt-install libnftables-dev + - name: chmod 755 /home/runner + # CRIU's tests are sometimes running as some random user and need + # to be able to access the test files. + run: sudo chmod 755 /home/runner + - name: Build with nftables network locking backend + run: sudo make -C scripts/ci local COMPILE_FLAGS="NETWORK_LOCK_DEFAULT=NETWORK_LOCK_NFTABLES" diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 611ff7803..0c4a08975 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -39,6 +39,10 @@ ci_prep () { # This can fail on aarch64 travis service apport stop || : + # Ubuntu has set up AppArmor in 24.04 so that it blocks use of user + # namespaces by unprivileged users. We need this for some of our tests. + sysctl kernel.apparmor_restrict_unprivileged_userns=0 || : + if [ "$CLANG" = "1" ]; then # clang support CC=clang @@ -121,8 +125,14 @@ if [ "${CD_TO_TOP}" = "1" ]; then fi export GCOV CC +if [ -z "$COMPILE_FLAGS" ]; then + LOCAL_COMPILE_FLAGS=("V=1") +else + IFS=" " read -r -a LOCAL_COMPILE_FLAGS <<< "$COMPILE_FLAGS" + LOCAL_COMPILE_FLAGS=("V=1" "${LOCAL_COMPILE_FLAGS[@]}") +fi $CC --version -time make CC="$CC" -j4 V=1 +time make CC="$CC" -j4 "${LOCAL_COMPILE_FLAGS[@]}" ./criu/criu -v4 cpuinfo dump || : ./criu/criu -v4 cpuinfo check || : @@ -150,6 +160,7 @@ ulimit -c unlimited cgid=$$ cleanup_cgroup() { ./test/zdtm_umount_cgroups $cgid + dmesg } trap cleanup_cgroup EXIT ./test/zdtm_mount_cgroups $cgid From 0f647094424811aaa83839ea10f49d94596a3d15 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Wed, 19 Mar 2025 23:19:31 +0700 Subject: [PATCH 087/257] namespace: skip cleaning up the uid/gid map in error cases free_userns_maps is called to clean up uid/gid map when the dump finishes. If we try to clean up these maps in error cases, it can lead to double free panic. So just skip cleaning up these maps and let free_userns_maps do its job. Signed-off-by: Bui Quang Minh --- criu/namespaces.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/criu/namespaces.c b/criu/namespaces.c index b7c0ab400..0c9b16a87 100644 --- a/criu/namespaces.c +++ b/criu/namespaces.c @@ -1009,36 +1009,31 @@ int dump_user_ns(pid_t pid, int ns_id) ret = parse_id_map(pid, "uid_map", &e->uid_map); if (ret < 0) - goto err; + /* + * The uid_map and gid_map is clean up in free_userns_maps + * later, so we don't need to clean these up in error cases. + */ + return -1; + e->n_uid_map = ret; ret = parse_id_map(pid, "gid_map", &e->gid_map); if (ret < 0) - goto err; + return -1; e->n_gid_map = ret; if (check_user_ns(pid)) - goto err; + return -1; img = open_image(CR_FD_USERNS, O_DUMP, ns_id); if (!img) - goto err; + return -1; ret = pb_write_one(img, e, PB_USERNS); close_image(img); if (ret < 0) - goto err; + return -1; return 0; -err: - if (e->uid_map) { - xfree(e->uid_map[0]); - xfree(e->uid_map); - } - if (e->gid_map) { - xfree(e->gid_map[0]); - xfree(e->gid_map); - } - return -1; } void free_userns_maps(void) From bc1415317379c45b08ac6f8eb98698ca2df9b78c Mon Sep 17 00:00:00 2001 From: Ivan Pravdin Date: Sat, 22 Mar 2025 19:31:02 -0400 Subject: [PATCH 088/257] criu: fix log_keep_err signal deadlock When using pr_err in signal handler, locking is used in an unsafe manner. If another signal happens while holding the lock, deadlock can happen. To fix this, we can introduce mutex_trylock similar to pthread_mutex_trylock that returns immediately. Due to the fact that lock is used only for writing first_err, this change garantees that deadlock cannot happen. Fixes: #358 Signed-off-by: Ivan Pravdin --- criu/log.c | 9 +++++---- include/common/lock.h | 6 ++++++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/criu/log.c b/criu/log.c index 89ae8f820..70e267fd6 100644 --- a/criu/log.c +++ b/criu/log.c @@ -132,10 +132,11 @@ static void log_note_err(char *msg) * anyway, so it doesn't make much sense to try hard * and optimize this out. */ - mutex_lock(&first_err->l); - if (first_err->s[0] == '\0') - __strlcpy(first_err->s, msg, sizeof(first_err->s)); - mutex_unlock(&first_err->l); + if (mutex_trylock(&first_err->l)) { + if (first_err->s[0] == '\0') + __strlcpy(first_err->s, msg, sizeof(first_err->s)); + mutex_unlock(&first_err->l); + } } } diff --git a/include/common/lock.h b/include/common/lock.h index ccfa468b8..4733d7287 100644 --- a/include/common/lock.h +++ b/include/common/lock.h @@ -2,6 +2,7 @@ #define __CR_COMMON_LOCK_H__ #include +#include #include #include #include @@ -162,6 +163,11 @@ static inline void mutex_lock(mutex_t *m) } } +static inline bool mutex_trylock(mutex_t *m) +{ + return atomic_inc_return(&m->raw) == 1; +} + static inline void mutex_unlock(mutex_t *m) { uint32_t c = 0; From b6059ff193a9b0dff98e997134d662c3ccfd1600 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 24 Mar 2025 15:23:34 -0700 Subject: [PATCH 089/257] criu: Version 4.1 (CRISC-V) Major changes: * RISC-V Support * PIDFD Support * CUDA Enhancements * Fixes here and there The full changelog can be found here: https://criu.org/Download/criu/4.1. Signed-off-by: Andrei Vagin --- Makefile.versions | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.versions b/Makefile.versions index c5859801a..85653c217 100644 --- a/Makefile.versions +++ b/Makefile.versions @@ -1,10 +1,10 @@ # # CRIU version. CRIU_VERSION_MAJOR := 4 -CRIU_VERSION_MINOR := 0 +CRIU_VERSION_MINOR := 1 CRIU_VERSION_SUBLEVEL := CRIU_VERSION_EXTRA := -CRIU_VERSION_NAME := CRIUDA +CRIU_VERSION_NAME := CRISCV CRIU_VERSION := $(CRIU_VERSION_MAJOR)$(if $(CRIU_VERSION_MINOR),.$(CRIU_VERSION_MINOR))$(if $(CRIU_VERSION_SUBLEVEL),.$(CRIU_VERSION_SUBLEVEL))$(if $(CRIU_VERSION_EXTRA),.$(CRIU_VERSION_EXTRA)) export CRIU_VERSION_MAJOR CRIU_VERSION_MINOR CRIU_VERSION_SUBLEVEL From 570621a48a81664a37a97f38d0ed65c1c0f56110 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 25 Jul 2025 00:05:06 +0000 Subject: [PATCH 090/257] mount-v2: enter the mount namesapce to propagation properties A kernel change (commit 12f147ddd6de, "do_change_type(): refuse to operate on unmounted/not ours mounts") modified how mount propagation properties can be changed. Previously, these properties could be changed from any mount namespace. Now, they can only be modified from the specific mount namespace where the target mount is actually mounted This commit addresses this new restriction by ensuring that CRIU enters the correct mount namespace before attempting to restore mount propagation properties (MS_SLAVE or MS_SHARED) for a mount. Signed-off-by: Andrei Vagin --- criu/mount-v2.c | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/criu/mount-v2.c b/criu/mount-v2.c index 5d53e9a22..cdebc8318 100644 --- a/criu/mount-v2.c +++ b/criu/mount-v2.c @@ -927,8 +927,12 @@ static int move_mount_set_group(int src_id, char *source, int dst_id) static int restore_one_sharing(struct sharing_group *sg, struct mount_info *target) { + int nsfd = -1, orig_nsfd = -1, exit_code = -1; char target_path[PATH_MAX]; - int target_fd; + int target_fd = -1; + + if (!sg->master_id && !sg->shared_id) + return 0; target_fd = fdstore_get(target->mnt_fd_id); BUG_ON(target_fd < 0); @@ -943,8 +947,7 @@ static int restore_one_sharing(struct sharing_group *sg, struct mount_info *targ first = get_first_mount(sg->parent); if (move_mount_set_group(first->mnt_fd_id, NULL, target->mnt_fd_id)) { pr_err("Failed to copy sharing from %d to %d\n", first->mnt_id, target->mnt_id); - close(target_fd); - return -1; + goto err; } } else { /* @@ -956,16 +959,23 @@ static int restore_one_sharing(struct sharing_group *sg, struct mount_info *targ */ if (move_mount_set_group(-1, sg->source, target->mnt_fd_id)) { pr_err("Failed to copy sharing from source %s to %d\n", sg->source, target->mnt_id); - close(target_fd); - return -1; + goto err; } } + } + nsfd = fdstore_get(target->nsid->mnt.nsfd_id); + if (nsfd < 0) + goto err; + + if (switch_ns_by_fd(nsfd, &mnt_ns_desc, &orig_nsfd)) + goto err; + + if (sg->master_id) { /* Convert shared_id to master_id */ if (mount(NULL, target_path, NULL, MS_SLAVE, NULL)) { pr_perror("Failed to make mount %d slave", target->mnt_id); - close(target_fd); - return -1; + goto err; } } @@ -973,13 +983,16 @@ static int restore_one_sharing(struct sharing_group *sg, struct mount_info *targ if (sg->shared_id) { if (mount(NULL, target_path, NULL, MS_SHARED, NULL)) { pr_perror("Failed to make mount %d shared", target->mnt_id); - close(target_fd); - return -1; + goto err; } } - close(target_fd); - - return 0; + exit_code = 0; +err: + close_safe(&target_fd); + close_safe(&nsfd); + if (orig_nsfd >= 0 && restore_ns(orig_nsfd, &mnt_ns_desc)) + exit_code = -1; + return exit_code; } static int restore_one_sharing_group(struct sharing_group *sg) From ced15c302b3f5f11f529e335d4b54ad88b45075e Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 8 Jun 2025 17:19:52 -0700 Subject: [PATCH 091/257] test/zdtm: remove unused compiler argument Fixes a clang compile-time error: "argument unused during compilation: '-c'". Signed-off-by: Andrei Vagin --- test/zdtm/Makefile.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/zdtm/Makefile.inc b/test/zdtm/Makefile.inc index 24f32c606..c19888da3 100644 --- a/test/zdtm/Makefile.inc +++ b/test/zdtm/Makefile.inc @@ -76,7 +76,7 @@ endef %.d: %.c $(E) " DEP " $@ - $(Q)$(CC) $(CFLAGS) $(CPPFLAGS) -MM -MP -c $< -o $@ + $(Q)$(CC) $(CFLAGS) $(CPPFLAGS) -MM -MP $< -o $@ %.o: %.c | %.d $(E) " CC " $@ From a44aa6d985472d995d04fef7eae22d63c7500f8c Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 28 Jul 2025 21:32:02 +0000 Subject: [PATCH 092/257] criu: Version 4.1.1 This release of CRIU (4.1.1) addresses a critical compatibility issue introduced in the Linux kernel and back-ported to all stable releases. The kernel commit (12f147ddd6de "do_change_type(): refuse to operate on unmounted/not ours mounts") addressed the security issue introduced almost 20 years ago. Unfortunately, this change inadvertently broke the restore functionality of mount namespaces within CRIU. Users attempting to restore a container on updated kernels would encounter the error: "mnt-v2: Failed to make mount 476 slave: Invalid argument." This release contains the necessary adjustments to CRIU, allowing it to work seamlessly with kernels incorporating this security change. Signed-off-by: Andrei Vagin --- Makefile.versions | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.versions b/Makefile.versions index 85653c217..0b1a46a16 100644 --- a/Makefile.versions +++ b/Makefile.versions @@ -2,7 +2,7 @@ # CRIU version. CRIU_VERSION_MAJOR := 4 CRIU_VERSION_MINOR := 1 -CRIU_VERSION_SUBLEVEL := +CRIU_VERSION_SUBLEVEL := 1 CRIU_VERSION_EXTRA := CRIU_VERSION_NAME := CRISCV CRIU_VERSION := $(CRIU_VERSION_MAJOR)$(if $(CRIU_VERSION_MINOR),.$(CRIU_VERSION_MINOR))$(if $(CRIU_VERSION_SUBLEVEL),.$(CRIU_VERSION_SUBLEVEL))$(if $(CRIU_VERSION_EXTRA),.$(CRIU_VERSION_EXTRA)) From 34226fd243b599b8c02dad3ef1530cef2016dabe Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Sat, 18 Jan 2025 13:43:15 +0000 Subject: [PATCH 093/257] ci: try GitHub arm runners Signed-off-by: Adrian Reber --- .github/workflows/actuated-aarch64-test.yaml | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/.github/workflows/actuated-aarch64-test.yaml b/.github/workflows/actuated-aarch64-test.yaml index 8b0a63fc7..567746a5f 100644 --- a/.github/workflows/actuated-aarch64-test.yaml +++ b/.github/workflows/actuated-aarch64-test.yaml @@ -1,4 +1,4 @@ -name: Actuated aarch64 test +name: aarch64 test on: [push, pull_request] @@ -11,32 +11,38 @@ jobs: build: # Actuated runners are not available in all repositories. if: ${{ github.repository == 'checkpoint-restore/criu' }} - # The memory size and the number of CPUs can be freely selected. - # 3GB and 4 CPUs seems to be enough according to the result from 'vmmeter'. - runs-on: actuated-arm64-4cpu-3gb + # The memory size and the number of CPUs can be freely selected for + # the actuated runners. 3GB and 4 CPUs seems to be enough according to the + # result from 'vmmeter'. + runs-on: ${{ matrix.os }} strategy: + fail-fast: false matrix: + os: [actuated-arm64-4cpu-3gb, ubuntu-24.04-arm] target: [GCC=1, CLANG=1] steps: # https://gist.github.com/alexellis/1f33e581c75e11e161fe613c46180771#file-metering-gha-md # vmmeter start - name: Prepare arkade + if: ${{ matrix.os == 'actuated-arm64-4cpu-3gb' }} uses: alexellis/arkade-get@master with: crane: latest print-summary: false - name: Install vmmeter + if: ${{ matrix.os == 'actuated-arm64-4cpu-3gb' }} run: | crane export --platform linux/arm64 ghcr.io/openfaasltd/vmmeter:latest | sudo tar -xvf - -C /usr/local/bin - name: Run vmmeter + if: ${{ matrix.os == 'actuated-arm64-4cpu-3gb' }} uses: self-actuated/vmmeter-action@master # vmmeter end - uses: actions/checkout@v4 - - name: Run Tests ${{ matrix.target }} + - name: Run Tests ${{ matrix.target }}/${{ matrix.os }} # Following tests are failing on the actuated VMs: # ./change_mnt_context --pidfile=change_mnt_context.pid --outfile=change_mnt_context.out # 45: ERR: change_mnt_context.c:23: mount (errno = 22 (Invalid argument)) From daa548bbfb189beb3c2b632a39081f8713b5222f Mon Sep 17 00:00:00 2001 From: Yuanhong Peng Date: Wed, 2 Apr 2025 18:48:12 +0800 Subject: [PATCH 094/257] criu: Do not print failed message when there is no late stage hook This is highly confusing, and it seems that the ret variable is not handled in the subsequent process. Signed-off-by: Yuanhong Peng --- criu/cr-restore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 1f4881dab..583b446e0 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2258,7 +2258,7 @@ skip_ns_bouncing: * might actually be a true error code but that would be also * captured in the plugin so no need to print the error here. */ - if (ret < 0) + if (ret < 0 && ret != -ENOTSUP) pr_debug("restore late stage hook for external plugin failed\n"); } From 9a1e979666275f2b94aa42f83bb4bd86ef00b7ea Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 2 Apr 2025 21:13:16 +0000 Subject: [PATCH 095/257] compel: fix the stack test The stack test incorrectly assumed the page immediately following the stack pointer could never be changed. This doesn't work, because this page can be a part of another mapping. This commit introduces a dedicated "stack redzone," a small guard region directly after the stack. The stack test is modified to specifically check for corruption within this redzone. Signed-off-by: Andrei Vagin --- compel/include/uapi/infect.h | 9 +++ compel/src/lib/infect.c | 6 +- compel/test/stack/spy.c | 113 +---------------------------------- 3 files changed, 12 insertions(+), 116 deletions(-) diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index ed97d64dd..1f61876ff 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -13,6 +13,15 @@ #define PARASITE_START_AREA_MIN (4096) +#define PARASITE_STACK_SIZE (16 << 10) +/* + * A stack redzone is a small, protected region of memory located immediately + * after a parasite stack. It is intended to remain unchanged. While it can be + * implemented as a guard page, we want to avoid the overhead of additional + * remote system calls. + */ +#define PARASITE_STACK_REDZONE 128 + extern int __must_check compel_interrupt_task(int pid); struct seize_task_status { diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index a9bbd6400..4ea27bc63 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -38,8 +38,6 @@ #define UNIX_PATH_MAX (sizeof(struct sockaddr_un) - (size_t)((struct sockaddr_un *)0)->sun_path) #endif -#define PARASITE_STACK_SIZE (16 << 10) - #ifndef SECCOMP_MODE_DISABLED #define SECCOMP_MODE_DISABLED 0 #endif @@ -1064,7 +1062,7 @@ int compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, p += RESTORE_STACK_SIGFRAME; p += PARASITE_STACK_SIZE; - ctl->rstack = ctl->remote_map + p; + ctl->rstack = ctl->remote_map + p - PARASITE_STACK_REDZONE; /* * x86-64 ABI requires a 16 bytes aligned stack. @@ -1078,7 +1076,7 @@ int compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, if (nr_threads > 1) { p += PARASITE_STACK_SIZE; - ctl->r_thread_stack = ctl->remote_map + p; + ctl->r_thread_stack = ctl->remote_map + p - PARASITE_STACK_REDZONE; } ret = arch_fetch_sas(ctl, ctl->rsigframe); diff --git a/compel/test/stack/spy.c b/compel/test/stack/spy.c index 9b7c9a7f0..184c8ab31 100644 --- a/compel/test/stack/spy.c +++ b/compel/test/stack/spy.c @@ -50,70 +50,6 @@ static void *get_parasite_rstack_start(struct parasite_ctl *ctl) return rstack_start; } -static int page_writable(struct parasite_ctl *ctl, int pid, void *page) -{ - FILE *maps; - size_t maps_line_len = 0; - char *maps_line = NULL; - char victim_maps_path[6 + 11 + 5 + 1]; - int written; - int ret = 0; - - if (((uintptr_t)page & (page_size() - 1)) != 0) { - fprintf(stderr, "Page address not aligned\n"); - ret = -1; - goto done; - } - - written = snprintf(victim_maps_path, sizeof(victim_maps_path), "/proc/%d/maps", pid); - if (written < 0 || written >= sizeof(victim_maps_path)) { - fprintf(stderr, "Failed to create path string to victim's /proc/%d/maps file\n", pid); - ret = -1; - goto done; - } - - maps = fopen(victim_maps_path, "r"); - if (maps == NULL) { - perror("Can't open victim's /proc/$pid/maps"); - ret = -1; - goto done; - } - - while (getline(&maps_line, &maps_line_len, maps) != -1) { - unsigned long vmstart, vmend; - char r, w; - - if (sscanf(maps_line, "%lx-%lx %c%c", &vmstart, &vmend, &r, &w) < 4) { - fprintf(stderr, "Can't parse victim's /proc/%d/maps; line: %s\n", pid, maps_line); - ret = -1; - goto free_linebuf; - } - - if (page >= (void *)vmstart && page < (void *)vmend) { - if (w == 'w') { - if (r != 'r') { - fprintf(stderr, "Expecting writable memory to also be readable"); - ret = -1; - goto free_linebuf; - } - ret = 1; - } - break; - } - } - - if (errno) { - perror("Can't read victim's /proc/$pid/maps"); - ret = -1; - } - -free_linebuf: - free(maps_line); - fclose(maps); -done: - return ret; -} - static void *read_proc_mem(int pid, void *offset, size_t len) { char victim_mem_path[6 + 11 + 4 + 1]; @@ -153,51 +89,6 @@ freebuf: return NULL; } -static int save_data_near_stack(struct parasite_ctl *ctl, int pid, void *stack, void **saved_data, - size_t *saved_data_size) -{ - size_t page_mask = page_size() - 1; - size_t saved_size = 0; - size_t stack_size_last_page = (uintptr_t)stack & page_mask; - void *next_page = stack; - - if (stack_size_last_page != 0) { - size_t empty_space_last_page = page_size() - stack_size_last_page; - saved_size = min(empty_space_last_page, (size_t)SAVED_DATA_MAX); - next_page += page_size() - stack_size_last_page; - } - - while (saved_size < SAVED_DATA_MAX && next_page != NULL) { - switch (page_writable(ctl, pid, next_page)) { - case 1: - saved_size = min((size_t)(saved_size + page_size()), (size_t)SAVED_DATA_MAX); - next_page += page_size(); - break; - case 0: - next_page = NULL; - break; - default: - return -1; - } - } - - if (saved_size > 0) { - void *sd; - - sd = read_proc_mem(pid, stack, saved_size); - if (sd == NULL) - return -1; - - *saved_data = sd; - } else { - *saved_data = NULL; - } - - *saved_data_size = saved_size; - - return 0; -} - static int check_saved_data(struct parasite_ctl *ctl, int pid, void *stack, void *saved_data, size_t saved_data_size) { if (saved_data != NULL) { @@ -221,7 +112,7 @@ static int do_infection(int pid) struct infect_ctx *ictx; int *arg; void *stack; - size_t saved_data_size; + size_t saved_data_size = PARASITE_STACK_REDZONE; int saved_data_check; compel_log_init(print_vmsg, COMPEL_LOG_DEBUG); @@ -257,8 +148,6 @@ static int do_infection(int pid) err_and_ret("Can't register cleanup function with atexit\n"); stack = get_parasite_rstack_start(ctl); - if (save_data_near_stack(ctl, pid, stack, &saved_data, &saved_data_size)) - err_and_ret("Can't save data above stack\n"); if (compel_start_daemon(ctl)) err_and_ret("Can't start daemon in victim\n"); From 5ff52326e15b90dc59ed8ae317735201277a2377 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 27 Mar 2025 14:21:03 +0000 Subject: [PATCH 096/257] restore: use the new kernel interface to restore timers Thomas Gleixner introduced the new interface to create posix timers with specifed timer IDs: https://github.com/torvalds/linux/commit/ec2d0c04624b3c8a7eb1682e006717fa20cfbe24 Previously, CRIU recreated timers by repeatedly creating and deleting them until the desired ID was reached. This approach isn't fast, especially for timers with large IDs. For example, restoring two timers with IDs 1000000 and 2000000 took approximately 1.5 seconds. The new `prctl()` based interface allows direct creation of timers with specified IDs, reducing the restoration time to around 3 microseconds for the same example. Signed-off-by: Andrei Vagin --- criu/cr-check.c | 10 ++++++++ criu/include/kerndat.h | 1 + criu/include/prctl.h | 7 ++++++ criu/include/restorer.h | 1 + criu/kerndat.c | 20 +++++++++++++++ criu/pie/restorer.c | 54 +++++++++++++++++++++++++++++++++++++---- criu/timer.c | 2 ++ 7 files changed, 90 insertions(+), 5 deletions(-) diff --git a/criu/cr-check.c b/criu/cr-check.c index 0388cbe7f..7b4a6415a 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -1392,6 +1392,14 @@ static int check_pagemap_scan(void) return 0; } +static int check_timer_cr_ids(void) +{ + if (!kdat.has_timer_cr_ids) + return -1; + + return 0; +} + /* musl doesn't have a statx wrapper... */ struct staty { __u32 stx_dev_major; @@ -1703,6 +1711,7 @@ int cr_check(void) ret |= check_ipv6_freebind(); ret |= check_pagemap_scan(); ret |= check_overlayfs_maps(); + ret |= check_timer_cr_ids(); if (kdat.lsm == LSMTYPE__APPARMOR) ret |= check_apparmor_stacking(); @@ -1825,6 +1834,7 @@ static struct feature_list feature_list[] = { { "get_rseq_conf", check_ptrace_get_rseq_conf }, { "ipv6_freebind", check_ipv6_freebind }, { "pagemap_scan", check_pagemap_scan }, + { "timer_cr_ids", check_timer_cr_ids }, { "overlayfs_maps", check_overlayfs_maps }, { NULL, NULL }, }; diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index e03a57341..bd8744d62 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -89,6 +89,7 @@ struct kerndat_s { bool has_pagemap_scan; bool has_shstk; bool has_close_range; + bool has_timer_cr_ids; }; extern struct kerndat_s kdat; diff --git a/criu/include/prctl.h b/criu/include/prctl.h index f5f23c969..2966659da 100644 --- a/criu/include/prctl.h +++ b/criu/include/prctl.h @@ -97,4 +97,11 @@ struct prctl_mm_map { #define PR_GET_THP_DISABLE 42 #endif +#ifndef PR_TIMER_CREATE_RESTORE_IDS +#define PR_TIMER_CREATE_RESTORE_IDS 77 +# define PR_TIMER_CREATE_RESTORE_IDS_OFF 0 +# define PR_TIMER_CREATE_RESTORE_IDS_ON 1 +# define PR_TIMER_CREATE_RESTORE_IDS_GET 2 +#endif + #endif /* __CR_PRCTL_H__ */ diff --git a/criu/include/restorer.h b/criu/include/restorer.h index a4fb7ea79..56bea0fcc 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -170,6 +170,7 @@ struct task_restore_args { struct restore_posix_timer *posix_timers; unsigned int posix_timers_n; + bool posix_timer_cr_ids; struct restore_timerfd *timerfd; unsigned int timerfd_n; diff --git a/criu/kerndat.c b/criu/kerndat.c index 5939005a4..930117b0a 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -1720,6 +1720,22 @@ static int kerndat_has_close_range(void) return 0; } +static int kerndat_has_timer_cr_ids(void) +{ + if (prctl(PR_TIMER_CREATE_RESTORE_IDS, + PR_TIMER_CREATE_RESTORE_IDS_GET, 0, 0, 0) == -1) { + if (errno == EINVAL) { + pr_debug("PR_TIMER_CREATE_RESTORE_IDS isn't supported\n"); + return 0; + } + pr_perror("prctl returned unexpected error code"); + return -1; + } + + kdat.has_timer_cr_ids = true; + return 0; +} + /* * Some features depend on resource that can be dynamically changed * at the OS runtime. There are cases that we cannot determine the @@ -1981,6 +1997,10 @@ int kerndat_init(void) pr_err("kerndat_has_close_range has failed when initializing kerndat.\n"); ret = -1; } + if (!ret && kerndat_has_timer_cr_ids()) { + pr_err("kerndat_has_timer_cr_ids has failed when initializing kerndat.\n"); + ret = -1; + } kerndat_lsm(); kerndat_mmap_min_addr(); diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 348ce6659..9867a3ddd 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1235,9 +1235,23 @@ static int timerfd_arm(struct task_restore_args *args) static int create_posix_timers(struct task_restore_args *args) { - int ret, i; + int ret, i, exit_code = -1; kernel_timer_t next_id = 0, timer_id; struct sigevent sev; + bool create_restore_ids = false; + + if (!args->posix_timers_n) + return 0; + + /* prctl returns EINVAL if PR_TIMER_CREATE_RESTORE_IDS isn't supported. */ + ret = sys_prctl(PR_TIMER_CREATE_RESTORE_IDS, + PR_TIMER_CREATE_RESTORE_IDS_ON, 0, 0, 0); + if (ret == 0) { + create_restore_ids = true; + } else if (ret != -EINVAL) { + pr_err("Can't enabled PR_TIMER_CREATE_RESTORE_IDS: %d\n", ret); + return -1; + } for (i = 0; i < args->posix_timers_n; i++) { sev.sigev_notify = args->posix_timers[i].spt.it_sigev_notify; @@ -1249,16 +1263,36 @@ static int create_posix_timers(struct task_restore_args *args) #endif sev.sigev_value.sival_ptr = args->posix_timers[i].spt.sival_ptr; + if (create_restore_ids) { + /* + * With enabled PR_TIMER_CREATE_RESTORE_IDS, the + * timer_create syscall creates a new timer with the + * specified ID. + */ + timer_id = args->posix_timers[i].spt.it_id; + ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &timer_id); + if (ret < 0) { + pr_err("Can't create posix timer - %d: %d\n", i, ret); + goto out; + } + if (timer_id != args->posix_timers[i].spt.it_id) { + pr_err("Unexpected timer id %u (expected %lu)\n", + timer_id, args->posix_timers[i].spt.it_id); + goto out; + } + continue; + } + while (1) { ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &timer_id); if (ret < 0) { pr_err("Can't create posix timer - %d\n", i); - return ret; + goto out; } if (timer_id != next_id) { pr_err("Can't create timers, kernel don't give them consequently\n"); - return -1; + goto out; } next_id++; @@ -1268,12 +1302,22 @@ static int create_posix_timers(struct task_restore_args *args) ret = sys_timer_delete(timer_id); if (ret < 0) { pr_err("Can't remove temporaty posix timer 0x%x\n", timer_id); - return ret; + goto out; } } } - return 0; + exit_code = 0; +out: + if (create_restore_ids) { + ret = sys_prctl(PR_TIMER_CREATE_RESTORE_IDS, + PR_TIMER_CREATE_RESTORE_IDS_OFF, 0, 0, 0); + if (ret != 0) { + pr_err("Can't disable PR_TIMER_CREATE_RESTORE_IDS: %d\n", ret); + exit_code = -1; + } + } + return exit_code; } static void restore_posix_timers(struct task_restore_args *args) diff --git a/criu/timer.c b/criu/timer.c index 0413e2a72..856501be6 100644 --- a/criu/timer.c +++ b/criu/timer.c @@ -195,6 +195,7 @@ int prepare_posix_timers_from_fd(int pid, struct task_restore_args *ta) if (!img) return -1; + ta->posix_timer_cr_ids = kdat.has_timer_cr_ids; ta->posix_timers_n = 0; while (1) { PosixTimerEntry *pte; @@ -234,6 +235,7 @@ int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core) return prepare_posix_timers_from_fd(pid, ta); ta->posix_timers_n = tte->n_posix; + ta->posix_timer_cr_ids = kdat.has_timer_cr_ids; for (i = 0; i < ta->posix_timers_n; i++) { t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); if (!t) From e7aee3c5c723e95e1c0e787f4c57919c2fc58c60 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 10 Apr 2025 20:56:23 +0100 Subject: [PATCH 097/257] cuda: use pr_perror for libc function errors When handing errors for functions such as `ptrace()`, `pipe()`, and `fork()` it would be better to use `pr_perror` instead of `pr_err` as it would include a message describing the encountered error. Signed-off-by: Radostin Stoyanov --- plugins/cuda/cuda_plugin.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 99e4caf74..1aaad6842 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -93,7 +93,7 @@ static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) int fd[2], buf_off; if (pipe(fd) != 0) { - pr_err("Couldn't create pipes for reading cuda-checkpoint output\n"); + pr_perror("Couldn't create pipes for reading cuda-checkpoint output"); return -1; } @@ -101,7 +101,7 @@ static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) int child_pid = fork(); if (child_pid == -1) { - pr_err("Failed to fork to exec cuda-checkpoint\n"); + pr_perror("Failed to fork to exec cuda-checkpoint"); close(fd[READ]); close(fd[WRITE]); return -1; @@ -166,7 +166,6 @@ static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) } if (WIFSIGNALED(status)) { int sig = WTERMSIG(status); - pr_err("cuda-checkpoint unexpectedly signaled with %d: %s\n", sig, strsignal(sig)); } else if (WIFEXITED(status)) { exit_code = WEXITSTATUS(status); @@ -283,8 +282,8 @@ static int interrupt_restore_thread(int restore_tid, k_rtsigset_t *restore_sigse * a compel_interrupt_task() */ if (ptrace(PTRACE_INTERRUPT, restore_tid, NULL, 0)) { - pr_err("Could not interrupt cuda restore tid %d after checkpoint, process may be in strange state\n", - restore_tid); + pr_perror("Could not interrupt cuda restore tid %d after checkpoint, process may be in strange state", + restore_tid); return -1; } @@ -295,12 +294,12 @@ static int interrupt_restore_thread(int restore_tid, k_rtsigset_t *restore_sigse } if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, PTRACE_O_SUSPEND_SECCOMP | PTRACE_O_TRACESYSGOOD)) { - pr_err("Failed to set ptrace options on interrupt for restore tid %d\n", restore_tid); + pr_perror("Failed to set ptrace options on interrupt for restore tid %d", restore_tid); return -1; } if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(*restore_sigset), restore_sigset)) { - pr_err("Unable to restore original sigmask to restore tid %d\n", restore_tid); + pr_perror("Unable to restore original sigmask to restore tid %d", restore_tid); return -1; } @@ -312,7 +311,7 @@ static int resume_restore_thread(int restore_tid, k_rtsigset_t *save_sigset) k_rtsigset_t block; if (ptrace(PTRACE_GETSIGMASK, restore_tid, sizeof(*save_sigset), save_sigset)) { - pr_err("Failed to get current sigmask for restore tid %d\n", restore_tid); + pr_perror("Failed to get current sigmask for restore tid %d", restore_tid); return -1; } @@ -320,18 +319,18 @@ static int resume_restore_thread(int restore_tid, k_rtsigset_t *save_sigset) ksigdelset(&block, SIGTRAP); if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(block), &block)) { - pr_err("Failed to block signals on restore tid %d\n", restore_tid); + pr_perror("Failed to block signals on restore tid %d", restore_tid); return -1; } // Clear out PTRACE_O_SUSPEND_SECCOMP when we resume the restore thread if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, 0)) { - pr_err("Could not clear ptrace options on restore tid %d\n", restore_tid); + pr_perror("Could not clear ptrace options on restore tid %d", restore_tid); return -1; } if (ptrace(PTRACE_CONT, restore_tid, NULL, 0)) { - pr_err("Could not resume cuda restore tid %d\n", restore_tid); + pr_perror("Could not resume cuda restore tid %d", restore_tid); return -1; } From 6805841660e741eda203ef8339a895281f2095e9 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 10 Apr 2025 21:14:05 +0100 Subject: [PATCH 098/257] cuda: remove redundant goto label The `goto interrupt` label is unnecessary as the code directly returns after `cuda_process_checkpoint_action()`. Signed-off-by: Radostin Stoyanov --- plugins/cuda/cuda_plugin.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 1aaad6842..9ccb04224 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -395,12 +395,9 @@ int cuda_plugin_checkpoint_devices(int pid) status = cuda_process_checkpoint_action(pid, ACTION_CHECKPOINT, 0, msg_buf, sizeof(msg_buf)); if (status) { pr_err("CHECKPOINT_DEVICES failed with %s\n", msg_buf); - goto interrupt; } -interrupt: int_ret = interrupt_restore_thread(restore_tid, &save_sigset); - return status != 0 ? -1 : int_ret; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, cuda_plugin_checkpoint_devices); From 74799ae023f82d99efac8d67974705087f208567 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 9 Apr 2025 13:25:44 +0000 Subject: [PATCH 099/257] aarch64: fix build with missing NT_ARM_PAC_ENABLED_KEYS On a RHEL 8 based system building CRIU fails with: criu/arch/aarch64/crtools.c: In function 'save_pac_keys': criu/arch/aarch64/crtools.c:73:39: error: 'NT_ARM_PAC_ENABLED_KEYS' undeclared (first use in this function); did you mean 'NT_ARM_PACA_KEYS'? ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PAC_ENABLED_KEYS, &iov); ^~~~~~~~~~~~~~~~~~~~~~~ NT_ARM_PACA_KEYS criu/arch/aarch64/crtools.c:73:39: note: each undeclared identifier is reported only once for each function it appears in criu/arch/aarch64/crtools.c: In function 'arch_ptrace_restore': criu/arch/aarch64/crtools.c:261:44: error: 'NT_ARM_PAC_ENABLED_KEYS' undeclared (first use in this function); did you mean 'NT_ARM_PACA_KEYS'? if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PAC_ENABLED_KEYS, &iov))) { ^~~~~~~~~~~~~~~~~~~~~~~ NT_ARM_PACA_KEYS This adds the missing define if it is undefined. Signed-off-by: Adrian Reber --- criu/arch/aarch64/crtools.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/criu/arch/aarch64/crtools.c b/criu/arch/aarch64/crtools.c index 6cde03ee3..c077dd06b 100644 --- a/criu/arch/aarch64/crtools.c +++ b/criu/arch/aarch64/crtools.c @@ -23,6 +23,10 @@ #include "compel/infect.h" #include "pstree.h" +#ifndef NT_ARM_PAC_ENABLED_KEYS +#define NT_ARM_PAC_ENABLED_KEYS 0x40a /* AArch64 pointer authentication enabled keys. */ +#endif + extern unsigned long getauxval(unsigned long type); #define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))(src)->e From b9da95b0b2c5f42b24725d673bf287b3c00bbc40 Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Tue, 23 Jan 2024 08:22:07 -0800 Subject: [PATCH 100/257] s390: Fix FP reg restore after parasite code runs Currently we save FP regs before parasite code runs, and restore after for --leave-running, --check-only, and in case of errors. In case of errors the error may have happened before FP regs were saved, so we should only restore them if they were actually saved. Signed-off-by: Younes Manton --- criu/arch/s390/crtools.c | 90 +++++++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 38 deletions(-) diff --git a/criu/arch/s390/crtools.c b/criu/arch/s390/crtools.c index 96cef819e..e08c83878 100644 --- a/criu/arch/s390/crtools.c +++ b/criu/arch/s390/crtools.c @@ -142,6 +142,29 @@ static void print_core_fp_regs(const char *msg, CoreEntry *core) print_core_ri_cb(core); } +/* + * Allocate floating point registers + */ +static UserS390FpregsEntry *allocate_fp_regs(void) +{ + UserS390FpregsEntry *fpregs; + + fpregs = xmalloc(sizeof(*fpregs)); + if (!fpregs) + return NULL; + user_s390_fpregs_entry__init(fpregs); + + fpregs->n_fprs = 16; + fpregs->fprs = xzalloc(16 * sizeof(uint64_t)); + if (!fpregs->fprs) + goto fail_free_fpregs; + return fpregs; + +fail_free_fpregs: + xfree(fpregs); + return NULL; +} + /* * Allocate VxrsLow registers */ @@ -294,7 +317,13 @@ int save_task_regs(pid_t pid, void *arg, user_regs_struct_t *u, user_fpregs_stru CoreEntry *core = arg; gpregs = CORE_THREAD_ARCH_INFO(core)->gpregs; - fpregs = CORE_THREAD_ARCH_INFO(core)->fpregs; + /* + * We delay allocating this until now because checkpointing can fail earlier. + * When it fails we need to know if we reached here or not so that the cleanup + * code doesn't restore FPRs that were never saved in the first place. + */ + fpregs = allocate_fp_regs(); + CORE_THREAD_ARCH_INFO(core)->fpregs = fpregs; /* Vector registers */ if (f->flags & USER_FPREGS_VXRS) { @@ -399,36 +428,15 @@ int restore_fpu(struct rt_sigframe *f, CoreEntry *core) return 0; } -/* - * Allocate floating point registers - */ -static UserS390FpregsEntry *allocate_fp_regs(void) -{ - UserS390FpregsEntry *fpregs; - - fpregs = xmalloc(sizeof(*fpregs)); - if (!fpregs) - return NULL; - user_s390_fpregs_entry__init(fpregs); - - fpregs->n_fprs = 16; - fpregs->fprs = xzalloc(16 * sizeof(uint64_t)); - if (!fpregs->fprs) - goto fail_free_fpregs; - return fpregs; - -fail_free_fpregs: - xfree(fpregs); - return NULL; -} - /* * Free floating point registers */ static void free_fp_regs(UserS390FpregsEntry *fpregs) { - xfree(fpregs->fprs); - xfree(fpregs); + if (fpregs) { + xfree(fpregs->fprs); + xfree(fpregs); + } } /* @@ -487,15 +495,17 @@ int arch_alloc_thread_info(CoreEntry *core) ti_s390->gpregs = allocate_gp_regs(); if (!ti_s390->gpregs) goto fail_free_ti_s390; - ti_s390->fpregs = allocate_fp_regs(); - if (!ti_s390->fpregs) - goto fail_free_gp_regs; + + /* + * Delay allocating space until needed. Checkpointing can fail before that + * and the cleanup code needs to be able to tell if FPRs were saved or not + * before trying to restore the register state. + */ + ti_s390->fpregs = NULL; CORE_THREAD_ARCH_INFO(core) = ti_s390; return 0; -fail_free_gp_regs: - free_gp_regs(ti_s390->gpregs); fail_free_ti_s390: xfree(ti_s390); return -1; @@ -678,14 +688,18 @@ static int set_task_regs(pid_t pid, CoreEntry *core) user_fpregs_struct_t fpregs; memset(&fpregs, 0, sizeof(fpregs)); - /* Floating point registers */ + /* + * Floating point registers + * Optional on checkpoint; checkpoint may have failed and we may reach here as part of cleanup + * so there's no guarantee that we saved FPRs for this thread. + */ cfpregs = CORE_THREAD_ARCH_INFO(core)->fpregs; - if (!cfpregs) - return -1; - fpregs.prfpreg.fpc = cfpregs->fpc; - memcpy(fpregs.prfpreg.fprs, cfpregs->fprs, sizeof(fpregs.prfpreg.fprs)); - if (set_fp_regs(pid, &fpregs) < 0) - return -1; + if (cfpregs) { + fpregs.prfpreg.fpc = cfpregs->fpc; + memcpy(fpregs.prfpreg.fprs, cfpregs->fprs, sizeof(fpregs.prfpreg.fprs)); + if (set_fp_regs(pid, &fpregs) < 0) + return -1; + } /* Vector registers (optional) */ cvxrs_low = CORE_THREAD_ARCH_INFO(core)->vxrs_low; if (cvxrs_low != NULL) { From 5de61a721fbc56de68094f19ac34466d66f7374f Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 21 Apr 2025 06:33:41 +0000 Subject: [PATCH 101/257] net: nftables: avoid restore failure if the CRIU nft table already exist CRIU locks the network during restore in an "empty" network namespace. However, "empty" in this context means CRIU isn't restoring the namespace. This network namespace can be the same namespace where processes have been dumped and so the network is already locked in it. Fixes #2650 Signed-off-by: Andrei Vagin --- criu/cr-restore.c | 2 +- criu/include/net.h | 2 +- criu/net.c | 30 +++++++++++++++++------------- 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 583b446e0..30932f60a 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2119,7 +2119,7 @@ static int restore_root_task(struct pstree_item *init) * the '--empty-ns net' mode no iptables C/R is done and we * need to return these rules by hands. */ - ret = network_lock_internal(); + ret = network_lock_internal(/* restore = */ true); if (ret) goto out_kill; } diff --git a/criu/include/net.h b/criu/include/net.h index 5e8a84862..7c5ede21e 100644 --- a/criu/include/net.h +++ b/criu/include/net.h @@ -31,7 +31,7 @@ extern int collect_net_namespaces(bool for_dump); extern int network_lock(void); extern void network_unlock(void); -extern int network_lock_internal(void); +extern int network_lock_internal(bool restore); extern struct ns_desc net_ns_desc; diff --git a/criu/net.c b/criu/net.c index ee46f1c49..300df480b 100644 --- a/criu/net.c +++ b/criu/net.c @@ -3206,12 +3206,12 @@ static inline FILE *redirect_nftables_output(struct nft_ctx *nft) } #endif -static inline int nftables_lock_network_internal(void) +static inline int nftables_lock_network_internal(bool restore) { #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) cleanup_file FILE *fp = NULL; struct nft_ctx *nft; - int ret = 0; + int ret = 0, exit_code = -1; char table[32]; char buf[128]; @@ -3224,11 +3224,16 @@ static inline int nftables_lock_network_internal(void) fp = redirect_nftables_output(nft); if (!fp) - goto out; + goto err2; snprintf(buf, sizeof(buf), "create table %s", table); - if (NFT_RUN_CMD(nft, buf)) + ret = NFT_RUN_CMD(nft, buf); + if (ret) { + /* The network has been locked on dump. */ + if (restore && errno == EEXIST) + return 0; goto err2; + } snprintf(buf, sizeof(buf), "add chain %s output { type filter hook output priority 0; policy drop; }", table); if (NFT_RUN_CMD(nft, buf)) @@ -3246,17 +3251,16 @@ static inline int nftables_lock_network_internal(void) if (NFT_RUN_CMD(nft, buf)) goto err1; - goto out; - + exit_code = 0; +out: + nft_ctx_free(nft); + return exit_code; err1: snprintf(buf, sizeof(buf), "delete table %s", table); NFT_RUN_CMD(nft, buf); err2: - ret = -1; pr_err("Locking network failed using nftables\n"); -out: - nft_ctx_free(nft); - return ret; + goto out; #else pr_err("CRIU was built without libnftables support\n"); return -1; @@ -3288,7 +3292,7 @@ static int iptables_network_lock_internal(void) return ret; } -int network_lock_internal(void) +int network_lock_internal(bool restore) { int ret = 0, nsret; @@ -3301,7 +3305,7 @@ int network_lock_internal(void) if (opts.network_lock_method == NETWORK_LOCK_IPTABLES) ret = iptables_network_lock_internal(); else if (opts.network_lock_method == NETWORK_LOCK_NFTABLES) - ret = nftables_lock_network_internal(); + ret = nftables_lock_network_internal(restore); if (restore_ns(nsret, &net_ns_desc)) ret = -1; @@ -3427,7 +3431,7 @@ int network_lock(void) if (run_scripts(ACT_NET_LOCK)) return -1; - return network_lock_internal(); + return network_lock_internal(false); } void network_unlock(void) From b6dca31162562385cb0657af3443666990a28c01 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 14 Apr 2025 14:12:31 +0100 Subject: [PATCH 102/257] aarch64/crtools: fix define for missing constants Building CRIU package on Debian 11 aarch64 fails with criu/arch/aarch64/crtools.c: In function 'save_pac_keys': criu/arch/aarch64/crtools.c:32:31: error: storage size of 'paca' isn't known struct user_pac_address_keys paca; ^~~~ criu/arch/aarch64/crtools.c:33:31: error: storage size of 'pacg' isn't known struct user_pac_generic_keys pacg; ^~~~ criu/arch/aarch64/crtools.c:47:15: error: 'HWCAP_PACA' undeclared (first use in this function); did you mean 'HWCAP_FCMA'? if (hwcaps & HWCAP_PACA) { ^~~~~~~~~~ HWCAP_FCMA criu/arch/aarch64/crtools.c:47:15: note: each undeclared identifier is reported only once for each function it appears in criu/arch/aarch64/crtools.c:53:44: error: 'NT_ARM_PACA_KEYS' undeclared (first use in this function); did you mean 'NT_ARM_SVE'? if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PACA_KEYS, &iov))) { ^~~~~~~~~~~~~~~~ NT_ARM_SVE criu/arch/aarch64/crtools.c:73:39: error: 'NT_ARM_PAC_ENABLED_KEYS' undeclared (first use in this function) ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PAC_ENABLED_KEYS, &iov); ^~~~~~~~~~~~~~~~~~~~~~~ criu/arch/aarch64/crtools.c:82:15: error: 'HWCAP_PACG' undeclared (first use in this function); did you mean 'HWCAP_AES'? if (hwcaps & HWCAP_PACG) { ^~~~~~~~~~ HWCAP_AES criu/arch/aarch64/crtools.c:88:44: error: 'NT_ARM_PACG_KEYS' undeclared (first use in this function); did you mean 'NT_ARM_SVE'? if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PACG_KEYS, &iov))) { ^~~~~~~~~~~~~~~~ NT_ARM_SVE criu/arch/aarch64/crtools.c:33:31: error: unused variable 'pacg' [-Werror=unused-variable] struct user_pac_generic_keys pacg; ^~~~ criu/arch/aarch64/crtools.c:32:31: error: unused variable 'paca' [-Werror=unused-variable] struct user_pac_address_keys paca; ^~~~ criu/arch/aarch64/crtools.c: In function 'arch_ptrace_restore': criu/arch/aarch64/crtools.c:227:31: error: storage size of 'upaca' isn't known struct user_pac_address_keys upaca; ^~~~~ criu/arch/aarch64/crtools.c:228:31: error: storage size of 'upacg' isn't known struct user_pac_generic_keys upacg; ^~~~~ criu/arch/aarch64/crtools.c:241:18: error: 'HWCAP_PACA' undeclared (first use in this function); did you mean 'HWCAP_FCMA'? if (!(hwcaps & HWCAP_PACA)) { ^~~~~~~~~~ HWCAP_FCMA criu/arch/aarch64/crtools.c:255:44: error: 'NT_ARM_PACA_KEYS' undeclared (first use in this function); did you mean 'NT_ARM_SVE'? if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PACA_KEYS, &iov))) { ^~~~~~~~~~~~~~~~ NT_ARM_SVE criu/arch/aarch64/crtools.c:261:44: error: 'NT_ARM_PAC_ENABLED_KEYS' undeclared (first use in this function) if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PAC_ENABLED_KEYS, &iov))) { ^~~~~~~~~~~~~~~~~~~~~~~ criu/arch/aarch64/crtools.c:268:18: error: 'HWCAP_PACG' undeclared (first use in this function); did you mean 'HWCAP_AES'? if (!(hwcaps & HWCAP_PACG)) { ^~~~~~~~~~ HWCAP_AES criu/arch/aarch64/crtools.c:275:44: error: 'NT_ARM_PACG_KEYS' undeclared (first use in this function); did you mean 'NT_ARM_SVE'? if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PACG_KEYS, &iov))) { ^~~~~~~~~~~~~~~~ NT_ARM_SVE criu/arch/aarch64/crtools.c:233:6: error: variable 'ret' set but not used [-Werror=unused-but-set-variable] int ret; ^~~ criu/arch/aarch64/crtools.c:228:31: error: unused variable 'upacg' [-Werror=unused-variable] struct user_pac_generic_keys upacg; ^~~~~ criu/arch/aarch64/crtools.c:227:31: error: unused variable 'upaca' [-Werror=unused-variable] struct user_pac_address_keys upaca; ^~~~~ This patch adds the missing constants and structs if undefined. Signed-off-by: Radostin Stoyanov --- criu/arch/aarch64/crtools.c | 47 +++++++++++++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/criu/arch/aarch64/crtools.c b/criu/arch/aarch64/crtools.c index c077dd06b..3ed5c9d63 100644 --- a/criu/arch/aarch64/crtools.c +++ b/criu/arch/aarch64/crtools.c @@ -23,6 +23,45 @@ #include "compel/infect.h" #include "pstree.h" +/* + * cr_user_pac_* are a copy of the corresponding uapi structs + * in arch/arm64/include/uapi/asm/ptrace.h + */ +struct cr_user_pac_address_keys { + __uint128_t apiakey; + __uint128_t apibkey; + __uint128_t apdakey; + __uint128_t apdbkey; +}; + +struct cr_user_pac_generic_keys { + __uint128_t apgakey; +}; + +/* + * The following HWCAP constants are copied from + * arch/arm64/include/uapi/asm/hwcap.h + */ +#ifndef HWCAP_PACA +#define HWCAP_PACA (1 << 30) +#endif + +#ifndef HWCAP_PACG +#define HWCAP_PACG (1UL << 31) +#endif + +/* + * The following NT_ARM_PAC constants are copied from + * include/uapi/linux/elf.h + */ +#ifndef NT_ARM_PACA_KEYS +#define NT_ARM_PACA_KEYS 0x407 /* ARM pointer authentication address keys */ +#endif + +#ifndef NT_ARM_PACG_KEYS +#define NT_ARM_PACG_KEYS 0x408 +#endif + #ifndef NT_ARM_PAC_ENABLED_KEYS #define NT_ARM_PAC_ENABLED_KEYS 0x40a /* AArch64 pointer authentication enabled keys. */ #endif @@ -33,8 +72,8 @@ extern unsigned long getauxval(unsigned long type); static int save_pac_keys(int pid, CoreEntry *core) { - struct user_pac_address_keys paca; - struct user_pac_generic_keys pacg; + struct cr_user_pac_address_keys paca; + struct cr_user_pac_generic_keys pacg; PacKeys *pac_entry; long pac_enabled_key; struct iovec iov; @@ -228,8 +267,8 @@ int restore_gpregs(struct rt_sigframe *f, UserRegsEntry *r) int arch_ptrace_restore(int pid, struct pstree_item *item) { unsigned long hwcaps = getauxval(AT_HWCAP); - struct user_pac_address_keys upaca; - struct user_pac_generic_keys upacg; + struct cr_user_pac_address_keys upaca; + struct cr_user_pac_generic_keys upacg; PacAddressKeys *paca; PacGenericKeys *pacg; long pac_enabled_keys; From 88cb552f692353983aeab6478d1779566afd154e Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 2 Apr 2025 12:02:46 +0800 Subject: [PATCH 103/257] mount: restore root mount flags Mount flags belong to mount and mount namespace of the Container, so we should preserve them, as Container user will not expect mounts switching between ro and rw over c/r. Fixes: #2632 v5: fix both mount-v1 and mount-v2 Signed-off-by: Pavel Tikhomirov --- criu/mount-v2.c | 6 ++++++ criu/mount.c | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/criu/mount-v2.c b/criu/mount-v2.c index cdebc8318..1e33ac12a 100644 --- a/criu/mount-v2.c +++ b/criu/mount-v2.c @@ -443,6 +443,7 @@ err: /* Mounts root container mount. */ static int do_mount_root_v2(struct mount_info *mi) { + unsigned long mflags = mi->flags & (~MS_PROPAGATE); unsigned long flags = MS_BIND; int fd; @@ -477,6 +478,11 @@ static int do_mount_root_v2(struct mount_info *mi) return -1; } + if (mflags && mount(NULL, mi->plain_mountpoint, NULL, MS_REMOUNT | MS_BIND | mflags, NULL)) { + pr_perror("Unable to apply root mount options"); + return -1; + } + mi->mounted = true; return 0; diff --git a/criu/mount.c b/criu/mount.c index 82bbd52d6..06b959542 100644 --- a/criu/mount.c +++ b/criu/mount.c @@ -2690,9 +2690,16 @@ shared: static int do_mount_root(struct mount_info *mi) { + unsigned long mflags = mi->flags & (~MS_PROPAGATE); + if (restore_shared_options(mi, !mi->shared_id && !mi->master_id, mi->shared_id, mi->master_id)) return -1; + if (mflags && mount(NULL, service_mountpoint(mi), NULL, MS_REMOUNT | MS_BIND | mflags, NULL)) { + pr_perror("Unable to apply root mount options"); + return -1; + } + return fetch_rt_stat(mi, service_mountpoint(mi)); } From 6b3826a6fb384632dfbd6e4b90c43b15842f09f8 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 2 Apr 2025 12:47:46 +0800 Subject: [PATCH 104/257] zdtm/lib: add "bind" desc option Add {'bind': 'path/to/bindmount'} zdtm descriptor option, so that in test mount namespace a directory bindmount can be created before running the test. This is useful to leave test directory writable (e.g. for logs) while the test makes root mount readonly. note: We create this bindmount early so that all test files are opened on it initially and not on the below mount. Will be used in mnt_ro_root test. Signed-off-by: Pavel Tikhomirov --- test/zdtm.py | 3 +++ test/zdtm/lib/ns.c | 15 ++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/test/zdtm.py b/test/zdtm.py index 37ebe63b7..e3ddc762a 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -443,6 +443,7 @@ class zdtm_test: self._bins = [name] self._env = {'TMPDIR': os.environ.get('TMPDIR', '/tmp')} self._deps = desc.get('deps', []) + self._bind = desc.get('bind') self.auto_reap = True def __make_action(self, act, env=None, root=None): @@ -513,6 +514,8 @@ class zdtm_test: if self.__flavor.ns: env['ZDTM_NEWNS'] = "1" env['ZDTM_ROOT'] = self.__flavor.root + if self._bind: + env['ZDTM_BIND'] = self._bind env['ZDTM_DEV'] = self.__flavor.devpath env['PATH'] = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" diff --git a/test/zdtm/lib/ns.c b/test/zdtm/lib/ns.c index 3c0dbdeb8..5fe81561f 100644 --- a/test/zdtm/lib/ns.c +++ b/test/zdtm/lib/ns.c @@ -28,8 +28,9 @@ extern int pivot_root(const char *new_root, const char *put_old); static int prepare_mntns(void) { int dfd, ret; - char *root, *criu_path, *dev_path; + char *root, *criu_path, *dev_path, *zdtm_bind; char path[PATH_MAX]; + char bind_path[PATH_MAX]; root = getenv("ZDTM_ROOT"); if (!root) { @@ -52,6 +53,18 @@ static int prepare_mntns(void) return -1; } + zdtm_bind = getenv("ZDTM_BIND"); + if (zdtm_bind) { + /* + * Bindmount the directory to itself. + */ + snprintf(bind_path, sizeof(bind_path), "%s/%s", root, zdtm_bind); + if (mount(bind_path, bind_path, NULL, MS_BIND, NULL)) { + fprintf(stderr, "Can't bind-mount ZDTM_BIND: %m\n"); + return -1; + } + } + dev_path = getenv("ZDTM_DEV"); if (dev_path) { snprintf(path, sizeof(path), "%s/dev", root); From 5a725266ac83ab4dedbd11cc76c29a257c018fef Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 2 Apr 2025 12:47:46 +0800 Subject: [PATCH 105/257] zdtm: add mnt_ro_root test It makes root mount readonly and checks that it is still readonly after migration. Make zdtm/static writable for logs via "bind" desc option. v2: explain why we don't have explicit rw/ro flag check v3: use new zdtm "bind" desc option Signed-off-by: Pavel Tikhomirov --- test/zdtm/lib/ns.c | 3 +++ test/zdtm/static/Makefile | 1 + test/zdtm/static/mnt_ro_root.c | 32 +++++++++++++++++++++++++++++++ test/zdtm/static/mnt_ro_root.desc | 6 ++++++ 4 files changed, 42 insertions(+) create mode 100644 test/zdtm/static/mnt_ro_root.c create mode 100644 test/zdtm/static/mnt_ro_root.desc diff --git a/test/zdtm/lib/ns.c b/test/zdtm/lib/ns.c index 5fe81561f..822e09c92 100644 --- a/test/zdtm/lib/ns.c +++ b/test/zdtm/lib/ns.c @@ -57,6 +57,9 @@ static int prepare_mntns(void) if (zdtm_bind) { /* * Bindmount the directory to itself. + * e.g.: The mnt_ro_root test makes "/" mount readonly, but we + * still want to write logs to /zdtm/static/ so let's make it + * separate writable bind mount. */ snprintf(bind_path, sizeof(bind_path), "%s/%s", root, zdtm_bind); if (mount(bind_path, bind_path, NULL, MS_BIND, NULL)) { diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 6a19cad3c..81e44de22 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -423,6 +423,7 @@ TST_DIR = \ mntns_ghost \ mntns_ghost01 \ mntns_ro_root \ + mnt_ro_root \ mntns_link_ghost \ mntns_shared_bind \ mntns_shared_bind02 \ diff --git a/test/zdtm/static/mnt_ro_root.c b/test/zdtm/static/mnt_ro_root.c new file mode 100644 index 000000000..2d8370150 --- /dev/null +++ b/test/zdtm/static/mnt_ro_root.c @@ -0,0 +1,32 @@ +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check if root mount remains read-only after c/r"; +const char *test_author = "Pavel Tikhomirov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + test_init(argc, argv); + + if (mount(NULL, "/", NULL, MS_REMOUNT | MS_RDONLY | MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + + test_daemon(); + test_waitsig(); + + /* + * Note: In zdtm.py:check_visible_state() we already check for all + * tests, that all mounts in the test's mount namespace remain the + * same, by comparing mountinfo before and after c/r. So rw/ro mount + * option inconsistency will be detected there and we don't need to + * check it in the test itself. + */ + pass(); + return 0; +} diff --git a/test/zdtm/static/mnt_ro_root.desc b/test/zdtm/static/mnt_ro_root.desc new file mode 100644 index 000000000..c9a8e4f18 --- /dev/null +++ b/test/zdtm/static/mnt_ro_root.desc @@ -0,0 +1,6 @@ +{ + 'flavor': 'ns uns', + 'flags': 'suid', + 'feature': 'mnt_id', + 'bind': 'zdtm/static', +} From b458a5c1ad71b1081b3e1fdbc51b4581faabc4cf Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 7 May 2025 14:06:55 +0100 Subject: [PATCH 106/257] sk-inet: add message how to disable MPTCP in Go With Go version 1.24, ListenConfig now uses MPTCP by default [1]. Checkpoint/restore for this protocol is not currently supported and adding support requires kernel changes that are not trivial to implement. As a result, checkpointing of many containers that run Go programs is likely to fail with the following error [2]: (00.026522) Error (criu/sk-inet.c:130): inet: Unsupported proto 262 for socket 2f9bc5 This patch adds a message with suggested workaround for this problem. [1] https://go.dev/doc/go1.24#netpkgnet [2] https://github.com/checkpoint-restore/criu/issues/2655 Signed-off-by: Radostin Stoyanov --- criu/sk-inet.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/criu/sk-inet.c b/criu/sk-inet.c index 92f53e569..a191e78c4 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -128,6 +128,8 @@ static int can_dump_ipproto(unsigned int ino, int proto, int type) break; default: pr_err("Unsupported proto %d for socket %x\n", proto, ino); + if (proto == IPPROTO_MPTCP) + pr_err("For Go programs, consider using \"GODEBUG=multipathtcp=0\" to disable MPTCP\n"); return 0; } From 1eaa870ccebba1e067862e45b6f8887e07d61a5c Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 6 May 2025 15:38:26 +0000 Subject: [PATCH 107/257] kerndat: check that hardware breakpoints work In some cases, they might not work in virtual machines if the hypervisor doesn't virtualize them. For example, they don't work in AMD SEV virtual machines if the Debug Virtualization extension isn't supported or isn't enabled in SEV_FEATURES. Fixes #2658 Signed-off-by: Andrei Vagin --- criu/cr-check.c | 17 +++++++++ criu/cr-restore.c | 3 +- criu/include/kerndat.h | 1 + criu/kerndat.c | 80 +++++++++++++++++++++++++++++++++++++++++ criu/parasite-syscall.c | 2 +- 5 files changed, 101 insertions(+), 2 deletions(-) diff --git a/criu/cr-check.c b/criu/cr-check.c index 7b4a6415a..9c4778490 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -1589,6 +1589,17 @@ static int check_overlayfs_maps(void) return status == 0 ? 0 : -1; } +static int check_breakpoints(void) +{ + if (!kdat.has_breakpoints) { + pr_warn("Hardware breakpoints don't seem to work\n"); + return -1; + } + + return 0; +} + + static int (*chk_feature)(void); /* @@ -1616,6 +1627,7 @@ static int (*chk_feature)(void); return ret; \ } \ } while (0) + int cr_check(void) { struct ns_id *ns; @@ -1724,6 +1736,10 @@ int cr_check(void) ret |= check_autofs(); ret |= check_compat_cr(); } + /* + * Category 4 - optional. + */ + check_breakpoints(); pr_msg("%s\n", ret ? CHECK_MAYBE : CHECK_GOOD); return ret; @@ -1836,6 +1852,7 @@ static struct feature_list feature_list[] = { { "pagemap_scan", check_pagemap_scan }, { "timer_cr_ids", check_timer_cr_ids }, { "overlayfs_maps", check_overlayfs_maps }, + { "breakpoints", check_breakpoints }, { NULL, NULL }, }; diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 30932f60a..cabe2f464 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1820,6 +1820,7 @@ static int restore_rseq_cs(void) static int catch_tasks(bool root_seized) { struct pstree_item *item; + bool nobp = fault_injected(FI_NO_BREAKPOINTS) || !kdat.has_breakpoints; for_each_pstree_item(item) { int status, i, ret; @@ -1847,7 +1848,7 @@ static int catch_tasks(bool root_seized) return -1; } - ret = compel_stop_pie(pid, rsti(item)->breakpoint, fault_injected(FI_NO_BREAKPOINTS)); + ret = compel_stop_pie(pid, rsti(item)->breakpoint, nobp); if (ret < 0) return -1; } diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index bd8744d62..c5deb3283 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -90,6 +90,7 @@ struct kerndat_s { bool has_shstk; bool has_close_range; bool has_timer_cr_ids; + bool has_breakpoints; }; extern struct kerndat_s kdat; diff --git a/criu/kerndat.c b/criu/kerndat.c index 930117b0a..fa43f7d3f 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -1736,6 +1736,83 @@ static int kerndat_has_timer_cr_ids(void) return 0; } +static void breakpoint_func(void) +{ + if (raise(SIGSTOP)) + pr_perror("Unable to kill itself with SIGSTOP"); + exit(1); +} + +/* + * kerndat_breakpoints checks that hardware breakpoints work as they should. + * In some cases, they might not work in virtual machines if the hypervisor + * doesn't virtualize them. For example, they don't work in AMD SEV virtual + * machines if the Debug Virtualization extension isn't supported or isn't + * enabled in SEV_FEATURES. + */ +static int kerndat_breakpoints(void) +{ + int status, ret, exit_code = -1; + pid_t pid; + + pid = fork(); + if (pid == -1) { + pr_perror("fork"); + return -1; + } + if (pid == 0) { + if (ptrace(PTRACE_TRACEME, 0, 0, 0)) { + pr_perror("ptrace(PTRACE_TRACEME)"); + exit(1); + } + raise(SIGSTOP); + breakpoint_func(); + exit(1); + } + if (waitpid(pid, &status, 0) == -1) { + pr_perror("waitpid for initial stop"); + goto err; + } + if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGSTOP) { + pr_err("Child didn't stop as expected: status=%x\n", status); + goto err; + } + ret = ptrace_set_breakpoint(pid, &breakpoint_func); + if (ret < 0) { + pr_err("Failed to set breakpoint\n"); + goto err; + } + if (ret == 0) { + pr_debug("Hardware breakpoints appear to be disabled\n"); + goto out; + } + if (waitpid(pid, &status, 0) == -1) { + pr_perror("waitpid for breakpoint trigger"); + goto err; + } + if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGTRAP) { + pr_warn("Hardware breakpoints don't seem to work (status=%x)\n", status); + goto out; + } + kdat.has_breakpoints = true; +out: + exit_code = 0; +err: + if (kill(pid, SIGKILL)) { + pr_perror("Failed to kill the child process"); + exit_code = -1; + } + if (waitpid(pid, &status, 0) == -1) { + pr_perror("Failed to wait for the child process"); + exit_code = -1; + } + if (!WIFSIGNALED(status) || WTERMSIG(status) != SIGKILL) { + pr_err("The child exited with unexpected code: %x\n", status); + exit_code = -1; + } + return exit_code; +} + /* * Some features depend on resource that can be dynamically changed * at the OS runtime. There are cases that we cannot determine the @@ -1999,6 +2076,9 @@ int kerndat_init(void) } if (!ret && kerndat_has_timer_cr_ids()) { pr_err("kerndat_has_timer_cr_ids has failed when initializing kerndat.\n"); + } + if (!ret && kerndat_breakpoints()) { + pr_err("kerndat_breakpoints has failed when initializing kerndat.\n"); ret = -1; } diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c index 6db9d21fe..e19847b37 100644 --- a/criu/parasite-syscall.c +++ b/criu/parasite-syscall.c @@ -421,7 +421,7 @@ struct parasite_ctl *parasite_infect_seized(pid_t pid, struct pstree_item *item, ictx->flags |= INFECT_NO_MEMFD; if (fault_injected(FI_PARASITE_CONNECT)) ictx->flags |= INFECT_FAIL_CONNECT; - if (fault_injected(FI_NO_BREAKPOINTS)) + if (fault_injected(FI_NO_BREAKPOINTS) || !kdat.has_breakpoints) ictx->flags |= INFECT_NO_BREAKPOINTS; if (kdat.compat_cr) ictx->flags |= INFECT_COMPATIBLE; From 366d73a4c29033665d59a23d5e0f89323b5fc2b2 Mon Sep 17 00:00:00 2001 From: Lorenzo Fontana Date: Wed, 14 May 2025 19:02:06 +0200 Subject: [PATCH 108/257] make: remove checks and warnings for bsd strlcat and strlcpy In 0a7c5fd1bd8d1e49e273b51ff39af473d6c68cbc we swapped the BSD implementation of strlcat and strlcpy in favor of our own replacement. The checks and the predefined macros are not needed anymore. Signed-off-by: Lorenzo Fontana --- Makefile.config | 4 ++-- scripts/feature-tests.mak | 28 ---------------------------- 2 files changed, 2 insertions(+), 30 deletions(-) diff --git a/Makefile.config b/Makefile.config index 5ab689d41..5cf4b8216 100644 --- a/Makefile.config +++ b/Makefile.config @@ -9,7 +9,7 @@ ifeq ($(call try-cc,$(FEATURE_TEST_LIBBSD_DEV),-lbsd),true) LIBS_FEATURES += -lbsd FEATURE_DEFINES += -DCONFIG_HAS_LIBBSD else - $(info Note: Building without setproctitle() and strlcpy() support.) + $(info Note: Building without setproctitle() support.) $(info $S Install libbsd-devel (RPM) / libbsd-dev (DEB) to fix.) endif @@ -84,7 +84,7 @@ endif export DEFINES += $(FEATURE_DEFINES) export CFLAGS += $(FEATURE_DEFINES) -FEATURES_LIST := TCP_REPAIR STRLCPY STRLCAT PTRACE_PEEKSIGINFO \ +FEATURES_LIST := TCP_REPAIR PTRACE_PEEKSIGINFO \ SETPROCTITLE_INIT TCP_REPAIR_WINDOW MEMFD_CREATE \ OPENAT2 NO_LIBC_RSEQ_DEFS diff --git a/scripts/feature-tests.mak b/scripts/feature-tests.mak index fb5d2ef7a..727e9689e 100644 --- a/scripts/feature-tests.mak +++ b/scripts/feature-tests.mak @@ -35,34 +35,6 @@ int main(void) } endef -define FEATURE_TEST_STRLCPY - -#include - -#ifdef CONFIG_HAS_LIBBSD -# include -#endif - -int main(void) -{ - return strlcpy(NULL, NULL, 0); -} -endef - -define FEATURE_TEST_STRLCAT - -#include - -#ifdef CONFIG_HAS_LIBBSD -# include -#endif - -int main(void) -{ - return strlcat(NULL, NULL, 0); -} -endef - define FEATURE_TEST_PTRACE_PEEKSIGINFO #include From fddca67cc633b28a73bbb1bb272018f5a3a7ea74 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 11 May 2025 11:33:29 +0100 Subject: [PATCH 109/257] seize: fix pause devices for frozen containers The container checkpointing procedure in Kubernetes freezes running containers to create a consistent snapshot of both the runtime state and the rootfs of the container. However, when checkpointing a GPU container, the container must be unfrozen before invoking the cuda-checkpoint tool. This is achieved in prepare_freezer_for_interrupt_only_mode(), which needs to be called before the PAUSE_DEVICES hook. The patch introducing this functionality fixes this problem for containers with multiple processes. However, if the container has a single process, prepare_freezer_for_interrupt_only_mode() must be invoked immediately before the PAUSE_DEVICES hook. Fixes: #2514 Signed-off-by: Radostin Stoyanov --- criu/seize.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/criu/seize.c b/criu/seize.c index f56357ac7..23f192d46 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -1060,22 +1060,32 @@ int collect_pstree(void) */ alarm(opts.timeout); - ret = run_plugins(PAUSE_DEVICES, pid); - if (ret < 0 && ret != -ENOTSUP) { - goto err; - } - if (opts.freeze_cgroup && cgroup_version()) goto err; pr_debug("Detected cgroup V%d freezer\n", cgroup_v2 ? 2 : 1); if (opts.freeze_cgroup && !compel_interrupt_only_mode) { + ret = run_plugins(PAUSE_DEVICES, pid); + if (ret < 0 && ret != -ENOTSUP) { + goto err; + } + if (freeze_processes()) goto err; } else { if (opts.freeze_cgroup && prepare_freezer_for_interrupt_only_mode()) goto err; + + /* + * Call PAUSE_DEVICES after prepare_freezer_for_interrupt_only_mode() + * to be able to checkpoint containers in a frozen state. + */ + ret = run_plugins(PAUSE_DEVICES, pid); + if (ret < 0 && ret != -ENOTSUP) { + goto err; + } + if (compel_interrupt_task(pid)) { set_cr_errno(ESRCH); goto err; @@ -1136,4 +1146,4 @@ int checkpoint_devices(void) exit_code = 0; err: return exit_code; -} \ No newline at end of file +} From d57d40a5ad76eec0e6d09e3ad44e35922cb98ff2 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 16 May 2025 12:43:14 +0100 Subject: [PATCH 110/257] sk-inet: add MPTCP definition Building CRIU on Ubuntu 20.04 fails with the following error: criu/sk-inet.c: In function 'can_dump_ipproto': criu/sk-inet.c:131:16: error: 'IPPROTO_MPTCP' undeclared (first use in this function); did you mean 'IPPROTO_MTP'? 131 | if (proto == IPPROTO_MPTCP) | ^~~~~~~~~~~~~ | IPPROTO_MTP Add definition for MPTCP to fix this error. Signed-off-by: Radostin Stoyanov --- criu/sk-inet.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/criu/sk-inet.c b/criu/sk-inet.c index a191e78c4..1238b03dc 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -44,6 +44,11 @@ #define PB_ALEN_INET 1 #define PB_ALEN_INET6 4 +/* Definition for older kernels without MPTCP support (e.g. Ubuntu 20.04) */ +#ifndef IPPROTO_MPTCP +#define IPPROTO_MPTCP 262 +#endif + static LIST_HEAD(inet_ports); struct inet_port { From 427c0dc27b473ead1367c417bee8aac2b39a2844 Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Fri, 29 Nov 2024 02:07:38 +0000 Subject: [PATCH 111/257] criu: Introduce a new device plugin hook for restore Currently, in the target process, device-related restore operations and other restore operations almost run sequentially. When the target process executes the corresponding CRIU hook functions, it can't perform other restore operations. However, for GPU applications, some device restore operations have no logical dependencies on other common restore operations and can be parallelized with other operations to speed up the process. Instead of launching a thread in child processes for parallelization, this patch chooses to add a new hook, `POST_FORKING`, in the main CRIU process to handle these restore operations. This is because the restoration of memory state in the restore blob is one of the most time-consuming parts of all restore logic. The main CRIU process can easily parallelize these operations, whereas parallelizing in threads within child processes is challenging. - POST_FORKING *POST_FORKING: Hook to enable the main CRIU process to perform some restore operations of plugins. Signed-off-by: Yanning Yang --- criu/cr-restore.c | 3 +++ criu/include/criu-plugin.h | 4 ++++ criu/plugin.c | 1 + 3 files changed, 8 insertions(+) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index cabe2f464..9cc77b21f 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2132,6 +2132,9 @@ static int restore_root_task(struct pstree_item *init) __restore_switch_stage(CR_STATE_FORKING); skip_ns_bouncing: + ret = run_plugins(POST_FORKING); + if (ret < 0 && ret != -ENOTSUP) + goto out_kill; ret = restore_wait_inprogress_tasks(); if (ret < 0) diff --git a/criu/include/criu-plugin.h b/criu/include/criu-plugin.h index 392ea9f53..9fb21a449 100644 --- a/criu/include/criu-plugin.h +++ b/criu/include/criu-plugin.h @@ -60,6 +60,8 @@ enum { CR_PLUGIN_HOOK__CHECKPOINT_DEVICES = 11, + CR_PLUGIN_HOOK__POST_FORKING = 12, + CR_PLUGIN_HOOK__MAX }; @@ -78,6 +80,7 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, const char *path, const DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__POST_FORKING, void); enum { CR_PLUGIN_STAGE__DUMP, @@ -152,5 +155,6 @@ typedef int(cr_plugin_handle_device_vma_t)(int fd, const struct stat *stat); typedef int(cr_plugin_update_vma_map_t)(const char *path, const uint64_t addr, const uint64_t old_pgoff, uint64_t *new_pgoff, int *plugin_fd); typedef int(cr_plugin_resume_devices_late_t)(int pid); +typedef int(cr_plugin_post_forking_t)(void); #endif /* __CRIU_PLUGIN_H__ */ diff --git a/criu/plugin.c b/criu/plugin.c index 65e79a069..18da0499d 100644 --- a/criu/plugin.c +++ b/criu/plugin.c @@ -59,6 +59,7 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path) __assign_hook(RESUME_DEVICES_LATE, "cr_plugin_resume_devices_late"); __assign_hook(PAUSE_DEVICES, "cr_plugin_pause_devices"); __assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices"); + __assign_hook(POST_FORKING, "cr_plugin_post_forking"); #undef __assign_hook From 497109eb4e68caeb478dd3664b3ee1186c3baafd Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Fri, 29 Nov 2024 02:13:28 +0000 Subject: [PATCH 112/257] cr-restore: Move `cr_plugin_init` after `fdstore_init` Currently, when CRIU calls `cr_plugin_init`, `fdstore` is not initialized. However, during the plugin restore procedure, there may be some common file operations used in multiple hooks. This patch moves `cr_plugin_init` after `fdstore_init`, allowing `cr_plugin_init` to use `fdstore` to place these file operations. Signed-off-by: Yanning Yang --- criu/cr-restore.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 9cc77b21f..c1d1f4b9d 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2366,41 +2366,47 @@ int cr_restore_tasks(void) return 1; if (check_img_inventory(/* restore = */ true) < 0) - goto err; - - if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) return -1; if (init_stats(RESTORE_STATS)) - goto err; + return -1; if (lsm_check_opts()) - goto err; + return -1; timing_start(TIME_RESTORE); if (cpu_init() < 0) - goto err; + return -1; if (vdso_init_restore()) - goto err; + return -1; if (tty_init_restore()) - goto err; + return -1; if (opts.cpu_cap & CPU_CAP_IMAGE) { if (cpu_validate_cpuinfo()) - goto err; + return -1; } if (prepare_task_entries() < 0) - goto err; + return -1; if (prepare_pstree() < 0) - goto err; + return -1; if (fdstore_init()) - goto err; + return -1; + + /* + * For the AMDGPU plugin, its parallel restore feature needs to use fdstore to store + * its socket file descriptor. This allows the main process and the target process to + * communicate with each other through this file descriptor. Therefore, cr_plugin_init + * must be initialized after fdstore_init. + */ + if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) + return -1; if (inherit_fd_move_to_fdstore()) goto err; From e257d04974d7945e6e3fad52b6dae39e1e711cfc Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Tue, 10 Dec 2024 12:34:14 +0000 Subject: [PATCH 113/257] pstree: Add `has_children` function Currently, parallel restore only focuses on the single-process situation. Therefore, it needs an interface to know if there is only one process to restore. This patch adds a `has_children` function in `pstree.h` and replaces some existing implementations with this function. Signed-off-by: Yanning Yang --- criu/cr-dump.c | 2 +- criu/include/pstree.h | 1 + criu/pstree.c | 9 +++++++-- criu/seize.c | 2 +- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 302078caa..b8cf7d64d 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -1396,7 +1396,7 @@ static int dump_zombies(void) item->sid = pps_buf.sid; item->pgid = pps_buf.pgid; - BUG_ON(!list_empty(&item->children)); + BUG_ON(has_children(item)); if (!item->sid) { pr_err("A session leader of zombie process %d(%d) is outside of its pid namespace\n", diff --git a/criu/include/pstree.h b/criu/include/pstree.h index 1137046d4..b750a919e 100644 --- a/criu/include/pstree.h +++ b/criu/include/pstree.h @@ -104,6 +104,7 @@ extern void pstree_insert_pid(struct pid *pid_node); extern struct pid *pstree_pid_by_virt(pid_t pid); extern struct pstree_item *root_item; +extern bool has_children(struct pstree_item *item); extern struct pstree_item *pstree_item_next(struct pstree_item *item); #define for_each_pstree_item(pi) for (pi = root_item; pi != NULL; pi = pstree_item_next(pi)) diff --git a/criu/pstree.c b/criu/pstree.c index 660f1b9d9..75c2fc8d0 100644 --- a/criu/pstree.c +++ b/criu/pstree.c @@ -182,7 +182,7 @@ void free_pstree(struct pstree_item *root_item) struct pstree_item *item = root_item, *parent; while (item) { - if (!list_empty(&item->children)) { + if (has_children(item)) { item = list_first_entry(&item->children, struct pstree_item, sibling); continue; } @@ -244,10 +244,15 @@ int init_pstree_helper(struct pstree_item *ret) return 0; } +bool has_children(struct pstree_item *item) +{ + return !list_empty(&item->children); +} + /* Deep first search on children */ struct pstree_item *pstree_item_next(struct pstree_item *item) { - if (!list_empty(&item->children)) + if (has_children(item)) return list_first_entry(&item->children, struct pstree_item, sibling); while (item->parent) { diff --git a/criu/seize.c b/criu/seize.c index 23f192d46..d0cf7b36c 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -1008,7 +1008,7 @@ static int collect_task(struct pstree_item *item) if (ret < 0) goto err_close; - if ((item->pid->state == TASK_DEAD) && !list_empty(&item->children)) { + if ((item->pid->state == TASK_DEAD) && has_children(item)) { pr_err("Zombie with children?! O_o Run, run, run!\n"); goto err_close; } From 1fd1b670c4c536e908abfbc01ebb76377555c2e1 Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Tue, 10 Dec 2024 12:36:33 +0000 Subject: [PATCH 114/257] plugins/amdgpu: Add socket operations When enabling parallel restore, the target process and the main CRIU process need an IPC interface to communicate and transfer restore commands. This patch adds a Unix domain TCP socket and stores this socket in `fdstore`. Signed-off-by: Yanning Yang --- plugins/amdgpu/amdgpu_socket_utils.c | 59 ++++++++++++++++++++++++++++ plugins/amdgpu/amdgpu_socket_utils.h | 6 +++ 2 files changed, 65 insertions(+) create mode 100644 plugins/amdgpu/amdgpu_socket_utils.c create mode 100644 plugins/amdgpu/amdgpu_socket_utils.h diff --git a/plugins/amdgpu/amdgpu_socket_utils.c b/plugins/amdgpu/amdgpu_socket_utils.c new file mode 100644 index 000000000..9e957ae54 --- /dev/null +++ b/plugins/amdgpu/amdgpu_socket_utils.c @@ -0,0 +1,59 @@ +#include +#include +#include +#include +#include +#include + +#include "amdgpu_socket_utils.h" +#include "criu-log.h" +#include "common/scm.h" +#include "fdstore.h" +#include "util-pie.h" +#include "util.h" + +int parallel_socket_addr_len; +struct sockaddr_un parallel_socket_addr; +int parallel_socket_id = 0; + +static void amdgpu_socket_name_gen(struct sockaddr_un *addr, int *len) +{ + addr->sun_family = AF_UNIX; + snprintf(addr->sun_path, UNIX_PATH_MAX, "x/criu-amdgpu-parallel-%s", criu_run_id); + *len = SUN_LEN(addr); + *addr->sun_path = '\0'; +} + +int install_parallel_sock(void) +{ + int ret = 0; + int sock_fd; + + sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (sock_fd < 0) { + pr_perror("socket creation failed"); + return -1; + } + + amdgpu_socket_name_gen(¶llel_socket_addr, ¶llel_socket_addr_len); + ret = bind(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); + if (ret < 0) { + pr_perror("bind failed"); + goto err; + } + + ret = listen(sock_fd, SOMAXCONN); + if (ret < 0) { + pr_perror("listen failed"); + goto err; + } + + parallel_socket_id = fdstore_add(sock_fd); + if (parallel_socket_id < 0) { + ret = -1; + goto err; + } +err: + close(sock_fd); + return ret; +} \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_socket_utils.h b/plugins/amdgpu/amdgpu_socket_utils.h new file mode 100644 index 000000000..4e7aa2aa4 --- /dev/null +++ b/plugins/amdgpu/amdgpu_socket_utils.h @@ -0,0 +1,6 @@ +#ifndef __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ +#define __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ + +int install_parallel_sock(void); + +#endif \ No newline at end of file From e8ba7c103a02c49bbb1435b9d54d1fee33e31a0c Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Tue, 10 Dec 2024 12:38:48 +0000 Subject: [PATCH 115/257] plugins/amdgpu: Add parallel restore command Currently the restore of buffer object comsumes a significant amount of time. However, this part has no logical dependencies with other restore operations. This patch introduce some structures and some helper functions for the target process to offload this task to the main CRIU process. Signed-off-by: Yanning Yang --- plugins/amdgpu/amdgpu_socket_utils.c | 261 +++++++++++++++++++++++++++ plugins/amdgpu/amdgpu_socket_utils.h | 48 +++++ 2 files changed, 309 insertions(+) diff --git a/plugins/amdgpu/amdgpu_socket_utils.c b/plugins/amdgpu/amdgpu_socket_utils.c index 9e957ae54..c8bf6d1ba 100644 --- a/plugins/amdgpu/amdgpu_socket_utils.c +++ b/plugins/amdgpu/amdgpu_socket_utils.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "amdgpu_socket_utils.h" #include "criu-log.h" @@ -53,6 +54,266 @@ int install_parallel_sock(void) ret = -1; goto err; } +err: + close(sock_fd); + return ret; +} + +void parallel_restore_bo_add(int dmabuf_fd, int gpu_id, uint64_t size, uint64_t offset, + parallel_restore_cmd *restore_cmd) +{ + parallel_restore_entry *restore_entry = &restore_cmd->entries[restore_cmd->cmd_head.entry_num]; + restore_entry->gpu_id = gpu_id; + restore_entry->write_id = restore_cmd->cmd_head.fd_write_num; + restore_entry->write_offset = 0; + restore_entry->read_offset = offset; + restore_entry->size = size; + + restore_cmd->fds_write[restore_cmd->cmd_head.fd_write_num] = dmabuf_fd; + + restore_cmd->cmd_head.entry_num += 1; + restore_cmd->cmd_head.fd_write_num += 1; +} + +void parallel_restore_gpu_id_add(int gpu_id, int minor, parallel_restore_cmd *restore_cmd) +{ + restore_cmd->gpu_ids[restore_cmd->cmd_head.gpu_num] = (parallel_gpu_info){ gpu_id, minor }; + restore_cmd->cmd_head.gpu_num += 1; +} + +static int send_metadata(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send(sock_fd, &restore_cmd->cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { + pr_perror("Send parallel restore command head fail"); + return -1; + } + return 0; +} + +static int send_gpu_ids(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send(sock_fd, restore_cmd->gpu_ids, restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info), 0) < 0) { + pr_perror("Send GPU ids of parallel restore command fail"); + return -1; + } + return 0; +} + +static int send_cmds(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send(sock_fd, restore_cmd->entries, restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry), 0) < 0) { + pr_perror("Send parallel restore command fail"); + return -1; + } + return 0; +} + +static int send_dmabuf_fds(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send_fds(sock_fd, NULL, 0, restore_cmd->fds_write, restore_cmd->cmd_head.fd_write_num, 0, 0) < 0) { + pr_perror("Send dmabuf fds fail"); + return -1; + } + return 0; +} + +int send_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) +{ + int sock_fd; + int ret = 0; + + sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (sock_fd < 0) { + pr_perror("Socket creation failed"); + return -1; + } + + ret = connect(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); + if (ret < 0) { + pr_perror("Connect failed"); + goto err; + } + + ret = send_metadata(sock_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = send_gpu_ids(sock_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = send_cmds(sock_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = send_dmabuf_fds(sock_fd, restore_cmd); + +err: + close(sock_fd); + return ret; +} + +int init_parallel_restore_cmd(int num, int id, int gpu_num, parallel_restore_cmd *restore_cmd) +{ + restore_cmd->cmd_head.id = id; + restore_cmd->cmd_head.fd_write_num = 0; + restore_cmd->cmd_head.entry_num = 0; + restore_cmd->cmd_head.gpu_num = 0; + + restore_cmd->gpu_ids = xzalloc(gpu_num * sizeof(parallel_gpu_info)); + if (!restore_cmd->gpu_ids) + return -ENOMEM; + restore_cmd->fds_write = xzalloc(num * sizeof(int)); + if (!restore_cmd->fds_write) + return -ENOMEM; + restore_cmd->entries = xzalloc(num * sizeof(parallel_restore_entry)); + if (!restore_cmd->entries) + return -ENOMEM; + return 0; +} + +void free_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) +{ + if (restore_cmd->gpu_ids) + xfree(restore_cmd->gpu_ids); + if (restore_cmd->fds_write) + xfree(restore_cmd->fds_write); + if (restore_cmd->entries) + xfree(restore_cmd->entries); +} + +static int init_parallel_restore_cmd_by_head(parallel_restore_cmd *restore_cmd) +{ + restore_cmd->gpu_ids = xzalloc(restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info)); + if (!restore_cmd->gpu_ids) + return -ENOMEM; + restore_cmd->fds_write = xzalloc(restore_cmd->cmd_head.fd_write_num * sizeof(int)); + if (!restore_cmd->fds_write) + return -ENOMEM; + restore_cmd->entries = xzalloc(restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry)); + if (!restore_cmd->entries) + return -ENOMEM; + return 0; +} + +static int check_quit_cmd(parallel_restore_cmd *restore_cmd) +{ + return restore_cmd->cmd_head.fd_write_num == 0; +} + +static int recv_metadata(int client_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv(client_fd, &restore_cmd->cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { + pr_perror("Recv parallel restore command head fail"); + return -1; + } + return 0; +} + +static int recv_cmds(int client_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv(client_fd, restore_cmd->entries, restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry), 0) < 0) { + pr_perror("Recv parallel restore command fail"); + return -1; + } + return 0; +} + +static int recv_gpu_ids(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv(sock_fd, restore_cmd->gpu_ids, restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info), 0) < 0) { + pr_perror("Send GPU ids of parallel restore command fail"); + return -1; + } + return 0; +} + +static int recv_dmabuf_fds(int client_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv_fds(client_fd, restore_cmd->fds_write, restore_cmd->cmd_head.fd_write_num, 0, 0) < 0) { + pr_perror("Recv dmabuf fds fail"); + return -1; + } + return 0; +} + +int recv_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) +{ + int sock_fd, client_fd; + int ret = 0; + + sock_fd = fdstore_get(parallel_socket_id); + if (sock_fd < 0) + return -1; + + client_fd = accept(sock_fd, NULL, NULL); + if (client_fd < 0) { + ret = client_fd; + goto err_accept; + } + + ret = recv_metadata(client_fd, restore_cmd); + if (ret) { + goto err; + } + + // Return 1 to quit + if (check_quit_cmd(restore_cmd)) { + ret = 1; + goto err; + } + + ret = init_parallel_restore_cmd_by_head(restore_cmd); + if (ret) { + goto err; + } + + ret = recv_gpu_ids(client_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = recv_cmds(client_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = recv_dmabuf_fds(client_fd, restore_cmd); + +err: + close(client_fd); +err_accept: + close(sock_fd); + return ret; +} + +int close_parallel_restore_server(void) +{ + int sock_fd; + int ret = 0; + parallel_restore_cmd_head cmd_head; + + sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (sock_fd < 0) { + pr_perror("Socket creation failed"); + return -1; + } + + ret = connect(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); + if (ret < 0) { + pr_perror("Connect failed"); + goto err; + } + + memset(&cmd_head, 0, sizeof(parallel_restore_cmd_head)); + if (send(sock_fd, &cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { + pr_perror("Send parallel restore command head fail"); + return -1; + } + err: close(sock_fd); return ret; diff --git a/plugins/amdgpu/amdgpu_socket_utils.h b/plugins/amdgpu/amdgpu_socket_utils.h index 4e7aa2aa4..d7200c6bd 100644 --- a/plugins/amdgpu/amdgpu_socket_utils.h +++ b/plugins/amdgpu/amdgpu_socket_utils.h @@ -1,6 +1,54 @@ #ifndef __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ #define __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ +typedef struct { + int id; + int fd_write_num; /* The number of buffer objects to be restored. */ + int entry_num; /* The number of restore commands.*/ + int gpu_num; +} parallel_restore_cmd_head; + +typedef struct { + int gpu_id; + int minor; +} parallel_gpu_info; + +typedef struct { + int gpu_id; + int write_id; + uint64_t read_offset; + uint64_t write_offset; + uint64_t size; +} parallel_restore_entry; + +typedef struct { + parallel_restore_cmd_head cmd_head; + int *fds_write; + parallel_gpu_info *gpu_ids; + parallel_restore_entry *entries; +} parallel_restore_cmd; + +/* + * For parallel_restore, a background thread in the main CRIU process is used to restore the GPU + * buffer object. However, initially, the ownership of these buffer objects and the metadata for + * restoration are all with the target process. Therefore, we introduce a series of functions to + * help the target process send these tasks to the main CRIU process. + */ +int init_parallel_restore_cmd(int num, int id, int gpu_num, parallel_restore_cmd *restore_cmd); + +void free_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); + int install_parallel_sock(void); +int send_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); + +int recv_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); + +void parallel_restore_bo_add(int dmabuf_fd, int gpu_id, uint64_t size, uint64_t offset, + parallel_restore_cmd *restore_cmd); + +void parallel_restore_gpu_id_add(int gpu_id, int minor, parallel_restore_cmd *restore_cmd); + +int close_parallel_restore_server(void); + #endif \ No newline at end of file From a61116fd934868fbefb4db4edc565d117389e511 Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Wed, 15 Jan 2025 06:38:27 +0000 Subject: [PATCH 116/257] plugins/amdgpu: Implement parallel restore This patch implements the entire logic to enable the offloading of buffer object content restoration. The goal of this patch is to offload the buffer object content restoration to the main CRIU process so that this restoration can occur in parallel with other restoration logic (mainly the restoration of memory state in the restore blob, which is time-consuming) to speed up the restore phase. The restoration of buffer object content usually takes a significant amount of time for GPU applications, so parallelizing it with other operations can reduce the overall restore time. It has three parts: the first replaces the restoration of buffer objects in the target process by sending a parallel restore command to the main CRIU process; the second implements the POST_FORKING hook in the amdgpu plugin to enable buffer object content restoration in the main CRIU process; the third stops the parallel thread in the RESUME_DEVICES_LATE hook. This optimization only focuses on the single-process situation (common case). In other scenarios, it will turn to the original method. This is achieved with the new `parallel_disabled` flag. Signed-off-by: Yanning Yang --- plugins/amdgpu/Makefile | 2 +- plugins/amdgpu/amdgpu_plugin.c | 420 +++++++++++++++++++++--- plugins/amdgpu/amdgpu_plugin_topology.c | 2 +- plugins/amdgpu/amdgpu_plugin_topology.h | 1 + 4 files changed, 374 insertions(+), 51 deletions(-) diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index a20d1d163..4bf5e499f 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -27,7 +27,7 @@ endif criu-amdgpu.pb-c.c: criu-amdgpu.proto protoc-c --proto_path=. --c_out=. criu-amdgpu.proto -amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c +amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC) amdgpu_plugin_clean: diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 96c086162..69194fbc7 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -28,11 +28,13 @@ #include "xmalloc.h" #include "criu-log.h" #include "files.h" +#include "pstree.h" #include "common/list.h" #include "amdgpu_plugin_drm.h" #include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" +#include "amdgpu_socket_utils.h" #include "img-streamer.h" #include "image.h" @@ -64,6 +66,18 @@ bool plugin_added_to_inventory = false; bool plugin_disabled = false; +/* + * In the case of a single process (common case), this optimization can effectively + * reduce the restore latency with parallel restore. In the case of multiple processes, + * states are already restored in parallel within different processes. Therefore, this + * optimization does not introduce further improvement and will be disabled by default + * in this case. The flag, parallel_disabled, is used to control whether the + * optimization is enabled or disabled. + */ +bool parallel_disabled = false; + +pthread_t parallel_thread = 0; +int parallel_thread_result = 0; /**************************************************************************************************/ /* Call ioctl, restarting if it is interrupted */ @@ -351,6 +365,15 @@ int amdgpu_plugin_init(int stage) maps_init(&restore_maps); if (stage == CR_PLUGIN_STAGE__RESTORE) { + if (has_children(root_item)) { + pr_info("Parallel restore disabled\n"); + parallel_disabled = true; + } else { + if (install_parallel_sock() < 0) { + pr_err("Failed to install parallel socket\n"); + return -1; + } + } /* Default Values */ kfd_fw_version_check = true; kfd_sdma_fw_version_check = true; @@ -1439,14 +1462,9 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e) static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e) { - struct thread_data *thread_datas; + struct thread_data *thread_datas = NULL; int thread_i, ret = 0; - - thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus); - if (!thread_datas) { - ret = -ENOMEM; - goto exit; - } + int offset = 0; for (int i = 0; i < e->num_of_bos; i++) { struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i]; @@ -1489,56 +1507,101 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf } } - thread_i = 0; - for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { - struct tp_node *dev; - int ret_thread = 0; - uint32_t target_gpu_id; + if (!parallel_disabled) { + parallel_restore_cmd restore_cmd; + pr_info("Begin to send parallel restore cmd\n"); + ret = init_parallel_restore_cmd(e->num_of_bos, id, e->num_of_gpus, &restore_cmd); + if (ret) + goto exit_parallel; - if (!e->device_entries[i]->gpu_id) - continue; + for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { + uint32_t target_gpu_id; + struct tp_node *dev; - /* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */ - target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); + if (!e->device_entries[i]->gpu_id) + continue; - /* We need the fd for actual_gpu_id */ - dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); - if (!dev) { - pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); - ret = -ENODEV; + target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); + dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); + if (!dev) { + pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); + ret = -ENODEV; + goto exit_parallel; + } + parallel_restore_gpu_id_add(e->device_entries[i]->gpu_id, dev->drm_render_minor, &restore_cmd); + + for (int j = 0; j < e->num_of_bos; j++) { + if (bo_buckets[j].gpu_id != e->device_entries[i]->gpu_id) + continue; + if (bo_buckets[j].alloc_flags & + (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) { + parallel_restore_bo_add(bo_buckets[j].dmabuf_fd, bo_buckets[j].gpu_id, + bo_buckets[j].size, offset, &restore_cmd); + offset += bo_buckets[j].size; + } + } + } + ret = send_parallel_restore_cmd(&restore_cmd); +exit_parallel: + free_parallel_restore_cmd(&restore_cmd); + } else { + thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus); + if (!thread_datas) { + ret = -ENOMEM; goto exit; } - thread_datas[thread_i].id = id; - thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id; - thread_datas[thread_i].bo_buckets = bo_buckets; - thread_datas[thread_i].bo_entries = e->bo_entries; - thread_datas[thread_i].pid = e->pid; - thread_datas[thread_i].num_of_bos = e->num_of_bos; + thread_i = 0; + for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { + struct tp_node *dev; + int ret_thread = 0; + uint32_t target_gpu_id; - thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev); - if (thread_datas[thread_i].drm_fd < 0) { - ret = -thread_datas[thread_i].drm_fd; - goto exit; + if (!e->device_entries[i]->gpu_id) + continue; + + /* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */ + target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); + + /* We need the fd for actual_gpu_id */ + dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); + if (!dev) { + pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); + ret = -ENODEV; + goto exit; + } + + thread_datas[thread_i].id = id; + thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id; + thread_datas[thread_i].bo_buckets = bo_buckets; + thread_datas[thread_i].bo_entries = e->bo_entries; + thread_datas[thread_i].pid = e->pid; + thread_datas[thread_i].num_of_bos = e->num_of_bos; + + thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev); + if (thread_datas[thread_i].drm_fd < 0) { + ret = -thread_datas[thread_i].drm_fd; + goto exit; + } + + ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents, + (void *)&thread_datas[thread_i]); + if (ret_thread) { + pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread); + ret = -ret_thread; + goto exit; + } + thread_i++; } - ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents, - (void *)&thread_datas[thread_i]); - if (ret_thread) { - pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread); - ret = -ret_thread; - goto exit; - } - thread_i++; - } + for (int i = 0; i < e->num_of_gpus; i++) { + pthread_join(thread_datas[i].thread, NULL); + pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); - for (int i = 0; i < e->num_of_gpus; i++) { - pthread_join(thread_datas[i].thread, NULL); - pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); - - if (thread_datas[i].ret) { - ret = thread_datas[i].ret; - goto exit; + if (thread_datas[i].ret) { + ret = thread_datas[i].ret; + goto exit; + } } } exit: @@ -1546,8 +1609,8 @@ exit: if (bo_buckets[i].dmabuf_fd != KFD_INVALID_FD) close(bo_buckets[i].dmabuf_fd); } - - xfree(thread_datas); + if (thread_datas) + xfree(thread_datas); return ret; } @@ -1836,6 +1899,24 @@ int amdgpu_plugin_resume_devices_late(int target_pid) if (plugin_disabled) return -ENOTSUP; + if (!parallel_disabled) { + pr_info("Close parallel restore server\n"); + if (close_parallel_restore_server()) { + pr_err("Close parallel restore server fail\n"); + return -1; + } + + exit_code = pthread_join(parallel_thread, NULL); + if (exit_code) { + pr_err("Failed to join parallel thread ret:%d\n", exit_code); + return -1; + } + if (parallel_thread_result) { + pr_err("Parallel restore fail\n"); + return parallel_thread_result; + } + } + pr_info("Inside %s for target pid = %d\n", __func__, target_pid); fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); @@ -1862,3 +1943,244 @@ int amdgpu_plugin_resume_devices_late(int target_pid) } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late) + +int sdma_copy_bo_helper(uint64_t size, int fd, FILE *storage_fp, void *buffer, size_t buffer_size, + amdgpu_device_handle h_dev, uint64_t max_copy_size, enum sdma_op_type type) +{ + return sdma_copy_bo((struct kfd_criu_bo_bucket){ 0, size, 0, 0, 0, 0, fd, 0 }, storage_fp, buffer, + buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); +} + +int init_dev(int dev_minor, amdgpu_device_handle *h_dev, uint64_t *max_copy_size) +{ + int ret = 0; + int drm_fd = -1; + uint32_t major, minor; + + struct amdgpu_gpu_info gpu_info = { 0 }; + + drm_fd = open_drm_render_device(dev_minor); + if (drm_fd < 0) { + return drm_fd; + } + + ret = amdgpu_device_initialize(drm_fd, &major, &minor, h_dev); + if (ret) { + pr_perror("Failed to initialize device"); + goto err; + } + + ret = amdgpu_query_gpu_info(*h_dev, &gpu_info); + if (ret) { + pr_perror("failed to query gpuinfo via libdrm"); + goto err; + } + *max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE : + SDMA_LINEAR_COPY_MAX_SIZE - 1; + return 0; +err: + amdgpu_device_deinitialize(*h_dev); + return ret; +} + +FILE *get_bo_contents_fp(int id, int gpu_id, size_t tot_size) +{ + char img_path[PATH_MAX]; + size_t image_size = 0; + FILE *bo_contents_fp = NULL; + + snprintf(img_path, sizeof(img_path), IMG_KFD_PAGES_FILE, id, gpu_id); + bo_contents_fp = open_img_file(img_path, false, &image_size); + if (!bo_contents_fp) { + pr_perror("Cannot fopen %s", img_path); + return NULL; + } + + if (tot_size != image_size) { + pr_err("%s size mismatch (current:%ld:expected:%ld)\n", img_path, image_size, tot_size); + fclose(bo_contents_fp); + return NULL; + } + return bo_contents_fp; +} + +struct parallel_thread_data { + pthread_t thread; + uint32_t gpu_id; + int minor; + parallel_restore_cmd *restore_cmd; + int ret; +}; + +void *parallel_restore_bo_contents(void *_thread_data) +{ + struct parallel_thread_data *thread_data = (struct parallel_thread_data *)_thread_data; + amdgpu_device_handle h_dev; + uint64_t max_copy_size; + size_t total_bo_size = 0, max_bo_size = 0, buffer_size = 0; + FILE *bo_contents_fp = NULL; + parallel_restore_entry *entry; + parallel_restore_cmd *restore_cmd = thread_data->restore_cmd; + int ret = 0; + int offset = 0; + void *buffer = NULL; + + ret = init_dev(thread_data->minor, &h_dev, &max_copy_size); + if (ret) { + goto err; + } + + for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) { + if (restore_cmd->entries[i].gpu_id == thread_data->gpu_id) { + total_bo_size += restore_cmd->entries[i].size; + max_bo_size = max(restore_cmd->entries[i].size, max_bo_size); + } + } + + buffer_size = kfd_max_buffer_size > 0 ? min(kfd_max_buffer_size, max_bo_size) : max_bo_size; + + bo_contents_fp = get_bo_contents_fp(restore_cmd->cmd_head.id, thread_data->gpu_id, total_bo_size); + if (bo_contents_fp == NULL) { + ret = -1; + goto err_sdma; + } + offset = ftell(bo_contents_fp); + + posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); + if (!buffer) { + pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); + ret = -ENOMEM; + goto err_sdma; + } + + for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) { + if (restore_cmd->entries[i].gpu_id != thread_data->gpu_id) + continue; + + entry = &restore_cmd->entries[i]; + fseek(bo_contents_fp, entry->read_offset + offset, SEEK_SET); + ret = sdma_copy_bo_helper(entry->size, restore_cmd->fds_write[entry->write_id], bo_contents_fp, buffer, + buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); + if (ret) { + pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); + goto err_sdma; + } + } + +err_sdma: + if (bo_contents_fp) + fclose(bo_contents_fp); + if (buffer) + xfree(buffer); + amdgpu_device_deinitialize(h_dev); +err: + thread_data->ret = ret; + return NULL; +} + +void *restore_device_parallel_worker(void *arg) +{ + while (1) { + parallel_restore_cmd restore_cmd = { 0 }; + struct parallel_thread_data *thread_datas = NULL; + int ret; + int error_occurred = 0, join_ret = 0, created_threads = 0; + + ret = recv_parallel_restore_cmd(&restore_cmd); + if (ret) { + if (ret == 1) { + *(int *)arg = 0; + goto exit; + } + goto err; + } + + thread_datas = xzalloc(sizeof(*thread_datas) * restore_cmd.cmd_head.gpu_num); + if (!thread_datas) { + ret = -ENOMEM; + goto err; + } + + for (; created_threads < restore_cmd.cmd_head.gpu_num; created_threads++) { + thread_datas[created_threads].gpu_id = restore_cmd.gpu_ids[created_threads].gpu_id; + thread_datas[created_threads].minor = restore_cmd.gpu_ids[created_threads].minor; + thread_datas[created_threads].restore_cmd = &restore_cmd; + + ret = pthread_create(&thread_datas[created_threads].thread, NULL, parallel_restore_bo_contents, + (void *)&thread_datas[created_threads]); + if (ret) { + pr_err("Failed to create thread[0x%x] ret:%d\n", thread_datas[created_threads].gpu_id, ret); + error_occurred = 1; + break; + } + } + + for (int i = 0; i < created_threads; i++) { + join_ret = pthread_join(thread_datas[i].thread, NULL); + if (join_ret != 0) { + pr_err("pthread_join failed for Thread[0x%x] ret:%d\n", + thread_datas[i].gpu_id, join_ret); + if (!error_occurred) { + ret = join_ret; + error_occurred = 1; + } + } + + pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); + + /* Check thread return value */ + if (thread_datas[i].ret && !error_occurred) { + ret = thread_datas[i].ret; + error_occurred = 1; + } + } + + if (thread_datas) + xfree(thread_datas); +err: + free_parallel_restore_cmd(&restore_cmd); + + if (ret) { + *(int *)arg = ret; + return NULL; + } + } +exit: + return NULL; +} + +/* + * While the background thread is running, some processing functions (e.g., stop_cgroupd) + * in the main thread need to block SIGCHLD. To prevent interference from this background + * thread, SIGCHLD is blocked in this thread. + */ +static int back_thread_create(pthread_t *newthread, void *(*f)(void *), void *arg) +{ + int ret = 0; + sigset_t blockmask, oldmask; + + sigemptyset(&blockmask); + sigaddset(&blockmask, SIGCHLD); + sigprocmask(SIG_BLOCK, &blockmask, &oldmask); + + ret = pthread_create(newthread, NULL, f, arg); + if (ret) { + pr_err("Create worker thread fail: %d\n", ret); + return -1; + } + + sigprocmask(SIG_SETMASK, &oldmask, NULL); + return 0; +} + +int amdgpu_plugin_post_forking(void) +{ + if (plugin_disabled) + return -ENOTSUP; + + if (parallel_disabled) + return 0; + + return back_thread_create(¶llel_thread, restore_device_parallel_worker, ¶llel_thread_result); +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__POST_FORKING, amdgpu_plugin_post_forking) \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_plugin_topology.c b/plugins/amdgpu/amdgpu_plugin_topology.c index 5b4396a0c..730f2e028 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.c +++ b/plugins/amdgpu/amdgpu_plugin_topology.c @@ -45,7 +45,7 @@ bool kfd_capability_check = true; */ int fd_next = -1; -static int open_drm_render_device(int minor) +int open_drm_render_device(int minor) { char path[128]; int fd, ret_fd; diff --git a/plugins/amdgpu/amdgpu_plugin_topology.h b/plugins/amdgpu/amdgpu_plugin_topology.h index c890e3dda..e19f8e7ce 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.h +++ b/plugins/amdgpu/amdgpu_plugin_topology.h @@ -118,6 +118,7 @@ struct tp_node *sys_get_node_by_gpu_id(const struct tp_system *sys, const uint32 struct tp_node *sys_get_node_by_render_minor(const struct tp_system *sys, const int drm_render_minor); struct tp_node *sys_get_node_by_index(const struct tp_system *sys, uint32_t index); +int open_drm_render_device(int minor); int node_get_drm_render_device(struct tp_node *node); void sys_close_drm_render_devices(struct tp_system *sys); From 7a5b3d1f41cdc1f1f8960bf8037c1f646aada229 Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Tue, 10 Dec 2024 12:44:35 +0000 Subject: [PATCH 117/257] plugins/amdgpu: Update `README.md` and `criu-amdgpu-plugin.txt` Signed-off-by: Yanning Yang --- Documentation/criu-amdgpu-plugin.txt | 1 + plugins/amdgpu/README.md | 23 ++++++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/Documentation/criu-amdgpu-plugin.txt b/Documentation/criu-amdgpu-plugin.txt index 68803f3db..fe76fc3bc 100644 --- a/Documentation/criu-amdgpu-plugin.txt +++ b/Documentation/criu-amdgpu-plugin.txt @@ -15,6 +15,7 @@ Checkpoint / Restore inside a docker container Pytorch Tensorflow Using CRIU Image Streamer +Parallel Restore DESCRIPTION ----------- diff --git a/plugins/amdgpu/README.md b/plugins/amdgpu/README.md index 1078eafe6..b808fbc4f 100644 --- a/plugins/amdgpu/README.md +++ b/plugins/amdgpu/README.md @@ -3,7 +3,8 @@ Supporting ROCm with CRIU _Felix Kuehling _
_Rajneesh Bardwaj _
-_David Yat Sin _ +_David Yat Sin _
+_Yanning Yang _ # Introduction @@ -224,6 +225,26 @@ to resume execution on the GPUs. *This new plugin is enabled by the new hook `__RESUME_DEVICES_LATE` in our RFC patch series.* +## Restoring BO content in parallel + +Restoring the BO content is an important part in the restore of GPU state and +usually takes a significant amount of time. A possible location for this +procedure is the `cr_plugin_restore_file` hook. However, restoring in this hook +blocks the target process from performing other restore operations, which +hinders further optimization of the restore process. + +Therefore, a new plugin hook that runs in the master restore process is +introduced, and it interacts with the `cr_plugin_restore_file` hook to complete +the restore of BO content. Specifically, the target process only needs to send +the relevant BOs to the master restore process, while this new hook handles all +the restore of buffer objects. Through this method, during the restore of the BO +content, the target process can perform other restore operations, thus +accelerating the restore procedure. This is an implementation of the gCROP +method proposed in the ACM SoCC'24 paper: [On-demand and Parallel +Checkpoint/Restore for GPU Applications](https://dl.acm.org/doi/10.1145/3698038.3698510). + +*This optimization technique is enabled by the `__POST_FORKING` hook.* + ## Other CRIU changes In addition to the new plugins, we need to make some changes to CRIU itself to From ae1395de184976250b4ddd9f7213fd129b6f2e74 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 30 Apr 2025 11:39:18 +0800 Subject: [PATCH 118/257] zdtm.py: add an option to change pycriu import path By default zdtm expects that criu is built from source first and only then you can run zdtm tests against it. But what if you really want to run tests against a criu version installed on the system? Yes there is already a nice option for zdtm to change the criu binary it uses "--criu-bin", but it would still end up using the pycriu module from source and you would still have to build everything beforehand. Let's add an option to change the path where zdtm searches for pycriu module "--pycriu-search-path". This way we can run zdtm tests on the criu installed on the system directly without building criu from source, e.g. on Fedora it works like: test/zdtm.py run --criu-bin /usr/sbin/criu \ --pycriu-search-path /usr/lib/python3.13/site-packages \ -t zdtm/static/env00 Signed-off-by: Pavel Tikhomirov --- test/zdtm.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index e3ddc762a..d5514af71 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -22,11 +22,11 @@ import sys import tempfile import time import uuid +import site from builtins import input, int, open, range, str, zip import yaml -import pycriu as crpc from zdtm.criu_config import criu_config # File to store content of streamed images @@ -1142,6 +1142,24 @@ class criu: self.__img_streamer_process = None self.__tls = self.__tls_options() if opts['tls'] else [] self.__criu_bin = opts['criu_bin'] + + global crpc + pycriu_search_path = opts.get('pycriu_search_path') + if pycriu_search_path: + sys.path.insert(0, pycriu_search_path) + + try: + import pycriu as crpc + if pycriu_search_path: + print(f"pycriu loaded from: {crpc.__file__}") + except ImportError: + if not pycriu_search_path: + print("Consider building CRIU or using '--pycriu-search-path' option.") + raise + finally: + if pycriu_search_path: + sys.path.pop(0) + self.__crit_bin = opts['crit_bin'] self.__pre_dump_mode = opts['pre_dump_mode'] self.__preload_libfault = bool(opts['preload_libfault']) @@ -2169,7 +2187,8 @@ class Launcher: 'dedup', 'sbs', 'freezecg', 'user', 'dry_run', 'noauto_dedup', 'remote_lazy_pages', 'show_stats', 'lazy_migrate', 'stream', 'tls', 'criu_bin', 'crit_bin', 'pre_dump_mode', 'mntns_compat_mode', - 'rootless', 'preload_libfault', 'mocked_cuda_checkpoint') + 'rootless', 'preload_libfault', 'mocked_cuda_checkpoint', + 'pycriu_search_path') arg = repr((name, desc, flavor, {d: self.__opts[d] for d in nd})) if self.__use_log: @@ -2860,6 +2879,9 @@ def get_cli_args(): rp.add_argument("--criu-bin", help="Path to criu binary", default='../criu/criu') + rp.add_argument("--pycriu-search-path", + help=f"Path to search for pycriu module first (e.g., {site.getsitepackages()[0]})", + default=None) rp.add_argument("--crit-bin", help="Path to crit binary", default='../crit/crit') From 1fdff7c7a6f12627212b2704db48929204f6a397 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Mon, 19 May 2025 11:53:18 +0800 Subject: [PATCH 119/257] zdtm: fix check for criu binary The opts['action'] contains actor function and not the action name, so we should compare it with a function. While on it let's also add a comment about --criu-bin option if CRIU binary is missing. Signed-off-by: Pavel Tikhomirov --- test/zdtm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/zdtm.py b/test/zdtm.py index d5514af71..3339dd816 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -1611,6 +1611,7 @@ class criu: def available(): if not os.access(opts['criu_bin'], os.X_OK): print("CRIU binary not found at %s" % opts['criu_bin']) + print("Consider building CRIU or using '--criu-bin' option.") sys.exit(1) def kill(self): @@ -2972,7 +2973,7 @@ if __name__ == '__main__': if opts['debug']: sys.settrace(traceit) - if opts['action'] == 'run': + if opts['action'] == run_tests: criu.available() for tst in test_classes.values(): tst.available() From 2b8951a9cf22c587d3ba397f9f2adc1863bd5dd3 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 16 May 2025 19:26:01 +0000 Subject: [PATCH 120/257] image: use `protoc` instead of `protoc-c` The new protoc 1.5.2 reports warnings: `protoc-c` is deprecated. Please use `protoc` instead! Signed-off-by: Andrei Vagin --- images/Makefile | 4 ++-- plugins/amdgpu/Makefile | 2 +- test/others/rpc/Makefile | 2 +- test/others/unix-callback/Makefile | 2 +- test/zdtm/static/Makefile | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/images/Makefile b/images/Makefile index 1e40b8a8f..d966fbfca 100644 --- a/images/Makefile +++ b/images/Makefile @@ -58,7 +58,7 @@ proto-obj-y += ext-file.o proto-obj-y += cgroup.o proto-obj-y += userns.o proto-obj-y += pidns.o -proto-obj-y += google/protobuf/descriptor.o # To make protoc-c happy and compile opts.proto +proto-obj-y += google/protobuf/descriptor.o # To make protoc happy and compile opts.proto proto-obj-y += opts.o proto-obj-y += seccomp.o proto-obj-y += binfmt-misc.o @@ -96,7 +96,7 @@ makefile-deps := Makefile $(obj)/Makefile define gen-proto-rules $(obj)/$(1).pb-c.c $(obj)/$(1).pb-c.h: $(obj)/$(1).proto $(addsuffix .pb-c.c,$(addprefix $(obj)/,$(2))) $(makefile-deps) $$(E) " PBCC " $$@ - $$(Q) protoc-c --proto_path=$(obj)/ --c_out=$(obj)/ $$< + $$(Q) protoc --proto_path=$(obj)/ --c_out=$(obj)/ $$< ifeq ($(PROTOUFIX),y) $$(Q) sed -i -e 's/4294967295/0xFFFFFFFF/g' $$@ $$(Q) sed -i -e 's/4294967295/0xFFFFFFFF/g' $$(patsubst %.c,%.h,$$@) diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 4bf5e499f..870a039cd 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -25,7 +25,7 @@ else endif criu-amdgpu.pb-c.c: criu-amdgpu.proto - protoc-c --proto_path=. --c_out=. criu-amdgpu.proto + protoc --proto_path=. --c_out=. criu-amdgpu.proto amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC) diff --git a/test/others/rpc/Makefile b/test/others/rpc/Makefile index b2f907abe..384eb0539 100644 --- a/test/others/rpc/Makefile +++ b/test/others/rpc/Makefile @@ -47,7 +47,7 @@ rpc_pb2.py: rpc.proto protoc --proto_path=. --python_out=. rpc.proto rpc.pb-c.c: rpc.proto - protoc-c --proto_path=. --c_out=. rpc.proto + protoc --proto_path=. --c_out=. rpc.proto clean: rm -rf build rpc.pb-c.o test-c.o test-c rpc.pb-c.c rpc.pb-c.h rpc_pb2.py rpc_pb2.pyc criu diff --git a/test/others/unix-callback/Makefile b/test/others/unix-callback/Makefile index 25bcf228b..984044077 100644 --- a/test/others/unix-callback/Makefile +++ b/test/others/unix-callback/Makefile @@ -4,7 +4,7 @@ run: all ./run.sh unix.pb-c.c: unix.proto - protoc-c --proto_path=. --c_out=. unix.proto + protoc --proto_path=. --c_out=. unix.proto unix-lib.so: unix-lib.c unix.pb-c.c gcc -g -Werror -Wall -shared -nostartfiles unix-lib.c unix.pb-c.c -o unix-lib.so -iquote ../../../criu/include -fPIC diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 81e44de22..61cacbb4e 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -734,7 +734,7 @@ criu-rtc.pb-c.c: criu-rtc.proto $(Q)echo $@ >> .gitignore $(Q)echo $(@:%.c=%.h) >> .gitignore $(E) " PBCC " $@ - $(Q)protoc-c --proto_path=. --c_out=. criu-rtc.proto + $(Q)protoc --proto_path=. --c_out=. criu-rtc.proto criu-rtc.so: criu-rtc.c criu-rtc.pb-c.c $(E) " LD " $@ From af5412a433c6456071b45d7753132b84a942891b Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 4 May 2025 16:27:32 +0200 Subject: [PATCH 121/257] criu/proc_parse: support MADV_WIPEONFORK/VM_WIPEONFORK Support VM_WIPEONFORK [1] by detecting it from /proc//smaps and setting a corresponding MADV_WIPEONFORK flag on vma. [1] https://github.com/torvalds/linux/commit/d2cd9ede6e193dd7d88b6d27399e96229a551b19 Signed-off-by: Alexander Mikhalitsyn --- criu/include/mman.h | 3 +++ criu/proc_parse.c | 2 ++ 2 files changed, 5 insertions(+) diff --git a/criu/include/mman.h b/criu/include/mman.h index 8ca71fadf..a55356490 100644 --- a/criu/include/mman.h +++ b/criu/include/mman.h @@ -13,5 +13,8 @@ #ifndef MADV_DONTDUMP #define MADV_DONTDUMP 16 #endif +#ifndef MADV_WIPEONFORK +#define MADV_WIPEONFORK 18 +#endif #endif /* __CR_MMAN_H__ */ diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 99dc518a5..a97ee11d1 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -160,6 +160,8 @@ static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf, *madv |= (1ul << MADV_HUGEPAGE); else if (_vmflag_match(tok, "nh")) *madv |= (1ul << MADV_NOHUGEPAGE); + else if (_vmflag_match(tok, "wf")) + *madv |= (1ul << MADV_WIPEONFORK); /* vmsplice doesn't work for VM_IO and VM_PFNMAP mappings. */ if (_vmflag_match(tok, "io") || _vmflag_match(tok, "pf")) From 6476488a510264f922568d94e1f2be2208c8b2be Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 4 May 2025 16:32:01 +0200 Subject: [PATCH 122/257] test/zdtm/static/maps02: add MADV_WIPEONFORK testcase In addition to that I did small non-functional corrections. Signed-off-by: Alexander Mikhalitsyn --- test/zdtm/static/get_smaps_bits.c | 6 ++++++ test/zdtm/static/maps02.c | 16 ++++++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/test/zdtm/static/get_smaps_bits.c b/test/zdtm/static/get_smaps_bits.c index 31d0d92b2..d9ac8b1ce 100644 --- a/test/zdtm/static/get_smaps_bits.c +++ b/test/zdtm/static/get_smaps_bits.c @@ -18,6 +18,10 @@ #define MADV_DONTDUMP 16 #endif +#ifndef MADV_WIPEONFORK +#define MADV_WIPEONFORK 18 +#endif + static void parse_vmflags(char *buf, unsigned long *flags, unsigned long *madv) { char *tok; @@ -57,6 +61,8 @@ static void parse_vmflags(char *buf, unsigned long *flags, unsigned long *madv) *madv |= (1ul << MADV_HUGEPAGE); else if (_vmflag_match(tok, "nh")) *madv |= (1ul << MADV_NOHUGEPAGE); + else if (_vmflag_match(tok, "wf")) + *madv |= (1ul << MADV_WIPEONFORK); /* * Anything else is just ignored. diff --git a/test/zdtm/static/maps02.c b/test/zdtm/static/maps02.c index 29f1372c9..37c09dc71 100644 --- a/test/zdtm/static/maps02.c +++ b/test/zdtm/static/maps02.c @@ -6,7 +6,11 @@ #define MADV_DONTDUMP 16 #endif -const char *test_doc = "Test shared memory with advises"; +#ifndef MADV_WIPEONFORK +#define MADV_WIPEONFORK 18 +#endif + +const char *test_doc = "Test private memory with advises"; const char *test_author = "Cyrill Gorcunov "; struct mmap_data { @@ -43,12 +47,12 @@ static int alloc_anon_mmap(struct mmap_data *m, int flags, int adv) int main(int argc, char **argv) { - struct mmap_data m[5] = {}; + struct mmap_data m[6] = {}; size_t i; test_init(argc, argv); - test_msg("Alloc growsdown\n"); + test_msg("Alloc dontfork\n"); if (alloc_anon_mmap(&m[0], MAP_PRIVATE | MAP_ANONYMOUS, MADV_DONTFORK)) return -1; @@ -64,10 +68,14 @@ int main(int argc, char **argv) if (alloc_anon_mmap(&m[3], MAP_PRIVATE | MAP_ANONYMOUS, MADV_HUGEPAGE)) return -1; - test_msg("Alloc dontfork/random|mergeable\n"); + test_msg("Alloc mergeable\n"); if (alloc_anon_mmap(&m[4], MAP_PRIVATE | MAP_ANONYMOUS, MADV_MERGEABLE)) return -1; + test_msg("Alloc wipeonfork\n"); + if (alloc_anon_mmap(&m[5], MAP_PRIVATE | MAP_ANONYMOUS, MADV_WIPEONFORK)) + return -1; + test_msg("Fetch existing flags/adv\n"); for (i = 0; i < sizeof(m) / sizeof(m[0]); i++) { if (get_smaps_bits((unsigned long)m[i].start, &m[i].orig_flags, &m[i].orig_madv)) From b90cfc1a80f69ff1fc7595c349c81d73e7f7ccc0 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 4 May 2025 17:11:28 +0200 Subject: [PATCH 123/257] criu/proc_parse: support MAP_DROPPABLE mappings Support MAP_DROPPABLE [1] by detecting it from /proc//smaps and restoring it as a normal private mapping flag on vma with only difference that instead of MAP_PRIVATE we should use MAP_DROPPABLE. [1] https://github.com/torvalds/linux/commit/9651fcedf7b92d3f7f1ab179e8ab55b85ee10fc1 Signed-off-by: Alexander Mikhalitsyn --- criu/include/mman.h | 3 +++ criu/mem.c | 12 ++++++++++++ criu/proc_parse.c | 16 ++++++++++++++++ 3 files changed, 31 insertions(+) diff --git a/criu/include/mman.h b/criu/include/mman.h index a55356490..086753bcf 100644 --- a/criu/include/mman.h +++ b/criu/include/mman.h @@ -4,6 +4,9 @@ #ifndef MAP_HUGETLB #define MAP_HUGETLB 0x40000 #endif +#ifndef MAP_DROPPABLE +#define MAP_DROPPABLE 0x08 +#endif #ifndef MADV_HUGEPAGE #define MADV_HUGEPAGE 14 #endif diff --git a/criu/mem.c b/criu/mem.c index c9578ef44..803cb545b 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -10,6 +10,7 @@ #include "cr_options.h" #include "servicefd.h" #include "mem.h" +#include "mman.h" #include "parasite-syscall.h" #include "parasite.h" #include "page-pipe.h" @@ -398,6 +399,17 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, str if (vma_entry_is(vma->e, VMA_AREA_VVAR)) return 0; + /* + * 9651fcedf7b9 ("mm: add MAP_DROPPABLE for designating always lazily freeable mappings") + * tells us that: + * Under memory pressure, mm can just drop the pages (so that they're + * zero when read back again). + * + * Let's just skip MAP_DROPPABLE mappings pages dump logic. + */ + if (vma->e->flags & MAP_DROPPABLE) + return 0; + /* * To facilitate any combination of pre-dump modes to run after * one another, we need to take extra care as discussed below. diff --git a/criu/proc_parse.c b/criu/proc_parse.c index a97ee11d1..d7eb25662 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -144,6 +144,8 @@ static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf, *flags |= MAP_NORESERVE; else if (_vmflag_match(tok, "ht")) *flags |= MAP_HUGETLB; + else if (_vmflag_match(tok, "dp")) + *flags |= MAP_DROPPABLE; /* madvise() block */ if (_vmflag_match(tok, "sr")) @@ -206,6 +208,20 @@ static void parse_vma_vmflags(char *buf, struct vma_area *vma_area) if (vma_area->e->madv) vma_area->e->has_madv = true; + + /* + * We set MAP_PRIVATE flag on vma_area->e->flags right after parsing + * a first line of VMA entry in /proc//smaps file: + * 7fa84fa70000-7fa84fa95000 rw-p 00000000 00:00 0 + * but it's too early and we can't distinguish between MAP_DROPPABLE + * and MAP_PRIVATE mappings yet, as they both private mappings in nature + * and at this point we haven't yet read "VmFlags:" line in smaps. + * + * Let's detect this situation and drop MAP_PRIVATE flag while keep + * MAP_DROPPABLE, otherwise restorer's restore_mapping() helper will fail. + */ + if ((vma_area->e->flags & MAP_PRIVATE) && (vma_area->e->flags & MAP_DROPPABLE)) + vma_area->e->flags &= ~MAP_PRIVATE; } static inline int is_anon_shmem_map(dev_t dev) From 4f9dcfb9c8dc1d2c6bb07ffc63722c70b8b50796 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 4 May 2025 18:55:46 +0200 Subject: [PATCH 124/257] pycriu/images/pb2dict: add MAP_DROPPABLE flag Signed-off-by: Alexander Mikhalitsyn --- lib/pycriu/images/pb2dict.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/pycriu/images/pb2dict.py b/lib/pycriu/images/pb2dict.py index e3dd95ac0..6c4f68889 100644 --- a/lib/pycriu/images/pb2dict.py +++ b/lib/pycriu/images/pb2dict.py @@ -83,6 +83,7 @@ mmap_prot_map = [ mmap_flags_map = [ ('MAP_SHARED', 0x1), ('MAP_PRIVATE', 0x2), + ('MAP_DROPPABLE', 0x08), ('MAP_ANON', 0x20), ('MAP_GROWSDOWN', 0x0100), ] From dfa0ce1808fb1e3a1439392a6aec8071643ad2c0 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 4 May 2025 17:21:23 +0200 Subject: [PATCH 125/257] test/zdtm/static/maps02: add MAP_DROPPABLE testcase Signed-off-by: Alexander Mikhalitsyn --- test/zdtm/static/get_smaps_bits.c | 6 ++++++ test/zdtm/static/maps02.c | 20 +++++++++++++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/test/zdtm/static/get_smaps_bits.c b/test/zdtm/static/get_smaps_bits.c index d9ac8b1ce..3d952ac95 100644 --- a/test/zdtm/static/get_smaps_bits.c +++ b/test/zdtm/static/get_smaps_bits.c @@ -6,6 +6,10 @@ #define MAP_HUGETLB 0x40000 #endif +#ifndef MAP_DROPPABLE +#define MAP_DROPPABLE 0x08 +#endif + #ifndef MADV_HUGEPAGE #define MADV_HUGEPAGE 14 #endif @@ -45,6 +49,8 @@ static void parse_vmflags(char *buf, unsigned long *flags, unsigned long *madv) *flags |= MAP_NORESERVE; else if (_vmflag_match(tok, "ht")) *flags |= MAP_HUGETLB; + else if (_vmflag_match(tok, "dp")) + *flags |= MAP_DROPPABLE; /* madvise() block */ if (_vmflag_match(tok, "sr")) diff --git a/test/zdtm/static/maps02.c b/test/zdtm/static/maps02.c index 37c09dc71..38244f020 100644 --- a/test/zdtm/static/maps02.c +++ b/test/zdtm/static/maps02.c @@ -2,6 +2,10 @@ #include "zdtmtst.h" #include "get_smaps_bits.h" +#ifndef MAP_DROPPABLE +#define MAP_DROPPABLE 0x08 +#endif + #ifndef MADV_DONTDUMP #define MADV_DONTDUMP 16 #endif @@ -27,8 +31,14 @@ static int alloc_anon_mmap(struct mmap_data *m, int flags, int adv) { m->start = mmap(NULL, MEM_SIZE, PROT_READ | PROT_WRITE, flags, -1, 0); if (m->start == MAP_FAILED) { - pr_perror("mmap failed"); - return -1; + if (errno == EINVAL) { + test_msg("mmap failed, no kernel support\n"); + *m = (struct mmap_data){}; + return 0; + } else { + pr_perror("mmap failed"); + return -1; + } } if (madvise(m->start, MEM_SIZE, adv)) { @@ -47,7 +57,7 @@ static int alloc_anon_mmap(struct mmap_data *m, int flags, int adv) int main(int argc, char **argv) { - struct mmap_data m[6] = {}; + struct mmap_data m[7] = {}; size_t i; test_init(argc, argv); @@ -76,6 +86,10 @@ int main(int argc, char **argv) if (alloc_anon_mmap(&m[5], MAP_PRIVATE | MAP_ANONYMOUS, MADV_WIPEONFORK)) return -1; + test_msg("Alloc droppable\n"); + if (alloc_anon_mmap(&m[6], MAP_DROPPABLE | MAP_ANONYMOUS, MADV_NORMAL)) + return -1; + test_msg("Fetch existing flags/adv\n"); for (i = 0; i < sizeof(m) / sizeof(m[0]); i++) { if (get_smaps_bits((unsigned long)m[i].start, &m[i].orig_flags, &m[i].orig_madv)) From 5f18ca1bbe34a287af4dc4b0e7900253c3c71d51 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 4 May 2025 21:11:29 +0200 Subject: [PATCH 126/257] test/zdtm/static: add maps11 test for MAP_DROPPABLE/MADV_WIPEONFORK In this test we want to ensure that contents of droppable mappings and mappings with MADV_WIPEONFORK is properly restored in parent/child processes. Signed-off-by: Alexander Mikhalitsyn --- test/zdtm/static/Makefile | 1 + test/zdtm/static/maps11.c | 205 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 206 insertions(+) create mode 100644 test/zdtm/static/maps11.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 61cacbb4e..34fc90513 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -150,6 +150,7 @@ TST_NOFILE := \ maps05 \ maps09 \ maps10 \ + maps11 \ mlock_setuid \ xids00 \ groups \ diff --git a/test/zdtm/static/maps11.c b/test/zdtm/static/maps11.c new file mode 100644 index 000000000..df309714b --- /dev/null +++ b/test/zdtm/static/maps11.c @@ -0,0 +1,205 @@ +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +#ifndef MAP_DROPPABLE +#define MAP_DROPPABLE 0x08 +#endif + +#ifndef MADV_WIPEONFORK +#define MADV_WIPEONFORK 18 +#endif + +const char *test_doc = "Test MAP_DROPPABLE/MADV_WIPEONFORK mappings with 2 processes"; +const char *test_author = "Alexander Mikhalitsyn "; + +bool mem_is_zero(const uint8_t *buffer, size_t length) +{ + size_t i; + + for (i = 0; i < length; i++) + if (buffer[i] != 0) + return false; + + return true; +} + +int main(int argc, char **argv) +{ + uint8_t *p1, *p2; + pid_t pid; + int status; + const char data[] = "MADV_WIPEONFORK vma data"; + bool criu_was_there = false; + struct stat st1, st2; + + test_init(argc, argv); + + p1 = mmap(NULL, sizeof(data), PROT_READ | PROT_WRITE, + MAP_DROPPABLE | MAP_ANONYMOUS, 0, 0); + if (p1 == MAP_FAILED) { + if (errno == EINVAL) { + skip("mmap failed, no kernel support for MAP_DROPPABLE\n"); + goto skip; + } else { + pr_perror("mmap failed"); + return -1; + } + } + + p2 = mmap(NULL, sizeof(data), PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (p2 == MAP_FAILED) { + pr_perror("mmap failed"); + return 1; + } + + if (madvise(p2, sizeof(data), MADV_WIPEONFORK)) { + pr_perror("madvise failed"); + return -1; + } + + /* contents of this mapping is supposed to be dropped after C/R */ + memcpy(p1, data, sizeof(data)); + + /* contents of this mapping is supposed to be dropped after fork() */ + memcpy(p2, data, sizeof(data)); + + /* + * Let's spawn a process before C/R so our mappings get inherited + * then, after C/R we need to ensure that CRIU memory premapping + * machinery works properly. + * + * It is important, because we restore MADV_WIPEONFORK on a later + * stages (after vma premapping happens) and we need to ensure that + * CRIU handles everything in a right way. + */ + pid = test_fork(); + if (pid < 0) { + pr_perror("fork failed"); + return 1; + } + + if (pid == 0) { + test_waitsig(); + + /* + * Both mappings have VM_WIPEONFORK flag set, + * so we expect to have it null-ified after fork(). + */ + if (!mem_is_zero(p1, sizeof(data)) || + !mem_is_zero(p2, sizeof(data))) { + pr_err("1st child: memory check failed\n"); + return 1; + } + + return 0; + } + + /* + * A simple way to detect if C/R happened is to compare st_ino + * fields of stat() on the procfs files of the current task. + * + * Hopefully, this terrible hack is never used in real-world + * applications ;-) Here, we only need this to make test + * to pass with/without --nocr option. + */ + if (stat("/proc/self/status", &st1)) { + pr_perror("stat"); + return 1; + } + + test_daemon(); + test_waitsig(); + + /* signal a child process to continue */ + if (kill(pid, SIGTERM)) { + pr_perror("kill"); + goto err; + } + + if (waitpid(pid, &status, 0) != pid) { + pr_perror("1st waitpid"); + goto err; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + fail("1st process didn't exit cleanly: status=%d", status); + goto err; + } + + if (stat("/proc/self/status", &st2)) { + pr_perror("stat"); + return 1; + } + + /* detect CRIU */ + criu_was_there = st1.st_ino != st2.st_ino; + + /* + * We should mark failure if one of the following happens: + * 1. MAP_DROPPABLE memory is not zero after C/R + * 2. MAP_DROPPABLE memory somehow changed without C/R + * (kernel issue? memory pressure?) + * 3. MADV_WIPEONFORK memory is not preserved + * + * We care about 2nd case only because we would like test + * to pass even with --nocr zdtm.py option. + */ + if ((criu_was_there && !mem_is_zero(p1, sizeof(data))) || + (!criu_was_there && memcmp(p1, data, sizeof(data))) || + memcmp(p2, data, sizeof(data))) { + fail("Data mismatch"); + return 1; + } + + /* contents of these mappings is supposed to be dropped after fork() */ + memcpy(p1, data, sizeof(data)); + memcpy(p2, data, sizeof(data)); + + pid = test_fork(); + if (pid < 0) { + pr_perror("fork failed"); + return 1; + } + + if (pid == 0) { + if (!mem_is_zero(p1, sizeof(data)) || + !mem_is_zero(p2, sizeof(data))) { + pr_err("2nd child: memory check failed\n"); + return 1; + } + + return 0; + } + + if (waitpid(pid, &status, 0) != pid) { + pr_perror("2nd waitpid"); + goto err; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + fail("2nd process didn't exit cleanly: status=%d", status); + goto err; + } + + pass(); + + return 0; +err: + if (waitpid(-1, NULL, WNOHANG) == 0) { + kill(pid, SIGTERM); + wait(NULL); + } + return 1; + +skip: + test_daemon(); + test_waitsig(); + pass(); + return 0; +} From fbfed312e086b79bcddd30dbc368c16f2ca43310 Mon Sep 17 00:00:00 2001 From: Prajwal S N Date: Mon, 14 Apr 2025 14:06:40 +0530 Subject: [PATCH 127/257] feat: introduce Nix flake CRIU currently requires a number of dependencies in order to build from source. The package names vary across distributions and package managers. A Nix flake allows developers to spin up a dev environment with `nix develop`, eliminating the hassle of manual dependency management. It also prevents polluting the global package set on the machine. Signed-off-by: Prajwal S N --- CONTRIBUTING.md | 2 +- flake.lock | 61 +++++++++++++++++++++++++++++++++++++++ flake.nix | 77 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 flake.lock create mode 100644 flake.nix diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 37965e5fb..712e7b813 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -34,7 +34,7 @@ To clone CRIU repo and switch to the proper branch, run: ### Compile -First, you need to install compile-time dependencies. Check [Installation dependencies](https://criu.org/Installation#Dependencies) for more info. +First, you need to install compile-time dependencies. Check [Installation dependencies](https://criu.org/Installation#Dependencies) for more info. Alternatively, you can use the Nix flake to set up a development environment by running `nix develop`. To compile CRIU, run: diff --git a/flake.lock b/flake.lock new file mode 100644 index 000000000..90c914452 --- /dev/null +++ b/flake.lock @@ -0,0 +1,61 @@ +{ + "nodes": { + "flake-utils": { + "inputs": { + "systems": "systems" + }, + "locked": { + "lastModified": 1731533236, + "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1744463964, + "narHash": "sha256-LWqduOgLHCFxiTNYi3Uj5Lgz0SR+Xhw3kr/3Xd0GPTM=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "2631b0b7abcea6e640ce31cd78ea58910d31e650", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 000000000..dc2429ffc --- /dev/null +++ b/flake.nix @@ -0,0 +1,77 @@ +{ + description = "CRIU development environment"; + + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + flake-utils.url = "github:numtide/flake-utils"; + }; + + outputs = { self, nixpkgs, flake-utils }: + flake-utils.lib.eachDefaultSystem (system: + let + pkgs = nixpkgs.legacyPackages.${system}; + + # Dependencies for CRIU + criuDeps = with pkgs; [ + # Compiler and build essentials + gcc + gnumake + pkg-config + + # Protocol Buffers + protobuf + protobufc + python3Packages.protobuf + + # Other required libraries + libuuid + libbsd + iproute2 + nftables + libcap + libnet + libnl + libaio + gnutls + libdrm + + # ZDTM + python3Packages.pyyaml + ]; + + # Multilib support for 32-bit compatibility + # criuDeps32bit = with pkgs; [ + # glibc.dev + # glibc + # gcc-unwrapped + # ]; + + devShell = pkgs.mkShell { + buildInputs = criuDeps; # ++ (if pkgs.stdenv.isx86_64 then criuDeps32bit else []); + + shellHook = '' + echo "CRIU development environment" + echo "==============================" + echo "" + echo "Useful commands:" + echo " make - Build CRIU" + echo " make test - Run tests (requires ZDTM dependencies)" + echo "" + ''; + + # Add proper flags for multilib support + # NIX_CFLAGS_COMPILE = pkgs.lib.optional pkgs.stdenv.isx86_64 "-m32"; + + # Make sure the shell can find headers for multilib + # PKG_CONFIG_PATH = pkgs.lib.makeSearchPath "lib/pkgconfig" criuDeps; + }; + in + { + # Export the development shell + devShells.default = devShell; + + # Build CRIU package as well + packages.default = pkgs.criu; + } + ); +} From fcbaac0598e5be2cb87bf19341a0705fcae98259 Mon Sep 17 00:00:00 2001 From: Liana Koleva <43767763+lianakoleva@users.noreply.github.com> Date: Wed, 26 Mar 2025 17:41:51 +0000 Subject: [PATCH 128/257] crtools: simplify check for cpuinfo subcommands The cpuinfo command requires a "dump" or "check" subcommand. Thus, we replace `CR_CPUINFO` with `CR_CPUINFO_DUMP` and `CR_CPUINFO_CHECK`. This allows us to remove unnecessary subcommand check in `image_dir_mode()` and perform all parsing in `parse_criu_mode()`. With this change the check for validating the cpuinfo subcommand is now done only once with `CR_CPUINFO_DUMP` or `CR_CPUINFO_CHECK` enum. Signed-off-by: Liana Koleva <43767763+lianakoleva@users.noreply.github.com> Signed-off-by: Radostin Stoyanov --- criu/cr-service.c | 2 +- criu/crtools.c | 57 ++++++++++++++++++++------------------- criu/include/cr_options.h | 3 ++- 3 files changed, 32 insertions(+), 30 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index b9d11ced2..d8c5967bc 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -1261,7 +1261,7 @@ static int handle_cpuinfo(int sk, CriuReq *msg) if (pid == 0) { int ret = 1; - opts.mode = CR_CPUINFO; + opts.mode = (msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP) ? CR_CPUINFO_DUMP : CR_CPUINFO_CHECK; if (setup_opts_from_req(sk, msg->opts)) goto cout; diff --git a/criu/crtools.c b/criu/crtools.c index 6f493850b..4734c90f2 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -54,19 +54,17 @@ void flush_early_log_to_stderr(void) flush_early_log_buffer(STDERR_FILENO); } -static int image_dir_mode(char *argv[], int optind) +static int image_dir_mode(void) { switch (opts.mode) { case CR_DUMP: /* fallthrough */ + case CR_CPUINFO_DUMP: + /* fallthrough */ case CR_PRE_DUMP: return O_DUMP; case CR_RESTORE: return O_RSTR; - case CR_CPUINFO: - if (!strcmp(argv[optind + 1], "dump")) - return O_DUMP; - /* fallthrough */ default: return -1; } @@ -76,7 +74,7 @@ static int image_dir_mode(char *argv[], int optind) return -1; } -static int parse_criu_mode(char *mode) +static int parse_criu_mode(char *mode, char *subcommand) { if (!strcmp(mode, "dump")) opts.mode = CR_DUMP; @@ -96,8 +94,12 @@ static int parse_criu_mode(char *mode) opts.mode = CR_SWRK; else if (!strcmp(mode, "dedup")) opts.mode = CR_DEDUP; - else if (!strcmp(mode, "cpuinfo")) - opts.mode = CR_CPUINFO; + else if (!strcmp(mode, "cpuinfo") && subcommand == NULL) + return -2; + else if (!strcmp(mode, "cpuinfo") && !strcmp(subcommand, "dump")) + opts.mode = CR_CPUINFO_DUMP; + else if (!strcmp(mode, "cpuinfo") && !strcmp(subcommand, "check")) + opts.mode = CR_CPUINFO_CHECK; else if (!strcmp(mode, "exec")) opts.mode = CR_EXEC_DEPRECATED; else if (!strcmp(mode, "show")) @@ -115,6 +117,7 @@ int main(int argc, char *argv[], char *envp[]) bool has_exec_cmd = false; bool has_sub_command; int state = PARSING_GLOBAL_CONF; + char *subcommand; BUILD_BUG_ON(CTL_32 != SYSCTL_TYPE__CTL_32); BUILD_BUG_ON(__CTL_STR != SYSCTL_TYPE__CTL_STR); @@ -165,9 +168,15 @@ int main(int argc, char *argv[], char *envp[]) return 1; } - if (parse_criu_mode(argv[optind])) { + has_sub_command = (argc - optind) > 1; + subcommand = has_sub_command ? argv[optind + 1] : NULL; + ret = parse_criu_mode(argv[optind], subcommand); + if (ret == -1) { pr_err("unknown command: %s\n", argv[optind]); goto usage; + } else if (ret == -2) { + pr_err("cpuinfo requires an action: dump or check\n"); + goto usage; } /* * util_init initializes criu_run_id and compel_run_id so that sockets @@ -223,25 +232,20 @@ int main(int argc, char *argv[], char *envp[]) return 1; memcpy(opts.exec_cmd, &argv[optind + 1], (argc - optind - 1) * sizeof(char *)); opts.exec_cmd[argc - optind - 1] = NULL; - } else { + } else if (opts.mode != CR_CPUINFO_DUMP && opts.mode != CR_CPUINFO_CHECK && has_sub_command) { /* No subcommands except for cpuinfo and restore --exec-cmd */ - if (opts.mode != CR_CPUINFO && has_sub_command) { - pr_err("excessive parameter%s for command %s\n", (argc - optind) > 2 ? "s" : "", argv[optind]); - goto usage; - } else if (opts.mode == CR_CPUINFO && !has_sub_command) { - pr_err("cpuinfo requires an action: dump or check\n"); - goto usage; - } + pr_err("excessive parameter%s for command %s\n", (argc - optind) > 2 ? "s" : "", argv[optind]); + goto usage; } - if (opts.stream && image_dir_mode(argv, optind) == -1) { + if (opts.stream && image_dir_mode() == -1) { pr_err("--stream cannot be used with the %s command\n", argv[optind]); goto usage; } /* We must not open imgs dir, if service is called */ if (opts.mode != CR_SERVICE) { - ret = open_image_dir(opts.imgs_dir, image_dir_mode(argv, optind)); + ret = open_image_dir(opts.imgs_dir, image_dir_mode()); if (ret < 0) { pr_err("Couldn't open image dir %s\n", opts.imgs_dir); return 1; @@ -335,15 +339,12 @@ int main(int argc, char *argv[], char *envp[]) if (opts.mode == CR_DEDUP) return cr_dedup() != 0; - if (opts.mode == CR_CPUINFO) { - if (!argv[optind + 1]) { - pr_err("cpuinfo requires an action: dump or check\n"); - goto usage; - } - if (!strcmp(argv[optind + 1], "dump")) - return cpuinfo_dump(); - else if (!strcmp(argv[optind + 1], "check")) - return cpuinfo_check(); + if (opts.mode == CR_CPUINFO_DUMP) { + return cpuinfo_dump(); + } + + if (opts.mode == CR_CPUINFO_CHECK) { + return cpuinfo_check(); } if (opts.mode == CR_EXEC_DEPRECATED) { diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index ab0bd8fa3..4df8056b7 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -125,7 +125,8 @@ enum criu_mode { CR_SERVICE, CR_SWRK, CR_DEDUP, - CR_CPUINFO, + CR_CPUINFO_DUMP, + CR_CPUINFO_CHECK, CR_EXEC_DEPRECATED, CR_SHOW_DEPRECATED, }; From 99ba6db89b288b81beff3bfeace72552dedf5579 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 20 May 2025 14:47:55 +0000 Subject: [PATCH 129/257] crtools: do a few minor cleanups Signed-off-by: Andrei Vagin --- criu/crtools.c | 142 +++++++++++++++++++++++++------------------------ 1 file changed, 73 insertions(+), 69 deletions(-) diff --git a/criu/crtools.c b/criu/crtools.c index 4734c90f2..509e73d74 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -74,40 +74,55 @@ static int image_dir_mode(void) return -1; } -static int parse_criu_mode(char *mode, char *subcommand) -{ - if (!strcmp(mode, "dump")) - opts.mode = CR_DUMP; - else if (!strcmp(mode, "pre-dump")) - opts.mode = CR_PRE_DUMP; - else if (!strcmp(mode, "restore")) - opts.mode = CR_RESTORE; - else if (!strcmp(mode, "lazy-pages")) - opts.mode = CR_LAZY_PAGES; - else if (!strcmp(mode, "check")) - opts.mode = CR_CHECK; - else if (!strcmp(mode, "page-server")) - opts.mode = CR_PAGE_SERVER; - else if (!strcmp(mode, "service")) - opts.mode = CR_SERVICE; - else if (!strcmp(mode, "swrk")) - opts.mode = CR_SWRK; - else if (!strcmp(mode, "dedup")) - opts.mode = CR_DEDUP; - else if (!strcmp(mode, "cpuinfo") && subcommand == NULL) - return -2; - else if (!strcmp(mode, "cpuinfo") && !strcmp(subcommand, "dump")) - opts.mode = CR_CPUINFO_DUMP; - else if (!strcmp(mode, "cpuinfo") && !strcmp(subcommand, "check")) - opts.mode = CR_CPUINFO_CHECK; - else if (!strcmp(mode, "exec")) - opts.mode = CR_EXEC_DEPRECATED; - else if (!strcmp(mode, "show")) - opts.mode = CR_SHOW_DEPRECATED; - else - return -1; +struct { + char *cmd; + int mode; +} commands[] = { + { "dump", CR_DUMP }, + { "pre-dump", CR_PRE_DUMP }, + { "restore", CR_RESTORE }, + { "lazy-pages", CR_LAZY_PAGES }, + { "check", CR_CHECK }, + { "page-server", CR_PAGE_SERVER }, + { "service", CR_SERVICE }, + { "swrk", CR_SWRK }, + { "dedup", CR_DEDUP }, + { "exec", CR_EXEC_DEPRECATED }, + { "show", CR_SHOW_DEPRECATED }, +}; - return 0; +static int parse_criu_mode(int argc, char **argv, int *optind) +{ + char *cmd = argv[*optind]; + bool has_sub_command = (argc - *optind) > 1; + char *subcommand = has_sub_command ? argv[*optind + 1] : NULL; + int i; + + for (i = 0; i < ARRAY_SIZE(commands); i++) { + if (strcmp(cmd, commands[i].cmd)) + continue; + opts.mode = commands[i].mode; + return 0; + } + + if (!strcmp(cmd, "cpuinfo")) { + if (subcommand == NULL) { + pr_err("cpuinfo requires an action: dump or check\n"); + return -1; + } + if (!strcmp(subcommand, "dump")) + opts.mode = CR_CPUINFO_DUMP; + else if (!strcmp(subcommand, "check")) + opts.mode = CR_CPUINFO_CHECK; + else { + pr_err("unknown cpuinfo sub-command: %s\n", subcommand); + return -1; + } + (*optind)++; + return 0; + } + pr_err("unknown command: %s\n", argv[*optind]); + return -1; } int main(int argc, char *argv[], char *envp[]) @@ -117,7 +132,7 @@ int main(int argc, char *argv[], char *envp[]) bool has_exec_cmd = false; bool has_sub_command; int state = PARSING_GLOBAL_CONF; - char *subcommand; + char *cmd; BUILD_BUG_ON(CTL_32 != SYSCTL_TYPE__CTL_32); BUILD_BUG_ON(__CTL_STR != SYSCTL_TYPE__CTL_STR); @@ -168,16 +183,11 @@ int main(int argc, char *argv[], char *envp[]) return 1; } - has_sub_command = (argc - optind) > 1; - subcommand = has_sub_command ? argv[optind + 1] : NULL; - ret = parse_criu_mode(argv[optind], subcommand); - if (ret == -1) { - pr_err("unknown command: %s\n", argv[optind]); + cmd = argv[optind]; + ret = parse_criu_mode(argc, argv, &optind); + if (ret) goto usage; - } else if (ret == -2) { - pr_err("cpuinfo requires an action: dump or check\n"); - goto usage; - } + /* * util_init initializes criu_run_id and compel_run_id so that sockets * are generated with an unique name identifying the specific process @@ -232,14 +242,13 @@ int main(int argc, char *argv[], char *envp[]) return 1; memcpy(opts.exec_cmd, &argv[optind + 1], (argc - optind - 1) * sizeof(char *)); opts.exec_cmd[argc - optind - 1] = NULL; - } else if (opts.mode != CR_CPUINFO_DUMP && opts.mode != CR_CPUINFO_CHECK && has_sub_command) { - /* No subcommands except for cpuinfo and restore --exec-cmd */ - pr_err("excessive parameter%s for command %s\n", (argc - optind) > 2 ? "s" : "", argv[optind]); + } else if (has_sub_command) { + pr_err("excessive parameter%s for command %s\n", (argc - optind) > 2 ? "s" : "", cmd); goto usage; } if (opts.stream && image_dir_mode() == -1) { - pr_err("--stream cannot be used with the %s command\n", argv[optind]); + pr_err("--stream cannot be used with the %s command\n", cmd); goto usage; } @@ -290,14 +299,13 @@ int main(int argc, char *argv[], char *envp[]) if (opts.img_parent) pr_info("Will do snapshot from %s\n", opts.img_parent); - if (opts.mode == CR_DUMP) { + switch (opts.mode) { + case CR_DUMP: if (!opts.tree_id) goto opt_pid_missing; return cr_dump_tasks(opts.tree_id); - } - - if (opts.mode == CR_PRE_DUMP) { + case CR_PRE_DUMP: if (!opts.tree_id) goto opt_pid_missing; @@ -307,9 +315,7 @@ int main(int argc, char *argv[], char *envp[]) } return cr_pre_dump_tasks(opts.tree_id) != 0; - } - - if (opts.mode == CR_RESTORE) { + case CR_RESTORE: if (opts.tree_id) pr_warn("Using -t with criu restore is obsoleted\n"); @@ -322,43 +328,41 @@ int main(int argc, char *argv[], char *envp[]) } return ret != 0; - } - if (opts.mode == CR_LAZY_PAGES) + case CR_LAZY_PAGES: return cr_lazy_pages(opts.daemon_mode) != 0; - if (opts.mode == CR_CHECK) + case CR_CHECK: return cr_check() != 0; - if (opts.mode == CR_PAGE_SERVER) + case CR_PAGE_SERVER: return cr_page_server(opts.daemon_mode, false, -1) != 0; - if (opts.mode == CR_SERVICE) + case CR_SERVICE: return cr_service(opts.daemon_mode); - if (opts.mode == CR_DEDUP) + case CR_DEDUP: return cr_dedup() != 0; - if (opts.mode == CR_CPUINFO_DUMP) { + case CR_CPUINFO_DUMP: return cpuinfo_dump(); - } - if (opts.mode == CR_CPUINFO_CHECK) { + case CR_CPUINFO_CHECK: return cpuinfo_check(); - } - if (opts.mode == CR_EXEC_DEPRECATED) { + case CR_EXEC_DEPRECATED: pr_err("The \"exec\" action is deprecated by the Compel library.\n"); return -1; - } - if (opts.mode == CR_SHOW_DEPRECATED) { + case CR_SHOW_DEPRECATED: pr_err("The \"show\" action is deprecated by the CRIT utility.\n"); pr_err("To view an image use the \"crit decode -i $name --pretty\" command.\n"); return -1; - } - pr_err("unknown command: %s\n", argv[optind]); + case CR_UNSET: + default: + pr_err("unknown command: %s\n", cmd); + } usage: pr_msg("\n" "Usage:\n" From a79b33d0c5f5a56c58cb1201f2b5dfa9aed159bc Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 23 May 2025 08:33:20 +0100 Subject: [PATCH 130/257] cpuinfo: show error when image is missing The `criu cpuinfo check` command calls cpu_validate_cpuinfo(), which attempts to open the cpuinfo.img file using `open_image()`. If the image file is not found, `open_image()` returns an "empty image" object. As a result, `cpu_validate_cpuinfo()` tries to read from it and fails with the following error: (00.002473) Error (criu/protobuf.c:72): Unexpected EOF on (empty-image) This patch adds a check for an empty image and appropriate error message. Signed-off-by: Radostin Stoyanov --- criu/arch/ppc64/cpu.c | 6 ++++++ criu/arch/s390/cpu.c | 6 ++++++ criu/arch/x86/cpu.c | 6 ++++++ 3 files changed, 18 insertions(+) diff --git a/criu/arch/ppc64/cpu.c b/criu/arch/ppc64/cpu.c index bb5b7256e..b87230f40 100644 --- a/criu/arch/ppc64/cpu.c +++ b/criu/arch/ppc64/cpu.c @@ -64,6 +64,12 @@ int cpu_validate_cpuinfo(void) if (!img) return -1; + if (empty_image(img)) { + pr_err("No cpuinfo image\n"); + close_image(img); + return -1; + } + if (pb_read_one(img, &cpu_info, PB_CPUINFO) < 0) goto error; diff --git a/criu/arch/s390/cpu.c b/criu/arch/s390/cpu.c index 3f430f455..e227fad5e 100644 --- a/criu/arch/s390/cpu.c +++ b/criu/arch/s390/cpu.c @@ -87,6 +87,12 @@ int cpu_validate_cpuinfo(void) if (!img) return -1; + if (empty_image(img)) { + pr_err("No cpuinfo image\n"); + close_image(img); + return -1; + } + ret = 0; if (pb_read_one(img, &cpu_info, PB_CPUINFO) < 0) goto error; diff --git a/criu/arch/x86/cpu.c b/criu/arch/x86/cpu.c index dfa31569f..2e1f2de9a 100644 --- a/criu/arch/x86/cpu.c +++ b/criu/arch/x86/cpu.c @@ -407,6 +407,12 @@ int cpu_validate_cpuinfo(void) if (!img) return -1; + if (empty_image(img)) { + pr_err("No cpuinfo image\n"); + close_image(img); + return -1; + } + if (pb_read_one(img, &img_cpu_info, PB_CPUINFO) < 0) goto err; From 922754dffd9efd99b051215c477e0bf6d70562aa Mon Sep 17 00:00:00 2001 From: Ivan Pravdin Date: Tue, 6 May 2025 22:40:25 -0400 Subject: [PATCH 131/257] rpc/log: return first error always Use shared first error buffer to return correct first error in rpc. Fixes: #338 Signed-off-by: Ivan Pravdin --- criu/cr-service.c | 24 +++++++++++++++++++++++- criu/log.c | 4 ++++ test/others/rpc/errno.py | 22 +++++++++++++++++++++- 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index d8c5967bc..a1089ad5c 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -895,6 +895,11 @@ static int check(int sk, CriuOpts *req) resp.type = CRIU_REQ_TYPE__CHECK; + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto out; + } + pid = fork(); if (pid < 0) { pr_perror("Can't fork"); @@ -919,6 +924,7 @@ static int check(int sk, CriuOpts *req) resp.success = true; out: + set_resp_err(&resp); return send_criu_msg(sk, &resp); } @@ -927,6 +933,11 @@ static int pre_dump_using_req(int sk, CriuOpts *req, bool single) int pid, status; bool success = false; + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto out; + } + pid = fork(); if (pid < 0) { pr_perror("Can't fork"); @@ -1005,6 +1016,11 @@ static int start_page_server_req(int sk, CriuOpts *req, bool daemon_mode) CriuPageServerInfo ps = CRIU_PAGE_SERVER_INFO__INIT; struct ps_info info; + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto out; + } + if (pipe(start_pipe)) { pr_perror("No start pipe"); goto out; @@ -1078,6 +1094,7 @@ static int start_page_server_req(int sk, CriuOpts *req, bool daemon_mode) out: resp.type = CRIU_REQ_TYPE__PAGE_SERVER; resp.success = success; + set_resp_err(&resp); return send_criu_msg(sk, &resp); } @@ -1252,6 +1269,11 @@ static int handle_cpuinfo(int sk, CriuReq *msg) bool success = false; int pid, status; + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto out; + } + pid = fork(); if (pid < 0) { pr_perror("Can't fork"); @@ -1301,7 +1323,7 @@ static int handle_cpuinfo(int sk, CriuReq *msg) out: resp.type = msg->type; resp.success = success; - + set_resp_err(&resp); return send_criu_msg(sk, &resp); } diff --git a/criu/log.c b/criu/log.c index 70e267fd6..a02a8df20 100644 --- a/criu/log.c +++ b/criu/log.c @@ -10,6 +10,7 @@ #include #include #include +#include #include @@ -114,6 +115,9 @@ static struct str_and_lock *first_err; int log_keep_err(void) { + if (first_err) + return 0; + first_err = shmalloc(sizeof(struct str_and_lock)); if (first_err == NULL) return -1; diff --git a/test/others/rpc/errno.py b/test/others/rpc/errno.py index 4ea6c9d44..a5a3eb54d 100755 --- a/test/others/rpc/errno.py +++ b/test/others/rpc/errno.py @@ -40,7 +40,7 @@ class test: resp.ParseFromString(self.s.recv(self._MAX_MSG_SIZE)) return resp - def check_resp(self, resp, typ, err): + def check_resp(self, resp, typ, err, errmsg = None): if resp.type != typ: raise Exception('Unexpected response type ' + str(resp.type)) @@ -49,6 +49,9 @@ class test: if err and resp.cr_errno != err: raise Exception('Unexpected cr_errno ' + str(resp.cr_errno)) + + if errmsg and errmsg not in resp.cr_errmsg: + raise Exception('Unexpected cr_msg \'' + str(resp.cr_errmsg) + '\'') def no_process(self): print('Try to dump unexisting process') @@ -131,12 +134,29 @@ class test: self.check_resp(resp, rpc.EMPTY, None) print('Success') + + def child_first_err(self): + print('Receive correct first error message') + + req = self.get_base_req() + req.type = rpc.CHECK + + # mntns_compat_mode options is only allowed on restore + req.opts.mntns_compat_mode = True + + self.send_req(req) + resp = self.recv_resp() + + self.check_resp(resp, rpc.CHECK, None, "Option --mntns-compat-mode is only valid on restore\n") + + print('Success') def run(self): self.no_process() self.process_exists() self.bad_options() self.bad_request() + self.child_first_err() t = test() From 4c7d42f67a0da8fbd60b811c58f2d18950d88d1a Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Sun, 8 Jun 2025 13:24:11 +0800 Subject: [PATCH 132/257] ipc/sysctl: fix CTL_FLAGS_IPC_EACCES_SKIP by making it a flag Having CTL_FLAGS_IPC_EACCES_SKIP == (CTL_FLAGS_OPTIONAL | CTL_FLAGS_READ_EIO_SKIP) is probably not what we want. So let's make it a real distinct flag. Fixes: 840735aa0 ("ipc_sysctl: Prioritize restoring IPC variables using non usernsd approach") Signed-off-by: Pavel Tikhomirov --- criu/include/sysctl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/include/sysctl.h b/criu/include/sysctl.h index cb3eba817..2d689a9a0 100644 --- a/criu/include/sysctl.h +++ b/criu/include/sysctl.h @@ -37,6 +37,6 @@ enum { #define CTL_FLAGS_OPTIONAL 1 #define CTL_FLAGS_HAS 2 #define CTL_FLAGS_READ_EIO_SKIP 4 -#define CTL_FLAGS_IPC_EACCES_SKIP 5 +#define CTL_FLAGS_IPC_EACCES_SKIP 8 #endif /* __CR_SYSCTL_H__ */ From 4f057a6aeb6ced50ec412e425cd214975ceea42b Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Sun, 8 Jun 2025 13:34:19 +0800 Subject: [PATCH 133/257] net/sysctl: fix missprint in an error message Fixes: f38e58836 ("net/sysctl: c/r ipv4/ping_group_range value") Signed-off-by: Pavel Tikhomirov --- criu/net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/net.c b/criu/net.c index 300df480b..e5d2f1c4d 100644 --- a/criu/net.c +++ b/criu/net.c @@ -2147,7 +2147,7 @@ static int ipv4_sysctls_op(SysctlEntry ***rsysctl, size_t *pn, int op) size_t n = *pn; if (n != ARRAY_SIZE(ipv4_sysctl_entries)) { - pr_err("unix: Unexpected entries in sysctlig (%zu %zu)\n", n, ARRAY_SIZE(ipv4_sysctl_entries)); + pr_err("unix: Unexpected entries in sysctl (%zu %zu)\n", n, ARRAY_SIZE(ipv4_sysctl_entries)); return -EINVAL; } From 45d09ae17e9524250e31750abdc34e9a34710e94 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 10 Jun 2025 11:33:59 +0800 Subject: [PATCH 134/257] net/sysctl: fix broken ipv4_sysctls_op We have ability to skip sysctl if there is no value, but we still give n requests to sysctl_op, that is not correct and probably can segfault on nullptr access. Fix it by adding ri to count non skipped requests. To be on the safe side, let's add a check that ri == n on read, as we should not do any skips there. While on it lets fix bad error message prefix: s/unix/ipv4/. Remove excess has_iarg set, and add sarg reset to NULL for the case sysctl_op skipped it. Signed-off-by: Andrei Vagin Signed-off-by: Pavel Tikhomirov --- criu/net.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/criu/net.c b/criu/net.c index e5d2f1c4d..2c018ef7b 100644 --- a/criu/net.c +++ b/criu/net.c @@ -2144,51 +2144,53 @@ static int ipv4_sysctls_op(SysctlEntry ***rsysctl, size_t *pn, int op) char path[ARRAY_SIZE(ipv4_sysctl_entries)][MAX_IPV4_SYSCTL_PATH] = {}; struct sysctl_req req[ARRAY_SIZE(ipv4_sysctl_entries)] = {}; SysctlEntry **sysctl = *rsysctl; - size_t n = *pn; + size_t n = *pn, ri; if (n != ARRAY_SIZE(ipv4_sysctl_entries)) { - pr_err("unix: Unexpected entries in sysctl (%zu %zu)\n", n, ARRAY_SIZE(ipv4_sysctl_entries)); + pr_err("ipv4: Unexpected entries in sysctl (%zu %zu)\n", n, ARRAY_SIZE(ipv4_sysctl_entries)); return -EINVAL; } if (opts.weak_sysctls || op == CTL_READ) flags = CTL_FLAGS_OPTIONAL; - for (i = 0; i < n; i++) { - snprintf(path[i], MAX_IPV4_SYSCTL_PATH, IPV4_SYSCTL_FMT, ipv4_sysctl_entries[i]); - req[i].name = path[i]; - req[i].flags = flags; + for (i = 0, ri = 0; i < n; i++) { + snprintf(path[ri], MAX_IPV4_SYSCTL_PATH, IPV4_SYSCTL_FMT, ipv4_sysctl_entries[i]); + req[ri].name = path[ri]; + req[ri].flags = flags; switch (sysctl[i]->type) { case SYSCTL_TYPE__CTL_STR: - req[i].type = CTL_STR(MAX_STR_IPV4_SYSCTL_LEN); + req[ri].type = CTL_STR(MAX_STR_IPV4_SYSCTL_LEN); /* skip write if have no value */ if (op == CTL_WRITE && !sysctl[i]->sarg) continue; - req[i].arg = sysctl[i]->sarg; + req[ri].arg = sysctl[i]->sarg; break; default: pr_err("ipv4: Unknown sysctl type %d\n", sysctl[i]->type); return -1; } + ri++; } - ret = sysctl_op(req, n, op, CLONE_NEWNET); + ret = sysctl_op(req, ri, op, CLONE_NEWNET); if (ret < 0) { - pr_err("unix: Failed to %s %s/\n", (op == CTL_READ) ? "read" : "write", IPV4_SYSCTL_BASE); + pr_err("ipv4: Failed to %s %s/\n", (op == CTL_READ) ? "read" : "write", IPV4_SYSCTL_BASE); return -1; } if (op == CTL_READ) { bool has_entries = false; + BUG_ON(ri != n); for (i = 0; i < n; i++) { if (req[i].flags & CTL_FLAGS_HAS) { - sysctl[i]->has_iarg = true; - if (!has_entries) - has_entries = true; + has_entries = true; + } else { + sysctl[i]->sarg = NULL; } } From 87bd09a0d18b1388dde831ee2ff6bef7bc9f0845 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Sun, 8 Jun 2025 14:07:13 +0800 Subject: [PATCH 135/257] net/sysctl: make ipv4/ping_group_range work in user namespaces We dump sysctls from criu user namespace, but restore from restored user namespace. So group id values should be mapped to the restored user namespace gid space to restore correctly. Signed-off-by: Andrei Vagin Signed-off-by: Pavel Tikhomirov --- criu/net.c | 44 ++++++++++++++++++++++++++ test/zdtm/static/netns_sub_sysctl.desc | 2 +- 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/criu/net.c b/criu/net.c index 2c018ef7b..e5775a328 100644 --- a/criu/net.c +++ b/criu/net.c @@ -2203,6 +2203,42 @@ static int ipv4_sysctls_op(SysctlEntry ***rsysctl, size_t *pn, int op) return 0; } +static int ipv4_sysctls_ping_group_range_map_gid(SysctlEntry *ent, size_t size) +{ + int start, end, ustart, uend, ret; + + if (sscanf(ent->sarg, "%d %d", &start, &end) != 2) { + pr_err("Failed to parse ping_group_range: %s\n", ent->sarg); + return -1; + } + + /* + * The default is "1 0", which means no group + * is allowed to create ICMP Echo sockets. + */ + if (start == 1 && end == 0) { + pr_debug("The ping_group_range is set to default, skipping it.\n"); + ent->sarg = NULL; + return 0; + } + + if (!(root_ns_mask & CLONE_NEWUSER)) + return 0; + + ustart = userns_gid(start); + uend = userns_gid(end); + pr_debug("Mapping ping_group_range %d %d to userns -> %d %d\n", + start, end, ustart, uend); + + ret = snprintf(ent->sarg, size, "%d\t%d\n", ustart, uend); + if (ret < 0 || ret >= size) { + pr_err("Failed to map ping_group_range: %d\t%d\n", ustart, uend); + return -1; + } + + return 0; +} + static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) { void *buf, *o_buf; @@ -2220,6 +2256,7 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) SysctlEntry *ipv4_sysctls = NULL; size_t ipv4_sysctl_size = ARRAY_SIZE(ipv4_sysctl_entries); char ping_group_range[MAX_STR_IPV4_SYSCTL_LEN + 1] = {}; + int ping_group_range_id = -1; NetnsId *ids; struct netns_id *p; @@ -2310,6 +2347,7 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) if (!strcmp(ipv4_sysctl_entries[i], "ping_group_range")) { netns.ipv4_sysctl[i]->type = SYSCTL_TYPE__CTL_STR; netns.ipv4_sysctl[i]->sarg = ping_group_range; + ping_group_range_id = i; } else { /* Need to handle this case when we have more sysctls */ BUG(); @@ -2338,6 +2376,12 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) if (ret < 0) goto err_free; + BUG_ON(ping_group_range_id == -1); + ret = ipv4_sysctls_ping_group_range_map_gid(netns.ipv4_sysctl[ping_group_range_id], + MAX_STR_IPV4_SYSCTL_LEN + 1); + if (ret < 0) + goto err_free; + ret = pb_write_one(img_from_set(fds, CR_FD_NETNS), &netns, PB_NETNS); err_free: xfree(o_buf); diff --git a/test/zdtm/static/netns_sub_sysctl.desc b/test/zdtm/static/netns_sub_sysctl.desc index 535842668..0c357aefe 100644 --- a/test/zdtm/static/netns_sub_sysctl.desc +++ b/test/zdtm/static/netns_sub_sysctl.desc @@ -1,4 +1,4 @@ { - 'flavor': 'ns', + 'flavor': 'ns uns', 'flags': 'suid' } From 677a56891917b873dc30278be0063730d901717d Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 9 Jun 2025 21:17:57 -0700 Subject: [PATCH 136/257] zdtm/netns_sub_sysctl: skip unsupported sysctls net/unix/max_dgram_qlen can't be tuned from non-root userns before: v5.17-rc1~170^2~215 ("net: Enable max_dgram_qlen unix sysctl to be configurable by non-init user namespaces") Signed-off-by: Andrei Vagin --- test/zdtm/static/netns_sub_sysctl.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/zdtm/static/netns_sub_sysctl.c b/test/zdtm/static/netns_sub_sysctl.c index 0f94c40a7..03b478b7d 100644 --- a/test/zdtm/static/netns_sub_sysctl.c +++ b/test/zdtm/static/netns_sub_sysctl.c @@ -1,4 +1,6 @@ #include +#include +#include #include "zdtmtst.h" #include "sysctl.h" @@ -20,6 +22,7 @@ typedef struct { int new; char s_old[MAX_STR_SYSCTL_LEN]; char s_new[MAX_STR_SYSCTL_LEN]; + bool set; } sysctl_opt_t; #define CONF_UNIX_BASE "/proc/sys/net/unix" @@ -38,6 +41,11 @@ int main(int argc, char **argv) test_init(argc, argv); for (p = net_unix_params; p->path != NULL; p++) { + if (access(p->path, W_OK) != 0) { + test_msg("%s doesn't exist\n", p->path); + continue; + } + p->set = true; if (p->type == SYSCTL_INT) { p->old = (((unsigned)lrand48()) % 1023) + 1; if (sysctl_write_int(p->path, p->old)) { @@ -56,6 +64,8 @@ int main(int argc, char **argv) test_waitsig(); for (p = net_unix_params; p->path != NULL; p++) { + if (!p->set) + continue; if (p->type == SYSCTL_INT) { if (sysctl_read_int(p->path, &p->new)) ret = 1; From a80c54484559a8bf7670f246db897b2710bb8688 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E0=A4=B8=E0=A4=AE=E0=A5=80=E0=A4=B0=20=E0=A4=B8=E0=A4=BF?= =?UTF-8?q?=E0=A4=82=E0=A4=B9=20Sameer=20Singh?= Date: Fri, 27 Dec 2024 03:47:35 +0530 Subject: [PATCH 137/257] sk-inet: Add support for checkpoint/restore of ICMP sockets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently there is no option to checkpoint/restore programs that use ICMP sockets, such as `ping`. This patch adds support for the same. Fixes #2557 Signed-off-by: समीर सिंह Sameer Singh --- criu/sk-inet.c | 7 +++++-- criu/sockets.c | 34 ++++++++++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/criu/sk-inet.c b/criu/sk-inet.c index 1238b03dc..6e0acf2ce 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -130,6 +130,8 @@ static int can_dump_ipproto(unsigned int ino, int proto, int type) case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_UDPLITE: + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: break; default: pr_err("Unsupported proto %d for socket %x\n", proto, ino); @@ -922,8 +924,9 @@ static int open_inet_sk(struct file_desc *d, int *new_fd) } if (ie->src_port) { - if (inet_bind(sk, ii)) - goto err; + if (ie->proto != IPPROTO_ICMP && ie->proto != IPPROTO_ICMPV6) + if (inet_bind(sk, ii)) + goto err; } /* diff --git a/criu/sockets.c b/criu/sockets.c index f9ce999be..0affccad0 100644 --- a/criu/sockets.c +++ b/criu/sockets.c @@ -65,7 +65,7 @@ const char *socket_proto_name(unsigned int proto, char *nm, size_t size) [IPPROTO_IPV6] = __stringify_1(IPPROTO_IPV6), [IPPROTO_RSVP] = __stringify_1(IPPROTO_RSVP), [IPPROTO_GRE] = __stringify_1(IPPROTO_GRE), [IPPROTO_ESP] = __stringify_1(IPPROTO_ESP), [IPPROTO_AH] = __stringify_1(IPPROTO_AH), [IPPROTO_UDPLITE] = __stringify_1(IPPROTO_UDPLITE), - [IPPROTO_RAW] = __stringify_1(IPPROTO_RAW), + [IPPROTO_RAW] = __stringify_1(IPPROTO_RAW), [IPPROTO_ICMPV6] = __stringify_1(IPPROTO_ICMPV6), }; return __socket_const_name(nm, size, protos, ARRAY_SIZE(protos), proto); } @@ -131,10 +131,12 @@ enum socket_cl_bits { INET_UDP_CL_BIT, INET_UDPLITE_CL_BIT, INET_RAW_CL_BIT, + INET_ICMP_CL_BIT, INET6_TCP_CL_BIT, INET6_UDP_CL_BIT, INET6_UDPLITE_CL_BIT, INET6_RAW_CL_BIT, + INET6_ICMP_CL_BIT, UNIX_CL_BIT, PACKET_CL_BIT, _MAX_CL_BIT, @@ -161,6 +163,8 @@ static inline enum socket_cl_bits get_collect_bit_nr(unsigned int family, unsign return INET_UDPLITE_CL_BIT; if (proto == IPPROTO_RAW) return INET_RAW_CL_BIT; + if (proto == IPPROTO_ICMP) + return INET_ICMP_CL_BIT; } if (family == AF_INET6) { if (proto == IPPROTO_TCP) @@ -171,6 +175,8 @@ static inline enum socket_cl_bits get_collect_bit_nr(unsigned int family, unsign return INET6_UDPLITE_CL_BIT; if (proto == IPPROTO_RAW) return INET6_RAW_CL_BIT; + if (proto == IPPROTO_ICMPV6) + return INET6_ICMP_CL_BIT; } pr_err("Unknown pair family %d proto %d\n", family, proto); @@ -282,6 +288,12 @@ void preload_socket_modules(void) req.r.i.sdiag_protocol = IPPROTO_RAW; probe_diag(nl, &req, -ENOENT); + req.r.i.sdiag_protocol = IPPROTO_ICMP; + probe_diag(nl, &req, -ENOENT); + + req.r.i.sdiag_protocol = IPPROTO_ICMPV6; + probe_diag(nl, &req, -ENOENT); + close(nl); pr_info("Done probing\n"); } @@ -773,6 +785,10 @@ static int inet_receive_one(struct nlmsghdr *h, struct ns_id *ns, void *arg) case IPPROTO_RAW: type = SOCK_RAW; break; + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + type = SOCK_DGRAM; + break; default: BUG_ON(1); return -1; @@ -797,7 +813,7 @@ static int collect_err(int err, struct ns_id *ns, void *arg) char family[32], proto[32]; char msg[256]; - snprintf(msg, sizeof(msg), "Sockects collect procedure family %s proto %s", + snprintf(msg, sizeof(msg), "Sockets collect procedure family %s proto %s", socket_family_name(gr->family, family, sizeof(family)), socket_proto_name(gr->protocol, proto, sizeof(proto))); @@ -905,6 +921,13 @@ int collect_sockets(struct ns_id *ns) if (tmp) err = tmp; + /* Collect IPv4 ICMP sockets */ + req.r.i.sdiag_family = AF_INET; + req.r.i.sdiag_protocol = IPPROTO_ICMP; + req.r.i.idiag_ext = 0; + req.r.i.idiag_states = -1; /* All */ + set_collect_bit(req.r.n.sdiag_family, req.r.n.sdiag_protocol); + /* Collect IPv6 TCP sockets */ req.r.i.sdiag_family = AF_INET6; req.r.i.sdiag_protocol = IPPROTO_TCP; @@ -944,6 +967,13 @@ int collect_sockets(struct ns_id *ns) if (tmp) err = tmp; + /* Collect IPv6 ICMP sockets */ + req.r.i.sdiag_family = AF_INET6; + req.r.i.sdiag_protocol = IPPROTO_ICMPV6; + req.r.i.idiag_ext = 0; + req.r.i.idiag_states = -1; /* All */ + set_collect_bit(req.r.n.sdiag_family, req.r.n.sdiag_protocol); + req.r.p.sdiag_family = AF_PACKET; req.r.p.sdiag_protocol = 0; req.r.p.pdiag_show = PACKET_SHOW_INFO | PACKET_SHOW_MCLIST | PACKET_SHOW_FANOUT | PACKET_SHOW_RING_CFG; From 3dc865bc80a4dfa5378bed9fe0434433d65379e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E0=A4=B8=E0=A4=AE=E0=A5=80=E0=A4=B0=20=E0=A4=B8=E0=A4=BF?= =?UTF-8?q?=E0=A4=82=E0=A4=B9=20Sameer=20Singh?= Date: Sat, 28 Dec 2024 09:35:11 +0530 Subject: [PATCH 138/257] test: add static tests for ICMP socket MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add ZDTM static tests for IP4/ICMP and IP6/ICMP socket feature. Signed-off-by: समीर सिंह Sameer Singh Signed-off-by: Andrei Vagin --- test/zdtm/static/Makefile | 3 + test/zdtm/static/socket6_icmp.c | 1 + test/zdtm/static/socket_icmp.c | 128 ++++++++++++++++++++++++++++++++ 3 files changed, 132 insertions(+) create mode 120000 test/zdtm/static/socket6_icmp.c create mode 100644 test/zdtm/static/socket_icmp.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 34fc90513..d427659e0 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -37,6 +37,8 @@ TST_NOFILE := \ socket_udp-corked \ socket6_udp \ socket_udp_shutdown \ + socket_icmp \ + socket6_icmp \ sk-freebind \ sk-freebind-false \ socket_udplite \ @@ -630,6 +632,7 @@ socket-tcp6-closed: CFLAGS += -D ZDTM_IPV6 socket-tcp6-closed: CFLAGS += -D ZDTM_IPV4V6 socket-tcp-closed-last-ack: CFLAGS += -D ZDTM_TCP_LAST_ACK socket-tcp-skip-in-flight: CFLAGS += -D ZDTM_IPV4V6 +socket6-icmp: CFLAGS += -DZDTM_IPV6 sock_ip_opts01: CFLAGS += -DZDTM_VAL_ZERO sock_tcp_opts01: CFLAGS += -DZDTM_VAL_ZERO tun_ns: CFLAGS += -DTUN_NS diff --git a/test/zdtm/static/socket6_icmp.c b/test/zdtm/static/socket6_icmp.c new file mode 120000 index 000000000..24d8fd806 --- /dev/null +++ b/test/zdtm/static/socket6_icmp.c @@ -0,0 +1 @@ +socket_icmp.c \ No newline at end of file diff --git a/test/zdtm/static/socket_icmp.c b/test/zdtm/static/socket_icmp.c new file mode 100644 index 000000000..f72e348bf --- /dev/null +++ b/test/zdtm/static/socket_icmp.c @@ -0,0 +1,128 @@ +#include "zdtmtst.h" + +const char *test_doc = "static test for ICMP socket\n"; +const char *test_author = "समीर सिंह Sameer Singh \n"; + +/* Description: + * Send a ping to localhost using ICMP socket + */ + +#include +#include +#include +#include +#if defined(ZDTM_IPV6) +#include +#else +#include +#endif +#include +#include +#include + +#include "sysctl.h" + +#define PACKET_SIZE 64 +#define RECV_TIMEOUT 1 + +static int echo_id = 1234; + +#if defined(ZDTM_IPV6) +#define TEST_ICMP_ECHOREPLY ICMP6_ECHOREPLY +#else +#define TEST_ICMP_ECHOREPLY ICMP_ECHOREPLY +#endif +int main(int argc, char **argv) +{ + int ret, sock, seq = 0; + char packet[PACKET_SIZE], recv_packet[PACKET_SIZE]; + + struct timeval tv; +#if defined(ZDTM_IPV6) + struct sockaddr_in6 addr, recv_addr; +#else + struct icmphdr icmp_header, *icmp_reply; +#endif + struct sockaddr_in addr, recv_addr; + socklen_t addr_len; + + // Allow GIDs 0-58468 to open an unprivileged ICMP socket + if (sysctl_write_str("/proc/sys/net/ipv4/ping_group_range", "0 58468")) + return -1; + + test_init(argc, argv); + +#if defined(ZDTM_IPV6) + sock = socket(PF_INET6, SOCK_DGRAM, IPPROTO_ICMPV6); +#else + sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_ICMP); +#endif + if (sock < 0) { + pr_perror("Can't create socket"); + return 1; + } + + tv.tv_sec = RECV_TIMEOUT; + tv.tv_usec = 0; + if (setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)) < 0) { + pr_perror("Can't set socket option"); + return 1; + } + + memset(&addr, 0, sizeof(addr)); + memset(&icmp_header, 0, sizeof(icmp_header)); +#if defined(ZDTM_IPV6) + addr.sin6_family = AF_INET6; + inet_pton(AF_INET6, "::1", &addr.sin6_addr); + + icmp_header.icmp6_type = ICMP6_ECHO_REQUEST; + icmp_header.icmp6_code = 0; + icmp_header.icmp6_id = echo_id; + icmp_header.icmp6_seq = seq; +#else + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = inet_addr("127.0.0.1"); + + icmp_header.type = ICMP_ECHO; + icmp_header.code = 0; + icmp_header.un.echo.id = echo_id; + icmp_header.un.echo.sequence = seq; +#endif + + memcpy(packet, &icmp_header, sizeof(icmp_header)); + memset(packet + sizeof(icmp_header), 0xa5, + PACKET_SIZE - sizeof(icmp_header)); + + test_daemon(); + test_waitsig(); + + ret = sendto(sock, packet, PACKET_SIZE, 0, + (struct sockaddr *)&addr, sizeof(addr)); + + if (ret < 0) { + fail("Can't send"); + return 1; + } + + addr_len = sizeof(recv_addr); + + ret = recvfrom(sock, recv_packet, sizeof(recv_packet), 0, + (struct sockaddr *)&recv_addr, &addr_len); + + if (ret < 0) { + fail("Can't recv"); + return 1; + } + + icmp_reply = (struct icmphdr *)recv_packet; + + if (icmp_reply->type != ICMP_ECHOREPLY) { + fail("Got no ICMP_ECHO_REPLY"); + return 1; + } + + close(sock); + + pass(); + return 0; +} From e31828ed8ce4fc7cbac4d412f7e33911f3a63b17 Mon Sep 17 00:00:00 2001 From: Chuan Qiu Date: Thu, 12 Jun 2025 22:49:26 -0700 Subject: [PATCH 139/257] mount: Fix trailing / when a file is bind-mounted E.g. I have a /etc/hosts in workspace mounted from the host, and get the following message. (00.141008) 1: mnt-v2: Create plain mountpoint /tmp/.criu.mntns.K1biY1/mnt-0000000938 for 938 (00.141546) 1: mnt-v2: Mounting unsupported @938 (0) (00.141887) 1: mnt-v2: Bind /tmp/agent/1-d8c746c6fda3a8b2/workspace/etc/hosts/ to /tmp/.criu.mntns.K1biY1/mnt-0000000938 (00.142179) 1: Error (criu/mount-v2.c:319): mnt-v2: Failed to open_tree /tmp/agent/1-d8c746c6fda3a8b2/workspace/etc/hosts/: Not a directory (00.143774) Error (criu/cr-restore.c:2320): Restoring FAILED. Signed-off-by: Chuan Qiu --- criu/mount.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/criu/mount.c b/criu/mount.c index 06b959542..b643a7f26 100644 --- a/criu/mount.c +++ b/criu/mount.c @@ -888,7 +888,11 @@ static int resolve_external_mounts(struct mount_info *info) cut_root = cut_root_for_bind(m->root, match->root); - p = xsprintf("%s/%s", match->ns_mountpoint + 1, cut_root); + if (cut_root[0] == '\0') { + p = xstrdup(match->ns_mountpoint + 1); + } else { + p = xsprintf("%s/%s", match->ns_mountpoint + 1, cut_root); + } if (!p) return -1; From 455c67739914a6d504733605e6d70538204aa3cf Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Fri, 20 Jun 2025 13:44:32 +0800 Subject: [PATCH 140/257] zdtm: Add ztatic/mnt_ext_file_bind_auto test The test creates a file bindmount in criu mntns and binds it into test mntns, this external file bindmount is autodetected and restored via "--external mnt[]" criu option. Note: In previous patch we fix the problem on this code path where file bindmount restore fails as there is excess "/" in source path. Signed-off-by: Pavel Tikhomirov --- test/zdtm/static/Makefile | 1 + test/zdtm/static/mnt_ext_file_bind_auto.c | 104 +++++++++++++++++++ test/zdtm/static/mnt_ext_file_bind_auto.desc | 4 + 3 files changed, 109 insertions(+) create mode 100644 test/zdtm/static/mnt_ext_file_bind_auto.c create mode 100644 test/zdtm/static/mnt_ext_file_bind_auto.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index d427659e0..ab69f389e 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -381,6 +381,7 @@ TST_FILE = \ sk-unix-listen02 \ sk-unix-listen03 \ sk-unix-listen04 \ + mnt_ext_file_bind_auto \ TST_DIR = \ cwd00 \ diff --git a/test/zdtm/static/mnt_ext_file_bind_auto.c b/test/zdtm/static/mnt_ext_file_bind_auto.c new file mode 100644 index 000000000..0c3b9f5fb --- /dev/null +++ b/test/zdtm/static/mnt_ext_file_bind_auto.c @@ -0,0 +1,104 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check if external file mount works"; +const char *test_author = "Pavel Tikhomirov "; + +char *filename = "mnt_ext_file_bind_auto_bind_auto.file"; +TEST_OPTION(filename, string, "file name", 1); + +char *source = "mnt_ext_file_bind_auto_bind_auto.source"; + +int create_file(const char *path) +{ + int fd; + + fd = open(path, O_CREAT | O_RDWR, 0644); + if (fd < 0) { + pr_perror("open"); + return -1; + } + + close(fd); + return 0; +} + +int main(int argc, char **argv) +{ + char *zdtm_newns = getenv("ZDTM_NEWNS"); + char *tmp = "/tmp/zdtm_ext_file_bind_auto.tmp"; + char *sourcefile = "/tmp/zdtm_ext_file_bind_auto.file"; + char *root, tmpfile[PATH_MAX], testfile[PATH_MAX]; + + root = getenv("ZDTM_ROOT"); + if (root == NULL) { + pr_perror("root"); + return 1; + } + + if (!zdtm_newns) { + pr_perror("ZDTM_NEWNS is not set"); + return 1; + } else if (strcmp(zdtm_newns, "1")) { + goto test; + } + + /* Prepare file bindmount in criu root (source for external file bindmount) */ + mkdir(tmp, 0755); + if (mount(source, tmp, "tmpfs", 0, NULL)) { + pr_perror("mount tmpfs"); + return 1; + } + if (mount(NULL, tmp, NULL, MS_PRIVATE, NULL)) { + pr_perror("make private"); + return 1; + } + + sprintf(tmpfile, "%s/%s", tmp, filename); + if (create_file(tmpfile)) + return 1; + + if (create_file(sourcefile)) + return 1; + + if (mount(tmpfile, sourcefile, NULL, MS_BIND, NULL)) { + pr_perror("bind"); + return 1; + } + + umount2(tmp, MNT_DETACH); + + /* Prepare file in test root (mount point for external file bindmount) */ + sprintf(testfile, "%s/%s", root, filename); + if (create_file(testfile)) + return 1; + + /* + * Create temporary mntns, next mounts will not show up in criu mntns + * and will be inherited into test mntns + */ + if (unshare(CLONE_NEWNS)) { + pr_perror("unshare"); + return 1; + } + + if (mount(sourcefile, testfile, NULL, MS_BIND, NULL)) { + pr_perror("bind"); + return 1; + } +test: + test_init(argc, argv); + + test_daemon(); + test_waitsig(); + + pass(); + return 0; +} diff --git a/test/zdtm/static/mnt_ext_file_bind_auto.desc b/test/zdtm/static/mnt_ext_file_bind_auto.desc new file mode 100644 index 000000000..825b08127 --- /dev/null +++ b/test/zdtm/static/mnt_ext_file_bind_auto.desc @@ -0,0 +1,4 @@ +{ 'opts': '--external mnt[]', + 'feature': 'mnt_id', + 'flavor': 'ns uns', + 'flags': 'suid'} From 7fbf7b2be4afd5768599cf104b8e05e52c671479 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 11 Jul 2025 22:16:49 +0100 Subject: [PATCH 141/257] images: remove symlink for descriptor.proto Currently the build scripts create the following symlink: criu-4.1/images/google/protobuf/descriptor.proto -> /usr/include/google/protobuf/descriptor.proto This symlink points to a system-wide absolute-path target. Also, this symlink ends up in the release tarball. The tarball may later be downloaded and unpacked by e.g. OS distributions. If unpacking is done using Python 3.14+, it will fail. This happens because Python 3.14 will switch the default behavior of extractall() from "fully trusting the content of archive" to "disallow common attack vectors while extracting the archive". With this new behavior, extractall() raises an exception when at least one file in the archive extracts or points to outside of the extraction directory (these are called path traversal attacks and zip slip attacks). Reported-by: Dmitrii Kuvaiskii Signed-off-by: Radostin Stoyanov --- .cirrus.yml | 7 ------- .lgtm.yml | 5 ----- images/Makefile | 17 ++++++++++++++++- images/google/protobuf/descriptor.proto | 1 - 4 files changed, 16 insertions(+), 14 deletions(-) delete mode 120000 images/google/protobuf/descriptor.proto diff --git a/.cirrus.yml b/.cirrus.yml index a4b53a54b..bddd5a3f1 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -15,7 +15,6 @@ task: setup_script: | scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto build_script: | make -C scripts/ci vagrant-fedora-no-vdso @@ -33,7 +32,6 @@ task: memory: 8G setup_script: | - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto dnf config-manager --set-enabled crb # Same as CentOS 8 powertools dnf -y install epel-release epel-next-release dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata xmlto libdrm-devel libuuid-devel @@ -67,7 +65,6 @@ task: setup_script: | scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto build_script: | make -C scripts/ci vagrant-fedora-rawhide @@ -88,7 +85,6 @@ task: setup_script: | scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto build_script: | make -C scripts/ci vagrant-fedora-non-root @@ -101,7 +97,6 @@ task: script: uname -a build_script: | scripts/ci/apt-install make - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto make -C scripts/ci local task: @@ -113,7 +108,6 @@ task: script: uname -a build_script: | scripts/ci/apt-install make - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto make -C scripts/ci local CLANG=1 task: @@ -125,6 +119,5 @@ task: script: uname -a build_script: | scripts/ci/prepare-for-fedora-rawhide.sh - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto make -C scripts/ci/ local CC=gcc SKIP_CI_PREP=1 SKIP_CI_TEST=1 CD_TO_TOP=1 make -C test/zdtm -j 4 diff --git a/.lgtm.yml b/.lgtm.yml index 0dd49cda4..4beadcc63 100644 --- a/.lgtm.yml +++ b/.lgtm.yml @@ -23,8 +23,3 @@ extraction: - "python3-yaml" - "libnl-route-3-dev" - "gnutls-dev" - configure: - command: - - "ls -laR images/google" - - "ln -s /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto" - - "ls -laR images/google" diff --git a/images/Makefile b/images/Makefile index d966fbfca..e94346eee 100644 --- a/images/Makefile +++ b/images/Makefile @@ -58,7 +58,6 @@ proto-obj-y += ext-file.o proto-obj-y += cgroup.o proto-obj-y += userns.o proto-obj-y += pidns.o -proto-obj-y += google/protobuf/descriptor.o # To make protoc happy and compile opts.proto proto-obj-y += opts.o proto-obj-y += seccomp.o proto-obj-y += binfmt-misc.o @@ -91,6 +90,22 @@ endef makefile-deps := Makefile $(obj)/Makefile +# +# Generate descriptor.pb-c.c and descriptor.pb-c.h to compile opts.proto. +PROTOBUF_DIR := images/google +DESCRIPTOR_DIR := $(PROTOBUF_DIR)/protobuf +GOOGLE_INCLUDE=$(shell pkg-config protobuf --variable=includedir)/google/protobuf +$(DESCRIPTOR_DIR)/descriptor.pb-c.c: $(GOOGLE_INCLUDE)/descriptor.proto + $$(Q) echo "Generating descriptor.pb-c.c" + $$(Q) protoc --proto_path=/usr/include --proto_path=$(obj)/ --c_out=$(obj)/ $< + +cleanup-y += $(DESCRIPTOR_DIR)/descriptor.pb-c.d + +submrproper: + $$(Q) rm -rf $(PROTOBUF_DIR) +.PHONY: submrproper +mrproper: submrproper + # # Generates rules needed to compile protobuf files. define gen-proto-rules diff --git a/images/google/protobuf/descriptor.proto b/images/google/protobuf/descriptor.proto deleted file mode 120000 index 07a4c9add..000000000 --- a/images/google/protobuf/descriptor.proto +++ /dev/null @@ -1 +0,0 @@ -/usr/include/google/protobuf/descriptor.proto \ No newline at end of file From 21c3b9c005d64db9bb6f998951384919f28d957f Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 22 Jul 2025 20:14:45 -0700 Subject: [PATCH 142/257] images/Makefile: fix using $(Q) Commit 68f92b551 used `$$(Q)` instead of `$(Q)` in the Makefile target, which resulted in the following error: $(Q) echo "Generating descriptor.pb-c.c" /bin/sh: 1: Q: not found Generating descriptor.pb-c.c $(Q) protoc --proto_path=/usr/include --proto_path=images/ --c_out=images/ /usr/include/google/protobuf/descriptor.proto /bin/sh: 1: Q: not found as well as: $(Q) rm -rf images/google /bin/sh: line 1: Q: command not found Fix it. Signed-off-by: Kir Kolyshkin --- images/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/images/Makefile b/images/Makefile index e94346eee..cb30a5126 100644 --- a/images/Makefile +++ b/images/Makefile @@ -96,13 +96,13 @@ PROTOBUF_DIR := images/google DESCRIPTOR_DIR := $(PROTOBUF_DIR)/protobuf GOOGLE_INCLUDE=$(shell pkg-config protobuf --variable=includedir)/google/protobuf $(DESCRIPTOR_DIR)/descriptor.pb-c.c: $(GOOGLE_INCLUDE)/descriptor.proto - $$(Q) echo "Generating descriptor.pb-c.c" - $$(Q) protoc --proto_path=/usr/include --proto_path=$(obj)/ --c_out=$(obj)/ $< + $(Q) echo "Generating descriptor.pb-c.c" + $(Q) protoc --proto_path=/usr/include --proto_path=$(obj)/ --c_out=$(obj)/ $< cleanup-y += $(DESCRIPTOR_DIR)/descriptor.pb-c.d submrproper: - $$(Q) rm -rf $(PROTOBUF_DIR) + $(Q) rm -rf $(PROTOBUF_DIR) .PHONY: submrproper mrproper: submrproper From 066bf7bf3c68c899644aab9ecfa9d7c8d551ea8f Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 22 Jul 2025 22:44:50 -0700 Subject: [PATCH 143/257] Keep images/google/protobuf directory Commit 68f92b551 removed images/google/protobuf directory, so it is re-created each time during the build process. This resulted in a weird behavior change. Previously, one could do something like this: git clone $CRURL criu (cd criu && sudo make install-criu) rm -rf criu This worked fine, including running rm -rf as a non-root user, since no new directories were created under criu -- all directories were still owned by the original user. Since commit 68f92b551 the same sequence fails: rm: cannot remove '/home/runner/criu/images/google/protobuf/descriptor.pb-c.c': Permission denied rm: cannot remove '/home/runner/criu/images/google/protobuf/descriptor.pb-c.d': Permission denied rm: cannot remove '/home/runner/criu/images/google/protobuf/descriptor.pb-c.h': Permission denied A workaround is to keep empty images/google/protobuf directory, which is what this commit does. Signed-off-by: Kir Kolyshkin --- .gitignore | 2 -- images/Makefile | 5 ++--- images/google/protobuf/.gitignore | 2 ++ 3 files changed, 4 insertions(+), 5 deletions(-) create mode 100644 images/google/protobuf/.gitignore diff --git a/.gitignore b/.gitignore index 854657d1c..94daa13ea 100644 --- a/.gitignore +++ b/.gitignore @@ -20,8 +20,6 @@ compel/compel compel/compel-host-bin images/*.c images/*.h -images/google/protobuf/*.c -images/google/protobuf/*.h .gitid criu/criu criu/unittest/unittest diff --git a/images/Makefile b/images/Makefile index cb30a5126..6f310e553 100644 --- a/images/Makefile +++ b/images/Makefile @@ -92,8 +92,7 @@ makefile-deps := Makefile $(obj)/Makefile # # Generate descriptor.pb-c.c and descriptor.pb-c.h to compile opts.proto. -PROTOBUF_DIR := images/google -DESCRIPTOR_DIR := $(PROTOBUF_DIR)/protobuf +DESCRIPTOR_DIR := images/google/protobuf GOOGLE_INCLUDE=$(shell pkg-config protobuf --variable=includedir)/google/protobuf $(DESCRIPTOR_DIR)/descriptor.pb-c.c: $(GOOGLE_INCLUDE)/descriptor.proto $(Q) echo "Generating descriptor.pb-c.c" @@ -102,7 +101,7 @@ $(DESCRIPTOR_DIR)/descriptor.pb-c.c: $(GOOGLE_INCLUDE)/descriptor.proto cleanup-y += $(DESCRIPTOR_DIR)/descriptor.pb-c.d submrproper: - $(Q) rm -rf $(PROTOBUF_DIR) + $(Q) rm -f $(DESCRIPTOR_DIR)/* .PHONY: submrproper mrproper: submrproper diff --git a/images/google/protobuf/.gitignore b/images/google/protobuf/.gitignore new file mode 100644 index 000000000..68359a786 --- /dev/null +++ b/images/google/protobuf/.gitignore @@ -0,0 +1,2 @@ +*.c +*.h From 22c83e3eba403c1402826dc9edd770d74965879c Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 22 Jul 2025 23:07:37 -0700 Subject: [PATCH 144/257] images/Makefile: use msg-gen In general, we use "$(E)" instead of "$(Q) echo", but we also have a msg-gen macro which can be used here. Signed-off-by: Kir Kolyshkin --- images/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/images/Makefile b/images/Makefile index 6f310e553..2c33152e9 100644 --- a/images/Makefile +++ b/images/Makefile @@ -95,7 +95,7 @@ makefile-deps := Makefile $(obj)/Makefile DESCRIPTOR_DIR := images/google/protobuf GOOGLE_INCLUDE=$(shell pkg-config protobuf --variable=includedir)/google/protobuf $(DESCRIPTOR_DIR)/descriptor.pb-c.c: $(GOOGLE_INCLUDE)/descriptor.proto - $(Q) echo "Generating descriptor.pb-c.c" + $(call msg-gen, $@) $(Q) protoc --proto_path=/usr/include --proto_path=$(obj)/ --c_out=$(obj)/ $< cleanup-y += $(DESCRIPTOR_DIR)/descriptor.pb-c.d From 95d5e2e59b1b83ba5400e7eea6db57f77424fb80 Mon Sep 17 00:00:00 2001 From: Ignacio Moreno Gonzalez Date: Wed, 16 Jul 2025 16:32:25 +0200 Subject: [PATCH 145/257] compel: flush caches after parasite injection After the CRIU process saves the parasite code for the target thread in the shared mmap, it is necessary to call __clear_cache before the target thread executes the code. Without this step, the target thread may not see the correct code to execute, which can result in a SIGILL signal. For the specific arm64 case. this is important so that the newly copied code is flushed from d-cache to RAM, so that the target thread sees the new code. The change is based on commit 6be10a2 by @fu.lin and on input received from @adrianreber. [ avagin: tweak code comment ] Signed-off-by: Ignacio Moreno Gonzalez Signed-off-by: Andrei Vagin --- compel/src/lib/infect.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index 4ea27bc63..22fcf24fa 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -1054,6 +1054,16 @@ int compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, memcpy(ctl->local_map, ctl->pblob.hdr.mem, ctl->pblob.hdr.bsize); compel_relocs_apply(ctl->local_map, ctl->remote_map, &ctl->pblob); + /* + * Ensure the infected thread sees the updated code. + * + * On architectures like ARM64, the Data Cache (D-cache) and + * Instruction Cache (I-cache) are not automatically coherent. + * Modifications land in the D-cache, so we must flush (clean) the + * D-cache to push changes to RAM to ensure the CPU fetches the updated + * instructions. + */ + __builtin___clear_cache(ctl->local_map, ctl->local_map + ctl->pblob.hdr.bsize); p = parasite_size; From 64276874d89825452baee6c756046e1277a41c48 Mon Sep 17 00:00:00 2001 From: Ignacio Moreno Gonzalez Date: Wed, 16 Jul 2025 16:38:13 +0200 Subject: [PATCH 146/257] restore: flush caches during restore See the previous commit for rationale and architecture-specific details. [ avagin: tweak code comment ] Signed-off-by: Ignacio Moreno Gonzalez Signed-off-by: Andrei Vagin --- criu/cr-restore.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index c1d1f4b9d..b37603563 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2569,6 +2569,17 @@ static int remap_restorer_blob(void *addr) restorer_setup_c_header_desc(&pbd, true); compel_relocs_apply(addr, addr, &pbd); + /* + * Ensure the infected thread sees the updated code. + * + * On architectures like ARM64, the Data Cache (D-cache) and + * Instruction Cache (I-cache) are not automatically coherent. + * Modifications land in the D-cache, so we must flush (clean) the + * D-cache to push changes to RAM to ensure the CPU fetches the updated + * instructions. + */ + __builtin___clear_cache(addr, addr + pbd.hdr.bsize); + return 0; } From 0d1e280d09d1a7422f9706cadb332586d520c352 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 25 Jul 2025 07:53:55 +0100 Subject: [PATCH 147/257] vagrant: fix 'qemu' install Installing this package currently fails with the following message: Package qemu is not available, but is referred to by another package. This may mean that the package is missing, has been obsoleted, or is only available from another source E: Package 'qemu' has no installation candidate Signed-off-by: Radostin Stoyanov --- scripts/ci/vagrant.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index ed5a01178..c3e15007c 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -22,7 +22,7 @@ setup() { wget --no-check-certificate https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}-1_"$(dpkg --print-architecture)".deb -O /tmp/vagrant.deb && \ dpkg -i /tmp/vagrant.deb - ./apt-install libvirt-clients libvirt-daemon-system libvirt-dev qemu-utils qemu \ + ./apt-install libvirt-clients libvirt-daemon-system libvirt-dev qemu-utils qemu-system \ ruby build-essential libxml2-dev qemu-kvm rsync ebtables dnsmasq-base \ openssh-client systemctl restart libvirtd From 2762b21e4a529f14b845f5bfe5153864d59b3e02 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 25 Jul 2025 08:34:31 +0100 Subject: [PATCH 148/257] vagrant: update image to fedora 42 Signed-off-by: Radostin Stoyanov --- scripts/ci/vagrant.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index c3e15007c..81af5d2e5 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -6,9 +6,9 @@ set -e set -x -VAGRANT_VERSION=2.4.1 -FEDORA_VERSION=40 -FEDORA_BOX_VERSION=40.20240414.0 +VAGRANT_VERSION=2.4.7 +FEDORA_VERSION=42 +FEDORA_BOX_VERSION=1.1.0 setup() { if [ -n "$TRAVIS" ]; then @@ -27,7 +27,7 @@ setup() { openssh-client systemctl restart libvirtd vagrant plugin install vagrant-libvirt - vagrant init fedora/${FEDORA_VERSION}-cloud-base --box-version ${FEDORA_BOX_VERSION} + vagrant init cloud-image/fedora-${FEDORA_VERSION} --box-version ${FEDORA_BOX_VERSION} # The default libvirt Vagrant VM uses 512MB. # Travis VMs should have around 7.5GB. # Increasing it to 4GB should work. From d586b30c6bede3767f86ef40217d462085b734e7 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 25 Jul 2025 08:50:29 +0100 Subject: [PATCH 149/257] vagrant: fix tar including archive in itself The tar command was failing with the following message: $ tar cf criu.tar ../../../criu tar: Removing leading `../../../' from member names tar: ../../../criu/scripts/ci/criu.tar: archive cannot contain itself; not dumped In addition, the /vagrant no-longer exist in the new Fedora images. bash: line 1: cd: /vagrant: No such file or directory Signed-off-by: Radostin Stoyanov --- scripts/ci/vagrant.sh | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index 81af5d2e5..008a01fb3 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -17,7 +17,7 @@ setup() { fi # Tar up the git checkout to have vagrant rsync it to the VM - tar cf criu.tar ../../../criu + tar cf /tmp/criu.tar -C ../../../ criu # Cirrus has problems with the following certificate. wget --no-check-certificate https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}-1_"$(dpkg --print-architecture)".deb -O /tmp/vagrant.deb && \ dpkg -i /tmp/vagrant.deb @@ -28,10 +28,16 @@ setup() { systemctl restart libvirtd vagrant plugin install vagrant-libvirt vagrant init cloud-image/fedora-${FEDORA_VERSION} --box-version ${FEDORA_BOX_VERSION} + # The default libvirt Vagrant VM uses 512MB. # Travis VMs should have around 7.5GB. # Increasing it to 4GB should work. sed -i Vagrantfile -e 's,^end$, config.vm.provider :libvirt do |libvirt|'"\n"' libvirt.memory = 4096;end'"\n"'end,g' + # Sync /tmp/criu.tar into the VM + # We want to use $HOME without expansion + # shellcheck disable=SC2016 + sed -i Vagrantfile -e 's|^end$| config.vm.provision "file", source: "/tmp/criu.tar", destination: "$HOME/criu.tar"'"\n"'end|g' + vagrant up --provider=libvirt --no-tty mkdir -p /root/.ssh vagrant ssh-config >> /root/.ssh/config @@ -40,8 +46,11 @@ setup() { libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make protobuf-c-devel \ protobuf-devel python3-protobuf python3-importlib-metadata python3-junit_xml \ rubygem-asciidoctor iptables libselinux-devel libbpf-devel python3-yaml libuuid-devel + # Disable sssd to avoid zdtm test failures in pty04 due to sssd socket ssh default sudo systemctl mask sssd + + ssh default 'sudo mkdir -p --mode=777 /vagrant && mv $HOME/criu.tar /vagrant && cd /vagrant && tar xf criu.tar' ssh default cat /proc/cmdline } @@ -49,7 +58,7 @@ fedora-no-vdso() { ssh default sudo grubby --update-kernel ALL --args="vdso=0" vagrant reload ssh default cat /proc/cmdline - ssh default 'cd /vagrant; tar xf criu.tar; cd criu; make -j 4' + ssh default 'cd /vagrant/criu; make -j' ssh default 'cd /vagrant/criu/test; sudo ./zdtm.py run -a --keep-going' # This test (pidfd_store_sk) requires pidfd_getfd syscall which is guaranteed in Fedora 33. # It is also skipped from -a because it runs in RPC mode only @@ -74,12 +83,12 @@ fedora-rawhide() { # In the container it is not possible to change the state of selinux. # Let's just disable it for this test run completely. ssh default 'sudo setenforce Permissive' - ssh default 'cd /vagrant; tar xf criu.tar; cd criu; sudo -E make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined"' + ssh default 'cd /vagrant/criu; sudo -E make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined"' } fedora-non-root() { ssh default uname -a - ssh default 'cd /vagrant; tar xf criu.tar; cd criu; make -j 4' + ssh default 'cd /vagrant/criu; make -j' # Setting the capability should be the only line needed to run as non-root on Fedora # In other environments either set /proc/sys/kernel/yama/ptrace_scope to 0 or grant cap_sys_ptrace to criu ssh default 'sudo setcap cap_checkpoint_restore+eip /vagrant/criu/criu/criu' From c6c6f6f231c8142ea8ee562e92c0bd4b6984f113 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 10 Aug 2025 21:46:39 +0000 Subject: [PATCH 150/257] zdtm/socket-tcp-closing: fill socket buffers effectivly Send large chunks to fill socket buffers. Signed-off-by: Andrei Vagin --- test/zdtm/static/socket-tcp-closing.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/zdtm/static/socket-tcp-closing.c b/test/zdtm/static/socket-tcp-closing.c index 87e1d7533..df291d446 100644 --- a/test/zdtm/static/socket-tcp-closing.c +++ b/test/zdtm/static/socket-tcp-closing.c @@ -31,10 +31,13 @@ static int port = 8880; int fill_sock_buf(int fd) { + char zdtm[512]; int flags; int size; int ret; + memset(zdtm, 5, sizeof(zdtm)); + flags = fcntl(fd, F_GETFL, 0); if (flags == -1) { pr_perror("Can't get flags"); @@ -47,7 +50,6 @@ int fill_sock_buf(int fd) size = 0; while (1) { - char zdtm[] = "zdtm test packet"; ret = write(fd, zdtm, sizeof(zdtm)); if (ret == -1) { if (errno == EAGAIN) From 5f94dd71e7fc59f31633faab57d59b924c3f0273 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 10 Aug 2025 21:50:41 +0000 Subject: [PATCH 151/257] CI: Consolidate arm64 tests on GitHub runners The arm64 tests are currently being executed on both actuated and GitHub runners. This change removes the actuated runner to avoid redundancy and streamline our CI process. Signed-off-by: Andrei Vagin --- .github/workflows/aarch64-test.yaml | 32 +++++++++++ .github/workflows/actuated-aarch64-test.yaml | 58 -------------------- 2 files changed, 32 insertions(+), 58 deletions(-) create mode 100644 .github/workflows/aarch64-test.yaml delete mode 100644 .github/workflows/actuated-aarch64-test.yaml diff --git a/.github/workflows/aarch64-test.yaml b/.github/workflows/aarch64-test.yaml new file mode 100644 index 000000000..32b19e176 --- /dev/null +++ b/.github/workflows/aarch64-test.yaml @@ -0,0 +1,32 @@ +name: aarch64 test + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: aarch64-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + runs-on: ubuntu-24.04-arm + strategy: + matrix: + target: [GCC=1, CLANG=1] + + steps: + - uses: actions/checkout@v4 + - name: Run Tests ${{ matrix.target }} + # Following tests are failing on the VMs: + # ./change_mnt_context --pidfile=change_mnt_context.pid --outfile=change_mnt_context.out + # 45: ERR: change_mnt_context.c:23: mount (errno = 22 (Invalid argument)) + # + # In combination with '--remote-lazy-pages' following error occurs: + # 138: FAIL: maps05.c:84: Data corrupted at page 1639 (errno = 11 (Resource temporarily unavailable)) + run: | + # The 'sched_policy00' needs the following: + sudo sysctl -w kernel.sched_rt_runtime_us=-1 + # etc/hosts entry is needed for netns_lock_iptables + echo "127.0.0.1 localhost" | sudo tee -a /etc/hosts + sudo -E make -C scripts/ci local ${{ matrix.target }} RUN_TESTS=1 \ + ZDTM_OPTS="-x zdtm/static/change_mnt_context -x zdtm/static/maps05" diff --git a/.github/workflows/actuated-aarch64-test.yaml b/.github/workflows/actuated-aarch64-test.yaml deleted file mode 100644 index 567746a5f..000000000 --- a/.github/workflows/actuated-aarch64-test.yaml +++ /dev/null @@ -1,58 +0,0 @@ -name: aarch64 test - -on: [push, pull_request] - -# Cancel any preceding run on the pull request. -concurrency: - group: actuated-test-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} - -jobs: - build: - # Actuated runners are not available in all repositories. - if: ${{ github.repository == 'checkpoint-restore/criu' }} - # The memory size and the number of CPUs can be freely selected for - # the actuated runners. 3GB and 4 CPUs seems to be enough according to the - # result from 'vmmeter'. - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [actuated-arm64-4cpu-3gb, ubuntu-24.04-arm] - target: [GCC=1, CLANG=1] - - steps: - # https://gist.github.com/alexellis/1f33e581c75e11e161fe613c46180771#file-metering-gha-md - # vmmeter start - - name: Prepare arkade - if: ${{ matrix.os == 'actuated-arm64-4cpu-3gb' }} - uses: alexellis/arkade-get@master - with: - crane: latest - print-summary: false - - - name: Install vmmeter - if: ${{ matrix.os == 'actuated-arm64-4cpu-3gb' }} - run: | - crane export --platform linux/arm64 ghcr.io/openfaasltd/vmmeter:latest | sudo tar -xvf - -C /usr/local/bin - - - name: Run vmmeter - if: ${{ matrix.os == 'actuated-arm64-4cpu-3gb' }} - uses: self-actuated/vmmeter-action@master - # vmmeter end - - - uses: actions/checkout@v4 - - name: Run Tests ${{ matrix.target }}/${{ matrix.os }} - # Following tests are failing on the actuated VMs: - # ./change_mnt_context --pidfile=change_mnt_context.pid --outfile=change_mnt_context.out - # 45: ERR: change_mnt_context.c:23: mount (errno = 22 (Invalid argument)) - # - # In combination with '--remote-lazy-pages' following error occurs: - # 138: FAIL: maps05.c:84: Data corrupted at page 1639 (errno = 11 (Resource temporarily unavailable)) - run: | - # The 'sched_policy00' needs the following: - sudo sysctl -w kernel.sched_rt_runtime_us=-1 - # etc/hosts entry is needed for netns_lock_iptables - echo "127.0.0.1 localhost" | sudo tee -a /etc/hosts - sudo -E make -C scripts/ci local ${{ matrix.target }} RUN_TESTS=1 \ - ZDTM_OPTS="-x zdtm/static/change_mnt_context -x zdtm/static/maps05" From fce491113bcb5bfe95e078ba92e2601b7f671c23 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 6 Apr 2025 20:22:57 +0200 Subject: [PATCH 152/257] criu/include/mman: define MADV_GUARD_INSTALL Signed-off-by: Alexander Mikhalitsyn --- criu/include/mman.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/criu/include/mman.h b/criu/include/mman.h index 086753bcf..43e0b6cc7 100644 --- a/criu/include/mman.h +++ b/criu/include/mman.h @@ -19,5 +19,8 @@ #ifndef MADV_WIPEONFORK #define MADV_WIPEONFORK 18 #endif +#ifndef MADV_GUARD_INSTALL +#define MADV_GUARD_INSTALL 102 +#endif #endif /* __CR_MMAN_H__ */ From 2bb77daa92d26266d32e08ac21c0ed91f438a945 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sat, 19 Apr 2025 20:37:48 +0200 Subject: [PATCH 153/257] kerndat: add madvise(MADV_GUARD_INSTALL) feature-detection Signed-off-by: Alexander Mikhalitsyn --- criu/include/kerndat.h | 1 + criu/kerndat.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index c5deb3283..66db75649 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -91,6 +91,7 @@ struct kerndat_s { bool has_close_range; bool has_timer_cr_ids; bool has_breakpoints; + bool has_madv_guard; }; extern struct kerndat_s kdat; diff --git a/criu/kerndat.c b/criu/kerndat.c index fa43f7d3f..7e2edb72d 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -31,6 +31,7 @@ #include "kerndat.h" #include "fs-magic.h" #include "mem.h" +#include "mman.h" #include "common/compiler.h" #include "sysctl.h" #include "cr_options.h" @@ -1813,6 +1814,33 @@ err: return exit_code; } +static int kerndat_has_madv_guard(void) +{ + void *map; + + map = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (map == MAP_FAILED) { + pr_perror("Can't mmap a page for has_madv_guard feature test"); + return -1; + } + + if (madvise(map, PAGE_SIZE, MADV_GUARD_INSTALL)) { + if (errno != EINVAL) { + pr_perror("madvise failed (has_madv_guard check)"); + goto mmap_cleanup; + } + } else { + kdat.has_madv_guard = true; + } + + munmap(map, PAGE_SIZE); + return 0; + +mmap_cleanup: + munmap(map, PAGE_SIZE); + return -1; +} + /* * Some features depend on resource that can be dynamically changed * at the OS runtime. There are cases that we cannot determine the @@ -2081,6 +2109,10 @@ int kerndat_init(void) pr_err("kerndat_breakpoints has failed when initializing kerndat.\n"); ret = -1; } + if (!ret && kerndat_has_madv_guard()) { + pr_err("kerndat_has_madv_guard has failed when initializing kerndat.\n"); + ret = -1; + } kerndat_lsm(); kerndat_mmap_min_addr(); From 4fc07a8a41f468b72e912fe38c96be18d37518d6 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sat, 19 Apr 2025 20:36:45 +0200 Subject: [PATCH 154/257] kerndat: add pagemap_scan_guard_pages feature check logic Signed-off-by: Alexander Mikhalitsyn --- criu/cr-check.c | 8 ++++++++ criu/include/kerndat.h | 3 +++ criu/include/pagemap_scan.h | 1 + criu/kerndat.c | 12 ++++++++++++ 4 files changed, 24 insertions(+) diff --git a/criu/cr-check.c b/criu/cr-check.c index 9c4778490..7c3dc76dd 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -1599,6 +1599,12 @@ static int check_breakpoints(void) return 0; } +static int check_pagemap_scan_guard_pages(void) +{ + kerndat_warn_about_madv_guards(); + + return kdat.has_pagemap_scan_guard_pages ? 0 : -1; +} static int (*chk_feature)(void); @@ -1724,6 +1730,7 @@ int cr_check(void) ret |= check_pagemap_scan(); ret |= check_overlayfs_maps(); ret |= check_timer_cr_ids(); + ret |= check_pagemap_scan_guard_pages(); if (kdat.lsm == LSMTYPE__APPARMOR) ret |= check_apparmor_stacking(); @@ -1853,6 +1860,7 @@ static struct feature_list feature_list[] = { { "timer_cr_ids", check_timer_cr_ids }, { "overlayfs_maps", check_overlayfs_maps }, { "breakpoints", check_breakpoints }, + { "pagemap_scan_guard_pages", check_pagemap_scan_guard_pages }, { NULL, NULL }, }; diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index 66db75649..e4922f401 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -92,6 +92,7 @@ struct kerndat_s { bool has_timer_cr_ids; bool has_breakpoints; bool has_madv_guard; + bool has_pagemap_scan_guard_pages; }; extern struct kerndat_s kdat; @@ -114,4 +115,6 @@ extern int kerndat_fs_virtualized(unsigned int which, u32 kdev); extern int kerndat_has_nspid(void); +extern void kerndat_warn_about_madv_guards(void); + #endif /* __CR_KERNDAT_H__ */ diff --git a/criu/include/pagemap_scan.h b/criu/include/pagemap_scan.h index 0ad4c9bc0..9046e01ed 100644 --- a/criu/include/pagemap_scan.h +++ b/criu/include/pagemap_scan.h @@ -14,6 +14,7 @@ #define PAGE_IS_PFNZERO (1 << 5) #define PAGE_IS_HUGE (1 << 6) #define PAGE_IS_SOFT_DIRTY (1 << 7) +#define PAGE_IS_GUARD (1 << 8) /* * struct page_region - Page region with flags diff --git a/criu/kerndat.c b/criu/kerndat.c index 7e2edb72d..997181ce7 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -87,6 +87,10 @@ static int check_pagemap(void) if (ioctl(fd, PAGEMAP_SCAN, &args) == 0) { pr_debug("PAGEMAP_SCAN is supported\n"); kdat.has_pagemap_scan = true; + + args.return_mask |= PAGE_IS_GUARD; + if (ioctl(fd, PAGEMAP_SCAN, &args) == 0) + kdat.has_pagemap_scan_guard_pages = true; } else { switch (errno) { case EINVAL: @@ -1841,6 +1845,14 @@ mmap_cleanup: return -1; } +void kerndat_warn_about_madv_guards(void) +{ + if (kdat.has_madv_guard && !kdat.has_pagemap_scan_guard_pages) + pr_warn("ioctl(PAGEMAP_SCAN) doesn't support PAGE_IS_GUARD flag. " + "CRIU dump will fail if dumped processes use madvise(MADV_GUARD_INSTALL). " + "Please, consider updating your kernel.\n"); +} + /* * Some features depend on resource that can be dynamically changed * at the OS runtime. There are cases that we cannot determine the From 1873e8f502f2495d8792716df277664f6e3c4852 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sat, 19 Apr 2025 20:42:43 +0200 Subject: [PATCH 155/257] cr-dump: warn if MADV_GUARD is supported but isn't shown in pagemap Signed-off-by: Alexander Mikhalitsyn --- criu/cr-dump.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index b8cf7d64d..f02db1a57 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -2124,6 +2124,8 @@ int cr_dump_tasks(pid_t pid) int pre_dump_ret = 0; int ret = -1; + kerndat_warn_about_madv_guards(); + pr_info("========================================\n"); pr_info("Dumping processes (pid: %d comm: %s)\n", pid, __task_comm_info(pid)); pr_info("========================================\n"); From 42580fcb1614a002c54ab0115e81a77a81871418 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 6 Apr 2025 20:51:24 +0200 Subject: [PATCH 156/257] criu/pagemap-cache: pagescan: look for PAGE_IS_GUARD pages Signed-off-by: Alexander Mikhalitsyn --- criu/pagemap-cache.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/criu/pagemap-cache.c b/criu/pagemap-cache.c index f04a517de..457c0d649 100644 --- a/criu/pagemap-cache.c +++ b/criu/pagemap-cache.c @@ -194,6 +194,9 @@ int pmc_fill(pmc_t *pmc, u64 start, u64 end) }; long ret; + if (kdat.has_pagemap_scan_guard_pages) + args.return_mask |= PAGE_IS_GUARD; + ret = ioctl(pmc->fd, PAGEMAP_SCAN, &args); if (ret == -1) { pr_perror("PAGEMAP_SCAN"); From 5843cbf97552f8ddb794931a8daf179aae71d78d Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Thu, 1 May 2025 20:02:37 +0200 Subject: [PATCH 157/257] criu/mem: refactor should_dump_page helper Make should_dump_page to return int to indicate failure, also return useful data back through the struct page_info structure passed as a pointer. Also, correspondingly convert all call sites. No functional changes intended, except fixing a bug in should_dump_page() as it could return (-1) when pmc_fill() fails, while caller didn't expect that before. Signed-off-by: Alexander Mikhalitsyn --- criu/include/mem.h | 8 +++++- criu/mem.c | 69 ++++++++++++++++++++++++++++++---------------- criu/shmem.c | 27 +++++++++++------- 3 files changed, 69 insertions(+), 35 deletions(-) diff --git a/criu/include/mem.h b/criu/include/mem.h index 3618c9cc3..0ce97822b 100644 --- a/criu/include/mem.h +++ b/criu/include/mem.h @@ -49,5 +49,11 @@ int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta); int unmap_guard_pages(struct pstree_item *t); int prepare_mappings(struct pstree_item *t); -u64 should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, bool *softdirty); +struct page_info { + u64 next; + bool softdirty; +}; + +int should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, struct page_info *page_info); + #endif /* __CR_MEM_H__ */ diff --git a/criu/mem.c b/criu/mem.c index 803cb545b..9fcf7a44c 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -115,27 +115,37 @@ static bool should_dump_entire_vma(VmaEntry *vmae) } /* - * should_dump_page returns vaddr if an addressed page has to be dumped. - * Otherwise, it returns an address that has to be inspected next. + * should_dump_page writes vaddr in page_info->next if an addressed page has to be dumped. + * Otherwise, it writes an address that has to be inspected next. */ -u64 should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, bool *softdirty) +int should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, struct page_info *page_info) { + if (!page_info) + goto err; + if (vaddr >= pmc->end && pmc_fill(pmc, vaddr, vmae->end)) - return -1; + goto err; if (pmc->regs) { while (1) { - if (pmc->regs_idx == pmc->regs_len) - return pmc->end; + if (pmc->regs_idx == pmc->regs_len) { + page_info->next = pmc->end; + return 0; + } + if (vaddr < pmc->regs[pmc->regs_idx].end) break; pmc->regs_idx++; } - if (vaddr < pmc->regs[pmc->regs_idx].start) - return pmc->regs[pmc->regs_idx].start; - if (softdirty) - *softdirty = pmc->regs[pmc->regs_idx].categories & PAGE_IS_SOFT_DIRTY; - return vaddr; + + if (vaddr < pmc->regs[pmc->regs_idx].start) { + page_info->next = pmc->regs[pmc->regs_idx].start; + return 0; + } + + page_info->softdirty = pmc->regs[pmc->regs_idx].categories & PAGE_IS_SOFT_DIRTY; + page_info->next = vaddr; + return 0; } else { u64 pme = pmc->map[PAGE_PFN(vaddr - pmc->start)]; @@ -143,16 +153,26 @@ u64 should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, bool *softdirty) * Optimisation for private mapping pages, that haven't * yet being COW-ed */ - if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) - return vaddr + PAGE_SIZE; - if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) { - if (softdirty) - *softdirty = pme & PME_SOFT_DIRTY; - return vaddr; + if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) { + page_info->next = vaddr + PAGE_SIZE; + return 0; } - return vaddr + PAGE_SIZE; + if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) { + page_info->softdirty = pme & PME_SOFT_DIRTY; + page_info->next = vaddr; + return 0; + } + + page_info->next = vaddr + PAGE_SIZE; + return 0; } + +err: + pr_err("should_dump_page failed on vma " + "%#016" PRIx64 "-%#016" PRIx64 " vaddr=%#016" PRIx64 "\n", + vmae->start, vmae->end, vaddr); + return -1; } bool page_is_zero(u64 pme) @@ -202,14 +222,15 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct nr_scanned = 0; for (vaddr = *pvaddr; vaddr < vma->e->end; vaddr += PAGE_SIZE, nr_scanned++) { unsigned int ppb_flags = 0; - bool softdirty = false; - u64 next; + struct page_info page_info = {}; int st; /* If dump_all_pages is true, should_dump_page is called to get pme. */ - next = should_dump_page(pmc, vma->e, vaddr, &softdirty); - if (!dump_all_pages && next != vaddr) { - vaddr = next - PAGE_SIZE; + if (should_dump_page(pmc, vma->e, vaddr, &page_info)) + return -1; + + if (!dump_all_pages && page_info.next != vaddr) { + vaddr = page_info.next - PAGE_SIZE; continue; } @@ -223,7 +244,7 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct * page. The latter would be checked in page-xfer. */ - if (has_parent && page_in_parent(softdirty)) { + if (has_parent && page_in_parent(page_info.softdirty)) { ret = page_pipe_add_hole(pp, vaddr, PP_HOLE_PARENT); st = 0; } else { diff --git a/criu/shmem.c b/criu/shmem.c index 9e3178352..bc7aa3669 100644 --- a/criu/shmem.c +++ b/criu/shmem.c @@ -206,31 +206,34 @@ static int expand_shmem(struct shmem_info *si, unsigned long new_size) return 0; } -static void update_shmem_pmaps(struct shmem_info *si, pmc_t *pmc, VmaEntry *vma) +static int update_shmem_pmaps(struct shmem_info *si, pmc_t *pmc, VmaEntry *vma) { unsigned long shmem_pfn, vma_pfn, vma_pgcnt; u64 vaddr; if (!is_shmem_tracking_en()) - return; + return 0; vma_pgcnt = DIV_ROUND_UP(si->size - vma->pgoff, PAGE_SIZE); for (vma_pfn = 0, vaddr = vma->start; vma_pfn < vma_pgcnt; ++vma_pfn, vaddr += PAGE_SIZE) { - bool softdirty = false; - u64 next; + struct page_info page_info = {}; - next = should_dump_page(pmc, vma, vaddr, &softdirty); - if (next != vaddr) { - vaddr = next - PAGE_SIZE; + if (should_dump_page(pmc, vma, vaddr, &page_info)) + return -1; + + if (page_info.next != vaddr) { + vaddr = page_info.next - PAGE_SIZE; continue; } shmem_pfn = vma_pfn + DIV_ROUND_UP(vma->pgoff, PAGE_SIZE); - if (softdirty) + if (page_info.softdirty) set_pstate(si->pstate_map, shmem_pfn, PST_DIRTY); else set_pstate(si->pstate_map, shmem_pfn, PST_DUMP); } + + return 0; } int collect_sysv_shmem(unsigned long shmid, unsigned long size) @@ -667,7 +670,9 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc) if (expand_shmem(si, size)) return -1; } - update_shmem_pmaps(si, pmc, vma); + + if (update_shmem_pmaps(si, pmc, vma)) + return -1; return 0; } @@ -684,7 +689,9 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc) if (expand_shmem(si, size)) return -1; - update_shmem_pmaps(si, pmc, vma); + + if (update_shmem_pmaps(si, pmc, vma)) + return -1; return 0; } From cc047d595f742e416220d2d7740334500eb96a85 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 6 Apr 2025 20:42:26 +0200 Subject: [PATCH 158/257] criu/mem: dump: skip MADV_GUARD pages content dump 1. get info about MADV_GUARD_INSTALL-protected pages with help of pagemap by looking for PME_GUARD_REGION flag if /proc//pagemap is used or by looking for PAGE_IS_GUARD flag if ioctl(PAGEMAP_SCAN) is used 2. skip those pages Signed-off-by: Alexander Mikhalitsyn --- criu/include/mem.h | 1 + criu/mem.c | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/criu/include/mem.h b/criu/include/mem.h index 0ce97822b..b2cbd4b64 100644 --- a/criu/include/mem.h +++ b/criu/include/mem.h @@ -35,6 +35,7 @@ extern int parasite_dump_pages_seized(struct pstree_item *item, struct vm_area_l #define PME_PRESENT (1ULL << 63) #define PME_SWAP (1ULL << 62) #define PME_FILE (1ULL << 61) +#define PME_GUARD_REGION (1ULL << 58) #define PME_SOFT_DIRTY (1ULL << 55) #define PME_PSHIFT_BITS (6) #define PME_STATUS_BITS (3) diff --git a/criu/mem.c b/criu/mem.c index 9fcf7a44c..58c4130c6 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -143,12 +143,18 @@ int should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, struct page_info *pa return 0; } + if (pmc->regs[pmc->regs_idx].categories & PAGE_IS_GUARD) + goto skip_guard_page; + page_info->softdirty = pmc->regs[pmc->regs_idx].categories & PAGE_IS_SOFT_DIRTY; page_info->next = vaddr; return 0; } else { u64 pme = pmc->map[PAGE_PFN(vaddr - pmc->start)]; + if (pme & PME_GUARD_REGION) + goto skip_guard_page; + /* * Optimisation for private mapping pages, that haven't * yet being COW-ed @@ -173,6 +179,10 @@ err: "%#016" PRIx64 "-%#016" PRIx64 " vaddr=%#016" PRIx64 "\n", vmae->start, vmae->end, vaddr); return -1; + +skip_guard_page: + page_info->next = vaddr + PAGE_SIZE; + return 0; } bool page_is_zero(u64 pme) From 63c7029686ea90c649b2909f37ae93c111f11418 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Mon, 4 Aug 2025 09:42:10 +0200 Subject: [PATCH 159/257] criu/{mem, vdso, cr-restore}: introduce VMA_AREA_GUARD fake VMAs Introduce a new kind of VMA - VMA_AREA_GUARD. In fact, it is not a real VMA as it is not represented as struct vm_area_struct in the kernel. We want to reuse an existing vma infrastructure in CRIU to dump an information about MADV_GUARD_INSTALL-covered address space ranges as VMAs. Then, on restore, we need to carefully skip those fake VMAs everywhere we expect a normal VMAs to be processed. And only in restorer we use these VMAs to get an information about where to call MADV_GUARD_INSTALL. Suggested-by: Andrei Vagin Signed-off-by: Alexander Mikhalitsyn --- criu/cr-restore.c | 6 ++++-- criu/include/image.h | 7 +++++++ criu/mem.c | 13 +++++++++++-- criu/vdso.c | 6 ++++++ 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index b37603563..1c3b36451 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2447,7 +2447,8 @@ static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_he while (1) { if (prev_vma_end + vma_len > s_vma->e->start) { - if (s_vma->list.next == self_vma_list) { + if ((s_vma->list.next == self_vma_list) || + vma_area_is(vma_next(s_vma), VMA_AREA_GUARD)) { s_vma = &end_vma; continue; } @@ -2460,7 +2461,8 @@ static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_he } if (prev_vma_end + vma_len > t_vma->e->start) { - if (t_vma->list.next == tgt_vma_list) { + if ((t_vma->list.next == tgt_vma_list) || + vma_area_is(vma_next(t_vma), VMA_AREA_GUARD)) { t_vma = &end_vma; continue; } diff --git a/criu/include/image.h b/criu/include/image.h index afa7d5e12..934f7d4e9 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -68,6 +68,12 @@ * processing exiting with error; while the rest of bits * are part of image ABI, this particular one must never * be used in image. + * - guard + * stands for a fake VMA (not represented in the kernel + * by a struct vm_area_struct). Used to keep an information + * about virtual address space ranges covered by + * MADV_GUARD_INSTALL guards. These ones must be always at + * the end of the vma_area_list and properly skipped a.e. */ #define VMA_AREA_NONE (0 << 0) #define VMA_AREA_REGULAR (1 << 0) @@ -87,6 +93,7 @@ #define VMA_AREA_AIORING (1 << 13) #define VMA_AREA_MEMFD (1 << 14) #define VMA_AREA_SHSTK (1 << 15) +#define VMA_AREA_GUARD (1 << 16) #define VMA_EXT_PLUGIN (1 << 27) #define VMA_CLOSE (1 << 28) diff --git a/criu/mem.c b/criu/mem.c index 58c4130c6..ee841aca2 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -599,6 +599,9 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasit parent_predump_mode = mdc->parent_ie->pre_dump_mode; list_for_each_entry(vma_area, &vma_area_list->h, list) { + if (vma_area_is(vma_area, VMA_AREA_GUARD)) + continue; + ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl, &pmc, has_parent, mdc->pre_dump, parent_predump_mode); if (ret < 0) @@ -861,14 +864,14 @@ static void prepare_cow_vmas_for(struct vm_area_list *vmas, struct vm_area_list /* <= here to shift from matching VMAs and ... */ while (vma->e->start <= pvma->e->start) { vma = vma_next(vma); - if (&vma->list == &vmas->h) + if ((&vma->list == &vmas->h) || vma_area_is(vma, VMA_AREA_GUARD)) return; } /* ... no == here since we must stop on matching pair */ while (pvma->e->start < vma->e->start) { pvma = vma_next(pvma); - if (&pvma->list == &pvmas->h) + if ((&pvma->list == &pvmas->h) || vma_area_is(pvma, VMA_AREA_GUARD)) return; } } @@ -1069,6 +1072,9 @@ static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas, vo filemap_ctx_init(true); list_for_each_entry(vma, &vmas->h, list) { + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + if (task_size_check(vpid(t), vma->e)) { ret = -1; break; @@ -1276,6 +1282,9 @@ err_read: unsigned long size, i = 0; void *addr = decode_pointer(vma->premmaped_addr); + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + if (!vma_inherited(vma)) continue; diff --git a/criu/vdso.c b/criu/vdso.c index d4d351131..2d9e57c4d 100644 --- a/criu/vdso.c +++ b/criu/vdso.c @@ -145,6 +145,9 @@ static void drop_rt_vdso(struct vm_area_list *vma_area_list, struct vdso_quarter * Also BTW search for rt-vvar to remove it later. */ list_for_each_entry(vma, &vma_area_list->h, list) { + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + if (vma->e->start == addr->orig_vdso) { vma->e->status |= VMA_AREA_REGULAR | VMA_AREA_VDSO; pr_debug("vdso: Restore orig vDSO status at %lx\n", (long)vma->e->start); @@ -276,6 +279,9 @@ int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid, struct vm_area_list } list_for_each_entry(vma, &vma_area_list->h, list) { + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + /* * Defer handling marked vdso until we walked over * all vmas and restore potentially remapped vDSO From 59b4d662ae8fd704dfc16f47628a676852f4c886 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 6 Apr 2025 20:10:10 +0200 Subject: [PATCH 160/257] criu/pie/restorer: add madvise(MADV_GUARD_INSTALL) restore logic Signed-off-by: Alexander Mikhalitsyn --- criu/pie/restorer.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 9867a3ddd..394d3dea0 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -28,6 +28,7 @@ #include #include #include +#include "mman.h" #include "signal.h" #include "prctl.h" #include "criu-log.h" @@ -1665,6 +1666,30 @@ static int restore_membarrier_registrations(int mask) return ret; } +static int restore_madv_guard_regions(struct task_restore_args *args) +{ + int i, ret; + + for (i = 0; i < args->vmas_n; i++) { + VmaEntry *vma_entry = args->vmas + i; + size_t len; + + if (!vma_entry_is(vma_entry, VMA_AREA_GUARD)) + continue; + + len = vma_entry->end - vma_entry->start; + ret = sys_madvise(vma_entry->start, len, MADV_GUARD_INSTALL); + if (ret) { + pr_err("madvise(%" PRIx64 ", %zu, MADV_GUARD_INSTALL) " + "failed with %d\n", + vma_entry->start, len, ret); + return -1; + } + } + + return 0; +} + /* * The main routine to restore task via sigreturn. * This one is very special, we never return there @@ -1972,6 +1997,13 @@ __visible long __export_restore_task(struct task_restore_args *args) } } + /* + * Restore madvise(MADV_GUARD_INSTALL) + */ + ret = restore_madv_guard_regions(args); + if (ret) + goto core_restore_end; + /* * Tune up the task fields. */ From 9c0f725a625126063e09d01ebc087d1e36a0dcc5 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Mon, 4 Aug 2025 10:48:47 +0200 Subject: [PATCH 161/257] criu/mem: dump: note MADV_GUARD pages as VMA_AREA_GUARD VMAs Signed-off-by: Alexander Mikhalitsyn --- criu/cr-dump.c | 17 ++++++++++++ criu/include/mem.h | 1 + criu/mem.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index f02db1a57..10c485cbe 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -130,6 +130,23 @@ int collect_mappings(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap if (ret < 0) goto err; + /* + * In addition to real process VMAs we should keep an info about + * madvise(MADV_GUARD_INSTALL) pages. While these are not represented + * as a struct vm_area_struct in the kernel, it is convenient to treat + * them as mappings in CRIU and reuse the same VMA images but with only + * VMA_AREA_GUARD flag set. + * + * Also, we don't need to dump them during pre-dump. + */ + if (dump_file) { + ret = collect_madv_guards(pid, vma_area_list); + if (ret < 0) { + pr_err("Collect MADV_GUARD_INSTALL pages (pid: %d) failed with %d\n", pid, ret); + goto err; + } + } + pr_info("Collected, longest area occupies %lu pages\n", vma_area_list->nr_priv_pages_longest); pr_info_vma_list(&vma_area_list->h); diff --git a/criu/include/mem.h b/criu/include/mem.h index b2cbd4b64..e9ce3518a 100644 --- a/criu/include/mem.h +++ b/criu/include/mem.h @@ -31,6 +31,7 @@ extern int do_task_reset_dirty_track(int pid); extern unsigned long dump_pages_args_size(struct vm_area_list *vmas); extern int parasite_dump_pages_seized(struct pstree_item *item, struct vm_area_list *vma_area_list, struct mem_dump_ctl *mdc, struct parasite_ctl *ctl); +extern int collect_madv_guards(pid_t pid, struct vm_area_list *vma_area_list); #define PME_PRESENT (1ULL << 63) #define PME_SWAP (1ULL << 62) diff --git a/criu/mem.c b/criu/mem.c index ee841aca2..0636273cb 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -1548,3 +1548,72 @@ int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta) return prepare_vma_ios(t, ta); } + +int collect_madv_guards(pid_t pid, struct vm_area_list *vma_area_list) +{ + int pagemap_fd = -1; + struct page_region *regs = NULL; + long regs_len = 0; + int i, ret = -1; + + struct pm_scan_arg args = { + .size = sizeof(struct pm_scan_arg), + .flags = 0, + .start = 0, + .end = kdat.task_size, + .walk_end = 0, + .vec_len = 1000, /* this should be enough for most cases */ + .max_pages = 0, + .category_mask = PAGE_IS_GUARD, + .return_mask = PAGE_IS_GUARD, + }; + + if (!kdat.has_pagemap_scan_guard_pages) { + ret = 0; + goto out; + } + + pagemap_fd = open_proc(pid, "pagemap"); + if (pagemap_fd < 0) + goto out; + + regs = xmalloc(args.vec_len * sizeof(struct page_region)); + if (!regs) + goto out; + args.vec = (long)regs; + + do { + /* start from where we finished the last time */ + args.start = args.walk_end; + regs_len = ioctl(pagemap_fd, PAGEMAP_SCAN, &args); + if (regs_len == -1) { + pr_perror("PAGEMAP_SCAN"); + goto out; + } + + for (i = 0; i < regs_len; i++) { + struct vma_area *vma; + + BUG_ON(!(regs[i].categories & PAGE_IS_GUARD)); + + vma = alloc_vma_area(); + if (!vma) + goto out; + + vma->e->start = regs[i].start; + vma->e->end = regs[i].end; + vma->e->status = VMA_AREA_GUARD; + + list_add_tail(&vma->list, &vma_area_list->h); + vma_area_list->nr++; + } + } while (args.walk_end != kdat.task_size); + + ret = 0; + +out: + xfree(regs); + if (pagemap_fd >= 0) + close(pagemap_fd); + return ret; +} From 01265cfc69e178ca5cb1ae691e1b615c2ddc7eb1 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 20 Apr 2025 20:20:20 +0200 Subject: [PATCH 162/257] test/zdtm/static/maps12: add madv guards test Test for madvise(MADV_GUARD_INSTALL). Signed-off-by: Alexander Mikhalitsyn --- test/zdtm/static/Makefile | 1 + test/zdtm/static/maps12.c | 350 +++++++++++++++++++++++++++++++++++ test/zdtm/static/maps12.desc | 1 + 3 files changed, 352 insertions(+) create mode 100644 test/zdtm/static/maps12.c create mode 100644 test/zdtm/static/maps12.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index ab69f389e..e73f964be 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -315,6 +315,7 @@ TST_FILE = \ write_read02 \ write_read10 \ maps00 \ + maps12 \ link10 \ file_attr \ deleted_unix_sock \ diff --git a/test/zdtm/static/maps12.c b/test/zdtm/static/maps12.c new file mode 100644 index 000000000..b645595be --- /dev/null +++ b/test/zdtm/static/maps12.c @@ -0,0 +1,350 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "Test madvise(MADV_GUARD_INSTALL)"; +const char *test_author = "Alexander Mikhalitsyn "; +/* some parts of code were taken from Linux kernel's kselftest guard-pages.c + written by Lorenzo Stoakes */ + +char *filename; +int fd; +TEST_OPTION(filename, string, "file name", 1); + +#ifndef MADV_GUARD_INSTALL +#define MADV_GUARD_INSTALL 102 +#endif + +uint8_t *map_base; + +struct { + unsigned int pages_num; + bool filemap; +} vmas[] = { + { 2, false }, + { 2, false }, + { 2, false }, + { 2, true }, + { 2, true }, + { 2, true }, +}; + +struct { + bool guarded; + bool wipeonfork; +} pages[] = { + { false, false }, /* vmas[0] */ + { true, false }, + { true, false }, /* vmas[1] */ + { false, false }, + { false, false }, /* vmas[2] */ + { true, true }, + { true, false }, /* vmas[3] */ + { false, false }, + { true, false }, /* vmas[4] */ + { true, false }, + { false, false }, /* vmas[5] */ + { true, false }, +}; + +static volatile sig_atomic_t signal_jump_set; +static sigjmp_buf signal_jmp_buf; + +static void handle_sigsegv(int signo) +{ + if (!signal_jump_set) + return; + + siglongjmp(signal_jmp_buf, 1); +} + +static bool try_write_to_addr(uint8_t *ptr) +{ + bool failed; + + /* Tell signal handler to jump back here on fatal signal. */ + signal_jump_set = true; + /* If a fatal signal arose, we will jump back here and failed is set. */ + failed = sigsetjmp(signal_jmp_buf, 1) != 0; + + if (!failed) + *ptr = 'x'; + + signal_jump_set = false; + return !failed; +} + +static int setup_sigsegv_handler(void) +{ + uint8_t write_me; + + if (signal(SIGSEGV, handle_sigsegv) == SIG_ERR) { + pr_perror("setting SIGSEGV handler failed"); + return 1; + } + + /* ensure that try_write_to_addr() works properly */ + if (!try_write_to_addr(&write_me)) { + pr_err("Failed to write at valid addr. Buggy try_write_to_addr()?\n"); + return 1; + } + + if (try_write_to_addr(NULL)) { + pr_err("Failed to detect an invalid write. Buggy try_write_to_addr()?\n"); + return 1; + } + + return 0; +} + +static inline void *mmap_pages(void *addr_hint, unsigned int count, bool filemap) +{ + char *map; + + map = mmap(addr_hint, count * PAGE_SIZE, PROT_WRITE | PROT_READ, + MAP_PRIVATE | (filemap ? 0 : MAP_ANONYMOUS) | (addr_hint ? MAP_FIXED : 0), + filemap ? fd : -1, filemap ? ((off_t)addr_hint - (off_t)map_base) : 0); + if (map == MAP_FAILED || (addr_hint && (map != addr_hint))) + return MAP_FAILED; + + return map; +} + +static int __check_guards(const char *when, bool in_child) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pages); i++) { + /* + * Skip pages that were never guarded, and also those + * that were, but have MADV_WIPEONFORK which means that + * guards were removed on fork. + */ + if (!pages[i].guarded || (in_child && pages[i].wipeonfork)) + continue; + + if (try_write_to_addr(&map_base[i * PAGE_SIZE])) { + pr_err("successful write to a guarded area %d %s C/R\n", + i, when); + return 1; + } + } + + return 0; +} + +static int check_guards(const char *when) +{ + int status; + pid_t pid; + + /* + * First of all, check that guards are on their places + * in a main test process. + */ + if (__check_guards(when, false)) { + return 1; + } + + /* + * Now, check that guards are on their places + * after fork(). This allows to ensure that + * combo MADV_WIPEONFORK + MADV_GUARD_INSTALL + * is restored properly too. + */ + + pid = test_fork(); + if (pid < 0) { + pr_perror("check_guards: fork failed"); + return 1; + } + + if (pid == 0) { + if (__check_guards(when, true)) { + pr_err("check_guards(\"%s\") failed in child\n", when); + exit(1); + } + + exit(0); + } + + if (waitpid(pid, &status, 0) != pid) { + pr_perror("check_guards: waitpid"); + return 1; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + pr_err("check_guards: process didn't exit cleanly: status=%d\n", status); + return 1; + } + + return 0; +} + +static void gen_pages_data(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pages); i++) { + uint32_t crc; + + if (pages[i].guarded) + continue; + + crc = ~0; + datagen(&map_base[i * PAGE_SIZE], PAGE_SIZE, &crc); + } +} + +static int set_pages_madvs(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pages); i++) { + if (pages[i].guarded) { + if (madvise(&map_base[i * PAGE_SIZE], PAGE_SIZE, + MADV_GUARD_INSTALL)) { + pr_perror("MADV_GUARD_INSTALL failed on page %d", i); + return 1; + } + } + + if (pages[i].wipeonfork) { + if (madvise(&map_base[i * PAGE_SIZE], PAGE_SIZE, + MADV_WIPEONFORK)) { + pr_perror("MADV_WIPEONFORK failed on page %d", i); + return 1; + } + } + } + + return 0; +} + +static int check_pages_data(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pages); i++) { + uint32_t crc; + + if (pages[i].guarded) + continue; + + crc = ~0; + if (datachk(&map_base[i * PAGE_SIZE], PAGE_SIZE, &crc)) { + pr_err("Page %d is corrupted\n", i); + return 1; + } + } + + return 0; +} + +static int prepare_vmas(void) +{ + char *map; + int i, shift; + + shift = 0; + for (i = 0; i < ARRAY_SIZE(vmas); i++) { + map = mmap_pages(&map_base[shift * PAGE_SIZE], + vmas[i].pages_num, vmas[i].filemap); + if (map == MAP_FAILED) { + pr_err("mmap of [%d,%d] pages failed\n", + shift, shift + vmas[i].pages_num); + return 1; + } + + shift += vmas[i].pages_num; + } + + if (shift != ARRAY_SIZE(pages)) { + pr_err("Different number of pages in vmas and pages arrays.\n"); + return 1; + } + + return 0; +} + +int main(int argc, char **argv) +{ + unsigned int pages_num = ARRAY_SIZE(pages); + + test_init(argc, argv); + + fd = open(filename, O_TRUNC | O_CREAT | O_RDWR, 0600); + if (fd < 0) { + pr_perror("Unable to create a test file"); + return -1; + } + + if (ftruncate(fd, pages_num * PAGE_SIZE)) { + pr_perror("Unable to ftruncate a test file"); + return -1; + } + + if (setup_sigsegv_handler()) { + pr_err("setup_sigsegv_handler() failed\n"); + return 1; + } + + /* let's find a large enough area in address space */ + map_base = mmap_pages(NULL, pages_num, false); + if (map_base == MAP_FAILED) { + pr_err("mmap of %d pages failed\n", pages_num); + return 1; + } + + /* + * Now we know that we have a free vm address space area + * [map_base, map_base + pages_num * PAGE_SIZE). + * We can use (map_base) as a hint for our further mmaps. + */ + if (prepare_vmas()) { + pr_err("prepare_vmas() failed\n"); + return 1; + } + + /* fill non-guarded pages with data and preserve checksums */ + gen_pages_data(); + + if (set_pages_madvs()) { + pr_err("set_pages_madvs() failed\n"); + return 1; + } + + /* ensure that madvise(MADV_GUARD_INSTALL) works like expected */ + if (check_guards("before")) { + pr_err("check_guards(\"before\") failed\n"); + return 1; + } + + test_daemon(); + test_waitsig(); + + /* ensure that guards are at their places */ + if (check_guards("after")) { + fail("check_guards(\"after\") failed"); + return 1; + } + + /* check that non-guarded pages still contain original data */ + if (check_pages_data()) { + fail("check_pages_data() failed"); + return 1; + } + + pass(); + munmap(map_base, pages_num * PAGE_SIZE); + close(fd); + return 0; +} diff --git a/test/zdtm/static/maps12.desc b/test/zdtm/static/maps12.desc new file mode 100644 index 000000000..3f7627ff3 --- /dev/null +++ b/test/zdtm/static/maps12.desc @@ -0,0 +1 @@ +{'flavor': 'h', 'feature': 'pagemap_scan_guard_pages'} From 98f2bd525a5eb6db84bdabf4566b18aeaacf32af Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 10 Aug 2025 18:22:23 +0200 Subject: [PATCH 163/257] ci/vagrant: install vanilla kernel for Fedora Rawhide test We need at least 6.16 to test MADV_GUARD_INSTALL support, but our current Fedora Rawhide test uses only Rawhide's user space, while using Fedora 42 kernel. Let's start using a vanilla kernel. Suggested-by: Adrian Reber Signed-off-by: Alexander Mikhalitsyn --- scripts/ci/vagrant.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index 008a01fb3..98942e756 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -66,6 +66,10 @@ fedora-no-vdso() { } fedora-rawhide() { + # Upgrade the kernel to the latest vanilla one + ssh default sudo dnf -y copr enable @kernel-vanilla/stable + ssh default sudo dnf upgrade -y + # The 6.2 kernel of Fedora 38 in combination with rawhide userspace breaks # zdtm/static/socket-tcp-nfconntrack. To activate the new kernel previously # installed this reboots the VM. From dcee5bd6ff2d632bd4e1d4d09d2ffb2bf683d6a2 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 15 Aug 2025 01:44:01 +0000 Subject: [PATCH 164/257] make: Disable branch-protection for PIE code on ARM64 Branch protection uses PAC. It cryptographically "signs" a function's return address before it is stored on the stack. Upon return, the address is authenticated using a secret key. If the signature is invalid, the program will fault. The PIE code is used for the parasite and the restorer. In both cases, it runs in a foreign process. The case of the restorer is even trickier because it needs to restore the original PAC keys, which invalidates all previously "signed" pointers within the restorer itself. Fixes #2709 Signed-off-by: Andrei Vagin --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index 5d8e89ac1..7272cfce1 100644 --- a/Makefile +++ b/Makefile @@ -64,6 +64,8 @@ endif ifeq ($(ARCH),aarch64) DEFINES := -DCONFIG_AARCH64 + CC_MBRANCH_PROT := $(shell $(CC) -c -x c /dev/null -mbranch-protection=none -o /dev/null >/dev/null 2>&1 && echo "-mbranch-protection=none") + CFLAGS_PIE := $(CC_MBRANCH_PROT) endif ifeq ($(ARCH),ppc64) From 2ba343010663f12979ca29fa22c54e511f2d6473 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 16 Aug 2025 15:45:05 +0100 Subject: [PATCH 165/257] test/zdtm/static/maps12: fix pointer-to-int cast The `offset` argument to `mmap()` was computed with a direct cast from pointer to `off_t`: `(off_t)addr_hint - (off_t)map_base` This causes a build failure when compiling since pointers and `off_t` may differ in size on some platforms. maps12.c: In function 'mmap_pages': maps12.c:114:50: error: cast from pointer to integer of different size [-Werror=pointer-to-int-cast] 114 | filemap ? fd : -1, filemap ? ((off_t)addr_hint - (off_t)map_base) : 0); | ^ maps12.c:114:69: error: cast from pointer to integer of different size [-Werror=pointer-to-int-cast] 114 | filemap ? fd : -1, filemap ? ((off_t)addr_hint - (off_t)map_base) : 0); The fix in this patch is to cast both pointers to `intptr_t`, perform the subtraction in that type, and then cast the result back to `off_t`. Signed-off-by: Radostin Stoyanov --- test/zdtm/static/maps12.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/zdtm/static/maps12.c b/test/zdtm/static/maps12.c index b645595be..f0d6c2381 100644 --- a/test/zdtm/static/maps12.c +++ b/test/zdtm/static/maps12.c @@ -111,7 +111,8 @@ static inline void *mmap_pages(void *addr_hint, unsigned int count, bool filemap map = mmap(addr_hint, count * PAGE_SIZE, PROT_WRITE | PROT_READ, MAP_PRIVATE | (filemap ? 0 : MAP_ANONYMOUS) | (addr_hint ? MAP_FIXED : 0), - filemap ? fd : -1, filemap ? ((off_t)addr_hint - (off_t)map_base) : 0); + filemap ? fd : -1, + filemap ? (off_t)((intptr_t)addr_hint - (intptr_t)map_base) : 0); if (map == MAP_FAILED || (addr_hint && (map != addr_hint))) return MAP_FAILED; From fa1b399064575be2aff7d3c6486f0503b0098038 Mon Sep 17 00:00:00 2001 From: Dong Sunchao Date: Wed, 20 Aug 2025 12:38:18 +0000 Subject: [PATCH 166/257] zdtm/static/sock_opts00: use unix socket to test SO_PASSCRED and SO_PASSSEC SO_PASSCRED and SO_PASSSEC are only valid for AF_UNIX and AF_NETLINK This patch updates the test logic to use a unix socket for these options, while preserving the original value consistency check Fixes: #2705 Signed-off-by: Dong Sunchao --- test/zdtm/static/sock_opts00.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/test/zdtm/static/sock_opts00.c b/test/zdtm/static/sock_opts00.c index fcf00ffed..854aaa591 100644 --- a/test/zdtm/static/sock_opts00.c +++ b/test/zdtm/static/sock_opts00.c @@ -31,7 +31,7 @@ int main(int argc, char **argv) static const int NOPTS = sizeof(vname) / sizeof(*vname); #undef OPT - int sock, ret = 0, val[NOPTS], rval, i; + int sock, usock, sk, ret = 0, val[NOPTS], rval, i; socklen_t len = sizeof(int); test_init(argc, argv); @@ -42,8 +42,15 @@ int main(int argc, char **argv) return 1; } + usock = socket(AF_UNIX, SOCK_STREAM, 0); + if (usock < 0) { + pr_perror("can't create unix socket"); + return 1; + } + for (i = 0; i < NOPTS; i++) { - ret = getsockopt(sock, SOL_SOCKET, vname[i].opt, &val[i], &len); + sk = vname[i].opt == SO_PASSCRED || vname[i].opt == SO_PASSSEC ? usock : sock; + ret = getsockopt(sk, SOL_SOCKET, vname[i].opt, &val[i], &len); if (ret) { pr_perror("can't get %s", vname[i].name); return 1; @@ -51,13 +58,13 @@ int main(int argc, char **argv) val[i]++; - ret = setsockopt(sock, SOL_SOCKET, vname[i].opt, &val[i], len); + ret = setsockopt(sk, SOL_SOCKET, vname[i].opt, &val[i], len); if (ret) { pr_perror("can't set %s = %d", vname[i].name, val[i]); return 1; } - ret = getsockopt(sock, SOL_SOCKET, vname[i].opt, &rval, &len); + ret = getsockopt(sk, SOL_SOCKET, vname[i].opt, &rval, &len); if (ret) { pr_perror("can't re-get %s", vname[i].name); return 1; @@ -78,7 +85,8 @@ int main(int argc, char **argv) test_waitsig(); for (i = 0; i < NOPTS; i++) { - ret = getsockopt(sock, SOL_SOCKET, vname[i].opt, &rval, &len); + sk = vname[i].opt == SO_PASSCRED || vname[i].opt == SO_PASSSEC ? usock : sock; + ret = getsockopt(sk, SOL_SOCKET, vname[i].opt, &rval, &len); if (ret) { pr_perror("can't verify %s", vname[i].name); return 1; @@ -93,6 +101,7 @@ int main(int argc, char **argv) pass(); close(sock); + close(usock); return 0; } From 4b73985955ecc01604d8ed1247605a5875042e4c Mon Sep 17 00:00:00 2001 From: Dong Sunchao Date: Wed, 20 Aug 2025 12:38:37 +0000 Subject: [PATCH 167/257] criu/sockets: Restrict SO_PASSCRED and SO_PASSSEC to supported families Linux 6.16+ restricts SO_PASSCRED and SO_PASSSEC to AF_UNIX, AF_NETLINK, and AF_BLUETOOTH This patch updates CRIU to check the socket family before dumping these options Fixes: #2705 Signed-off-by: Dong Sunchao --- criu/include/sockets.h | 2 +- criu/sk-inet.c | 2 +- criu/sk-netlink.c | 2 +- criu/sk-packet.c | 2 +- criu/sk-unix.c | 2 +- criu/sockets.c | 16 +++++++++------- 6 files changed, 14 insertions(+), 12 deletions(-) diff --git a/criu/include/sockets.h b/criu/include/sockets.h index c3e7c879a..6c81d3edd 100644 --- a/criu/include/sockets.h +++ b/criu/include/sockets.h @@ -25,7 +25,7 @@ struct socket_desc { }; extern int dump_socket(struct fd_parms *p, int lfd, FdinfoEntry *); -extern int dump_socket_opts(int sk, SkOptsEntry *soe); +extern int dump_socket_opts(int sk, int family, SkOptsEntry *soe); extern int restore_socket_opts(int sk, SkOptsEntry *soe); extern int sk_setbufs(int sk, uint32_t *bufs); extern void release_skopts(SkOptsEntry *); diff --git a/criu/sk-inet.c b/criu/sk-inet.c index 6e0acf2ce..422edc656 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -581,7 +581,7 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa if (dump_ip_opts(lfd, family, type, proto, &ipopts)) goto err; - if (dump_socket_opts(lfd, &skopts)) + if (dump_socket_opts(lfd, family, &skopts)) goto err; pr_info("Dumping inet socket at %d\n", p->fd); diff --git a/criu/sk-netlink.c b/criu/sk-netlink.c index a219b69be..dc2baa1b8 100644 --- a/criu/sk-netlink.c +++ b/criu/sk-netlink.c @@ -165,7 +165,7 @@ static int dump_one_netlink_fd(int lfd, u32 id, const struct fd_parms *p) ne.fown = (FownEntry *)&p->fown; ne.opts = &skopts; - if (dump_socket_opts(lfd, &skopts)) + if (dump_socket_opts(lfd, AF_NETLINK, &skopts)) goto err; fe.type = FD_TYPES__NETLINKSK; diff --git a/criu/sk-packet.c b/criu/sk-packet.c index 1d2e23522..6530bff58 100644 --- a/criu/sk-packet.c +++ b/criu/sk-packet.c @@ -173,7 +173,7 @@ static int dump_one_packet_fd(int lfd, u32 id, const struct fd_parms *p) psk.fown = (FownEntry *)&p->fown; psk.opts = &skopts; - if (dump_socket_opts(lfd, &skopts)) + if (dump_socket_opts(lfd, AF_PACKET, &skopts)) return -1; psk.protocol = sd->proto; diff --git a/criu/sk-unix.c b/criu/sk-unix.c index 70ca16be4..6145fe734 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -527,7 +527,7 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) } } dump: - if (dump_socket_opts(lfd, skopts)) + if (dump_socket_opts(lfd, AF_UNIX, skopts)) goto err; pr_info("Dumping unix socket at %d\n", p->fd); diff --git a/criu/sockets.c b/criu/sockets.c index 0affccad0..e4adae03c 100644 --- a/criu/sockets.c +++ b/criu/sockets.c @@ -649,7 +649,7 @@ int do_dump_opt(int sk, int level, int name, void *val, int len) return 0; } -int dump_socket_opts(int sk, SkOptsEntry *soe) +int dump_socket_opts(int sk, int family, SkOptsEntry *soe) { int ret = 0, val; struct timeval tv; @@ -688,13 +688,15 @@ int dump_socket_opts(int sk, SkOptsEntry *soe) soe->so_reuseport = val ? true : false; soe->has_so_reuseport = true; - ret |= dump_opt(sk, SOL_SOCKET, SO_PASSCRED, &val); - soe->has_so_passcred = true; - soe->so_passcred = val ? true : false; + if (family == AF_UNIX || family == AF_NETLINK) { + ret |= dump_opt(sk, SOL_SOCKET, SO_PASSCRED, &val); + soe->has_so_passcred = true; + soe->so_passcred = val ? true : false; - ret |= dump_opt(sk, SOL_SOCKET, SO_PASSSEC, &val); - soe->has_so_passsec = true; - soe->so_passsec = val ? true : false; + ret |= dump_opt(sk, SOL_SOCKET, SO_PASSSEC, &val); + soe->has_so_passsec = true; + soe->so_passsec = val ? true : false; + } ret |= dump_opt(sk, SOL_SOCKET, SO_DONTROUTE, &val); soe->has_so_dontroute = true; From 254ba3e8cc60790eec2369e2fb9ca3702a3f7019 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 8 Sep 2025 12:48:34 -0700 Subject: [PATCH 168/257] ci: avoid Docker 28 due to regression This change modifies the CI script to avoid Docker version 28, which has a known regression that breaks Checkpoint/Restore (C/R) functionality. The issue is tracked in the moby/moby project as https://github.com/moby/moby/issues/50750. Signed-off-by: Andrei Vagin --- scripts/ci/docker-test.sh | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index aaf443afd..ae7f52454 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -2,6 +2,24 @@ set -x -e -o pipefail +# Workaround: Docker 28.x has a known regression that breaks the checkpoint and +# restore (C/R) feature. Let's install previous, or next major version. See +# https://github.com/moby/moby/issues/50750 for details on the bug. +export DEBIAN_FRONTEND=noninteractive +apt remove -y docker-ce docker-ce-cli +./apt-install -y ca-certificates curl +install -m 0755 -d /etc/apt/keyrings +curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc +chmod a+r /etc/apt/keyrings/docker.asc +# shellcheck disable=SC1091 +echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "${UBUNTU_CODENAME:-$VERSION_CODENAME}") stable" > /etc/apt/sources.list.d/docker.list +apt update -y +apt-cache madison docker-ce | awk '{ print $3 }' +verstr="$(apt-cache madison docker-ce | awk '{ print $3 }' | sort | grep -v ':28\.'| tail -n 1)" +./apt-install -y "docker-ce=$verstr" "docker-ce-cli=$verstr" + # docker checkpoint and restore is an experimental feature echo '{ "experimental": true }' > /etc/docker/daemon.json service docker restart From a779417a3fa59e55209c50a1a0c40f48a1c456ee Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 14 Sep 2025 19:29:16 -0700 Subject: [PATCH 169/257] zdtm: stop importing junit_xml We are dropping support for generating JUnit XML reports in zdtm.py as we've migrated testing infrastructure entirely to `GitHub Actions` and other third-party test runners. This package has been removed from some distribution repositories (e.g., Fedora), making it simpler to remove the dependency than to force installation via pip. Signed-off-by: Andrei Vagin --- .cirrus.yml | 2 +- scripts/build/Dockerfile.alpine | 2 -- scripts/build/Dockerfile.archlinux | 1 - scripts/build/Dockerfile.centos8 | 2 -- scripts/ci/prepare-for-fedora-rawhide.sh | 1 - scripts/ci/run-ci-tests.sh | 2 +- scripts/ci/vagrant.sh | 2 +- test/jenkins/criu-lazy-migration.pipeline | 1 - test/zdtm.py | 24 +---------------------- 9 files changed, 4 insertions(+), 33 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index bddd5a3f1..848e14132 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -34,7 +34,7 @@ task: setup_script: | dnf config-manager --set-enabled crb # Same as CentOS 8 powertools dnf -y install epel-release epel-next-release - dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata xmlto libdrm-devel libuuid-devel + dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python3-importlib-metadata xmlto libdrm-devel libuuid-devel # The image has a too old version of nettle which does not work with gnutls. # Just upgrade to the latest to make the error go away. dnf -y upgrade nettle nettle-devel diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index d843793ea..819fda0c3 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -48,6 +48,4 @@ RUN apk add \ # The rpc test cases are running as user #1000, let's add the user RUN adduser -u 1000 -D test -RUN pip3 install junit_xml --break-system-packages - RUN make -C test/zdtm diff --git a/scripts/build/Dockerfile.archlinux b/scripts/build/Dockerfile.archlinux index 9d11194bb..d4b432f8d 100644 --- a/scripts/build/Dockerfile.archlinux +++ b/scripts/build/Dockerfile.archlinux @@ -32,7 +32,6 @@ RUN pacman -Syu --noconfirm \ go \ python-yaml \ asciidoctor \ - python-junit-xml \ python-importlib-metadata \ libdrm \ util-linux-libs \ diff --git a/scripts/build/Dockerfile.centos8 b/scripts/build/Dockerfile.centos8 index a67212344..5ab6c9cfa 100644 --- a/scripts/build/Dockerfile.centos8 +++ b/scripts/build/Dockerfile.centos8 @@ -45,6 +45,4 @@ RUN make mrproper && date && make -j $(nproc) CC="$CC" && date # The rpc test cases are running as user #1000, let's add the user RUN adduser -u 1000 test -RUN pip3 install junit_xml - RUN make -C test/zdtm -j $(nproc) diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index f8ad9cf97..f8f797c1e 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -26,7 +26,6 @@ dnf install -y \ protobuf-devel \ python3-PyYAML \ python3-protobuf \ - python3-junit_xml \ python3-pip \ python3-importlib-metadata \ python-unversioned-command \ diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 0c4a08975..617f54fc6 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -6,7 +6,7 @@ CI_PKGS=(protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev libnl-3-dev gdb bash libnet-dev util-linux asciidoctor libnl-route-3-dev time libbsd-dev python3-yaml uuid-dev libperl-dev pkg-config python3-protobuf python3-pip - python3-importlib-metadata python3-junit.xml libdrm-dev) + python3-importlib-metadata libdrm-dev) X86_64_PKGS=(gcc-multilib) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index 98942e756..c222e30e0 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -44,7 +44,7 @@ setup() { ssh default sudo dnf upgrade -y ssh default sudo dnf install -y gcc git gnutls-devel nftables-devel libaio-devel \ libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make protobuf-c-devel \ - protobuf-devel python3-protobuf python3-importlib-metadata python3-junit_xml \ + protobuf-devel python3-protobuf python3-importlib-metadata \ rubygem-asciidoctor iptables libselinux-devel libbpf-devel python3-yaml libuuid-devel # Disable sssd to avoid zdtm test failures in pty04 due to sssd socket diff --git a/test/jenkins/criu-lazy-migration.pipeline b/test/jenkins/criu-lazy-migration.pipeline index 2c863f170..45dc2c776 100644 --- a/test/jenkins/criu-lazy-migration.pipeline +++ b/test/jenkins/criu-lazy-migration.pipeline @@ -21,7 +21,6 @@ pipeline { stage('Test'){ steps { sh './test/jenkins/run_ct sh -c "mount --make-rprivate / && mount --rbind . /mnt && cd /mnt && ./test/jenkins/criu-lazy-migration.sh"' - junit 'test/report/criu-testreport*.xml' } } } diff --git a/test/zdtm.py b/test/zdtm.py index 3339dd816..7e83aa4df 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -2078,8 +2078,6 @@ class Launcher: self.__subs = {} self.__fail = False self.__file_report = None - self.__junit_file = None - self.__junit_test_cases = None self.__failed = [] self.__nr_skip = 0 if self.__max > 1 and self.__total > 1: @@ -2091,22 +2089,14 @@ class Launcher: if opts['report'] and (opts['keep_going'] or self.__total == 1): global TestSuite, TestCase - from junit_xml import TestCase, TestSuite now = datetime.datetime.now() att = 0 reportname = os.path.join(report_dir, "criu-testreport.tap") - junitreport = os.path.join(report_dir, "criu-testreport.xml") - while os.access(reportname, os.F_OK) or os.access( - junitreport, os.F_OK): + while os.access(reportname, os.F_OK): reportname = os.path.join(report_dir, "criu-testreport" + ".%d.tap" % att) - junitreport = os.path.join(report_dir, - "criu-testreport" + ".%d.xml" % att) att += 1 - self.__junit_file = open(junitreport, 'a') - self.__junit_test_cases = [] - self.__file_report = open(reportname, 'a') print(u"TAP version 13", file=self.__file_report) print(u"# Hardware architecture: " + arch, file=self.__file_report) @@ -2141,10 +2131,6 @@ class Launcher: self.__runtest += 1 self.__nr_skip += 1 - if self.__junit_test_cases is not None: - tc = TestCase(name) - tc.add_skipped_info(reason) - self.__junit_test_cases.append(tc) if self.__file_report: testline = u"ok %d - %s # SKIP %s" % (self.__runtest, name, reason) print(testline, file=self.__file_report) @@ -2247,10 +2233,6 @@ class Launcher: # It's useful for taming warnings in subprocess.Popen.__del__() sub['sub'].wait() tc = None - if self.__junit_test_cases is not None: - tc = TestCase(sub['name'], - elapsed_sec=time.time() - sub['start']) - self.__junit_test_cases.append(tc) if status != 0: self.__fail = True failed_flavor = decode_flav(os.WEXITSTATUS(status)) @@ -2307,10 +2289,6 @@ class Launcher: if not opts['fault'] and check_core_files(): self.__fail = True if self.__file_report: - ts = TestSuite(opts['title'], self.__junit_test_cases, - os.getenv("NODE_NAME")) - self.__junit_file.write(TestSuite.to_xml_string([ts])) - self.__junit_file.close() self.__file_report.close() if opts['keep_going']: From 053a22a23bf05c91223d48dc609defa641354a87 Mon Sep 17 00:00:00 2001 From: Lorenzo Fontana Date: Thu, 18 Sep 2025 10:01:48 +0200 Subject: [PATCH 170/257] pagemap: prevent integer overflow in pagemap_len Fixes #2738 Original-patch-by: Andrey Vagin Signed-off-by: Lorenzo Fontana --- criu/include/pagemap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h index 3ae15deb9..fae110108 100644 --- a/criu/include/pagemap.h +++ b/criu/include/pagemap.h @@ -121,7 +121,7 @@ extern int dedup_one_iovec(struct page_read *pr, unsigned long base, unsigned lo static inline unsigned long pagemap_len(PagemapEntry *pe) { - return pe->nr_pages * PAGE_SIZE; + return (unsigned long)pe->nr_pages * PAGE_SIZE; } static inline bool page_read_has_parent(struct page_read *pr) From 80c280610e43fc78e1479ad681bc22e69b4b5287 Mon Sep 17 00:00:00 2001 From: dong sunchao Date: Thu, 18 Sep 2025 03:09:30 +1000 Subject: [PATCH 171/257] compel/mips: Relax ELF magic check to support MIPS libraries On MIPS platforms, shared libraries may use EI_ABIVERSION = 5 to indicate support for .MIPS.xhash sections. The previous ELF header check in handle_binary() strictly compared e_ident against a hardcoded value, causing legitimate shared objects to be rejected. This patch replaces the memcmp-based check with a structured validation of ELF magic and class, and allows EI_ABIVERSION values beside 0. fixes: #2745 Signed-off-by: dong sunchao --- compel/arch/mips/src/lib/handle-elf.c | 31 +++++++++++++++++++-------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/compel/arch/mips/src/lib/handle-elf.c b/compel/arch/mips/src/lib/handle-elf.c index a605a5a45..e086761c2 100644 --- a/compel/arch/mips/src/lib/handle-elf.c +++ b/compel/arch/mips/src/lib/handle-elf.c @@ -5,18 +5,31 @@ #include "piegen.h" #include "log.h" -static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { - 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -}; - extern int __handle_elf(void *mem, size_t size); int handle_binary(void *mem, size_t size) { - if (memcmp(mem, elf_ident_64_le, sizeof(elf_ident_64_le)) == 0) - return __handle_elf(mem, size); + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)mem; - pr_err("Unsupported Elf format detected\n"); - return -EINVAL; + /* check ELF magic */ + if (ehdr->e_ident[EI_MAG0] != ELFMAG0 || + ehdr->e_ident[EI_MAG1] != ELFMAG1 || + ehdr->e_ident[EI_MAG2] != ELFMAG2 || + ehdr->e_ident[EI_MAG3] != ELFMAG3) { + pr_err("Invalid ELF magic\n"); + return -EINVAL; + } + + /* check ELF class and data encoding */ + if (ehdr->e_ident[EI_CLASS] != ELFCLASS64 || + ehdr->e_ident[EI_DATA] != ELFDATA2LSB) { + pr_err("Unsupported ELF class or data encoding\n"); + return -EINVAL; + } + + if (ehdr->e_ident[EI_ABIVERSION] != 0) { + pr_warn("Unusual ABI version: %d\n", ehdr->e_ident[EI_ABIVERSION]); + } + + return __handle_elf(mem, size); } From a8c5e11715673926f95ecaebd6e805c2d311636b Mon Sep 17 00:00:00 2001 From: Filip Hejsek Date: Sat, 13 Sep 2025 19:49:24 +0200 Subject: [PATCH 172/257] lsm: use attr/apparmor/current to get apparmor label On some kernels, attr/current can be intercepted by BPF LSM, causing errors (#2033). Using attr/apparmor/current is preferable, because it is guaranteed to return the apparmor label. attr/current will still be used as a fallback for older kernels. Fixes: #2033 Signed-off-by: Filip Hejsek --- criu/lsm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/criu/lsm.c b/criu/lsm.c index 70b66d42e..5faf3e5b2 100644 --- a/criu/lsm.c +++ b/criu/lsm.c @@ -29,7 +29,9 @@ static int apparmor_get_label(pid_t pid, char **profile_name) FILE *f; char *space; - f = fopen_proc(pid, "attr/current"); + f = fopen_proc(pid, "attr/apparmor/current"); + if (!f) + f = fopen_proc(pid, "attr/current"); if (!f) return -1; From c7395f4cbedc5cf0dd86a2c7aa12e58e33ffc2f4 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 14 Sep 2025 18:44:51 -0700 Subject: [PATCH 173/257] files: fork helpers without CLONE_FILES | CLONE_FS On restore, CRIU needs to change mount namespaces to properly restore files and unix sockets. However, the kernel prevents this if a process is sharing its file system information (fs) with other processes. Fixes #2687 Signed-off-by: Andrei Vagin --- criu/files.c | 1 - criu/pstree.c | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/criu/files.c b/criu/files.c index f16ec32a2..af4b8aeac 100644 --- a/criu/files.c +++ b/criu/files.c @@ -1329,7 +1329,6 @@ int prepare_fds(struct pstree_item *me) } } - BUG_ON(current->pid->state == TASK_HELPER); ret = open_fdinfos(me); if (rsti(me)->fdt) diff --git a/criu/pstree.c b/criu/pstree.c index 75c2fc8d0..cee8b5741 100644 --- a/criu/pstree.c +++ b/criu/pstree.c @@ -237,9 +237,8 @@ int init_pstree_helper(struct pstree_item *ret) { BUG_ON(!ret->parent); ret->pid->state = TASK_HELPER; - rsti(ret)->clone_flags = CLONE_FILES | CLONE_FS; - if (shared_fdt_prepare(ret) < 0) - return -1; + rsti(ret)->clone_flags = 0; + INIT_LIST_HEAD(&rsti(ret)->fds); task_entries->nr_helpers++; return 0; } From afb2e6c3f95dd0b15f739d9669bd7eaf120a2f31 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 18 Sep 2025 14:48:42 +0000 Subject: [PATCH 174/257] pagemap: change PagemapEntry.nr_pages to uint64 to support huge mappings Update the nr_pages field in PagemapEntry to uint64 to prepare for checkpointing and restoring huge memory mappings. Backward compatibility with older pagemap images is preserved. Signed-off-by: Andrei Vagin --- criu/include/pagemap.h | 2 +- criu/page-xfer.c | 1 + criu/pagemap.c | 5 ++++- images/pagemap.proto | 3 ++- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h index fae110108..3ae15deb9 100644 --- a/criu/include/pagemap.h +++ b/criu/include/pagemap.h @@ -121,7 +121,7 @@ extern int dedup_one_iovec(struct page_read *pr, unsigned long base, unsigned lo static inline unsigned long pagemap_len(PagemapEntry *pe) { - return (unsigned long)pe->nr_pages * PAGE_SIZE; + return pe->nr_pages * PAGE_SIZE; } static inline bool page_read_has_parent(struct page_read *pr) diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 0314963e6..b0e04d82c 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -326,6 +326,7 @@ static int write_pagemap_loc(struct page_xfer *xfer, struct iovec *iov, u32 flag pe.nr_pages = iov->iov_len / PAGE_SIZE; pe.has_flags = true; pe.flags = flags; + pe.has_nr_pages = true; if (flags & PE_PRESENT) { if (opts.auto_dedup && xfer->parent != NULL) { diff --git a/criu/pagemap.c b/criu/pagemap.c index 85bb92259..d9ccc03eb 100644 --- a/criu/pagemap.c +++ b/criu/pagemap.c @@ -171,7 +171,7 @@ static int seek_pagemap(struct page_read *pr, unsigned long vaddr) static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, int nr) { if (vaddr < pe->vaddr || (vaddr - pe->vaddr) / PAGE_SIZE + nr > pe->nr_pages) { - pr_err("Page read err %" PRIx64 ":%u vs %lx:%u\n", pe->vaddr, pe->nr_pages, vaddr, nr); + pr_err("Page read err %" PRIx64 ":%lu vs %lx:%u\n", pe->vaddr, pe->nr_pages, vaddr, nr); BUG(); } } @@ -682,6 +682,9 @@ static void init_compat_pagemap_entry(PagemapEntry *pe) pe->flags |= PE_PARENT; else if (!pe->has_flags) pe->flags = PE_PRESENT; + + if (!pe->has_nr_pages) + pe->nr_pages = pe->compat_nr_pages; } /* diff --git a/images/pagemap.proto b/images/pagemap.proto index e6d341b0f..f2436a51a 100644 --- a/images/pagemap.proto +++ b/images/pagemap.proto @@ -10,7 +10,8 @@ message pagemap_head { message pagemap_entry { required uint64 vaddr = 1 [(criu).hex = true]; - required uint32 nr_pages = 2; + required uint32 compat_nr_pages = 2; optional bool in_parent = 3; optional uint32 flags = 4 [(criu).flags = "pmap.flags" ]; + optional uint64 nr_pages = 5; } From 7e0da4d9757e67d8bd0ee8441a581483ad97b12e Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 18 Sep 2025 15:20:32 +0000 Subject: [PATCH 175/257] pagemap: use unsigned long for page counts Variables storing page counts were previously `unsigned int`, limiting them to a maximum of 2^32 pages. With a 4k page size, this corresponds to a 16TB memory mapping, which is insufficient for larger mappings. This commit changes the type for these variables to `unsigned long` to support larger memory mappings. Signed-off-by: Andrei Vagin --- criu/include/page-pipe.h | 6 +++--- criu/include/page-xfer.h | 6 +++--- criu/include/pagemap.h | 6 +++--- criu/include/parasite.h | 2 +- criu/mem.c | 2 +- criu/page-pipe.c | 6 +++--- criu/page-xfer.c | 20 ++++++++++---------- criu/pagemap.c | 22 +++++++++++----------- criu/pie/parasite.c | 2 +- criu/uffd.c | 25 ++++++++++++------------- 10 files changed, 48 insertions(+), 49 deletions(-) diff --git a/criu/include/page-pipe.h b/criu/include/page-pipe.h index 15178c015..65292b7ab 100644 --- a/criu/include/page-pipe.h +++ b/criu/include/page-pipe.h @@ -92,9 +92,9 @@ struct kernel_pipe_buffer { struct page_pipe_buf { int p[2]; /* pipe with pages */ unsigned int pipe_size; /* how many pages can be fit into pipe */ - unsigned int pipe_off; /* where this buf is started in a pipe */ - unsigned int pages_in; /* how many pages are there */ unsigned int nr_segs; /* how many iov-s are busy */ + unsigned long pipe_off; /* where this buf is started in a pipe */ + unsigned long pages_in; /* how many pages are there */ #define PPB_LAZY (1 << 0) unsigned int flags; struct iovec *iov; /* vaddr:len map */ @@ -149,7 +149,7 @@ struct pipe_read_dest { }; extern int pipe_read_dest_init(struct pipe_read_dest *prd); -extern int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned int *nr_pages, +extern int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned long *nr_pages, unsigned int ppb_flags); #endif /* __CR_PAGE_PIPE_H__ */ diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h index 36fe67092..0d9b35019 100644 --- a/criu/include/page-xfer.h +++ b/criu/include/page-xfer.h @@ -69,9 +69,9 @@ extern int check_parent_page_xfer(int fd_type, unsigned long id); */ /* async request/receive of remote pages */ -extern int request_remote_pages(unsigned long img_id, unsigned long addr, int nr_pages); +extern int request_remote_pages(unsigned long img_id, unsigned long addr, unsigned long nr_pages); -typedef int (*ps_async_read_complete)(unsigned long img_id, unsigned long vaddr, int nr_pages, void *); -extern int page_server_start_read(void *buf, int nr_pages, ps_async_read_complete complete, void *priv, unsigned flags); +typedef int (*ps_async_read_complete)(unsigned long img_id, unsigned long vaddr, unsigned long nr_pages, void *); +extern int page_server_start_read(void *buf, unsigned long nr_pages, ps_async_read_complete complete, void *priv, unsigned flags); #endif /* __CR_PAGE_XFER__H__ */ diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h index 3ae15deb9..4cbc87cc6 100644 --- a/criu/include/pagemap.h +++ b/criu/include/pagemap.h @@ -44,7 +44,7 @@ struct page_read { /* reads page from current pagemap */ - int (*read_pages)(struct page_read *, unsigned long vaddr, int nr, void *, unsigned flags); + int (*read_pages)(struct page_read *, unsigned long vaddr, unsigned long nr, void *, unsigned flags); /* Advance page_read to the next entry */ int (*advance)(struct page_read *pr); void (*close)(struct page_read *); @@ -52,8 +52,8 @@ struct page_read { int (*sync)(struct page_read *pr); int (*seek_pagemap)(struct page_read *pr, unsigned long vaddr); void (*reset)(struct page_read *pr); - int (*io_complete)(struct page_read *, unsigned long vaddr, int nr); - int (*maybe_read_page)(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags); + int (*io_complete)(struct page_read *, unsigned long vaddr, unsigned long nr); + int (*maybe_read_page)(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags); /* Whether or not pages can be read in PIE code */ bool pieok; diff --git a/criu/include/parasite.h b/criu/include/parasite.h index b33d6710f..176357711 100644 --- a/criu/include/parasite.h +++ b/criu/include/parasite.h @@ -63,7 +63,7 @@ struct parasite_dump_pages_args { unsigned int add_prot; unsigned int off; unsigned int nr_segs; - unsigned int nr_pages; + unsigned long nr_pages; }; static inline struct parasite_vma_entry *pargs_vmas(struct parasite_dump_pages_args *a) diff --git a/criu/mem.c b/criu/mem.c index 0636273cb..f8c550842 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -336,7 +336,7 @@ static int drain_pages(struct page_pipe *pp, struct parasite_ctl *ctl, struct pa list_for_each_entry(ppb, &pp->bufs, l) { args->nr_segs = ppb->nr_segs; args->nr_pages = ppb->pages_in; - pr_debug("PPB: %d pages %d segs %u pipe %d off\n", args->nr_pages, args->nr_segs, ppb->pipe_size, + pr_debug("PPB: %ld pages %d segs %u pipe %d off\n", args->nr_pages, args->nr_segs, ppb->pipe_size, args->off); ret = compel_rpc_call(PARASITE_CMD_DUMPPAGES, ctl); diff --git a/criu/page-pipe.c b/criu/page-pipe.c index aab6742be..f8e3520f7 100644 --- a/criu/page-pipe.c +++ b/criu/page-pipe.c @@ -381,7 +381,7 @@ int pipe_read_dest_init(struct pipe_read_dest *prd) return 0; } -int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned int *nr_pages, +int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned long int *nr_pages, unsigned int ppb_flags) { struct page_pipe_buf *ppb; @@ -406,7 +406,7 @@ int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned lo } /* clamp the request if it passes the end of iovec */ - len = min((unsigned long)iov->iov_base + iov->iov_len - addr, (unsigned long)(*nr_pages) * PAGE_SIZE); + len = min((unsigned long)iov->iov_base + iov->iov_len - addr, *nr_pages * PAGE_SIZE); *nr_pages = len / PAGE_SIZE; skip += ppb->pipe_off * PAGE_SIZE; @@ -446,7 +446,7 @@ void debug_show_page_pipe(struct page_pipe *pp) pr_debug("Page pipe:\n"); pr_debug("* %u pipes %u/%u iovs:\n", pp->nr_pipes, pp->free_iov, pp->nr_iovs); list_for_each_entry(ppb, &pp->bufs, l) { - pr_debug("\tbuf %u pages, %u iovs, flags: %x pipe_off: %x :\n", ppb->pages_in, ppb->nr_segs, ppb->flags, + pr_debug("\tbuf %lu pages, %u iovs, flags: %x pipe_off: %lx :\n", ppb->pages_in, ppb->nr_segs, ppb->flags, ppb->pipe_off); for (i = 0; i < ppb->nr_segs; i++) { iov = &ppb->iov[i]; diff --git a/criu/page-xfer.c b/criu/page-xfer.c index b0e04d82c..4d057163d 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -32,7 +32,7 @@ static int page_server_sk = -1; struct page_server_iov { u32 cmd; - u32 nr_pages; + u64 nr_pages; u64 vaddr; u64 dst_id; }; @@ -886,7 +886,7 @@ int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp) list_for_each_entry(ppb, &pp->bufs, l) { unsigned int i; - pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs); + pr_debug("\tbuf %ld/%d\n", ppb->pages_in, ppb->nr_segs); for (i = 0; i < ppb->nr_segs; i++) { struct iovec iov = ppb->iov[i]; @@ -1071,7 +1071,7 @@ static int page_server_add(int sk, struct page_server_iov *pi, u32 flags) struct page_xfer *lxfer = &cxfer.loc_xfer; struct iovec iov; - pr_debug("Adding %" PRIx64 "/%u\n", pi->vaddr, pi->nr_pages); + pr_debug("Adding %" PRIx64 "/%lu\n", pi->vaddr, pi->nr_pages); if (prep_loc_xfer(pi)) return -1; @@ -1348,7 +1348,7 @@ static int fill_page_pipe(struct page_read *pr, struct page_pipe *pp) static int page_pipe_from_pagemap(struct page_pipe **pp, int pid) { struct page_read pr; - int nr_pages = 0; + unsigned long nr_pages = 0; if (open_page_read(pid, &pr, PR_TASK) <= 0) { pr_err("Failed to open page read for %d\n", pid); @@ -1551,13 +1551,13 @@ struct ps_async_read { static LIST_HEAD(async_reads); -static inline void async_read_set_goal(struct ps_async_read *ar, int nr_pages) +static inline void async_read_set_goal(struct ps_async_read *ar, unsigned long nr_pages) { ar->goal = sizeof(ar->pi) + nr_pages * PAGE_SIZE; ar->nr_pages = nr_pages; } -static void init_ps_async_read(struct ps_async_read *ar, void *buf, int nr_pages, ps_async_read_complete complete, +static void init_ps_async_read(struct ps_async_read *ar, void *buf, unsigned long nr_pages, ps_async_read_complete complete, void *priv) { ar->pages = buf; @@ -1567,7 +1567,7 @@ static void init_ps_async_read(struct ps_async_read *ar, void *buf, int nr_pages async_read_set_goal(ar, nr_pages); } -static int page_server_start_async_read(void *buf, int nr_pages, ps_async_read_complete complete, void *priv) +static int page_server_start_async_read(void *buf, unsigned long nr_pages, ps_async_read_complete complete, void *priv) { struct ps_async_read *ar; @@ -1667,7 +1667,7 @@ int connect_to_page_server_to_recv(int epfd) return epoll_add_rfd(epfd, &ps_rfd); } -int request_remote_pages(unsigned long img_id, unsigned long addr, int nr_pages) +int request_remote_pages(unsigned long img_id, unsigned long addr, unsigned long nr_pages) { struct page_server_iov pi = { .cmd = PS_IOV_GET, @@ -1684,7 +1684,7 @@ int request_remote_pages(unsigned long img_id, unsigned long addr, int nr_pages) return 0; } -static int page_server_start_sync_read(void *buf, int nr, ps_async_read_complete complete, void *priv) +static int page_server_start_sync_read(void *buf, unsigned long nr, ps_async_read_complete complete, void *priv) { struct ps_async_read ar; int ret = 1; @@ -1695,7 +1695,7 @@ static int page_server_start_sync_read(void *buf, int nr, ps_async_read_complete return ret; } -int page_server_start_read(void *buf, int nr, ps_async_read_complete complete, void *priv, unsigned flags) +int page_server_start_read(void *buf, unsigned long nr, ps_async_read_complete complete, void *priv, unsigned flags) { if (flags & PR_ASYNC) return page_server_start_async_read(buf, nr, complete, priv); diff --git a/criu/pagemap.c b/criu/pagemap.c index d9ccc03eb..16d680fdb 100644 --- a/criu/pagemap.c +++ b/criu/pagemap.c @@ -168,15 +168,15 @@ static int seek_pagemap(struct page_read *pr, unsigned long vaddr) return 0; } -static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, int nr) +static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, unsigned long int nr) { if (vaddr < pe->vaddr || (vaddr - pe->vaddr) / PAGE_SIZE + nr > pe->nr_pages) { - pr_err("Page read err %" PRIx64 ":%lu vs %lx:%u\n", pe->vaddr, pe->nr_pages, vaddr, nr); + pr_err("Page read err %" PRIx64 ":%lu vs %lx:%lu\n", pe->vaddr, pe->nr_pages, vaddr, nr); BUG(); } } -static int read_parent_page(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int read_parent_page(struct page_read *pr, unsigned long vaddr, unsigned long int nr, void *buf, unsigned flags) { struct page_read *ppr = pr->parent; int ret; @@ -195,7 +195,7 @@ static int read_parent_page(struct page_read *pr, unsigned long vaddr, int nr, v */ do { - int p_nr; + unsigned long int p_nr; pr_debug("\tpr%lu-%u Read from parent\n", pr->img_id, pr->id); ret = ppr->seek_pagemap(ppr, vaddr); @@ -210,7 +210,7 @@ static int read_parent_page(struct page_read *pr, unsigned long vaddr, int nr, v * read as much as we can. */ p_nr = ppr->pe->nr_pages - (vaddr - ppr->pe->vaddr) / PAGE_SIZE; - pr_info("\tparent has %u pages in\n", p_nr); + pr_info("\tparent has %lu pages in\n", p_nr); if (p_nr > nr) p_nr = nr; @@ -374,7 +374,7 @@ int pagemap_enqueue_iovec(struct page_read *pr, void *buf, unsigned long len, st return 0; } -static int maybe_read_page_local(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int maybe_read_page_local(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags) { int ret; unsigned long len = nr * PAGE_SIZE; @@ -402,7 +402,7 @@ static int maybe_read_page_local(struct page_read *pr, unsigned long vaddr, int * We cannot use maybe_read_page_local() for streaming images as it uses * pread(), seeking in the file. Instead, we use this custom page reader. */ -static int maybe_read_page_img_streamer(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int maybe_read_page_img_streamer(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags) { unsigned long len = nr * PAGE_SIZE; int fd; @@ -445,7 +445,7 @@ static int maybe_read_page_img_streamer(struct page_read *pr, unsigned long vadd return ret; } -static int read_page_complete(unsigned long img_id, unsigned long vaddr, int nr_pages, void *priv) +static int read_page_complete(unsigned long img_id, unsigned long vaddr, unsigned long int nr_pages, void *priv) { int ret = 0; struct page_read *pr = priv; @@ -463,7 +463,7 @@ static int read_page_complete(unsigned long img_id, unsigned long vaddr, int nr_ return ret; } -static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags) { int ret; @@ -474,9 +474,9 @@ static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr, int return ret; } -static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags) { - pr_info("pr%lu-%u Read %lx %u pages\n", pr->img_id, pr->id, vaddr, nr); + pr_info("pr%lu-%u Read %lx %lu pages\n", pr->img_id, pr->id, vaddr, nr); pagemap_bound_check(pr->pe, vaddr, nr); if (pagemap_in_parent(pr->pe)) { diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c index 1bc03dc2a..c966e9e62 100644 --- a/criu/pie/parasite.c +++ b/criu/pie/parasite.c @@ -101,7 +101,7 @@ static int dump_pages(struct parasite_dump_pages_args *args) } if (spliced_bytes != args->nr_pages * PAGE_SIZE) { sys_close(p); - pr_err("Can't splice all pages to pipe (%ld/%d)\n", spliced_bytes, args->nr_pages); + pr_err("Can't splice all pages to pipe (%ld/%ld)\n", spliced_bytes, args->nr_pages); return -1; } diff --git a/criu/uffd.c b/criu/uffd.c index 98c2b7e07..8e12dcd63 100644 --- a/criu/uffd.c +++ b/criu/uffd.c @@ -668,12 +668,11 @@ static int remap_iovs(struct lazy_pages_info *lpi, unsigned long from, unsigned */ static int collect_iovs(struct lazy_pages_info *lpi) { + unsigned long start, end, len, nr_pages = 0; + int n_vma = 0, max_iov_len = 0, ret = -1; struct page_read *pr = &lpi->pr; struct lazy_iov *iov; MmEntry *mm; - int nr_pages = 0, n_vma = 0, max_iov_len = 0; - int ret = -1; - unsigned long start, end, len; mm = init_mm_entry(lpi); if (!mm) @@ -728,7 +727,7 @@ free_mm: return ret; } -static int uffd_io_complete(struct page_read *pr, unsigned long vaddr, int nr); +static int uffd_io_complete(struct page_read *pr, unsigned long vaddr, unsigned long nr); static int ud_open(int client, struct lazy_pages_info **_lpi) { @@ -822,7 +821,7 @@ static bool uffd_recoverable_error(int mcopy_rc) return false; } -static int uffd_check_op_error(struct lazy_pages_info *lpi, const char *op, int *nr_pages, long mcopy_rc) +static int uffd_check_op_error(struct lazy_pages_info *lpi, const char *op, unsigned long *nr_pages, long mcopy_rc) { if (errno == ENOSPC || errno == ESRCH) { handle_exit(lpi); @@ -844,7 +843,7 @@ static int uffd_check_op_error(struct lazy_pages_info *lpi, const char *op, int return 0; } -static int uffd_copy(struct lazy_pages_info *lpi, __u64 address, int *nr_pages) +static int uffd_copy(struct lazy_pages_info *lpi, __u64 address, unsigned long *nr_pages) { struct uffdio_copy uffdio_copy; unsigned long len = *nr_pages * page_size(); @@ -865,12 +864,12 @@ static int uffd_copy(struct lazy_pages_info *lpi, __u64 address, int *nr_pages) return 0; } -static int uffd_io_complete(struct page_read *pr, unsigned long img_addr, int nr) +static int uffd_io_complete(struct page_read *pr, unsigned long img_addr, unsigned long nr) { struct lazy_pages_info *lpi; - unsigned long addr = 0; - int req_pages, ret; + unsigned long addr = 0, req_pages; struct lazy_iov *req; + int ret; lpi = container_of(pr, struct lazy_pages_info, pr); @@ -920,7 +919,7 @@ static int uffd_io_complete(struct page_read *pr, unsigned long img_addr, int nr return drop_iovs(lpi, addr, nr * PAGE_SIZE); } -static int uffd_zero(struct lazy_pages_info *lpi, __u64 address, int nr_pages) +static int uffd_zero(struct lazy_pages_info *lpi, __u64 address, unsigned long nr_pages) { struct uffdio_zeropage uffdio_zeropage; unsigned long len = page_size() * nr_pages; @@ -946,7 +945,7 @@ static int uffd_zero(struct lazy_pages_info *lpi, __u64 address, int nr_pages) * Returns 0 for zero pages, 1 for "real" pages and negative value on * error */ -static int uffd_seek_pages(struct lazy_pages_info *lpi, __u64 address, int nr) +static int uffd_seek_pages(struct lazy_pages_info *lpi, __u64 address, unsigned long nr) { int ret; @@ -961,7 +960,7 @@ static int uffd_seek_pages(struct lazy_pages_info *lpi, __u64 address, int nr) return 0; } -static int uffd_handle_pages(struct lazy_pages_info *lpi, __u64 address, int nr, unsigned flags) +static int uffd_handle_pages(struct lazy_pages_info *lpi, __u64 address, unsigned long nr, unsigned flags) { int ret; @@ -1003,7 +1002,7 @@ static void update_xfer_len(struct lazy_pages_info *lpi, bool pf) static int xfer_pages(struct lazy_pages_info *lpi) { struct lazy_iov *iov; - unsigned int nr_pages; + unsigned long nr_pages; unsigned long len; int err; From 2e26b36d44e9ccee7d9b6978a36cbaa308f9a119 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 19 Sep 2025 15:10:25 +0000 Subject: [PATCH 176/257] pagemap: print page regions in the format `start - end` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During investigations, it’s much easier to read logs when regions are printed in the start - end format rather than `start/size`. In addition, all page counters and memory sizes are now printed in hexadecimal, as they are hard to read in decimal form. Signed-off-by: Andrei Vagin --- criu/cr-dedup.c | 3 ++- criu/page-pipe.c | 6 +++--- criu/page-xfer.c | 23 +++++++++++++---------- criu/pagemap.c | 2 +- 4 files changed, 19 insertions(+), 15 deletions(-) diff --git a/criu/cr-dedup.c b/criu/cr-dedup.c index c0c21f53e..feeb9ebb0 100644 --- a/criu/cr-dedup.c +++ b/criu/cr-dedup.c @@ -87,7 +87,8 @@ static int cr_dedup_one_pagemap(unsigned long img_id, int flags) if (ret <= 0) goto exit; - pr_debug("dedup iovec base=%" PRIx64 ", len=%lu\n", pr.pe->vaddr, pagemap_len(pr.pe)); + pr_debug("dedup iovec %" PRIx64 " - %" PRIx64 "\n", + pr.pe->vaddr, pr.pe->vaddr + pagemap_len(pr.pe)); if (!pagemap_in_parent(pr.pe)) { ret = dedup_one_iovec(prp, pr.pe->vaddr, pagemap_len(pr.pe)); if (ret) diff --git a/criu/page-pipe.c b/criu/page-pipe.c index f8e3520f7..4601d8f9c 100644 --- a/criu/page-pipe.c +++ b/criu/page-pipe.c @@ -446,17 +446,17 @@ void debug_show_page_pipe(struct page_pipe *pp) pr_debug("Page pipe:\n"); pr_debug("* %u pipes %u/%u iovs:\n", pp->nr_pipes, pp->free_iov, pp->nr_iovs); list_for_each_entry(ppb, &pp->bufs, l) { - pr_debug("\tbuf %lu pages, %u iovs, flags: %x pipe_off: %lx :\n", ppb->pages_in, ppb->nr_segs, ppb->flags, + pr_debug("\tbuf %lx pages, %u iovs, flags: %x pipe_off: %lx :\n", ppb->pages_in, ppb->nr_segs, ppb->flags, ppb->pipe_off); for (i = 0; i < ppb->nr_segs; i++) { iov = &ppb->iov[i]; - pr_debug("\t\t%p %lu\n", iov->iov_base, iov->iov_len / PAGE_SIZE); + pr_debug("\t\t%p - %p\n", iov->iov_base, iov->iov_base + iov->iov_len); } } pr_debug("* %u holes:\n", pp->free_hole); for (i = 0; i < pp->free_hole; i++) { iov = &pp->holes[i]; - pr_debug("\t%p %lu\n", iov->iov_base, iov->iov_len / PAGE_SIZE); + pr_debug("\t%p - %p\n", iov->iov_base, iov->iov_base + iov->iov_len); } } diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 4d057163d..e2913b924 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -178,12 +178,12 @@ static int write_pages_to_server(struct page_xfer *xfer, int p, unsigned long le ssize_t ret, left = len; if (opts.tls) { - pr_debug("Sending %lu bytes / %lu pages\n", len, len / PAGE_SIZE); + pr_debug("Sending %lx bytes\n", len); if (tls_send_data_from_fd(p, len)) return -1; } else { - pr_debug("Splicing %lu bytes / %lu pages into socket\n", len, len / PAGE_SIZE); + pr_debug("Splicing %lx bytes into socket\n", len); while (left > 0) { ret = splice(p, NULL, xfer->sk, NULL, left, SPLICE_F_MOVE); @@ -192,7 +192,7 @@ static int write_pages_to_server(struct page_xfer *xfer, int p, unsigned long le return -1; } - pr_debug("\tSpliced: %lu bytes sent\n", (unsigned long)ret); + pr_debug("\tSpliced: %lx bytes sent\n", (unsigned long)ret); left -= ret; } } @@ -288,7 +288,7 @@ static int check_pagehole_in_parent(struct page_read *p, struct iovec *iov) * read_pagemap_page routine. */ - pr_debug("Checking %p/%zu hole\n", iov->iov_base, iov->iov_len); + pr_debug("Checking %p - %p hole\n", iov->iov_base, iov->iov_base + iov->iov_len); off = (unsigned long)iov->iov_base; end = off + iov->iov_len; while (1) { @@ -300,7 +300,8 @@ static int check_pagehole_in_parent(struct page_read *p, struct iovec *iov) return -1; } - pr_debug("\tFound %" PRIx64 "/%lu\n", p->pe->vaddr, pagemap_len(p->pe)); + pr_debug("\tFound %" PRIx64 " - %" PRIx64 "\n", + p->pe->vaddr, p->pe->vaddr + pagemap_len(p->pe)); /* * The pagemap entry in parent may happen to be @@ -340,7 +341,8 @@ static int write_pagemap_loc(struct page_xfer *xfer, struct iovec *iov, u32 flag if (xfer->parent != NULL) { ret = check_pagehole_in_parent(xfer->parent, iov); if (ret) { - pr_err("Hole %p/%zu not found in parent\n", iov->iov_base, iov->iov_len); + pr_err("Hole %p - %p not found in parent\n", + iov->iov_base, iov->iov_base + iov->iov_len); return -1; } } @@ -850,7 +852,7 @@ int page_xfer_predump_pages(int pid, struct page_xfer *xfer, struct page_pipe *p BUG_ON(iov.iov_base < (void *)xfer->offset); iov.iov_base -= xfer->offset; - pr_debug("\t p %p [%u]\n", iov.iov_base, (unsigned int)(iov.iov_len / PAGE_SIZE)); + pr_debug("\t p %p - %p\n", iov.iov_base, iov.iov_base + iov.iov_len); flags = ppb_xfer_flags(xfer, ppb); @@ -886,7 +888,7 @@ int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp) list_for_each_entry(ppb, &pp->bufs, l) { unsigned int i; - pr_debug("\tbuf %ld/%d\n", ppb->pages_in, ppb->nr_segs); + pr_debug("\tbuf %lx/%d\n", ppb->pages_in, ppb->nr_segs); for (i = 0; i < ppb->nr_segs; i++) { struct iovec iov = ppb->iov[i]; @@ -898,7 +900,7 @@ int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp) BUG_ON(iov.iov_base < (void *)xfer->offset); iov.iov_base -= xfer->offset; - pr_debug("\tp %p [%u]\n", iov.iov_base, (unsigned int)(iov.iov_len / PAGE_SIZE)); + pr_debug("\tp %p - %p\n", iov.iov_base, iov.iov_base + iov.iov_len); flags = ppb_xfer_flags(xfer, ppb); @@ -1071,7 +1073,8 @@ static int page_server_add(int sk, struct page_server_iov *pi, u32 flags) struct page_xfer *lxfer = &cxfer.loc_xfer; struct iovec iov; - pr_debug("Adding %" PRIx64 "/%lu\n", pi->vaddr, pi->nr_pages); + pr_debug("Adding %" PRIx64 " - %" PRIx64 "\n", + pi->vaddr, pi->vaddr + pi->nr_pages * PAGE_SIZE); if (prep_loc_xfer(pi)) return -1; diff --git a/criu/pagemap.c b/criu/pagemap.c index 16d680fdb..b6ec3e333 100644 --- a/criu/pagemap.c +++ b/criu/pagemap.c @@ -171,7 +171,7 @@ static int seek_pagemap(struct page_read *pr, unsigned long vaddr) static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, unsigned long int nr) { if (vaddr < pe->vaddr || (vaddr - pe->vaddr) / PAGE_SIZE + nr > pe->nr_pages) { - pr_err("Page read err %" PRIx64 ":%lu vs %lx:%lu\n", pe->vaddr, pe->nr_pages, vaddr, nr); + pr_err("Page read err %" PRIx64 ":%lx vs %lx:%lx\n", pe->vaddr, pe->nr_pages, vaddr, nr); BUG(); } } From 2d2168fc9c142a2eaf18f319b0d21825775d5660 Mon Sep 17 00:00:00 2001 From: dong sunchao Date: Tue, 23 Sep 2025 01:00:12 +1000 Subject: [PATCH 177/257] vdso: relax EI_OSABI check to support linux in ELF header On some ARM/aarch64 systems, the VDSO ELF header sets EI_OSABI to 3 (Linux), while CRIU expects 0 (System V). This strict check causes restore to fail with "ELF header magic mismatch" This patch relaxes the check to accept both values, improving compatibility with modern toolchains and kernels (e.g. Linux 6.12+) Fixes: #2751 Signed-off-by: dong sunchao --- criu/pie/util-vdso.c | 44 ++++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/criu/pie/util-vdso.c b/criu/pie/util-vdso.c index 8daf5c71f..45fb6a648 100644 --- a/criu/pie/util-vdso.c +++ b/criu/pie/util-vdso.c @@ -98,25 +98,45 @@ static unsigned long elf_gnu_hash(const unsigned char *name) static int has_elf_identity(Ehdr_t *ehdr) { - /* - * See Elf specification for this magic values. - */ + /* check ELF magic */ + + if (ehdr->e_ident[EI_MAG0] != ELFMAG0 || + ehdr->e_ident[EI_MAG1] != ELFMAG1 || + ehdr->e_ident[EI_MAG2] != ELFMAG2 || + ehdr->e_ident[EI_MAG3] != ELFMAG3) { + pr_err("Invalid ELF magic\n"); + return false; + }; + + /* check ELF class */ #if defined(CONFIG_VDSO_32) - static const char elf_ident[] = { - 0x7f, 0x45, 0x4c, 0x46, 0x01, BORD, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + if (ehdr->e_ident[EI_CLASS] != ELFCLASS32) { + pr_err("Unsupported ELF class: %d\n", ehdr->e_ident[EI_CLASS]); + return false; }; #else - static const char elf_ident[] = { - 0x7f, 0x45, 0x4c, 0x46, 0x02, BORD, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + if (ehdr->e_ident[EI_CLASS] != ELFCLASS64) { + pr_err("Unsupported ELF class: %d\n", ehdr->e_ident[EI_CLASS]); + return false; }; #endif - BUILD_BUG_ON(sizeof(elf_ident) != sizeof(ehdr->e_ident)); - - if (memcmp(ehdr->e_ident, elf_ident, sizeof(elf_ident))) { - pr_err("ELF header magic mismatch\n"); + /* check ELF data encoding */ + if (ehdr->e_ident[EI_DATA] != ELFDATA2LSB) { + pr_err("Unsupported ELF data encoding: %d\n", ehdr->e_ident[EI_DATA]); return false; - } + }; + /* check ELF version */ + if (ehdr->e_ident[EI_VERSION] != EV_CURRENT) { + pr_err("Unsupported ELF version: %d\n", ehdr->e_ident[EI_VERSION]); + return false; + }; + /* check ELF OSABI */ + if (ehdr->e_ident[EI_OSABI] != ELFOSABI_NONE && + ehdr->e_ident[EI_OSABI] != ELFOSABI_LINUX) { + pr_err("Unsupported OSABI version: %d\n", ehdr->e_ident[EI_OSABI]); + return false; + }; return true; } From 91758a68e929f1bf9fb2e682aa53e924806ac475 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 22 Sep 2025 17:59:29 +0000 Subject: [PATCH 178/257] zdtm: Remove junit_xml leftovers The previous commit 4cd4a6b1ac15 ("zdtm: stop importing junit_xml") removed the junit_xml library, but some variables related to it were left in the code. This commit removes the unused `tc` variable and a call to its `add_error_info` method. Fixes: 4cd4a6b1ac15 ("zdtm: stop importing junit_xml") Signed-off-by: Andrei Vagin --- test/zdtm.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index 7e83aa4df..e21356c30 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -2232,7 +2232,6 @@ class Launcher: # The following wait() is not useful for our domain logic. # It's useful for taming warnings in subprocess.Popen.__del__() sub['sub'].wait() - tc = None if status != 0: self.__fail = True failed_flavor = decode_flav(os.WEXITSTATUS(status)) @@ -2243,7 +2242,6 @@ class Launcher: with open(sub['log']) as sublog: output = sublog.read() details = {'output': output} - tc.add_error_info(output=output) print(testline, file=self.__file_report) print("%s" % yaml.safe_dump(details, explicit_start=True, From 67751bc11b2906a3bc6e7bf65fce19717c272356 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 19 Sep 2025 00:34:56 +0000 Subject: [PATCH 179/257] docs: add developer overviews for AI assistants This commit adds the document to provide high-level overviews of the CRIU project for AI assistants like Claude and Gemini. These documents are intended to be used as context for AI-powered developer assistants to help them understand the project's goals, architecture, and development process. This will allow them to provide more accurate and helpful responses to developer questions. The documents include: - A brief introduction to CRIU - A quick start guide for checkpointing and restoring a simple process - An overview of the dump and restore process - A description of the Compel subproject - Information about the project's coding style, code layout, and tests Signed-off-by: Andrei Vagin --- CLAUDE.md | 1 + GEMINI.md | 136 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+) create mode 120000 CLAUDE.md create mode 100644 GEMINI.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 120000 index 000000000..e3c5a92d9 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1 @@ +GEMINI.md \ No newline at end of file diff --git a/GEMINI.md b/GEMINI.md new file mode 100644 index 000000000..e56c1de12 --- /dev/null +++ b/GEMINI.md @@ -0,0 +1,136 @@ +# CRIU (Checkpoint/Restore In User-space) + +CRIU is a tool for saving the state of a running application to a set of files +(checkpointing) and restoring it back to a live state. It is primarily used for +live migration of containers, in-place updates, and fast application startup. + +It is implemented as a command-line tool called `criu`. The two primary commands +are `dump` and `restore`. + +- `dump`: Saves a process tree and all its related resources (file + descriptors, IPC, sockets, namespaces, etc.) into a collection of image + files. +- `restore`: Restores processes from image files to the same state they were + in before the dump. + +## Quick Start + +To get a feel for `criu`, you can try checkpointing and restoring a simple +process. + +1. **Run a simple process:** + Open a terminal and run a command that will run for a while. Find its PID. + ```bash + sleep 1000 & + [1] 12345 + ``` + +2. **Dump the process:** + As root, use `criu dump` with the process ID (`-t`) and a directory for the + image files (`-D`). + ```bash + sudo criu dump -t 12345 -D /tmp/sleep_images -v4 --shell-job + ``` + The `sleep` process will no longer be running. + +3. **Restore the process:** + Use `criu restore` to bring the process back to life from the images. + ```bash + sudo criu restore -D /tmp/sleep_images -v4 --shell-job + ``` + The `sleep` process will be running again as if nothing happened. + +# For Developers and Contributors + +This section contains more technical details about CRIU's internals and +development process. + +## Dump Process + +On dump, CRIU uses available kernel interfaces to collect information about +processes. For properties that can only be retrieved from within the process +itself, CRIU injects a binary blob (called a "parasite") into the process's +address space and executes it in the context of one of the process's threads. +This injection is handled by a subproject called **Compel**. + +## Restore Process + +On restore, CRIU reads the image files to reconstruct the processes. The goal is +to restore them to the exact state they were in before the dump. The restore +process is divided into several stages (defined as `CR_STATE_*` in +`./criu/include/restorer.h`). + +The main `criu` process acts as a coordinator. It first restores resources with +inter-process dependencies (file descriptors, sockets, shared memory, +namespaces, etc.). It then forks the process tree and sets up namespaces. +Finally, it restores process-specific resources like file descriptors and memory +mappings. + +A key step involves a small, self-contained binary called the "restorer". All +restored processes switch to executing this code, which unmaps the CRIU-specific +memory and restores the application's original memory mappings. On the final +step, the restorer calls `sigreturn` on a prepared signal frame to resume the +process with the state it had at the moment of the dump. + +## Compel + +Compel is a subproject responsible for generating the binary blobs used for the +parasite code (for dumping) and the restorer code (for restoring). It provides a +library for injecting and executing this code within the target process's +address space. It is a separate project because the logic for generating and +injecting Position-Independent Executable (PIE) code is complex and +self-contained. + +## Coding Style + +The C code in the CRIU project follows the +[Linux Kernel Coding Style](https://www.kernel.org/doc/html/latest/process/coding-style.html). +Here are some of the main points: + +- **Indentation**: Use tabs, which are set to 8 characters. +- **Line Length**: The preferred line limit is 80 characters, but it can be + extended to 120 if it improves code readability. +- **Braces**: + - The opening brace for a function goes on a new line. + - The opening brace for a block (like `if`, `for`, `while`, `switch`) goes + on the same line. +- **Spaces**: Use spaces around operators (`+`, `-`, `*`, `/`, `%`, `<`, `>`, + `=`, etc.). +- **Naming**: Use descriptive names for functions and variables. +- **Comments**: Use C-style comments (`/* ... */`). For multi-line comments, + the preferred format is: + ```c + /* + * This is a multi-line + * comment. + */ + ``` + +## Code Layout + +The code is organized into the following directories: + +- `./compel`: The Compel sub-project. +- `./criu`: The main `criu` tool source code. +- `./images`: Protobuf descriptions for the image files. +- `./test`: All tests. +- `./test/zdtm`: The Zero-Downtime Migration (ZDTM) test suite. +- `./test/zdtm.py`: The executor script for ZDTM tests. +- `./scripts`: Helper scripts. +- `./scripts/build`: Docker image files used for CI and cross-compilation + checks. +- `./crit`: A tool to inspect and manipulate CRIU image files. +- `./soccr`: A library for TCP socket checkpoint/restore. + +## Tests + +The main test suite is ZDTM. Here is an example of how to run a single test: + +```bash +sudo ./test/zdtm.py run -t zdtm/static/env00 +``` + +Each ZDTM test has three stages: preparation, C/R, and results checks. During +the test, a process calls `test_daemon()` to signal it is ready for C/R, then +calls `test_waitsig()` to wait for the C/R stage to complete. After being +restored, the test checks that all its resources are still in a valid state. From 25f8be0f6016bd6ef0e0a1222cd3fbfca8e0b6fd Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Wed, 17 Sep 2025 19:14:36 +0900 Subject: [PATCH 180/257] ci: use package-manager dependency install scripts Currently, adding a package which is required either for development or testing requires it to be added in multiple places due to many duplicated Dockerfiles and installation scripts. This makes it difficult to ensure that all scripts are updated appropriately and can lead to some places being missed. This patch consolidates the list of dependencies and adds installation scripts for each package-manager used in our CI (apk, apt, dnf, pacman). This change also replaces the `debian/dev-packages.lst` as this subfolder conflicts with the Ubuntu/Debian packing scripts used for CRIU: https://github.com/rst0git/criu-deb-packages This patch also removes the CentOS 8 build scripts as it is EOL and the container registry is no longer available. Signed-off-by: Shashank Balaji Signed-off-by: Radostin Stoyanov --- .cirrus.yml | 12 +-- .github/workflows/check-commits.yml | 2 +- .github/workflows/codeql.yml | 2 +- .github/workflows/nftables-test.yml | 2 +- CONTRIBUTING.md | 84 ++++++++++++------- Makefile | 3 +- {scripts/ci => contrib}/apt-install | 0 contrib/debian/dev-packages.lst | 19 ----- contrib/dependencies/apk-packages.sh | 38 +++++++++ contrib/dependencies/apt-cross-packages.sh | 34 ++++++++ contrib/dependencies/apt-packages.sh | 40 +++++++++ contrib/dependencies/dnf-packages.sh | 35 ++++++++ contrib/dependencies/pacman-packages.sh | 31 +++++++ scripts/build/Dockerfile.alpine | 43 +--------- scripts/build/Dockerfile.archlinux | 35 +------- scripts/build/Dockerfile.centos8 | 48 ----------- scripts/build/Dockerfile.fedora.tmpl | 5 +- scripts/build/Dockerfile.hotspot-alpine | 25 +----- scripts/build/Dockerfile.hotspot-ubuntu | 28 +------ scripts/build/Dockerfile.linux32.tmpl | 26 +----- scripts/build/Dockerfile.openj9-ubuntu | 28 +------ .../Dockerfile.riscv64-stable-cross.tmpl | 33 +------- scripts/build/Dockerfile.stable-cross.tmpl | 25 +----- scripts/build/Dockerfile.tmpl | 36 +------- scripts/build/Dockerfile.unstable-cross.tmpl | 26 +----- scripts/build/Dockerfile.x86_64.hdr | 2 +- scripts/build/Makefile | 2 +- scripts/ci/Makefile | 2 +- scripts/ci/docker-test.sh | 4 +- scripts/ci/java-test.sh | 2 + scripts/ci/loongarch64-qemu-test.sh | 4 +- scripts/ci/prepare-for-fedora-rawhide.sh | 29 +------ scripts/ci/run-ci-tests.sh | 12 +-- scripts/ci/vagrant.sh | 12 +-- scripts/install-debian-pkgs.sh | 25 ------ 35 files changed, 295 insertions(+), 459 deletions(-) rename {scripts/ci => contrib}/apt-install (100%) delete mode 100644 contrib/debian/dev-packages.lst create mode 100755 contrib/dependencies/apk-packages.sh create mode 100755 contrib/dependencies/apt-cross-packages.sh create mode 100755 contrib/dependencies/apt-packages.sh create mode 100755 contrib/dependencies/dnf-packages.sh create mode 100755 contrib/dependencies/pacman-packages.sh delete mode 100644 scripts/build/Dockerfile.centos8 delete mode 100755 scripts/install-debian-pkgs.sh diff --git a/.cirrus.yml b/.cirrus.yml index 848e14132..99dd70d63 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -13,7 +13,7 @@ task: nested_virtualization: true setup_script: | - scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker + contrib/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok build_script: | make -C scripts/ci vagrant-fedora-no-vdso @@ -34,7 +34,7 @@ task: setup_script: | dnf config-manager --set-enabled crb # Same as CentOS 8 powertools dnf -y install epel-release epel-next-release - dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python3-importlib-metadata xmlto libdrm-devel libuuid-devel + contrib/dependencies/dnf-packages.sh # The image has a too old version of nettle which does not work with gnutls. # Just upgrade to the latest to make the error go away. dnf -y upgrade nettle nettle-devel @@ -63,7 +63,7 @@ task: nested_virtualization: true setup_script: | - scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker + contrib/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok build_script: | make -C scripts/ci vagrant-fedora-rawhide @@ -83,7 +83,7 @@ task: nested_virtualization: true setup_script: | - scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker + contrib/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok build_script: | make -C scripts/ci vagrant-fedora-non-root @@ -96,7 +96,7 @@ task: memory: 4G script: uname -a build_script: | - scripts/ci/apt-install make + contrib/apt-install make make -C scripts/ci local task: @@ -107,7 +107,7 @@ task: memory: 4G script: uname -a build_script: | - scripts/ci/apt-install make + contrib/apt-install make make -C scripts/ci local CLANG=1 task: diff --git a/.github/workflows/check-commits.yml b/.github/workflows/check-commits.yml index 354873909..bf7d06697 100644 --- a/.github/workflows/check-commits.yml +++ b/.github/workflows/check-commits.yml @@ -19,7 +19,7 @@ jobs: # Checkout pull request HEAD commit instead of merge commit ref: ${{ github.event.pull_request.head.sha }} - name: Install dependencies - run: sudo scripts/ci/apt-install libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev uuid-dev + run: sudo contrib/apt-install libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev uuid-dev - name: Configure git user details run: | git config --global user.email "checkpoint-restore@users.noreply.github.com" diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 88e21d3d1..9c9e46c1b 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -34,7 +34,7 @@ jobs: - name: Install Packages (cpp) if: ${{ matrix.language == 'cpp' }} run: | - sudo scripts/ci/apt-install protobuf-c-compiler libprotobuf-c-dev libprotobuf-dev build-essential libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnet-dev pkg-config libnl-3-dev libbsd0 libbsd-dev iproute2 libcap-dev libaio-dev libbsd-dev python3-yaml libnl-route-3-dev gnutls-dev + sudo contrib/apt-install protobuf-c-compiler libprotobuf-c-dev libprotobuf-dev build-essential libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnet-dev pkg-config libnl-3-dev libbsd0 libbsd-dev iproute2 libcap-dev libaio-dev libbsd-dev python3-yaml libnl-route-3-dev gnutls-dev - name: Initialize CodeQL uses: github/codeql-action/init@v3 with: diff --git a/.github/workflows/nftables-test.yml b/.github/workflows/nftables-test.yml index eb3d8e814..7a7d8bd30 100644 --- a/.github/workflows/nftables-test.yml +++ b/.github/workflows/nftables-test.yml @@ -15,7 +15,7 @@ jobs: - name: Remove iptables run: sudo apt remove -y iptables - name: Install libnftables-dev - run: sudo scripts/ci/apt-install libnftables-dev + run: sudo contrib/apt-install libnftables-dev - name: chmod 755 /home/runner # CRIU's tests are sometimes running as some random user and need # to be able to access the test files. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 712e7b813..3ad4aa101 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -27,19 +27,43 @@ The repository may contain multiple branches. Development happens in the **criu- To clone CRIU repo and switch to the proper branch, run: ``` - git clone https://github.com/checkpoint-restore/criu criu - cd criu - git checkout criu-dev +git clone https://github.com/checkpoint-restore/criu criu +cd criu +git checkout criu-dev ``` -### Compile +### Building from source -First, you need to install compile-time dependencies. Check [Installation dependencies](https://criu.org/Installation#Dependencies) for more info. Alternatively, you can use the Nix flake to set up a development environment by running `nix develop`. +Follow these steps to compile CRIU from source code. -To compile CRIU, run: +#### Installing build dependencies + +First, you need to install the required build dependencies. We provide scripts to simplify this process for several Linux distributions in [contrib/dependencies](contrib/dependencies). For a complete list of dependencies, please refer to the [installation guide](https://criu.org/Installation). + +##### On Ubuntu/Debian-based systems: ``` - make +./contrib/dependencies/apt-packages.sh +``` + +##### On Fedora/CentOS-based systems: + +``` +./contrib/dependencies/dnf-packages.sh +``` + +##### Using Nix: + +``` +nix develop +``` + +#### Compiling CRIU + +Once the dependencies are installed, you can compile CRIU by running the `make` command from the root of the source directory: + +``` +make ``` This should create the `./criu/criu` executable. @@ -63,7 +87,7 @@ The following command can be used to automatically run a code linter for Python text spelling (codespell), and a number of CRIU-specific checks (usage of print macros and EOL whitespace for C files). ``` - make lint +make lint ``` In addition, we have adopted a [clang-format configuration file](https://www.kernel.org/doc/Documentation/process/clang-format.rst) @@ -73,7 +97,7 @@ results in decreased readability, we may choose to ignore these errors. Run the following command to check if your changes are compliant with the clang-format rules: ``` - make indent +make indent ``` This command is built upon the `git-clang-format` tool and supports two options `BASE` and `OPTS`. The `BASE` option allows you to @@ -83,7 +107,7 @@ can use `BASE=origin/criu-dev`. The `OPTS` option can be used to pass additional to check the last *N* commits for formatting errors, without applying the changes to the codebase you can use the following command. ``` - make indent OPTS=--diff BASE=HEAD~N +make indent OPTS=--diff BASE=HEAD~N ``` Note that for pull requests, the "Run code linter" workflow runs these checks for all commits. If a clang-format error is detected @@ -96,7 +120,7 @@ Here are some bad examples of clang-format-ing: ``` @@ -58,8 +59,7 @@ static int register_membarriers(void) } - + if (!all_ok) { - fail("can't register membarrier()s - tried %#x, kernel %#x", - barriers_registered, barriers_supported); @@ -129,7 +153,7 @@ Here are some bad examples of clang-format-ing: CRIU comes with an extensive test suite. To check whether your changes introduce any regressions, run ``` - make test +make test ``` The command runs [ZDTM Test Suite](https://criu.org/ZDTM_Test_Suite). Check for any error messages produced by it. @@ -166,21 +190,21 @@ If your change fixes a bug in a specific commit, e.g. you found an issue using the SHA-1 ID, and the one line summary. For example: ``` - Fixes: 9433b7b9db3e ("make: use cflags/ldflags for config.h detection mechanism") +Fixes: 9433b7b9db3e ("make: use cflags/ldflags for config.h detection mechanism") ``` The following `git config` settings can be used to add a pretty format for outputting the above style in the `git log` or `git show` commands: ``` - [pretty] - fixes = Fixes: %h (\"%s\") +[pretty] + fixes = Fixes: %h (\"%s\") ``` If your change address an issue listed in GitHub, please use `Fixes:` tag with the number of the issue. For instance: ``` - Fixes: #339 +Fixes: #339 ``` The `Fixes:` tags should be put at the end of the detailed description. @@ -263,7 +287,7 @@ can certify the below: then you just add a line saying ``` - Signed-off-by: Random J Developer +Signed-off-by: Random J Developer ``` using your real name (please, no pseudonyms or anonymous contributions if @@ -275,14 +299,14 @@ commit message. To append such line to a commit you already made, use ``` From: Random J Developer - Subject: [PATCH] component: Short patch description +Subject: [PATCH] component: Short patch description - Long patch description (could be skipped if patch - is trivial enough) +Long patch description (could be skipped if patch +is trivial enough) - Signed-off-by: Random J Developer - --- - Patch body here +Signed-off-by: Random J Developer +--- +Patch body here ``` ## Submit your work upstream @@ -316,8 +340,8 @@ contains the following: revisions should be listed. For example: ``` - v3: rebase on the current criu-dev - v2: add commit to foo() and update bar() coding style +v3: rebase on the current criu-dev +v2: add commit to foo() and update bar() coding style ``` If there are only minor updates to the commits in a pull request, it is @@ -335,7 +359,7 @@ Historically, CRIU worked with mailing lists and patches so if you still prefer To create a patch, run ``` - git format-patch --signoff origin/criu-dev +git format-patch --signoff origin/criu-dev ``` You might need to read GIT documentation on how to prepare patches @@ -346,8 +370,8 @@ at all. We recommend to post patches using `git send-email` ``` - git send-email --cover-letter --no-chain-reply-to --annotate \ - --confirm=always --to=criu@openvz.org criu-dev +git send-email --cover-letter --no-chain-reply-to --annotate \ + --confirm=always --to=criu@openvz.org criu-dev ``` Note that the `git send-email` subcommand may not be in @@ -359,14 +383,14 @@ If this is your first time using git send-email, you might need to configure it to point it to your SMTP server with something like: ``` - git config --global sendemail.smtpServer stmp.example.net +git config --global sendemail.smtpServer stmp.example.net ``` If you get tired of typing `--to=criu@openvz.org` all the time, you can configure that to be automatically handled as well: ``` - git config sendemail.to criu@openvz.org +git config sendemail.to criu@openvz.org ``` If a developer is sending another version of the patch (e.g. to address diff --git a/Makefile b/Makefile index 7272cfce1..3e5d62726 100644 --- a/Makefile +++ b/Makefile @@ -464,7 +464,8 @@ ruff: shellcheck: shellcheck --version shellcheck scripts/*.sh - shellcheck scripts/ci/*.sh scripts/ci/apt-install + shellcheck scripts/ci/*.sh + shellcheck contrib/apt-install contrib/dependencies/*.sh shellcheck -x test/others/crit/*.sh shellcheck -x test/others/libcriu/*.sh shellcheck -x test/others/crit/*.sh test/others/criu-coredump/*.sh diff --git a/scripts/ci/apt-install b/contrib/apt-install similarity index 100% rename from scripts/ci/apt-install rename to contrib/apt-install diff --git a/contrib/debian/dev-packages.lst b/contrib/debian/dev-packages.lst deleted file mode 100644 index ce45f1b7c..000000000 --- a/contrib/debian/dev-packages.lst +++ /dev/null @@ -1,19 +0,0 @@ -# Required packages for development in Debian -build-essential -libprotobuf-dev -libprotobuf-c-dev -protobuf-c-compiler -protobuf-compiler -python3-protobuf -libnet-dev - -# Extra packages, required for testing and building other tools -pkg-config -libnl-3-dev -libbsd0 -libbsd-dev -iproute2 -libcap-dev -libaio-dev -python3-yaml -libnl-route-3-dev diff --git a/contrib/dependencies/apk-packages.sh b/contrib/dependencies/apk-packages.sh new file mode 100755 index 000000000..0084dea3a --- /dev/null +++ b/contrib/dependencies/apk-packages.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env sh + +apk add --no-cache \ + asciidoctor \ + bash \ + build-base \ + coreutils \ + e2fsprogs \ + git \ + gnutls-dev \ + go \ + ip6tables \ + iproute2 \ + iptables \ + iptables-legacy \ + libaio-dev \ + libbsd-dev \ + libcap-dev \ + libcap-utils \ + libdrm-dev \ + libnet-dev \ + libnl3-dev \ + nftables \ + nftables-dev \ + pkgconfig \ + procps \ + protobuf-c-compiler \ + protobuf-c-dev \ + protobuf-dev \ + py3-importlib-metadata \ + py3-pip \ + py3-protobuf \ + py3-yaml \ + python3 \ + sudo \ + tar \ + util-linux \ + util-linux-dev diff --git a/contrib/dependencies/apt-cross-packages.sh b/contrib/dependencies/apt-cross-packages.sh new file mode 100755 index 000000000..588be40d0 --- /dev/null +++ b/contrib/dependencies/apt-cross-packages.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env sh + +APT_INSTALL="$(cd "$(dirname "$0")/.." >/dev/null 2>&1 && pwd)/apt-install" +if [ ! -x "$APT_INSTALL" ]; then + echo "Error: apt-install not found or not executable" + exit 1 +fi + +"$APT_INSTALL" \ + crossbuild-essential-"${DEBIAN_ARCH}" \ + iproute2:"${DEBIAN_ARCH}" \ + libaio-dev:"${DEBIAN_ARCH}" \ + libbz2-dev:"${DEBIAN_ARCH}" \ + libc6-"${DEBIAN_ARCH}"-cross \ + libc6-dev-"${DEBIAN_ARCH}"-cross \ + libcap-dev:"${DEBIAN_ARCH}" \ + libexpat1-dev:"${DEBIAN_ARCH}" \ + libgnutls28-dev:"${DEBIAN_ARCH}" \ + libnet-dev:"${DEBIAN_ARCH}" \ + libnftables-dev:"${DEBIAN_ARCH}" \ + libnl-3-dev:"${DEBIAN_ARCH}" \ + libnl-route-3-dev:"${DEBIAN_ARCH}" \ + libprotobuf-c-dev:"${DEBIAN_ARCH}" \ + libprotobuf-dev:"${DEBIAN_ARCH}" \ + libssl-dev:"${DEBIAN_ARCH}" \ + ncurses-dev:"${DEBIAN_ARCH}" \ + uuid-dev:"${DEBIAN_ARCH}" \ + libdrm-dev:"${DEBIAN_ARCH}" \ + build-essential \ + pkg-config \ + git \ + protobuf-c-compiler \ + protobuf-compiler \ + python3-protobuf diff --git a/contrib/dependencies/apt-packages.sh b/contrib/dependencies/apt-packages.sh new file mode 100755 index 000000000..c60ba9041 --- /dev/null +++ b/contrib/dependencies/apt-packages.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env sh + +APT_INSTALL="$(cd "$(dirname "$0")/.." >/dev/null 2>&1 && pwd)/apt-install" +if [ ! -x "$APT_INSTALL" ]; then + echo "Error: apt-install not found or not executable" + exit 1 +fi + +"$APT_INSTALL" \ + asciidoctor \ + bash \ + bsdmainutils \ + build-essential \ + gdb \ + git-core \ + iptables \ + kmod \ + libaio-dev \ + libbsd-dev \ + libcap-dev \ + libdrm-dev \ + libgnutls28-dev \ + libgnutls30 \ + libnet-dev \ + libnl-3-dev \ + libnl-route-3-dev \ + libperl-dev \ + libprotobuf-c-dev \ + libprotobuf-dev \ + libselinux-dev \ + pkg-config \ + protobuf-c-compiler \ + protobuf-compiler \ + python3-importlib-metadata \ + python3-pip \ + python3-protobuf \ + python3-yaml \ + time \ + util-linux \ + uuid-dev diff --git a/contrib/dependencies/dnf-packages.sh b/contrib/dependencies/dnf-packages.sh new file mode 100755 index 000000000..efbb659c5 --- /dev/null +++ b/contrib/dependencies/dnf-packages.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env sh + +dnf install -y \ + asciidoc \ + binutils \ + gcc \ + git \ + glibc-devel \ + gnutls-devel \ + iproute \ + iptables \ + libaio-devel \ + libasan \ + libbpf-devel \ + libbsd-devel \ + libcap-devel \ + libdrm-devel \ + libnet-devel \ + libnl3-devel \ + libselinux-devel \ + libuuid-devel \ + make \ + nftables \ + pkg-config \ + protobuf \ + protobuf-c \ + protobuf-c-devel \ + protobuf-compiler \ + protobuf-devel \ + python-devel \ + python3-importlib-metadata \ + python3-protobuf \ + python3-pyyaml \ + rubygem-asciidoctor \ + xmlto diff --git a/contrib/dependencies/pacman-packages.sh b/contrib/dependencies/pacman-packages.sh new file mode 100755 index 000000000..5fe6995fb --- /dev/null +++ b/contrib/dependencies/pacman-packages.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env sh + +pacman -Syu --noconfirm \ + asciidoctor \ + base-devel \ + bash \ + coreutils \ + diffutils \ + git \ + gnutls \ + go \ + iproute2 \ + iptables \ + libaio \ + libbsd \ + libcap \ + libdrm \ + libnet \ + libnl \ + nftables \ + pkg-config \ + protobuf \ + protobuf-c \ + python-importlib-metadata \ + python-pip \ + python-protobuf \ + python-yaml \ + sudo \ + tar \ + util-linux \ + util-linux-libs diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index 819fda0c3..ed883f300 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -1,49 +1,12 @@ FROM alpine ARG CC=gcc -RUN apk update && apk add \ - $CC \ - bash \ - build-base \ - coreutils \ - procps \ - git \ - gnutls-dev \ - libaio-dev \ - libcap-dev \ - libnet-dev \ - libnl3-dev \ - nftables \ - nftables-dev \ - pkgconfig \ - protobuf-c-dev \ - protobuf-dev \ - py3-pip \ - py3-protobuf \ - python3 \ - sudo \ - libcap-utils \ - libdrm-dev \ - util-linux \ - util-linux-dev - COPY . /criu WORKDIR /criu -RUN make mrproper && date && make -j $(nproc) CC="$CC" && date -RUN apk add \ - ip6tables \ - iptables \ - iptables-legacy \ - nftables \ - iproute2 \ - tar \ - bash \ - go \ - e2fsprogs \ - py-yaml \ - py3-importlib-metadata \ - asciidoctor +RUN apk add --no-cache "$CC" && /criu/contrib/dependencies/apk-packages.sh + +RUN make mrproper && date && make -j $(nproc) CC="$CC" && date # The rpc test cases are running as user #1000, let's add the user RUN adduser -u 1000 -D test diff --git a/scripts/build/Dockerfile.archlinux b/scripts/build/Dockerfile.archlinux index d4b432f8d..261bd2d79 100644 --- a/scripts/build/Dockerfile.archlinux +++ b/scripts/build/Dockerfile.archlinux @@ -5,40 +5,11 @@ ARG CC=gcc # Initialize machine ID RUN systemd-machine-id-setup -RUN pacman -Syu --noconfirm \ - $CC \ - bash \ - make \ - coreutils \ - git \ - gnutls \ - libaio \ - libcap \ - libnet \ - libnl \ - nftables \ - pkgconfig \ - protobuf-c \ - protobuf \ - python-pip \ - python-protobuf \ - which \ - sudo \ - iptables \ - nftables \ - iproute2 \ - tar \ - bash \ - go \ - python-yaml \ - asciidoctor \ - python-importlib-metadata \ - libdrm \ - util-linux-libs \ - diffutils - COPY . /criu WORKDIR /criu + +RUN pacman -Syu --noconfirm "$CC" && contrib/dependencies/pacman-packages.sh + RUN make mrproper && date && make -j $(nproc) CC="$CC" && date # The rpc test cases are running as user #1000, let's add the user diff --git a/scripts/build/Dockerfile.centos8 b/scripts/build/Dockerfile.centos8 deleted file mode 100644 index 5ab6c9cfa..000000000 --- a/scripts/build/Dockerfile.centos8 +++ /dev/null @@ -1,48 +0,0 @@ -FROM registry.centos.org/centos/centos:8 - -ARG CC=gcc - -RUN yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm dnf-plugins-core -RUN yum config-manager --set-enabled powertools -RUN yum install -y --allowerasing \ - asciidoc \ - coreutils \ - chkconfig \ - diffutils \ - findutils \ - gcc \ - git \ - gnutls-devel \ - iproute \ - iptables \ - libaio-devel \ - libasan \ - libcap-devel \ - libnet-devel \ - libnl3-devel \ - libselinux-devel \ - make \ - procps-ng \ - protobuf-c-devel \ - protobuf-devel \ - python3-devel \ - python3-PyYAML \ - python3-protobuf \ - python3-pip \ - sudo \ - tar \ - which \ - xmlto - -RUN alternatives --set python /usr/bin/python3 -ENV PYTHON=python3 - -COPY . /criu -WORKDIR /criu - -RUN make mrproper && date && make -j $(nproc) CC="$CC" && date - -# The rpc test cases are running as user #1000, let's add the user -RUN adduser -u 1000 test - -RUN make -C test/zdtm -j $(nproc) diff --git a/scripts/build/Dockerfile.fedora.tmpl b/scripts/build/Dockerfile.fedora.tmpl index 9d3bb0f87..c26a5fd57 100644 --- a/scripts/build/Dockerfile.fedora.tmpl +++ b/scripts/build/Dockerfile.fedora.tmpl @@ -1,11 +1,10 @@ ARG CC=gcc -COPY scripts/ci/prepare-for-fedora-rawhide.sh /bin/prepare-for-fedora-rawhide.sh -RUN /bin/prepare-for-fedora-rawhide.sh - COPY . /criu WORKDIR /criu +RUN dnf install -y "$CC" && scripts/ci/prepare-for-fedora-rawhide.sh + RUN make mrproper && date && make -j $(nproc) CC="$CC" && date # The rpc test cases are running as user #1000, let's add the user diff --git a/scripts/build/Dockerfile.hotspot-alpine b/scripts/build/Dockerfile.hotspot-alpine index 6caf9d0b1..cd632dddf 100644 --- a/scripts/build/Dockerfile.hotspot-alpine +++ b/scripts/build/Dockerfile.hotspot-alpine @@ -1,30 +1,11 @@ FROM docker.io/library/eclipse-temurin:11-alpine ARG CC=gcc -RUN apk update && apk add \ - bash \ - build-base \ - coreutils \ - git \ - gnutls-dev \ - libaio-dev \ - libcap-dev \ - libnet-dev \ - libnl3-dev \ - pkgconfig \ - protobuf-c-dev \ - protobuf-dev \ - python3 \ - sudo \ - maven \ - ip6tables \ - iptables \ - util-linux-dev \ - bash - COPY . /criu WORKDIR /criu +RUN apk add --no-cache maven "$CC" && contrib/dependencies/apk-packages.sh + RUN make mrproper && make -j $(nproc) CC="$CC" -ENTRYPOINT mvn -q -f test/javaTests/pom.xml test +ENTRYPOINT ["mvn", "-q", "-f", "test/javaTests/pom.xml", "test"] diff --git a/scripts/build/Dockerfile.hotspot-ubuntu b/scripts/build/Dockerfile.hotspot-ubuntu index 67de916ac..76aa571fa 100644 --- a/scripts/build/Dockerfile.hotspot-ubuntu +++ b/scripts/build/Dockerfile.hotspot-ubuntu @@ -1,33 +1,11 @@ FROM docker.io/library/eclipse-temurin:11-focal ARG CC=gcc -COPY scripts/ci/apt-install /bin/apt-install - -RUN apt-install protobuf-c-compiler \ - libprotobuf-c-dev \ - libaio-dev \ - libprotobuf-dev \ - protobuf-compiler \ - libcap-dev \ - libnl-3-dev \ - gdb \ - bash \ - python3-protobuf \ - python3-yaml \ - libnet-dev \ - libnl-route-3-dev \ - libbsd-dev \ - make \ - git \ - pkg-config \ - iptables \ - gcc \ - uuid-dev \ - maven - COPY . /criu WORKDIR /criu +RUN contrib/apt-install maven "$CC" && contrib/dependencies/apt-packages.sh + RUN make mrproper && make -j $(nproc) CC="$CC" -ENTRYPOINT mvn -q -f test/javaTests/pom.xml test +ENTRYPOINT ["mvn", "-q", "-f", "test/javaTests/pom.xml", "test"] diff --git a/scripts/build/Dockerfile.linux32.tmpl b/scripts/build/Dockerfile.linux32.tmpl index d218e0641..a37f16e49 100644 --- a/scripts/build/Dockerfile.linux32.tmpl +++ b/scripts/build/Dockerfile.linux32.tmpl @@ -1,32 +1,10 @@ ARG CC=gcc -COPY scripts/ci/apt-install /bin/apt-install - -RUN apt-install \ - libnet-dev \ - libnl-route-3-dev \ - $CC \ - bsdmainutils \ - build-essential \ - git-core \ - iptables \ - libaio-dev \ - libcap-dev \ - libgnutls28-dev \ - libgnutls30 \ - libnl-3-dev \ - libprotobuf-c-dev \ - libprotobuf-dev \ - libselinux-dev \ - pkg-config \ - protobuf-c-compiler \ - protobuf-compiler \ - uuid-dev \ - python3-minimal - COPY . /criu WORKDIR /criu +RUN contrib/apt-install "$CC" && contrib/dependencies/apt-packages.sh + RUN uname -m && setarch linux32 uname -m && setarch --list RUN make mrproper && date && \ diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.openj9-ubuntu index 0ae4727d2..825495659 100644 --- a/scripts/build/Dockerfile.openj9-ubuntu +++ b/scripts/build/Dockerfile.openj9-ubuntu @@ -1,34 +1,12 @@ FROM docker.io/library/ibm-semeru-runtimes:open-11-jdk-focal ARG CC=gcc -COPY scripts/ci/apt-install /bin/apt-install - -RUN apt-install protobuf-c-compiler \ - libprotobuf-c-dev \ - libaio-dev \ - libprotobuf-dev \ - protobuf-compiler \ - libcap-dev \ - libnl-3-dev \ - gdb \ - bash \ - python3-protobuf \ - python3-yaml \ - libnet-dev \ - libnl-route-3-dev \ - libbsd-dev \ - make \ - git \ - pkg-config \ - iptables \ - gcc \ - uuid-dev \ - maven - RUN mkdir -p /etc/criu && echo 'ghost-limit 16777216' > /etc/criu/default.conf COPY . /criu WORKDIR /criu +RUN contrib/apt-install maven "$CC" && contrib/dependencies/apt-packages.sh + RUN make mrproper && make -j $(nproc) CC="$CC" -ENTRYPOINT mvn -f test/javaTests/pom.xml test +ENTRYPOINT ["mvn", "-f", "test/javaTests/pom.xml", "test"] diff --git a/scripts/build/Dockerfile.riscv64-stable-cross.tmpl b/scripts/build/Dockerfile.riscv64-stable-cross.tmpl index e95a43306..8933a6c82 100644 --- a/scripts/build/Dockerfile.riscv64-stable-cross.tmpl +++ b/scripts/build/Dockerfile.riscv64-stable-cross.tmpl @@ -1,5 +1,3 @@ -COPY scripts/ci/apt-install /bin/apt-install - # Add the cross compiler sources RUN apt-get clean -y && apt-get update -y && apt-get install -y --no-install-recommends gnupg2 @@ -12,33 +10,6 @@ COPY scripts/ci/riscv64-cross/riscv64-sources.list /etc/apt/sources.list.d/ RUN dpkg --add-architecture ${DEBIAN_ARCH} && \ apt-get update -y -# Install required packages -RUN apt-get install -y --no-install-recommends \ - build-essential \ - pkg-config \ - git \ - crossbuild-essential-${DEBIAN_ARCH} \ - libc6-dev-${DEBIAN_ARCH}-cross \ - libc6-${DEBIAN_ARCH}-cross \ - libbz2-dev:${DEBIAN_ARCH} \ - libexpat1-dev:${DEBIAN_ARCH} \ - ncurses-dev:${DEBIAN_ARCH} \ - libssl-dev:${DEBIAN_ARCH} \ - protobuf-c-compiler \ - protobuf-compiler \ - python3-protobuf \ - libnl-3-dev:${DEBIAN_ARCH} \ - libprotobuf-dev:${DEBIAN_ARCH} \ - libnet-dev:${DEBIAN_ARCH} \ - libprotobuf-c-dev:${DEBIAN_ARCH} \ - libcap-dev:${DEBIAN_ARCH} \ - libaio-dev:${DEBIAN_ARCH} \ - uuid-dev:${DEBIAN_ARCH} \ - libnl-route-3-dev:${DEBIAN_ARCH} \ - libnftables-dev:${DEBIAN_ARCH} \ - libgnutls28-dev:${DEBIAN_ARCH} \ - iproute2:${DEBIAN_ARCH} - ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ CROSS_ROOT=/usr/${CROSS_TRIPLET} \ AS=/usr/bin/${CROSS_TRIPLET}-as \ @@ -55,4 +26,6 @@ ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ COPY . /criu WORKDIR /criu -RUN make mrproper && date && make -j $(nproc) zdtm && date +RUN contrib/dependencies/apt-cross-packages.sh + +RUN make mrproper && date && make -j $(nproc) zdtm && date diff --git a/scripts/build/Dockerfile.stable-cross.tmpl b/scripts/build/Dockerfile.stable-cross.tmpl index 65ae55833..56104081f 100644 --- a/scripts/build/Dockerfile.stable-cross.tmpl +++ b/scripts/build/Dockerfile.stable-cross.tmpl @@ -1,30 +1,7 @@ -COPY scripts/ci/apt-install /bin/apt-install - # Add the cross compiler sources RUN echo "deb http://deb.debian.org/debian/ stable main" >> /etc/apt/sources.list && \ dpkg --add-architecture ${DEBIAN_ARCH} -RUN apt-install \ - crossbuild-essential-${DEBIAN_ARCH} \ - libc6-dev-${DEBIAN_ARCH}-cross \ - libc6-${DEBIAN_ARCH}-cross \ - libbz2-dev:${DEBIAN_ARCH} \ - libexpat1-dev:${DEBIAN_ARCH} \ - ncurses-dev:${DEBIAN_ARCH} \ - libssl-dev:${DEBIAN_ARCH} \ - protobuf-c-compiler \ - protobuf-compiler \ - python3-protobuf \ - libnl-3-dev:${DEBIAN_ARCH} \ - libprotobuf-dev:${DEBIAN_ARCH} \ - libnet-dev:${DEBIAN_ARCH} \ - uuid-dev:${DEBIAN_ARCH} \ - libprotobuf-c-dev:${DEBIAN_ARCH} \ - libcap-dev:${DEBIAN_ARCH} \ - libaio-dev:${DEBIAN_ARCH} \ - libnl-route-3-dev:${DEBIAN_ARCH} \ - libdrm-dev:${DEBIAN_ARCH} - ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ CROSS_ROOT=/usr/${CROSS_TRIPLET} \ AS=/usr/bin/${CROSS_TRIPLET}-as \ @@ -41,6 +18,8 @@ ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ COPY . /criu WORKDIR /criu +RUN contrib/dependencies/apt-cross-packages.sh + # amdgpu_plugin with armv7 is not supported RUN make mrproper && date && \ make -j $(nproc) && \ diff --git a/scripts/build/Dockerfile.tmpl b/scripts/build/Dockerfile.tmpl index 3d6de1044..498b99be9 100644 --- a/scripts/build/Dockerfile.tmpl +++ b/scripts/build/Dockerfile.tmpl @@ -1,40 +1,12 @@ ARG CC=gcc -COPY scripts/ci/apt-install /bin/apt-install - -# On Ubuntu, kernel modules such as ip_tables and xt_mark may not be loaded by default -# We need to install kmod to enable iptables to load these modules for us. -RUN apt-install \ - libnet-dev \ - libnl-route-3-dev \ - $CC \ - bsdmainutils \ - build-essential \ - git-core \ - iptables \ - libaio-dev \ - libbsd-dev \ - libcap-dev \ - libgnutls28-dev \ - libgnutls30 \ - libnftables-dev \ - libnl-3-dev \ - libprotobuf-c-dev \ - libprotobuf-dev \ - libselinux-dev \ - iproute2 \ - kmod \ - pkg-config \ - protobuf-c-compiler \ - protobuf-compiler \ - python3-minimal \ - python3-protobuf \ - uuid-dev \ - python3-yaml - COPY . /criu WORKDIR /criu +# On Ubuntu, kernel modules such as ip_tables and xt_mark may not be loaded by default +# We need to install kmod to enable iptables to load these modules for us. +RUN contrib/apt-install "$CC" && contrib/dependencies/apt-packages.sh + RUN git clean -dfx && date && \ # Check single object build make -j $(nproc) CC="$CC" criu/parasite-syscall.o && \ diff --git a/scripts/build/Dockerfile.unstable-cross.tmpl b/scripts/build/Dockerfile.unstable-cross.tmpl index 3504b0433..7edb289b6 100644 --- a/scripts/build/Dockerfile.unstable-cross.tmpl +++ b/scripts/build/Dockerfile.unstable-cross.tmpl @@ -1,29 +1,7 @@ -COPY scripts/ci/apt-install /bin/apt-install - # Add the cross compiler sources RUN echo "deb http://deb.debian.org/debian/ unstable main" >> /etc/apt/sources.list && \ dpkg --add-architecture ${DEBIAN_ARCH} -RUN apt-install \ - crossbuild-essential-${DEBIAN_ARCH} \ - libc6-dev-${DEBIAN_ARCH}-cross \ - libc6-${DEBIAN_ARCH}-cross \ - libbz2-dev:${DEBIAN_ARCH} \ - libexpat1-dev:${DEBIAN_ARCH} \ - ncurses-dev:${DEBIAN_ARCH} \ - libssl-dev:${DEBIAN_ARCH} \ - protobuf-c-compiler \ - protobuf-compiler \ - python3-protobuf \ - libnl-3-dev:${DEBIAN_ARCH} \ - libprotobuf-dev:${DEBIAN_ARCH} \ - uuid-dev:${DEBIAN_ARCH} \ - libnet-dev:${DEBIAN_ARCH} \ - libprotobuf-c-dev:${DEBIAN_ARCH} \ - libcap-dev:${DEBIAN_ARCH} \ - libaio-dev:${DEBIAN_ARCH} \ - libnl-route-3-dev:${DEBIAN_ARCH} - ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ CROSS_ROOT=/usr/${CROSS_TRIPLET} \ AS=/usr/bin/${CROSS_TRIPLET}-as \ @@ -40,4 +18,6 @@ ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ COPY . /criu WORKDIR /criu -RUN make mrproper && date && make -j $(nproc) zdtm && date +RUN contrib/dependencies/apt-cross-packages.sh + +RUN make mrproper && date && make -j $(nproc) zdtm && date diff --git a/scripts/build/Dockerfile.x86_64.hdr b/scripts/build/Dockerfile.x86_64.hdr index 566b4c916..a666f6c26 100644 --- a/scripts/build/Dockerfile.x86_64.hdr +++ b/scripts/build/Dockerfile.x86_64.hdr @@ -1,5 +1,5 @@ FROM ubuntu:24.04 -COPY scripts/ci/apt-install /bin/apt-install +COPY contrib/apt-install /bin/apt-install RUN apt-install gcc-multilib diff --git a/scripts/build/Makefile b/scripts/build/Makefile index 389315227..a420cea94 100644 --- a/scripts/build/Makefile +++ b/scripts/build/Makefile @@ -1,4 +1,4 @@ -ARCHES := x86_64 fedora-asan fedora-rawhide armv7hf centos8 +ARCHES := x86_64 fedora-asan fedora-rawhide armv7hf STABLE_CROSS_ARCHES := armv7-stable-cross aarch64-stable-cross ppc64-stable-cross mips64el-stable-cross riscv64-stable-cross UNSTABLE_CROSS_ARCHES := armv7-unstable-cross aarch64-unstable-cross ppc64-unstable-cross mips64el-unstable-cross NON_CLANG := $(UNSTABLE_CROSS_ARCHES) $(STABLE_CROSS_ARCHES) diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index 9dc0190b3..ed30e4268 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -11,7 +11,7 @@ ifdef CLANG target-suffix = -clang endif -TARGETS := alpine fedora-rawhide centos8 archlinux +TARGETS := alpine fedora-rawhide archlinux ZDTM_OPTS := UNAME := $(shell uname -m) export UNAME diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index ae7f52454..bc5a74667 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -7,7 +7,7 @@ set -x -e -o pipefail # https://github.com/moby/moby/issues/50750 for details on the bug. export DEBIAN_FRONTEND=noninteractive apt remove -y docker-ce docker-ce-cli -./apt-install -y ca-certificates curl +../../contrib/apt-install -y ca-certificates curl install -m 0755 -d /etc/apt/keyrings curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc chmod a+r /etc/apt/keyrings/docker.asc @@ -18,7 +18,7 @@ echo \ apt update -y apt-cache madison docker-ce | awk '{ print $3 }' verstr="$(apt-cache madison docker-ce | awk '{ print $3 }' | sort | grep -v ':28\.'| tail -n 1)" -./apt-install -y "docker-ce=$verstr" "docker-ce-cli=$verstr" +../../contrib/apt-install -y "docker-ce=$verstr" "docker-ce-cli=$verstr" # docker checkpoint and restore is an experimental feature echo '{ "experimental": true }' > /etc/docker/daemon.json diff --git a/scripts/ci/java-test.sh b/scripts/ci/java-test.sh index 7cf704f07..a5b13a107 100755 --- a/scripts/ci/java-test.sh +++ b/scripts/ci/java-test.sh @@ -2,6 +2,8 @@ cd ../.. || exit 1 +sudo modprobe iptable_filter + failures="" docker build -t criu-openj9-ubuntu-test:latest -f scripts/build/Dockerfile.openj9-ubuntu . diff --git a/scripts/ci/loongarch64-qemu-test.sh b/scripts/ci/loongarch64-qemu-test.sh index d5646468e..7e00ab65a 100755 --- a/scripts/ci/loongarch64-qemu-test.sh +++ b/scripts/ci/loongarch64-qemu-test.sh @@ -4,7 +4,7 @@ set -o nounset set -o errexit set -x -./apt-install \ +../../contrib/apt-install \ apt-transport-https \ ca-certificates \ curl \ @@ -19,7 +19,7 @@ add-apt-repository \ $(lsb_release -cs) \ stable test" -./apt-install docker-ce +../../contrib/apt-install docker-ce # shellcheck source=/dev/null . /etc/lsb-release diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index f8f797c1e..ff75717c5 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -1,43 +1,22 @@ #!/bin/bash set -e -x +contrib/dependencies/dnf-packages.sh dnf install -y \ diffutils \ + e2fsprogs \ findutils \ gawk \ - gcc \ - git \ - gnutls-devel \ gzip \ - iproute \ - iptables \ - nftables \ - nftables-devel \ - libaio-devel \ - libasan \ - libcap-devel \ - libnet-devel \ - libnl3-devel \ - libbsd-devel \ + kmod \ libselinux-utils \ - make \ procps-ng \ - protobuf-c-devel \ - protobuf-devel \ - python3-PyYAML \ - python3-protobuf \ python3-pip \ - python3-importlib-metadata \ python-unversioned-command \ redhat-rpm-config \ sudo \ tar \ - which \ - e2fsprogs \ - rubygem-asciidoctor \ - libdrm-devel \ - libuuid-devel \ - kmod + which # /tmp is no longer 755 in the rawhide container image and breaks CI - fix it chmod 1777 /tmp diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 617f54fc6..9fbdd8e30 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -1,12 +1,7 @@ #!/bin/bash set -x -e -CI_PKGS=(protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev - libgnutls30 libprotobuf-dev protobuf-compiler libcap-dev - libnl-3-dev gdb bash libnet-dev util-linux asciidoctor - libnl-route-3-dev time libbsd-dev python3-yaml uuid-dev - libperl-dev pkg-config python3-protobuf python3-pip - python3-importlib-metadata libdrm-dev) +CI_PKGS=() X86_64_PKGS=(gcc-multilib) @@ -60,7 +55,8 @@ ci_prep () { CI_PKGS+=("${X86_64_PKGS[@]}") fi - scripts/ci/apt-install "${CI_PKGS[@]}" + contrib/dependencies/apt-packages.sh + contrib/apt-install "${CI_PKGS[@]}" chmod a+x "$HOME" } @@ -187,7 +183,7 @@ if [ "${COMPAT_TEST}x" = "yx" ] ; then done apt-get remove "${INCOMPATIBLE_LIBS[@]}" dpkg --add-architecture i386 - scripts/ci/apt-install "${IA32_PKGS[@]}" + contrib/apt-install "${IA32_PKGS[@]}" mkdir -p /usr/lib/x86_64-linux-gnu/ mv "$REFUGE"/* /usr/lib/x86_64-linux-gnu/ fi diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index c222e30e0..f69b11352 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -22,9 +22,8 @@ setup() { wget --no-check-certificate https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}-1_"$(dpkg --print-architecture)".deb -O /tmp/vagrant.deb && \ dpkg -i /tmp/vagrant.deb - ./apt-install libvirt-clients libvirt-daemon-system libvirt-dev qemu-utils qemu-system \ - ruby build-essential libxml2-dev qemu-kvm rsync ebtables dnsmasq-base \ - openssh-client + ../../contrib/apt-install libvirt-clients libvirt-daemon-system libvirt-dev qemu-utils qemu-system \ + ruby build-essential libxml2-dev qemu-kvm rsync ebtables dnsmasq-base openssh-client systemctl restart libvirtd vagrant plugin install vagrant-libvirt vagrant init cloud-image/fedora-${FEDORA_VERSION} --box-version ${FEDORA_BOX_VERSION} @@ -41,16 +40,13 @@ setup() { vagrant up --provider=libvirt --no-tty mkdir -p /root/.ssh vagrant ssh-config >> /root/.ssh/config - ssh default sudo dnf upgrade -y - ssh default sudo dnf install -y gcc git gnutls-devel nftables-devel libaio-devel \ - libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make protobuf-c-devel \ - protobuf-devel python3-protobuf python3-importlib-metadata \ - rubygem-asciidoctor iptables libselinux-devel libbpf-devel python3-yaml libuuid-devel # Disable sssd to avoid zdtm test failures in pty04 due to sssd socket ssh default sudo systemctl mask sssd ssh default 'sudo mkdir -p --mode=777 /vagrant && mv $HOME/criu.tar /vagrant && cd /vagrant && tar xf criu.tar' + ssh default sudo dnf upgrade -y + ssh default sudo /vagrant/criu/contrib/dependencies/dnf-packages.sh ssh default cat /proc/cmdline } diff --git a/scripts/install-debian-pkgs.sh b/scripts/install-debian-pkgs.sh deleted file mode 100755 index 8be49c787..000000000 --- a/scripts/install-debian-pkgs.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash -# Install required packages for development environment in Debian Distro - -REQ_PKGS=${REQ_PKGS:=contrib/debian/dev-packages.lst} - -help_msg="Install required packages for development environment in Debian Distro -Usage: - scripts/install-debian-pkgs.sh" - -function print_help() -{ - exec echo -e "$help_msg" -} - -function process() -{ - sudo apt-get update - sudo apt-get install -yq "$( sed 's/\#.*$//' "${REQ_PKGS}" )" -} - -if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then - print_help -else - process -fi From b25ff1d3363ce9ccfc0854009ac0f96431439848 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 26 Sep 2025 16:54:49 +0100 Subject: [PATCH 181/257] Remove travis-ci leftovers Travis CI stopped providing CI minutes for open-source projects some time ago and we have migrated to GitHub actions. Signed-off-by: Radostin Stoyanov --- .travis.yml | 35 ----------------------------------- CONTRIBUTING.md | 7 ------- Makefile | 2 +- Makefile.compel | 4 ++-- scripts/ci/Makefile | 4 ++-- scripts/ci/run-ci-tests.sh | 16 +++++++--------- scripts/ci/vagrant.sh | 7 +------ test/inhfd/memfd.py.checkskip | 2 +- test/zdtm/Makefile.inc | 2 +- 9 files changed, 15 insertions(+), 64 deletions(-) delete mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 94841b3f3..000000000 --- a/.travis.yml +++ /dev/null @@ -1,35 +0,0 @@ -language: c -os: linux -dist: bionic -services: - - docker -jobs: - include: - - os: linux - arch: ppc64le - env: TR_ARCH=local - dist: bionic - - os: linux - arch: ppc64le - env: TR_ARCH=local CLANG=1 - dist: bionic - - os: linux - arch: s390x - env: TR_ARCH=local - dist: bionic - - os: linux - arch: arm64-graviton2 - env: TR_ARCH=local RUN_TESTS=1 - dist: focal - group: edge - virt: vm - - os: linux - arch: arm64-graviton2 - env: TR_ARCH=local CLANG=1 RUN_TESTS=1 - group: edge - virt: vm - dist: bionic -script: - - sudo make -C scripts/ci $TR_ARCH -after_success: - - make -C scripts/ci after_success diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3ad4aa101..2d1dc8227 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -158,11 +158,6 @@ make test The command runs [ZDTM Test Suite](https://criu.org/ZDTM_Test_Suite). Check for any error messages produced by it. -In case you'd rather have someone else run the tests, you can use travis-ci for your -own GitHub fork of CRIU. It will check the compilation for various supported platforms, -as well as run most of the tests from the suite. See https://travis-ci.org/checkpoint-restore/criu -for more details. - ## Describe your changes Describe your problem. Whether your change is a one-line bug fix or @@ -420,5 +415,3 @@ sometimes a patch may fly around a week before it gets reviewed. Wiki article: [Continuous integration](https://criu.org/Continuous_integration) CRIU tests are run for each series sent to the mailing list. If you get a message from our patchwork that patches failed to pass the tests, you have to investigate what is wrong. - -We also recommend you to [enable Travis CI for your repo](https://criu.org/Continuous_integration#Enable_Travis_CI_for_your_repo) to check patches in your git branch, before sending them to the mailing list. diff --git a/Makefile b/Makefile index 3e5d62726..611bcdd5a 100644 --- a/Makefile +++ b/Makefile @@ -43,7 +43,7 @@ ifeq ($(ARCH),arm) endif ifeq ($(ARMV),8) - # Running 'setarch linux32 uname -m' returns armv8l on travis aarch64. + # Running 'setarch linux32 uname -m' returns armv8l on aarch64. # This tells CRIU to handle armv8l just as armv7hf. Right now this is # only used for compile testing. No further verification of armv8l exists. ARCHCFLAGS += -march=armv7-a diff --git a/Makefile.compel b/Makefile.compel index 764afadc8..a4209edc5 100644 --- a/Makefile.compel +++ b/Makefile.compel @@ -50,8 +50,8 @@ compel/plugins/%: $(compel-deps) .FORCE # # GNU make 4.x supports targets matching via wide -# match targeting, where GNU make 3.x series (used on -# Travis) is not, so we have to write them here explicitly. +# match targeting, where GNU make 3.x series is not, +# so we have to write them here explicitly. compel/plugins/std.lib.a: $(compel-deps) .FORCE $(Q) $(MAKE) $(build)=compel/plugins $@ diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index ed30e4268..bad8065f2 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -30,9 +30,9 @@ endif export CONTAINER_TERMINAL +# Here we assume that any CPU architecture besides x86_64 is running in containers +# that may not support running docker with '--privileged'. ifeq ($(UNAME),x86_64) - # On anything besides x86_64 Travis is running unprivileged LXD - # containers which do not support running docker with '--privileged'. CONTAINER_OPTS := --rm $(CONTAINER_TERMINAL) --privileged --userns=host --cgroupns=host -v /lib/modules:/lib/modules --tmpfs /run else CONTAINER_OPTS := --rm -v /lib/modules:/lib/modules --tmpfs /run diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 9fbdd8e30..7a8345b7c 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -11,13 +11,11 @@ IFS=" " read -r -a ZDTM_OPTS <<< "$ZDTM_OPTS" UNAME_M=$(uname -m) if [ "$UNAME_M" != "x86_64" ]; then - # For Travis only x86_64 seems to be baremetal. Other - # architectures are running in unprivileged LXD containers. - # That seems to block most of CRIU's interfaces. - - # But with the introduction of baremetal aarch64 systems in - # Travis (arch: arm64-graviton2) we can override this using - # an environment variable + # Some tests rely on kernel features that may not be availble + # when running in a container. Here we assume that x86_64 + # systems are baremetal, and skip the tests for all other + # CPU architectures. We can override this using the RUN_TESTS + # environment variable (e.g., for aarch64). [ -n "$RUN_TESTS" ] || SKIP_CI_TEST=1 fi @@ -31,7 +29,7 @@ ci_prep () { # not run anymore with 'sudo -u \#1000' if the UID does not exist. adduser -u 1000 --disabled-password --gecos "criutest" criutest || : - # This can fail on aarch64 travis + # This can fail on aarch64 service apport stop || : # Ubuntu has set up AppArmor in 24.04 so that it blocks use of user @@ -258,7 +256,7 @@ if [ -z "$SKIP_EXT_DEV_TEST" ]; then fi make -C test/others/make/ run CC="$CC" -if [ -n "$TRAVIS" ] || [ -n "$CIRCLECI" ]; then +if [ -n "$CIRCLECI" ]; then # GitHub Actions (and Cirrus CI) does not provide a real TTY and CRIU will fail with: # Error (criu/tty.c:1014): tty: Don't have tty to inherit session from, aborting make -C test/others/shell-job/ run diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index f69b11352..5f2de32b8 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -11,11 +11,6 @@ FEDORA_VERSION=42 FEDORA_BOX_VERSION=1.1.0 setup() { - if [ -n "$TRAVIS" ]; then - # Load the kvm modules for vagrant to use qemu - modprobe kvm kvm_intel - fi - # Tar up the git checkout to have vagrant rsync it to the VM tar cf /tmp/criu.tar -C ../../../ criu # Cirrus has problems with the following certificate. @@ -29,7 +24,7 @@ setup() { vagrant init cloud-image/fedora-${FEDORA_VERSION} --box-version ${FEDORA_BOX_VERSION} # The default libvirt Vagrant VM uses 512MB. - # Travis VMs should have around 7.5GB. + # VMs in our CI typically have around 16GB. # Increasing it to 4GB should work. sed -i Vagrantfile -e 's,^end$, config.vm.provider :libvirt do |libvirt|'"\n"' libvirt.memory = 4096;end'"\n"'end,g' # Sync /tmp/criu.tar into the VM diff --git a/test/inhfd/memfd.py.checkskip b/test/inhfd/memfd.py.checkskip index 27e2b7b15..32c57d929 100755 --- a/test/inhfd/memfd.py.checkskip +++ b/test/inhfd/memfd.py.checkskip @@ -3,5 +3,5 @@ import ctypes libc = ctypes.CDLL(None) -# libc may not have memfd_create (e.g., centos on travis) +# libc may not have memfd_create (e.g., centos) libc.memfd_create("test".encode('utf8'), 0) diff --git a/test/zdtm/Makefile.inc b/test/zdtm/Makefile.inc index c19888da3..3b349ed4d 100644 --- a/test/zdtm/Makefile.inc +++ b/test/zdtm/Makefile.inc @@ -27,7 +27,7 @@ ifeq ($(ARCH),arm) else ifeq ($(ARMV),7) ARCHCFLAGS += -march=armv7-a+fp else ifeq ($(ARMV),8) - # To build aarch32 on armv8 Travis-CI (see criu Makefile) + # To build aarch32 on armv8 (see criu Makefile) ARCHCFLAGS += -march=armv7-a ARMV := 7 endif From 0a81dc8bbe9aa4acadc5a47f7a0e276940f9edb5 Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Fri, 26 Sep 2025 23:38:08 +0900 Subject: [PATCH 182/257] ci/java: update base image from focal to jammy Ubuntu Focal Fossa (20.04) reached its end-of-life on 31 May 2025. So, move over to using Ubuntu Jammy (22.04) base images. Also, focal repos do not have libtracefs, which the uprobes zdtm test needs. Signed-off-by: Shashank Balaji --- scripts/build/Dockerfile.hotspot-ubuntu | 2 +- scripts/build/Dockerfile.openj9-ubuntu | 2 +- scripts/ci/run-ci-tests.sh | 9 ++++----- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/scripts/build/Dockerfile.hotspot-ubuntu b/scripts/build/Dockerfile.hotspot-ubuntu index 76aa571fa..a459e1ec7 100644 --- a/scripts/build/Dockerfile.hotspot-ubuntu +++ b/scripts/build/Dockerfile.hotspot-ubuntu @@ -1,4 +1,4 @@ -FROM docker.io/library/eclipse-temurin:11-focal +FROM docker.io/library/eclipse-temurin:11-jammy ARG CC=gcc COPY . /criu diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.openj9-ubuntu index 825495659..18664f100 100644 --- a/scripts/build/Dockerfile.openj9-ubuntu +++ b/scripts/build/Dockerfile.openj9-ubuntu @@ -1,4 +1,4 @@ -FROM docker.io/library/ibm-semeru-runtimes:open-11-jdk-focal +FROM docker.io/library/ibm-semeru-runtimes:open-11-jdk-jammy ARG CC=gcc RUN mkdir -p /etc/criu && echo 'ghost-limit 16777216' > /etc/criu/default.conf diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 7a8345b7c..05a3b71e8 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -11,11 +11,10 @@ IFS=" " read -r -a ZDTM_OPTS <<< "$ZDTM_OPTS" UNAME_M=$(uname -m) if [ "$UNAME_M" != "x86_64" ]; then - # Some tests rely on kernel features that may not be availble - # when running in a container. Here we assume that x86_64 - # systems are baremetal, and skip the tests for all other - # CPU architectures. We can override this using the RUN_TESTS - # environment variable (e.g., for aarch64). + # Some tests rely on kernel features that may not be available + # when running in a container. Here we assume that x86_64 systems + # are baremetal, and skip the tests for all other CPU architectures. + # The RUN_TESTS environment variable can override this, e.g., for aarch64. [ -n "$RUN_TESTS" ] || SKIP_CI_TEST=1 fi From 76394e93a818af92a682946a0dcb97fdabb71099 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 27 Sep 2025 09:21:26 +0100 Subject: [PATCH 183/257] ci: consolidate aarch64 tests on GitHub runners Currently we run aarch64 tests on both Cirrus CI and GitHub runners. However, Cirrus CI fails with "Monthly compute limit exceeded!". This change removes the redundant tests to streamline our CI process. Signed-off-by: Radostin Stoyanov --- .cirrus.yml | 22 ---------------------- .github/workflows/aarch64-test.yaml | 6 ++++-- 2 files changed, 4 insertions(+), 24 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index 99dd70d63..72dbb3898 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -88,28 +88,6 @@ task: build_script: | make -C scripts/ci vagrant-fedora-non-root -task: - name: aarch64 build GCC (native) - arm_container: - image: docker.io/library/ubuntu:jammy - cpu: 4 - memory: 4G - script: uname -a - build_script: | - contrib/apt-install make - make -C scripts/ci local - -task: - name: aarch64 build CLANG (native) - arm_container: - image: docker.io/library/ubuntu:jammy - cpu: 4 - memory: 4G - script: uname -a - build_script: | - contrib/apt-install make - make -C scripts/ci local CLANG=1 - task: name: aarch64 Fedora Rawhide arm_container: diff --git a/.github/workflows/aarch64-test.yaml b/.github/workflows/aarch64-test.yaml index 32b19e176..ebbecadb3 100644 --- a/.github/workflows/aarch64-test.yaml +++ b/.github/workflows/aarch64-test.yaml @@ -9,14 +9,16 @@ concurrency: jobs: build: - runs-on: ubuntu-24.04-arm strategy: matrix: + os: [ubuntu-24.04-arm, ubuntu-22.04-arm] target: [GCC=1, CLANG=1] + runs-on: ${{ matrix.os }} + steps: - uses: actions/checkout@v4 - - name: Run Tests ${{ matrix.target }} + - name: Run Tests ${{ matrix.target }} on ${{ matrix.os }} # Following tests are failing on the VMs: # ./change_mnt_context --pidfile=change_mnt_context.pid --outfile=change_mnt_context.out # 45: ERR: change_mnt_context.c:23: mount (errno = 22 (Invalid argument)) From 7a4b35a91032d36be3469ac4c142ea2d0c399313 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 1 Oct 2025 11:20:13 +0100 Subject: [PATCH 184/257] contributing: update links to mailing list Our previous mailing list had some technical issues and we created a new one that is hopefully more reliable. Signed-off-by: Radostin Stoyanov --- CONTRIBUTING.md | 12 ++++++------ crit/pyproject.toml | 2 +- crit/setup.cfg | 2 +- lib/pyproject.toml | 2 +- lib/setup.cfg | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2d1dc8227..03875639d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -8,8 +8,8 @@ Here are some useful hints to get involved. * We have both -- [very simple](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3Aenhancement) and [more sophisticated](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3A%22new+feature%22) coding tasks; * CRIU does need [extensive testing](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3Atesting); * Documentation is always hard, we have [some information](https://criu.org/Category:Empty_articles) that is to be extracted from people's heads into wiki pages as well as [some texts](https://criu.org/Category:Editor_help_needed) that all need to be converted into useful articles; -* Feedback is expected on the GitHub issues page and on the [mailing list](https://lists.openvz.org/mailman/listinfo/criu); -* We accept GitHub pull requests and this is the preferred way to contribute to CRIU. If you prefer to send patches by email, you are welcome to send them to [CRIU development mailing list](https://lists.openvz.org/mailman/listinfo/criu). +* Feedback is expected on the GitHub issues page and on the [mailing list](https://lore.kernel.org/criu); +* We accept GitHub pull requests and this is the preferred way to contribute to CRIU. If you prefer to send patches by email, you are welcome to send them to [CRIU development mailing list](https://lore.kernel.org/criu). Below we describe in more detail recommend practices for CRIU development. * Spread the word about CRIU in [social networks](http://criu.org/Contacts); * If you're giving a talk about CRIU -- let us know, we'll mention it on the [wiki main page](https://criu.org/News/events); @@ -366,7 +366,7 @@ We recommend to post patches using `git send-email` ``` git send-email --cover-letter --no-chain-reply-to --annotate \ - --confirm=always --to=criu@openvz.org criu-dev + --confirm=always --to=criu@lists.linux.dev criu-dev ``` Note that the `git send-email` subcommand may not be in @@ -381,11 +381,11 @@ configure it to point it to your SMTP server with something like: git config --global sendemail.smtpServer stmp.example.net ``` -If you get tired of typing `--to=criu@openvz.org` all the time, +If you get tired of typing `--to=criu@lists.linux.dev` all the time, you can configure that to be automatically handled as well: ``` -git config sendemail.to criu@openvz.org +git config sendemail.to criu@lists.linux.dev ``` If a developer is sending another version of the patch (e.g. to address @@ -398,7 +398,7 @@ version if needed though). ### Mail patches -The patches should be sent to CRIU development mailing list, `criu AT openvz.org`. Note that you need to be subscribed first in order to post. The list web interface is available at https://openvz.org/mailman/listinfo/criu; you can also use standard mailman aliases to work with it. +The patches should be sent to CRIU development mailing list, `criu AT lists.linux.dev`. Note that you need to be subscribed first in order to post. The list web interface is available at https://lore.kernel.org/criu; you can also use standard mailman aliases to work with it. Please make sure the email client you're using doesn't screw your patch (line wrapping and so on). diff --git a/crit/pyproject.toml b/crit/pyproject.toml index 9089f0a39..f0b185eb7 100644 --- a/crit/pyproject.toml +++ b/crit/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" name = "crit" description = "CRiu Image Tool" authors = [ - {name = "CRIU team", email = "criu@openvz.org"}, + {name = "CRIU team", email = "criu@lists.linux.dev"}, ] license = {text = "GPLv2"} dynamic = ["version"] diff --git a/crit/setup.cfg b/crit/setup.cfg index fbc9a5143..37895923f 100644 --- a/crit/setup.cfg +++ b/crit/setup.cfg @@ -7,7 +7,7 @@ name = crit description = CRiu Image Tool author = CRIU team -author_email = criu@openvz.org +author_email = criu@lists.linux.dev license = GPLv2 version = attr: crit.__version__ diff --git a/lib/pyproject.toml b/lib/pyproject.toml index 8eb4b7084..c9e11551b 100644 --- a/lib/pyproject.toml +++ b/lib/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" name = "pycriu" description = "Python bindings for CRIU" authors = [ - {name = "CRIU team", email = "criu@openvz.org"}, + {name = "CRIU team", email = "criu@lists.linux.dev"}, ] license = {text = "GPLv2"} dynamic = ["version"] diff --git a/lib/setup.cfg b/lib/setup.cfg index 23ee48dd5..5d75719ca 100644 --- a/lib/setup.cfg +++ b/lib/setup.cfg @@ -7,7 +7,7 @@ name = pycriu description = Python bindings for CRIU author = CRIU team -author_email = criu@openvz.org +author_email = criu@lists.linux.dev license = GPLv2 version = attr: pycriu.__version__ From 3379c122e53524a47a31867fa96d5809253c7c4a Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 2 Oct 2025 08:39:30 +0100 Subject: [PATCH 185/257] page-xfer: fix incompatible pointer type on armv7 page_pipe_read() expects an 'unsigned long *', but pi->nr_pages is u64. On 32-bit platforms (e.g., armv7), passing &pi->nr_pages directly causes a compiler error. To fix this we introduce a temporary variable and copy the result back to pi->nr_pages. Fixes: #2756 Suggested-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- criu/page-xfer.c | 9 +++++++-- criu/pagemap.c | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/criu/page-xfer.c b/criu/page-xfer.c index e2913b924..463d4c506 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -1139,13 +1139,17 @@ static int page_server_get_pages(int sk, struct page_server_iov *pi) { struct pstree_item *item; struct page_pipe *pp; - unsigned long len; + unsigned long len, nr_pages; int ret; item = pstree_item_by_virt(pi->dst_id); pp = dmpi(item)->mem_pp; - ret = page_pipe_read(pp, &pipe_read_dest, pi->vaddr, &pi->nr_pages, PPB_LAZY); + /* page_pipe_read() uses 'unsigned long *' but pi->nr_pages is u64. + * Use a temporary variable to fix the incompatible pointer type + * on 32-bit platforms (e.g. armv7). */ + nr_pages = pi->nr_pages; + ret = page_pipe_read(pp, &pipe_read_dest, pi->vaddr, &nr_pages, PPB_LAZY); if (ret) return ret; @@ -1154,6 +1158,7 @@ static int page_server_get_pages(int sk, struct page_server_iov *pi) * .dst_id all remain intact. */ + pi->nr_pages = nr_pages; if (pi->nr_pages == 0) { pr_debug("no iovs found, zero pages\n"); return -1; diff --git a/criu/pagemap.c b/criu/pagemap.c index b6ec3e333..6c9c4f7fe 100644 --- a/criu/pagemap.c +++ b/criu/pagemap.c @@ -171,7 +171,7 @@ static int seek_pagemap(struct page_read *pr, unsigned long vaddr) static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, unsigned long int nr) { if (vaddr < pe->vaddr || (vaddr - pe->vaddr) / PAGE_SIZE + nr > pe->nr_pages) { - pr_err("Page read err %" PRIx64 ":%lx vs %lx:%lx\n", pe->vaddr, pe->nr_pages, vaddr, nr); + pr_err("Page read err %" PRIx64 ":%" PRIx64 " vs %lx:%lx\n", pe->vaddr, pe->nr_pages, vaddr, nr); BUG(); } } From 77553f07d3057dec544ed243cb6a20d933bdd7b5 Mon Sep 17 00:00:00 2001 From: Pepper Gray Date: Tue, 30 Sep 2025 22:58:29 +0200 Subject: [PATCH 186/257] make: prevent redefinition of 'struct sigcontext' Compilation on gentoo/arm64 (llvm+musl) fails with: In file included from compel/include/uapi/compel/asm/sigframe.h:4, from compel/plugins/std/infect.c:14: /usr/include/asm/sigcontext.h:28:8: error: redefinition of 'struct sigcontext' 28 | struct sigcontext { | ^~~~~~~~~~ In file included from criu/arch/aarch64/include/asm/restorer.h:4, from criu/arch/aarch64/crtools.c:11: /usr/include/asm/sigcontext.h:28:8: error: redefinition of 'struct sigcontext' 28 | struct sigcontext { | ^~~~~~~~~~ This is happening because and are mutually incompatible on Linux. To fix, use instead of for arm64 (like all others arches do). Fixes: #2766 Signed-off-by: Pepper Gray --- compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h | 3 ++- criu/arch/aarch64/include/asm/restorer.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h index 9152024fd..a3528500d 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h @@ -1,10 +1,11 @@ #ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ #define UAPI_COMPEL_ASM_SIGFRAME_H__ -#include +#include #include #include +#include /* Copied from the kernel header arch/arm64/include/uapi/asm/sigcontext.h */ diff --git a/criu/arch/aarch64/include/asm/restorer.h b/criu/arch/aarch64/include/asm/restorer.h index 64a9c24eb..2174df4fa 100644 --- a/criu/arch/aarch64/include/asm/restorer.h +++ b/criu/arch/aarch64/include/asm/restorer.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_RESTORER_H__ #define __CR_ASM_RESTORER_H__ -#include +#include #include #include "asm/types.h" From 790b3cf425400cdea794466f3f11c55ca42e8552 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 2 Oct 2025 12:03:57 -0700 Subject: [PATCH 187/257] ci: run alpine tests on arm64 These tests reveal the following build error: In file included from compel/include/uapi/compel/asm/sigframe.h:4, from compel/plugins/std/infect.c:14: /usr/include/asm/sigcontext.h:28:8: error: redefinition of 'struct sigcontext' 28 | struct sigcontext { | ^~~~~~~~~~ In file included from criu/arch/aarch64/include/asm/restorer.h:4, from criu/arch/aarch64/crtools.c:11: /usr/include/asm/sigcontext.h:28:8: error: redefinition of 'struct sigcontext' 28 | struct sigcontext { | ^~~~~~~~~~ Inspired by #2766 / #2767. Signed-off-by: Kir Kolyshkin Signed-off-by: Radostin Stoyanov --- .github/workflows/alpine-test.yml | 3 ++- contrib/dependencies/apk-packages.sh | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/alpine-test.yml b/.github/workflows/alpine-test.yml index 73530d79a..0f5c20f48 100644 --- a/.github/workflows/alpine-test.yml +++ b/.github/workflows/alpine-test.yml @@ -9,10 +9,11 @@ concurrency: jobs: build: - runs-on: ubuntu-22.04 strategy: matrix: + os: [ubuntu-22.04, ubuntu-22.04-arm] target: [GCC=1, CLANG=1] + runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 diff --git a/contrib/dependencies/apk-packages.sh b/contrib/dependencies/apk-packages.sh index 0084dea3a..d02704b15 100755 --- a/contrib/dependencies/apk-packages.sh +++ b/contrib/dependencies/apk-packages.sh @@ -22,6 +22,7 @@ apk add --no-cache \ libnl3-dev \ nftables \ nftables-dev \ + perl \ pkgconfig \ procps \ protobuf-c-compiler \ From 520266d8959b48bac345985874f6008f70755af2 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 3 Oct 2025 17:02:25 +0100 Subject: [PATCH 188/257] zdtm: add sk-unix-restore-fs-share test Add a ZDTM test case where CRIU uses a helper process to restore a non-empty process group with a terminated leader and a Unix domain socket. This reproduces a corner case in which mount namespace switching can fail during restore: https://github.com/checkpoint-restore/criu/issues/2687 Signed-off-by: Qiao Ma Signed-off-by: Radostin Stoyanov --- test/zdtm/static/Makefile | 1 + test/zdtm/static/sk-unix-restore-fs-share.c | 196 ++++++++++++++++++ .../zdtm/static/sk-unix-restore-fs-share.desc | 1 + 3 files changed, 198 insertions(+) create mode 100644 test/zdtm/static/sk-unix-restore-fs-share.c create mode 100644 test/zdtm/static/sk-unix-restore-fs-share.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index e73f964be..6b262c443 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -382,6 +382,7 @@ TST_FILE = \ sk-unix-listen02 \ sk-unix-listen03 \ sk-unix-listen04 \ + sk-unix-restore-fs-share \ mnt_ext_file_bind_auto \ TST_DIR = \ diff --git a/test/zdtm/static/sk-unix-restore-fs-share.c b/test/zdtm/static/sk-unix-restore-fs-share.c new file mode 100644 index 000000000..d4f6dde75 --- /dev/null +++ b/test/zdtm/static/sk-unix-restore-fs-share.c @@ -0,0 +1,196 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test non-empty process group with terminated parent and unix socket"; +const char *test_author = "Qiao Ma "; + +char *filename; +TEST_OPTION(filename, string, "socket file name", 1); + +static int create_and_connect(void) +{ + struct sockaddr_un addr; + int client_fd; + + client_fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (client_fd == -1) { + pr_perror("socket"); + return -1; + } + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + if (snprintf(addr.sun_path, sizeof(addr.sun_path), "%s", filename) >= (int)sizeof(addr.sun_path)) { + pr_err("Socket path too long\n"); + close(client_fd); + return -1; + } + + if (connect(client_fd, (struct sockaddr *)&addr, sizeof(addr)) == -1) { + pr_perror("connect"); + close(client_fd); + return -1; + } + + return 0; +} + +static int child(int ready_fd) +{ + int listen_fd; + struct sockaddr_un addr; + int ret = EXIT_FAILURE; + + listen_fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (listen_fd == -1) { + pr_perror("socket"); + return EXIT_FAILURE; + } + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + if (strlen(filename) >= sizeof(addr.sun_path)) { + pr_err("Socket path too long\n"); + goto cleanup; + } + strncpy(addr.sun_path, filename, sizeof(addr.sun_path)); + + unlink(filename); /* Ignore error if file doesn't exist */ + + if (bind(listen_fd, (struct sockaddr *)&addr, sizeof(addr)) == -1) { + pr_perror("bind"); + goto cleanup; + } + + if (listen(listen_fd, 5) == -1) { + pr_perror("listen"); + goto cleanup; + } + + if (create_and_connect() != 0) { + pr_err("Failed to create and connect\n"); + goto cleanup; + } + + /* Signal parent that socket is ready */ + if (write(ready_fd, "1", 1) != 1) { + pr_perror("write ready_fd"); + goto cleanup; + } + + /* Wait indefinitely */ + pause(); + + ret = EXIT_SUCCESS; +cleanup: + if (listen_fd != -1) + close(listen_fd); + unlink(filename); + + return ret; +} + +static int zombie_leader(int *cpid) +{ + char buf; + pid_t pid; + int pipefd[2]; + + if (pipe(pipefd) == -1) { + pr_perror("pipe"); + return EXIT_FAILURE; + } + + if (setpgid(0, 0) == -1) { + pr_perror("setpgid"); + return EXIT_FAILURE; + } + + pid = fork(); + if (pid < 0) { + pr_perror("Failed to fork child"); + return EXIT_FAILURE; + } + + if (pid == 0) { + /* Close read end */ + close(pipefd[0]); + exit(child(pipefd[1])); + } + + /* Close write end in parent */ + close(pipefd[1]); + + /* Wait for child to set up socket */ + if (read(pipefd[0], &buf, 1) != 1) { + pr_err("Failed to receive readiness signal from child\n"); + close(pipefd[0]); + return EXIT_FAILURE; + } + close(pipefd[0]); + + *cpid = pid; + return EXIT_SUCCESS; +} + +int main(int argc, char **argv) +{ + int ret = EXIT_FAILURE, status; + pid_t pid; + int *cpid; + + test_init(argc, argv); + + cpid = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0); + if (cpid == MAP_FAILED) { + pr_perror("mmap"); + return EXIT_FAILURE; + } + *cpid = 0; + + pid = fork(); + if (pid < 0) { + pr_perror("Failed to fork zombie"); + goto out; + } + + if (pid == 0) + exit(zombie_leader(cpid)); + + if (waitpid(pid, &status, 0) < 0) { + pr_perror("Failed to waitpid zombie"); + goto out; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != EXIT_SUCCESS) { + pr_err("Unexpected exit code: %d\n", WEXITSTATUS(status)); + goto out; + } + + if (!*cpid) { + pr_err("Don't know grandchild's pid\n"); + goto out; + } + + test_daemon(); + test_waitsig(); + + ret = EXIT_SUCCESS; + pass(); +out: + /* Clean up */ + if (*cpid) + kill(*cpid, SIGKILL); + + munmap(cpid, sizeof(int)); + + return ret; +} diff --git a/test/zdtm/static/sk-unix-restore-fs-share.desc b/test/zdtm/static/sk-unix-restore-fs-share.desc new file mode 100644 index 000000000..6c4afe5f0 --- /dev/null +++ b/test/zdtm/static/sk-unix-restore-fs-share.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns'} From 7bf402f6b3f117e9e464c39fcebf23b2a1af3644 Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Mon, 18 Aug 2025 11:00:07 +0900 Subject: [PATCH 189/257] vma: introduce VMA_AREA_UPROBES flag This flag will be used for a "[uprobes]" vma. Signed-off-by: Shashank Balaji --- criu/include/image.h | 7 +++++++ criu/util.c | 1 + 2 files changed, 8 insertions(+) diff --git a/criu/include/image.h b/criu/include/image.h index 934f7d4e9..b5951d3d4 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -74,6 +74,12 @@ * about virtual address space ranges covered by * MADV_GUARD_INSTALL guards. These ones must be always at * the end of the vma_area_list and properly skipped a.e. + * - uprobes + * stands for a "[uprobes]" vma that's automatically mapped by + * the kernel when an active uprobe is hit. Contents of this vma + * are not dumped and neither are its madvise bits restored, + * because the kernel is in complete control of this vma. This is + * just used to track the existence of the uprobes vma. */ #define VMA_AREA_NONE (0 << 0) #define VMA_AREA_REGULAR (1 << 0) @@ -94,6 +100,7 @@ #define VMA_AREA_MEMFD (1 << 14) #define VMA_AREA_SHSTK (1 << 15) #define VMA_AREA_GUARD (1 << 16) +#define VMA_AREA_UPROBES (1 << 17) #define VMA_EXT_PLUGIN (1 << 27) #define VMA_CLOSE (1 << 28) diff --git a/criu/util.c b/criu/util.c index 58c18e20b..e2f80e4c6 100644 --- a/criu/util.c +++ b/criu/util.c @@ -195,6 +195,7 @@ static void vma_opt_str(const struct vma_area *v, char *opt) opt2s(VMA_ANON_PRIVATE, "ap"); opt2s(VMA_AREA_SYSVIPC, "sysv"); opt2s(VMA_AREA_SOCKET, "sk"); + opt2s(VMA_AREA_UPROBES, "uprobes"); #undef opt2s } From 0ff2e0a66e49c0ad0f8b8997ea773a0fc94b1223 Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Mon, 18 Aug 2025 10:53:18 +0900 Subject: [PATCH 190/257] criu-coredump: add VMA_AREA_UPROBES flag Signed-off-by: Shashank Balaji --- coredump/criu_coredump/coredump.py | 1 + 1 file changed, 1 insertion(+) diff --git a/coredump/criu_coredump/coredump.py b/coredump/criu_coredump/coredump.py index c6a758c8a..9454d8f0b 100644 --- a/coredump/criu_coredump/coredump.py +++ b/coredump/criu_coredump/coredump.py @@ -55,6 +55,7 @@ status = { "VMA_AREA_VVAR": 1 << 12, "VMA_AREA_AIORING": 1 << 13, "VMA_AREA_MEMFD": 1 << 14, + "VMA_AREA_UPROBES": 1 << 17, "VMA_AREA_UNSUPP": 1 << 31 } From 74bf40feeb683a668a9f1b192da627bb2d16fa67 Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Mon, 18 Aug 2025 10:54:28 +0900 Subject: [PATCH 191/257] crit: add VMA_AREA_UPROBES flag Signed-off-by: Shashank Balaji --- lib/pycriu/images/pb2dict.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/pycriu/images/pb2dict.py b/lib/pycriu/images/pb2dict.py index 6c4f68889..a35dd3c3f 100644 --- a/lib/pycriu/images/pb2dict.py +++ b/lib/pycriu/images/pb2dict.py @@ -105,6 +105,7 @@ mmap_status_map = [ ('VMA_AREA_AIORING', 1 << 13), ('VMA_AREA_MEMFD', 1 << 14), ('VMA_AREA_SHSTK', 1 << 15), + ('VMA_AREA_UPROBES', 1 << 17), ('VMA_UNSUPP', 1 << 31), ] From bab72af9a5d5d9f715c351cdc5de51eabc3f7727 Mon Sep 17 00:00:00 2001 From: "Mahadasyam, Shashank (SGC)" Date: Mon, 18 Aug 2025 01:03:39 +0900 Subject: [PATCH 192/257] vma: introduce --allow-uprobes option This commit teaches criu to deal with processes which have a "[uprobes]" vma. This vma is mapped by the kernel when execution hits a uprobe location. This is done so as to execute the uprobe'd instruciton out-of-line in the special vma. The uprobe'd location is replaced by a software breakpoint instruction, which is int3 on x86. When execution reaches that location, control is transferred over to the kernel, which then executes whatever handler code it has to, for the uprobe, and then executed the replaced instruction out-of-line in the special vma. For more details, refer to this commit: https://github.com/torvalds/linux/commit/d4b3b6384f98f8692ad0209891ccdbc7e78bbefe Reason for adding a new option ------------------------------ A new option is added instead of making the uprobes vma handling transparent to the user, so that when a dump is attempted on a process tree in which a process has the uprobes vma, criu will error, asking the user to use this option. This gives the user a chance to check what uprobes are attached to the processes being dumped, and try to ensure that those uprobes are active on restore as well. Again, the same reason for requiring this option on restore as well. Because if a process is dumped with an active uprobe, and on restore if the uprobe is not active, then if execution reaches the uprobe location, then the process will be sent a SIGTRAP, whose default behaviour will terminate and core dump the process. This is because the code pages are dumped with the software breakpoint instruction replacement at the uprobe'd locations. On restore, if execution reaches these locations and the kernel sees no associated active uprobes, then it'll send a SIGTRAP. So, using this option is on dump and restore is an implicit guarantee on the user's behalf that they'll take care of the active uprobes and that any future SIGTRAPs because of this are not on us! :) Handling uprobes vma on dump ---------------------------- We don't need to store any information about the uprobes vma because it's completely handled by the kernel, transparent to userspace. So, when a uprobes vma is detected, we check if the --allow-uprobes option was specified or not. If so, then the allow_uprobes boolean in the inventory image is set (this is used on restore). The uprobes vma is skipped from being added to the vma list. Handling uprobes vma on restore ------------------------------- If allow_uprobes is set in the inventory image, then check if --allow-uprobes is specified or not. Restoring the vma is not required. Fixes: checkpoint-restore#1961 Signed-off-by: Shashank Balaji --- criu/config.c | 2 ++ criu/cr-dump.c | 4 ++++ criu/crtools.c | 2 ++ criu/image.c | 5 +++++ criu/include/cr_options.h | 1 + criu/include/image.h | 2 ++ criu/include/proc_parse.h | 2 ++ criu/proc_parse.c | 24 +++++++++++++++++++++++- images/inventory.proto | 1 + 9 files changed, 42 insertions(+), 1 deletion(-) diff --git a/criu/config.c b/criu/config.c index 1322a490a..d7ef3f8e8 100644 --- a/criu/config.c +++ b/criu/config.c @@ -18,6 +18,7 @@ #include "cr_options.h" #include "filesystems.h" #include "file-lock.h" +#include "image.h" #include "irmap.h" #include "mount.h" #include "mount-v2.h" @@ -703,6 +704,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, BOOL_OPT("mntns-compat-mode", &opts.mntns_compat_mode), BOOL_OPT("unprivileged", &opts.unprivileged), BOOL_OPT("ghost-fiemap", &opts.ghost_fiemap), + BOOL_OPT(OPT_ALLOW_UPROBES, &opts.allow_uprobes), {}, }; diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 10c485cbe..60b8e793c 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -2319,6 +2319,10 @@ int cr_dump_tasks(pid_t pid) goto err; he.has_pre_dump_mode = false; + if (found_uprobes_vma()) { + he.has_allow_uprobes = true; + he.allow_uprobes = true; + } ret = write_img_inventory(&he); if (ret) diff --git a/criu/crtools.c b/criu/crtools.c index 509e73d74..203bded81 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -427,6 +427,8 @@ usage: " can be 'nftables' or 'iptables' (default).\n" " --unprivileged accept limitations when running as non-root\n" " consult documentation for further details\n" + " --allow-uprobes allow dump/restore with uprobes vma\n" + " consult documentation for further details\n" "\n" "* External resources support:\n" " --external RES dump objects from this list as external resources:\n" diff --git a/criu/image.c b/criu/image.c index f3747d6ff..c4f05e159 100644 --- a/criu/image.c +++ b/criu/image.c @@ -95,6 +95,11 @@ int check_img_inventory(bool restore) goto out_err; } + if (restore && he->allow_uprobes && !opts.allow_uprobes) { + pr_err("Dumped with --" OPT_ALLOW_UPROBES ". Need to set it on restore as well.\n"); + goto out_err; + } + if (restore) { if (!he->has_network_lock_method) { /* diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index 4df8056b7..8c5707b41 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -196,6 +196,7 @@ struct cr_options { char *work_dir; int network_lock_method; int skip_file_rwx_check; + int allow_uprobes; /* * When we scheduler for removal some functionality we first diff --git a/criu/include/image.h b/criu/include/image.h index b5951d3d4..b06dbf706 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -114,6 +114,8 @@ #define CR_PARENT_LINK "parent" +#define OPT_ALLOW_UPROBES "allow-uprobes" + extern bool ns_per_id; extern bool img_common_magic; diff --git a/criu/include/proc_parse.h b/criu/include/proc_parse.h index 0bd79bf55..76d3242d2 100644 --- a/criu/include/proc_parse.h +++ b/criu/include/proc_parse.h @@ -105,4 +105,6 @@ extern int parse_uptime(uint64_t *upt); extern int parse_timens_offsets(struct timespec *boff, struct timespec *moff); +extern bool found_uprobes_vma(void); + #endif /* __CR_PROC_PARSE_H__ */ diff --git a/criu/proc_parse.c b/criu/proc_parse.c index d7eb25662..0d3b5b23f 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -74,6 +74,8 @@ struct buffer { static struct buffer __buf; static char *buf = __buf.buf; +/* only ever goes from false to true, if at all */ +static bool uprobes_vma_exists = false; /* * This is how AIO ring buffers look like in proc @@ -202,8 +204,11 @@ static void parse_vma_vmflags(char *buf, struct vma_area *vma_area) * vmsplice doesn't work for VM_IO and VM_PFNMAP mappings, the * only exception is VVAR area that mapped by the kernel as * VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP + * + * The uprobes vma is also mapped by the kernel with VM_IO, among other flags */ - if (io_pf && !vma_area_is(vma_area, VMA_AREA_VVAR) && !vma_entry_is(vma_area->e, VMA_FILE_SHARED)) + if (io_pf && !vma_area_is(vma_area, VMA_AREA_VVAR) && !vma_entry_is(vma_area->e, VMA_FILE_SHARED) + && !vma_area_is(vma_area, VMA_AREA_UPROBES)) vma_area->e->status |= VMA_UNSUPP; if (vma_area->e->madv) @@ -603,6 +608,14 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_pat goto err; } else if (!strcmp(file_path, "[heap]")) { vma_area->e->status |= VMA_AREA_REGULAR | VMA_AREA_HEAP; + } else if (!strcmp(file_path, "[uprobes]")) { + uprobes_vma_exists = true; + if (!opts.allow_uprobes) { + pr_err("PID %d has uprobes vma. Consider using --" OPT_ALLOW_UPROBES ".\n", + pid); + goto err; + } + vma_area->e->status |= VMA_AREA_UPROBES; } else { vma_area->e->status = VMA_AREA_REGULAR; } @@ -739,6 +752,10 @@ static int vma_list_add(struct vma_area *vma_area, struct vm_area_list *vma_area */ pr_debug("Device file mapping %016" PRIx64 "-%016" PRIx64 " supported via device plugins\n", vma_area->e->start, vma_area->e->end); + } else if (vma_area->e->status & VMA_AREA_UPROBES) { + pr_debug("Skipping uprobes vma %016" PRIx64 "-%016" PRIx64 "\n", vma_area->e->start, + vma_area->e->end); + return 0; } else if (vma_area->e->status & VMA_UNSUPP) { pr_err("Unsupported mapping found %016" PRIx64 "-%016" PRIx64 "\n", vma_area->e->start, vma_area->e->end); @@ -2929,3 +2946,8 @@ int parse_uptime(uint64_t *upt) fclose(f); return 0; } + +bool found_uprobes_vma(void) +{ + return uprobes_vma_exists; +} diff --git a/images/inventory.proto b/images/inventory.proto index 1e18815bb..feed5b850 100644 --- a/images/inventory.proto +++ b/images/inventory.proto @@ -33,4 +33,5 @@ message inventory_entry { // This is currently used to delete the correct nftables // network locking rule. optional string dump_criu_run_id = 13; + optional bool allow_uprobes = 14; } From aeec40bf026df5218be0a8c381f33bc47de94203 Mon Sep 17 00:00:00 2001 From: "Mahadasyam, Shashank (SGC)" Date: Mon, 18 Aug 2025 01:04:10 +0900 Subject: [PATCH 193/257] docs: add documentation for --allow-uprobes Signed-off-by: Shashank Balaji --- Documentation/criu.txt | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/Documentation/criu.txt b/Documentation/criu.txt index 606935790..40ede84e2 100644 --- a/Documentation/criu.txt +++ b/Documentation/criu.txt @@ -465,6 +465,30 @@ The 'mode' may be one of the following: *skip*::: Don't lock the network. If *--tcp-close* is not used, the network must be locked externally to allow CRIU to dump TCP connections. +*--allow-uprobes*:: + Allow dumping when uprobes vma is present. When used on dump, this option is + required on restore as well. + + A uprobes vma is automatically created by the kernel once a uprobe is + triggered. This mapping is not removed even once the uprobe is deleted. So, + even if a process once had uprobes attached to it, and they're removed by + the time the process is dumped, this option is still required because criu + has no way of knowing whether there are active uprobes or not. + + When using this option on restore, make sure the uprobes (if any) active on + the dumped processes are still active. Otherwise, when execution reaches + a uprobe'd location in any of the restored processes, that process will be + sent a SIGTRAP. + + As an example, say a uprobe is set at function foo in the executable of the + process p_bar. Whenever execution in p_bar reaches function foo, the uprobe + is triggered. If the uprobe has been triggered at least once, then the kernel + will have created the uprobes vma. To dump p_bar, this option is + necessary. After dumping, say the uprobe is deleted. Now, on restoring with + this option, once execution reaches function foo, SIGTRAP will be sent to + the restored p_bar. Unless it has a signal handler installed for SIGTRAP, + it will be terminated and core dumped. + *restore* ~~~~~~~~~ Restores previously checkpointed processes. @@ -692,6 +716,10 @@ The 'mode' may be one of the following: *--skip-file-rwx-check*:: Skip checking file permissions (r/w/x for u/g/o) on restore. +*--allow-uprobes*:: + Required when dumped with this option. Refer to this option in the section + on dumping for more details. + *check* ~~~~~~~ Checks whether the kernel supports the features needed by *criu* to From f548d3af4a8fd2d71dcb0592dec7d66e54786f26 Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Wed, 20 Aug 2025 22:05:03 +0900 Subject: [PATCH 194/257] crtools: remove "consult documentation" Most people know this, don't they? :) Suggested-by: Radostin Stoyanov Signed-off-by: Shashank Balaji --- criu/crtools.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/criu/crtools.c b/criu/crtools.c index 203bded81..e207133ac 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -426,9 +426,7 @@ usage: " --network-lock METHOD network locking/unlocking method; argument\n" " can be 'nftables' or 'iptables' (default).\n" " --unprivileged accept limitations when running as non-root\n" - " consult documentation for further details\n" " --allow-uprobes allow dump/restore with uprobes vma\n" - " consult documentation for further details\n" "\n" "* External resources support:\n" " --external RES dump objects from this list as external resources:\n" From dcce9bd0e2fb330cf2dc124c6ea2ee09af5133e1 Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Fri, 22 Aug 2025 12:47:16 +0900 Subject: [PATCH 195/257] zdtm: add a test for --allow-uprobes option Program flow: - Parse the test's own executable to calculate the file offset of the uprobe target function symbol - Enable the uprobe at the target function - Call the target function to trigger the uprobe, and hence the uprobes vma creation - C/R - Call the target function again to check that no SIGTRAP is sent, since the uprobe is still active At least v1.7 of libtracefs is required because that's when tracefs_instance_reset was introduced. The uprobes API was introduced in v1.4, and the dynamic events API was introduced in v1.3. Ubuntu Focal doesn't have libtracefs. Jammy has v1.2.5, and Noble has v1.7. Signed-off-by: Shashank Balaji --- contrib/dependencies/apk-packages.sh | 3 + contrib/dependencies/apt-cross-packages.sh | 5 +- contrib/dependencies/apt-packages.sh | 3 + contrib/dependencies/dnf-packages.sh | 5 +- contrib/dependencies/pacman-packages.sh | 3 + test/zdtm/static/Makefile | 9 +- test/zdtm/static/uprobes.c | 295 +++++++++++++++++++++ test/zdtm/static/uprobes.desc | 6 + 8 files changed, 326 insertions(+), 3 deletions(-) create mode 100644 test/zdtm/static/uprobes.c create mode 100644 test/zdtm/static/uprobes.desc diff --git a/contrib/dependencies/apk-packages.sh b/contrib/dependencies/apk-packages.sh index d02704b15..c47fb9fe0 100755 --- a/contrib/dependencies/apk-packages.sh +++ b/contrib/dependencies/apk-packages.sh @@ -6,6 +6,7 @@ apk add --no-cache \ build-base \ coreutils \ e2fsprogs \ + elfutils-dev \ git \ gnutls-dev \ go \ @@ -20,6 +21,8 @@ apk add --no-cache \ libdrm-dev \ libnet-dev \ libnl3-dev \ + libtraceevent-dev \ + libtracefs-dev \ nftables \ nftables-dev \ perl \ diff --git a/contrib/dependencies/apt-cross-packages.sh b/contrib/dependencies/apt-cross-packages.sh index 588be40d0..30ce6874c 100755 --- a/contrib/dependencies/apt-cross-packages.sh +++ b/contrib/dependencies/apt-cross-packages.sh @@ -14,6 +14,8 @@ fi libc6-"${DEBIAN_ARCH}"-cross \ libc6-dev-"${DEBIAN_ARCH}"-cross \ libcap-dev:"${DEBIAN_ARCH}" \ + libdrm-dev:"${DEBIAN_ARCH}" \ + libelf-dev:"${DEBIAN_ARCH}" \ libexpat1-dev:"${DEBIAN_ARCH}" \ libgnutls28-dev:"${DEBIAN_ARCH}" \ libnet-dev:"${DEBIAN_ARCH}" \ @@ -23,9 +25,10 @@ fi libprotobuf-c-dev:"${DEBIAN_ARCH}" \ libprotobuf-dev:"${DEBIAN_ARCH}" \ libssl-dev:"${DEBIAN_ARCH}" \ + libtraceevent-dev:"${DEBIAN_ARCH}" \ + libtracefs-dev:"${DEBIAN_ARCH}" \ ncurses-dev:"${DEBIAN_ARCH}" \ uuid-dev:"${DEBIAN_ARCH}" \ - libdrm-dev:"${DEBIAN_ARCH}" \ build-essential \ pkg-config \ git \ diff --git a/contrib/dependencies/apt-packages.sh b/contrib/dependencies/apt-packages.sh index c60ba9041..1fd42d4e6 100755 --- a/contrib/dependencies/apt-packages.sh +++ b/contrib/dependencies/apt-packages.sh @@ -19,6 +19,7 @@ fi libbsd-dev \ libcap-dev \ libdrm-dev \ + libelf-dev \ libgnutls28-dev \ libgnutls30 \ libnet-dev \ @@ -28,6 +29,8 @@ fi libprotobuf-c-dev \ libprotobuf-dev \ libselinux-dev \ + libtraceevent-dev \ + libtracefs-dev \ pkg-config \ protobuf-c-compiler \ protobuf-compiler \ diff --git a/contrib/dependencies/dnf-packages.sh b/contrib/dependencies/dnf-packages.sh index efbb659c5..00dc91a2e 100755 --- a/contrib/dependencies/dnf-packages.sh +++ b/contrib/dependencies/dnf-packages.sh @@ -3,6 +3,7 @@ dnf install -y \ asciidoc \ binutils \ + elfutils-libelf-devel \ gcc \ git \ glibc-devel \ @@ -18,6 +19,8 @@ dnf install -y \ libnet-devel \ libnl3-devel \ libselinux-devel \ + libtraceevent-devel \ + libtracefs-devel \ libuuid-devel \ make \ nftables \ @@ -27,9 +30,9 @@ dnf install -y \ protobuf-c-devel \ protobuf-compiler \ protobuf-devel \ - python-devel \ python3-importlib-metadata \ python3-protobuf \ python3-pyyaml \ + python-devel \ rubygem-asciidoctor \ xmlto diff --git a/contrib/dependencies/pacman-packages.sh b/contrib/dependencies/pacman-packages.sh index 5fe6995fb..260797606 100755 --- a/contrib/dependencies/pacman-packages.sh +++ b/contrib/dependencies/pacman-packages.sh @@ -15,8 +15,11 @@ pacman -Syu --noconfirm \ libbsd \ libcap \ libdrm \ + libelf \ libnet \ libnl \ + libtraceevent \ + libtracefs \ nftables \ pkg-config \ protobuf \ diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 6b262c443..ea901a805 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -290,6 +290,7 @@ TST_NOFILE := \ PKG_CONFIG ?= pkg-config pkg-config-check = $(shell sh -c '$(PKG_CONFIG) $(1) && echo y') +pkg-config-atleast-version = $(shell sh -c '$(PKG_CONFIG) --atleast-version=$(2) $(1) && echo y') ifeq ($(call pkg-config-check,libbpf),y) TST_NOFILE += \ bpf_hash \ @@ -298,7 +299,10 @@ endif ifneq ($(ARCH),arm) ifneq ($(COMPAT_TEST),y) - TST_NOFILE += maps03 + TST_NOFILE += maps03 +ifeq ($(call pkg-config-atleast-version,libtracefs,1.7),y) + TST_NOFILE += uprobes +endif endif endif @@ -727,6 +731,9 @@ sk-unix-listen04: CFLAGS += -DSK_UNIX_LISTEN02 -DSK_UNIX_LISTEN03 cgroupv2_01: LDLIBS += -pthread +uprobes: CFLAGS += $(call pkg-cflags, libtracefs libtraceevent) +uprobes: LDLIBS += $(call pkg-libs, libtracefs libelf) + $(LIB): force $(Q) $(MAKE) -C $(LIBDIR) diff --git a/test/zdtm/static/uprobes.c b/test/zdtm/static/uprobes.c new file mode 100644 index 000000000..4164375b7 --- /dev/null +++ b/test/zdtm/static/uprobes.c @@ -0,0 +1,295 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test the --allow-uprobes option"; +const char *test_author = "Shashank Balaji "; + +#define UPROBE_GROUP_NAME "zdtm" +#define UPROBE_EVENT_NAME "uprobes_test" +#define UPROBED_FUNCTION uprobe_target + +/* + * A uprobe can be set at the start of a function, but not all instructions + * will trigger the creation of a uprobes vma. + * + * Examples: + * - aarch64: if the function is a single `ret`, then no vma creation + * - x64: if the function is `nop; ret`, then no vma creation + * + * So to guarantee vma creation, create a volatile dummy variable (to prevent + * compiler optimization) and use it (to prevent "unused variable" warning) + */ +void UPROBED_FUNCTION(void) { + volatile int dummy = 0; + dummy += 1; +} +/* Calling via volatile function pointer ensures noinline at callsite */ +typedef void (*func_ptr)(void); +volatile func_ptr uprobe_target_alias = UPROBED_FUNCTION; + +struct uprobe_context { + struct tracefs_instance *instance; + struct tracefs_dynevent *uprobe; +}; + +volatile bool got_sigtrap = false; + +/* + * Returns the file offset of a symbol in the executable of this program + * Returns 0 on failure +*/ +uint64_t calc_sym_offset(const char *sym_name) +{ + GElf_Shdr section_header; + Elf_Scn *section = NULL; + Elf_Data *symtab_data; + uint64_t offset = 0; + char buf[PATH_MAX]; + GElf_Sym symbol; + ssize_t n_bytes; + int n_entries; + Elf *elf; + int fd; + int i; + + if (elf_version(EV_CURRENT) == EV_NONE) { + pr_err("ELF version of libelf is lower than that of the program\n"); + return 0; + } + + n_bytes = readlink("/proc/self/exe", buf, sizeof(buf)); + if (n_bytes < 0) { + pr_perror("Failed to readlink /proc/self/exe"); + return 0; + } + buf[n_bytes] = '\0'; + + fd = open(buf, O_RDONLY); + if (fd < 0) { + pr_perror("Failed to open self-executable"); + return 0; + } + + elf = elf_begin(fd, ELF_C_READ, NULL); + if (!elf) { + pr_err("%s\n", elf_errmsg(elf_errno())); + goto out_fd; + } + + /* Look for the symbol table section and its header */ + while ((section = elf_nextscn(elf, section)) != NULL) { + gelf_getshdr(section, §ion_header); + if (section_header.sh_type == SHT_SYMTAB) + break; + } + if (!section) { + pr_err("Failed to find symbol table\n"); + goto out_elf; + } + symtab_data = elf_getdata(section, NULL); + n_entries = section_header.sh_size / section_header.sh_entsize; + + /* Look for a symbol with the required name */ + for (i = 0; i < n_entries; i++) { + gelf_getsym(symtab_data, i, &symbol); + /* Symbol table's sh_link is the index of the string table section header */ + if (!strcmp(sym_name, + elf_strptr(elf, section_header.sh_link, symbol.st_name))) + break; + } + if (i == n_entries) { + pr_err("Failed to find symbol \"%s\"\n", sym_name); + goto out_elf; + } + + /* Get the section the symbol belongs to (mostly .text) */ + section = elf_getscn(elf, symbol.st_shndx); + gelf_getshdr(section, §ion_header); + offset = symbol.st_value - section_header.sh_addr + section_header.sh_offset; + +out_elf: + elf_end(elf); +out_fd: + close(fd); + return offset; +} + +/* + * Set and enable a uprobe on the file at the given offset + * Returns struct uprobe_context with members set to NULL on failure +*/ +struct uprobe_context enable_uprobe(const char *file, uint64_t offset) +{ + struct tracefs_instance *trace_instance; + struct tracefs_dynevent *uprobe; + struct uprobe_context context = {}; + + trace_instance = tracefs_instance_create("zdtm_uprobes_test"); + if (!trace_instance) { + pr_perror("Failed to create tracefs instance"); + return context; + } + tracefs_instance_reset(trace_instance); + + uprobe = tracefs_uprobe_alloc(UPROBE_GROUP_NAME, UPROBE_EVENT_NAME, file, offset, NULL); + if (!uprobe) { + pr_perror("Failed to allocate uprobe"); + goto instance_destroy; + } + + if (tracefs_dynevent_create(uprobe)) { + pr_perror("Failed to create uprobe"); + goto uprobe_free; + } + + if (tracefs_event_enable(trace_instance, UPROBE_GROUP_NAME, UPROBE_EVENT_NAME)) { + pr_perror("Failed to enable uprobe"); + goto uprobe_destroy; + } + + context.instance = trace_instance; + context.uprobe = uprobe; + return context; + +uprobe_destroy: + tracefs_dynevent_destroy(uprobe, false); +uprobe_free: + tracefs_dynevent_free(uprobe); +instance_destroy: + tracefs_instance_destroy(trace_instance); + tracefs_instance_free(trace_instance); + return context; +} + +void destroy_uprobe(struct uprobe_context context) +{ + tracefs_dynevent_destroy(context.uprobe, true); + tracefs_dynevent_free(context.uprobe); + tracefs_instance_destroy(context.instance); + tracefs_instance_free(context.instance); +} + +/* + * Check for the existence of the "[uprobes]" vma in /proc/self/maps + * Returns -1 on failure, 0 if not found, 1 if found +*/ +int uprobes_vma_exists(void) +{ + FILE *f; + char buf[LINE_MAX]; + int ret = 0; + + f = fopen("/proc/self/maps", "r"); + if (!f) { + pr_perror("Failed to open /proc/self/maps"); + return -1; + } + + while (fgets(buf, sizeof(buf), f)) { + if (strstr(buf, "[uprobes]")) { + ret = 1; + break; + } + } + if (ret == 0 && !feof(f)) { + pr_err("Failed to finish reading /proc/self/maps\n"); + ret = -1; + } + + fclose(f); + return ret; +} + +/* + * SIGTRAP is sent if execution reaches a previously set uprobed location, and + * the corresponding uprobe is not active. We don't want this to happen on restore +*/ +void sigtrap_handler(int signo, siginfo_t *info, void* context) +{ + if (info->si_code == SI_KERNEL) { + got_sigtrap = true; + fail("SIGTRAP on attempting to call uprobed function"); + } +} + +int main(int argc, char **argv) +{ + struct uprobe_context context; + struct sigaction sa; + char buf[PATH_MAX]; + uint64_t offset; + int n_bytes; + int ret = 1; + + test_init(argc, argv); + + offset = calc_sym_offset(__stringify(UPROBED_FUNCTION)); + if (!offset) + return 1; + + n_bytes = readlink("/proc/self/exe", buf, sizeof(buf)); + if (n_bytes < 0) { + pr_perror("Failed to readlink /proc/self/exe"); + return 1; + } + buf[n_bytes] = '\0'; + + sa.sa_flags = SA_SIGINFO; + sa.sa_sigaction = sigtrap_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGTRAP, &sa, NULL)) { + pr_perror("Failed to set SIGTRAP handler"); + return 1; + } + + context = enable_uprobe(buf, offset); + if (!context.instance) + return 1; + + /* + * Execution must reach the uprobed location at least once + * for the kernel to create the uprobes vma + */ + uprobe_target_alias(); + + switch (uprobes_vma_exists()) { + case -1: + goto out_uprobe; + break; + case 0: + pr_err("uprobes vma does not exist\n"); + goto out_uprobe; + break; + case 1: + test_msg("Found uprobes vma\n"); + break; + } + + test_daemon(); + test_waitsig(); + + /* + * Calling the uprobed function after restore should not cause + * a SIGTRAP, since the uprobe is still active + */ + uprobe_target_alias(); + if (!got_sigtrap) { + pass(); + ret = 0; + } + +out_uprobe: + destroy_uprobe(context); + return ret; +} diff --git a/test/zdtm/static/uprobes.desc b/test/zdtm/static/uprobes.desc new file mode 100644 index 000000000..6eab1f498 --- /dev/null +++ b/test/zdtm/static/uprobes.desc @@ -0,0 +1,6 @@ +{ + 'feature': 'cgroupns', + 'flags': 'suid nouser', + 'flavor': 'h', + 'opts': '--allow-uprobes' +} From c03c08d1bca96132a34833c0233ddd48b016f2d7 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 10 Sep 2025 10:50:46 +0100 Subject: [PATCH 196/257] cr-service: refactor rpc config parsing When an additional configuration file is specified via RPC, this file is parsed twice: first at an early stage to load options such as --log-file, --work-dir, and --images-dir; and again after all RPC options and configuration files have been evaluated. This allows users to overwrite options specified via RPC by the container runtime (e.g., --tcp-established). However, processing the RPC config file twice leads to silently duplicating the values of repeatable options such as `--action-script`. To address this problem, we adjust the order of options parsing so that the RPC config file is evaluated only once. This change should not introduce any functional changes. Note that this change does not affect the logging functionality, as early log messages are temporarily buffered and only written to the log file once it has been initialized (see commit 1ff2333 "Printout early log messages"). Fixes #2727 Suggested-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- criu/cr-service.c | 299 +++++++++++++++++++++------------------------- 1 file changed, 138 insertions(+), 161 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index a1089ad5c..e6aac232e 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -312,156 +312,6 @@ static int setup_opts_from_req(int sk, CriuOpts *req) BUG_ON(st.st_ino == -1); service_sk_ino = st.st_ino; - /* - * Evaluate an additional configuration file if specified. - * This needs to happen twice, because it is needed early to detect - * things like work_dir, imgs_dir and logfile. The second parsing - * of the optional RPC configuration file happens at the end and - * overwrites all options set via RPC. - */ - if (req->config_file) { - char *tmp_output = opts.output; - char *tmp_work = opts.work_dir; - char *tmp_imgs = opts.imgs_dir; - - opts.output = NULL; - opts.work_dir = NULL; - opts.imgs_dir = NULL; - - rpc_cfg_file = req->config_file; - i = parse_options(0, NULL, &dummy, &dummy, PARSING_RPC_CONF); - if (i) { - xfree(tmp_output); - xfree(tmp_work); - xfree(tmp_imgs); - goto err; - } - /* If this is non-NULL, the RPC configuration file had a value, use it.*/ - if (opts.output) - output_changed_by_rpc_conf = true; - /* If this is NULL, use the old value if it was set. */ - if (!opts.output && tmp_output) { - opts.output = tmp_output; - tmp_output = NULL; - } - - if (opts.work_dir) - work_changed_by_rpc_conf = true; - if (!opts.work_dir && tmp_work) { - opts.work_dir = tmp_work; - tmp_work = NULL; - } - - if (opts.imgs_dir) - imgs_changed_by_rpc_conf = true; - /* - * As the images directory is a required RPC setting, it is not - * necessary to use the value from other configuration files. - * Either it is set in the RPC configuration file or it is set - * via RPC. - */ - xfree(tmp_output); - xfree(tmp_work); - xfree(tmp_imgs); - } - - /* - * open images_dir - images_dir_fd is a required RPC parameter - * - * This assumes that if opts.imgs_dir is set we have a value - * from the configuration file parser. The test to see that - * imgs_changed_by_rpc_conf is true is used to make sure the value - * is from the RPC configuration file. - * The idea is that only the RPC configuration file is able to - * overwrite RPC settings: - * * apply_config(global_conf) - * * apply_config(user_conf) - * * apply_config(environment variable) - * * apply_rpc_options() - * * apply_config(rpc_conf) - */ - if (imgs_changed_by_rpc_conf) - strncpy(images_dir_path, opts.imgs_dir, PATH_MAX - 1); - else if (req->images_dir_fd != -1) - sprintf(images_dir_path, "/proc/%d/fd/%d", ids.pid, req->images_dir_fd); - else if (req->images_dir) - strncpy(images_dir_path, req->images_dir, PATH_MAX - 1); - else { - pr_err("Neither images_dir_fd nor images_dir was passed by RPC client.\n"); - goto err; - } - - if (req->parent_img) - SET_CHAR_OPTS(img_parent, req->parent_img); - - /* - * Image streaming is not supported with CRIU's service feature as - * the streamer must be started for each dump/restore operation. - * It is unclear how to do that with RPC, so we punt for now. - * This explains why we provide the argument mode=-1 instead of - * O_RSTR or O_DUMP. - */ - if (open_image_dir(images_dir_path, -1) < 0) { - pr_perror("Can't open images directory"); - goto err; - } - - /* get full path to images_dir to use in process title */ - if (readlink(images_dir_path, images_dir, PATH_MAX) == -1) { - pr_perror("Can't readlink %s", images_dir_path); - goto err; - } - - /* chdir to work dir */ - if (work_changed_by_rpc_conf) - /* Use the value from the RPC configuration file first. */ - strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); - else if (req->has_work_dir_fd) - /* Use the value set via RPC. */ - sprintf(work_dir_path, "/proc/%d/fd/%d", ids.pid, req->work_dir_fd); - else if (opts.work_dir) - /* Use the value from one of the other configuration files. */ - strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); - else - /* Use the images directory a work directory. */ - strcpy(work_dir_path, images_dir_path); - - if (chdir(work_dir_path)) { - pr_perror("Can't chdir to work_dir"); - goto err; - } - - /* initiate log file in work dir */ - if (req->log_file && !output_changed_by_rpc_conf) { - /* - * If RPC sets a log file and if there nothing from the - * RPC configuration file, use the RPC value. - */ - if (strchr(req->log_file, '/')) { - pr_perror("No subdirs are allowed in log_file name"); - goto err; - } - - SET_CHAR_OPTS(output, req->log_file); - } else if (req->has_log_to_stderr && req->log_to_stderr && !output_changed_by_rpc_conf) { - xfree(opts.output); - opts.output = NULL; - } else if (!opts.output) { - SET_CHAR_OPTS(output, DEFAULT_LOG_FILENAME); - } - - /* This is needed later to correctly set the log_level */ - opts.log_level = req->log_level; - log_set_loglevel(req->log_level); - if (log_init(opts.output) == -1) { - pr_perror("Can't initiate log"); - goto err; - } - - if (req->config_file) { - pr_debug("Would overwrite RPC settings with values from %s\n", req->config_file); - } - if (req->has_unprivileged) opts.unprivileged = req->unprivileged; @@ -753,14 +603,6 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->empty_ns & ~(CLONE_NEWNET)) goto err; } - - if (req->n_irmap_scan_paths) { - for (i = 0; i < req->n_irmap_scan_paths; i++) { - if (irmap_scan_path_add(req->irmap_scan_paths[i])) - goto err; - } - } - if (req->has_status_fd) { pr_warn("status_fd is obsoleted; use status-ready notification instead\n"); @@ -781,13 +623,148 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->has_display_stats) opts.display_stats = req->display_stats; - /* Evaluate additional configuration file a second time to overwrite - * all RPC settings. */ + /* Evaluate additional configuration file (e.g., runc.conf) to overwrite all RPC settings. */ if (req->config_file) { + char *tmp_output = opts.output; + char *tmp_work = opts.work_dir; + + opts.output = NULL; + opts.work_dir = NULL; + + /* + * As the images directory is a required RPC setting, it is not + * necessary to use the value from other configuration files. + * Either it is set in the RPC configuration file or it is set + * via RPC. + */ + xfree(opts.imgs_dir); + opts.imgs_dir = NULL; + + pr_debug("Would overwrite RPC settings with values from %s\n", req->config_file); + rpc_cfg_file = req->config_file; i = parse_options(0, NULL, &dummy, &dummy, PARSING_RPC_CONF); - if (i) + if (i) { + xfree(tmp_output); + xfree(tmp_work); goto err; + } + + /* If opts.{output,work_dir} is non-NULL, the RPC configuration file had a value, use it.*/ + /* If opts.{output,work_dir} is NULL, use the old value if it was set. */ + if (opts.output) { + output_changed_by_rpc_conf = true; + } else { + opts.output = tmp_output; + tmp_output = NULL; + } + + if (opts.work_dir) { + work_changed_by_rpc_conf = true; + } else { + opts.work_dir = tmp_work; + tmp_work = NULL; + } + + if (opts.imgs_dir) + imgs_changed_by_rpc_conf = true; + + xfree(tmp_output); + xfree(tmp_work); + } + + /* + * open images_dir - images_dir_fd is a required RPC parameter + * + * This assumes that if opts.imgs_dir is set we have a value + * from the configuration file parser. The test to see that + * imgs_changed_by_rpc_conf is true is used to make sure the value + * is from the RPC configuration file. The idea is that only the + * RPC configuration file is able to overwrite RPC settings: + * * apply_config(global_conf) + * * apply_config(user_conf) + * * apply_config(environment variable) + * * apply_rpc_options() + * * apply_config(rpc_conf) + */ + if (imgs_changed_by_rpc_conf) { + strncpy(images_dir_path, opts.imgs_dir, PATH_MAX - 1); + } else if (req->images_dir_fd != -1) { + sprintf(images_dir_path, "/proc/%d/fd/%d", ids.pid, req->images_dir_fd); + } else if (req->images_dir) { + strncpy(images_dir_path, req->images_dir, PATH_MAX - 1); + } else { + pr_err("Neither images_dir_fd nor images_dir was passed by RPC client.\n"); + goto err; + } + + if (req->parent_img) + SET_CHAR_OPTS(img_parent, req->parent_img); + + /* + * Image streaming is not supported with CRIU's service feature as + * the streamer must be started for each dump/restore operation. + * It is unclear how to do that with RPC, so we punt for now. + * This explains why we provide the argument mode=-1 instead of + * O_RSTR or O_DUMP. + */ + if (open_image_dir(images_dir_path, -1) < 0) { + pr_perror("Can't open images directory"); + goto err; + } + + /* get full path to images_dir to use in process title */ + if (readlink(images_dir_path, images_dir, PATH_MAX) == -1) { + pr_perror("Can't readlink %s", images_dir_path); + goto err; + } + + if (work_changed_by_rpc_conf) + strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); + else if (req->has_work_dir_fd) + sprintf(work_dir_path, "/proc/%d/fd/%d", ids.pid, req->work_dir_fd); + else if (opts.work_dir) + strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); + else + strcpy(work_dir_path, images_dir_path); + + if (chdir(work_dir_path)) { + pr_perror("Can't chdir to work_dir"); + goto err; + } + + if (req->n_irmap_scan_paths) { + for (i = 0; i < req->n_irmap_scan_paths; i++) { + if (irmap_scan_path_add(req->irmap_scan_paths[i])) + goto err; + } + } + + /* initiate log file in work dir */ + if (req->log_file && !output_changed_by_rpc_conf) { + /* + * If RPC sets a log file and if there nothing from the + * RPC configuration file, use the RPC value. + */ + if (strchr(req->log_file, '/')) { + pr_perror("No subdirs are allowed in log_file name"); + goto err; + } + + SET_CHAR_OPTS(output, req->log_file); + } else if (req->has_log_to_stderr && req->log_to_stderr && !output_changed_by_rpc_conf) { + xfree(opts.output); + opts.output = NULL; + } else if (!opts.output) { + SET_CHAR_OPTS(output, DEFAULT_LOG_FILENAME); + } + + /* This is needed later to correctly set the log_level */ + opts.log_level = req->log_level; + log_set_loglevel(req->log_level); + if (log_init(opts.output) == -1) { + pr_perror("Can't initiate log"); + goto err; } if (req->mntns_compat_mode) From 9d072222ef7a895c644ffe5be30ed4821dc9e30c Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 3 Sep 2025 18:29:34 +0100 Subject: [PATCH 197/257] test/others/rpc: parse action-script via config Extend the test for overwriting config options via RPC with repeatable option (--action-script) and verify that the value will not be silently duplicated. Signed-off-by: Radostin Stoyanov --- test/others/rpc/Makefile | 1 + test/others/rpc/action-script.sh | 17 +++++++++++++++++ test/others/rpc/config_file.py | 11 +++++++++++ 3 files changed, 29 insertions(+) create mode 100755 test/others/rpc/action-script.sh diff --git a/test/others/rpc/Makefile b/test/others/rpc/Makefile index 384eb0539..c0e56d528 100644 --- a/test/others/rpc/Makefile +++ b/test/others/rpc/Makefile @@ -12,6 +12,7 @@ run: all chmod a+rwx build chmod a+rwx build/{imgs_errno,imgs_ps,imgs_c,imgs_loop,imgs_py} rm -f build/status + rm -f build/_marker_* @# Create all log files to be accessible for anybody @# so that they can be displayed by any user. for i in imgs_errno/criu.log imgs_ps/page-server.log imgs_ps/dump.log \ diff --git a/test/others/rpc/action-script.sh b/test/others/rpc/action-script.sh new file mode 100755 index 000000000..991e315de --- /dev/null +++ b/test/others/rpc/action-script.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +MARKER_FILE="_marker_${CRTOOLS_SCRIPT_ACTION}" + +if [ -z "$CRTOOLS_SCRIPT_ACTION" ]; then + echo "Error: CRTOOLS_SCRIPT_ACTION is not set." + exit 2 +fi + +if [ ! -f "$MARKER_FILE" ]; then + touch "$MARKER_FILE" +else + echo "Error: Running the same action hook for the second time" + exit 1 +fi + +exit 0 diff --git a/test/others/rpc/config_file.py b/test/others/rpc/config_file.py index 6cffe270d..f5ec40818 100755 --- a/test/others/rpc/config_file.py +++ b/test/others/rpc/config_file.py @@ -13,6 +13,9 @@ from setup_swrk import setup_swrk log_file = 'config_file_test.log' does_not_exist = 'does-not.exist' +script_path = os.path.dirname(os.path.abspath(__file__)) +action_script_file = os.path.join(script_path, 'action-script.sh') + def setup_config_file(content): # Creating a temporary file which will be used as configuration file. @@ -156,6 +159,7 @@ def test_rpc_with_configuration_file_overwriting_rpc(): # file settings in the default configuration. log = does_not_exist content = 'log-file ' + log + '\n' + content += 'action-script ' + action_script_file + '\n' content += 'no-tcp-established\nno-shell-job' path = setup_config_file(content) # Only set the configuration file via RPC; @@ -180,11 +184,18 @@ args = vars(parser.parse_args()) cleanup_output(args['dir']) +print("*** Test broken config file ***") test_broken_configuration_file() cleanup_output(args['dir']) + +print("*** Test RPC without config file ***") test_rpc_without_configuration_file() cleanup_output(args['dir']) + +print("*** Test RPC with config file ***") test_rpc_with_configuration_file() cleanup_output(args['dir']) + +print("*** Test configuration file overwriting RPC ***") test_rpc_with_configuration_file_overwriting_rpc() cleanup_output(args['dir']) From bb9a7202a7a7965495456d3bd5f7aa07e9d06af3 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 3 Sep 2025 21:40:02 +0100 Subject: [PATCH 198/257] test/others/rpc: show logs on error Signed-off-by: Radostin Stoyanov --- test/others/rpc/config_file.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/test/others/rpc/config_file.py b/test/others/rpc/config_file.py index f5ec40818..c1a8276d8 100755 --- a/test/others/rpc/config_file.py +++ b/test/others/rpc/config_file.py @@ -92,29 +92,37 @@ def test_broken_configuration_file(): sys.exit(-1) -def search_in_log_file(log, message): - with open(os.path.join(args['dir'], log)) as f: +def search_in_log_file(log_path, message): + with open(log_path) as f: if message not in f.read(): - print( - 'FAIL: Missing the expected error message (%s) in the log file' - % message) + print('FAIL: Missing the expected error message (%s) in the log file' % message) sys.exit(-1) +def print_log_file(log_path): + print("\n--- Begin log file: %s ---" % log_path) + with open(log_path, 'r') as f: + print(f.read()) + print("--- End log file ---\n") + + def check_results(resp, log): # Check if the specified log file exists - if not os.path.isfile(os.path.join(args['dir'], log)): + log_path = os.path.join(args['dir'], log) + if not os.path.isfile(log_path): print('FAIL: Expected log file %s does not exist' % log) sys.exit(-1) # Dump should have failed with: 'The criu itself is within dumped tree' if resp.type != rpc.DUMP: print('FAIL: Unexpected msg type %r' % resp.type) + print_log_file(log_path) sys.exit(-1) if 'The criu itself is within dumped tree' not in resp.cr_errmsg: print('FAIL: Missing the expected error message in RPC response') + print_log_file(log_path) sys.exit(-1) # Look into the log file for the same message - search_in_log_file(log, 'The criu itself is within dumped tree') + search_in_log_file(log_path, 'The criu itself is within dumped tree') def test_rpc_without_configuration_file(): From 3365c7c02583b6e6cdab976d484b47b1fae5f19d Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Thu, 4 Sep 2025 21:35:37 +0200 Subject: [PATCH 199/257] restorer: shstk: add restorer shadow stack stubs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * shstk_restorer_stack_size() – restorer shadow stack size * shstk_set_restorer_stack() – set restorer shadow stack start Signed-off-by: Igor Svilenkov Bozic Co-Authored-By: Andrei Vagin Signed-off-by: Alexander Mikhalitsyn --- criu/include/restore.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/criu/include/restore.h b/criu/include/restore.h index 04d006505..2c4e4e267 100644 --- a/criu/include/restore.h +++ b/criu/include/restore.h @@ -9,6 +9,7 @@ extern int arch_set_thread_regs_nosigrt(struct pid *pid); struct task_restore_args; struct pstree_item; +struct rst_shstk_info; #ifndef arch_shstk_prepare static inline int arch_shstk_prepare(struct pstree_item *item, @@ -38,4 +39,18 @@ static inline int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *cor #define arch_shstk_trampoline arch_shstk_trampoline #endif +#ifndef shstk_restorer_stack_size +static always_inline long shstk_restorer_stack_size(void) +{ + return 0; +} +#endif + +#ifndef shstk_set_restorer_stack +static always_inline long shstk_set_restorer_stack(struct rst_shstk_info *info, void *ptr) +{ + return 0; +} +#endif + #endif From f29cb750dbf292249126402b5f4d40e03d6cefd7 Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Thu, 4 Sep 2025 21:45:19 +0200 Subject: [PATCH 200/257] x86/criu: shstk restorer memory accounting functions * shstk_restorer_stack_size(): PAGE_SIZE * shstk_set_restorer_stack(): set restorer temporary shadow stack start Signed-off-by: Igor Svilenkov Bozic Co-Authored-By: Andrei Vagin Signed-off-by: Alexander Mikhalitsyn --- criu/arch/x86/include/asm/shstk.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/criu/arch/x86/include/asm/shstk.h b/criu/arch/x86/include/asm/shstk.h index 7814c351d..2b9a303b8 100644 --- a/criu/arch/x86/include/asm/shstk.h +++ b/criu/arch/x86/include/asm/shstk.h @@ -73,6 +73,17 @@ int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, int (*func)(void *arg), void *arg); #define arch_shstk_trampoline arch_shstk_trampoline +static always_inline long shstk_restorer_stack_size(void) +{ + return PAGE_SIZE; +} +#define shstk_restorer_stack_size shstk_restorer_stack_size +static always_inline void shstk_set_restorer_stack(struct rst_shstk_info *info, void *ptr) +{ + info->tmp_shstk = (unsigned long)ptr; +} +#define shstk_set_restorer_stack shstk_set_restorer_stack + #ifdef CR_NOGLIBC #include From b18c07d8a856bce56387e30c851858ee4745b5fa Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 17 Oct 2025 18:53:01 +0200 Subject: [PATCH 201/257] restorer: shstk: add shstk_min_mmap_addr() * default: return whatever passed in eg. to be used as shtk_min_mmap_addr(kdat.mmap_min_addr) * x86: ignore def and return 4G On x86, CET shadow stack is required to be mapped above 4GiB On the other hand forcing 4GiB globally would break 32-bit restores. Co-Authored-By: Andrei Vagin Signed-off-by: Alexander Mikhalitsyn --- criu/arch/x86/include/asm/shstk.h | 6 ++++++ criu/cr-restore.c | 9 +++++---- criu/include/restore.h | 7 +++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/criu/arch/x86/include/asm/shstk.h b/criu/arch/x86/include/asm/shstk.h index 2b9a303b8..f62b8c3e9 100644 --- a/criu/arch/x86/include/asm/shstk.h +++ b/criu/arch/x86/include/asm/shstk.h @@ -84,6 +84,12 @@ static always_inline void shstk_set_restorer_stack(struct rst_shstk_info *info, } #define shstk_set_restorer_stack shstk_set_restorer_stack +static always_inline long shstk_min_mmap_addr(struct rst_shstk_info *info, unsigned long __maybe_unused def) +{ + return !(info->cet & ARCH_SHSTK_SHSTK) ? def : (4UL << 30); +} +#define shstk_min_mmap_addr shstk_min_mmap_addr + #ifdef CR_NOGLIBC #include diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 1c3b36451..9781dbfa0 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2431,16 +2431,15 @@ err: return ret; } -static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_head *self_vma_list, long vma_len) +static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_head *self_vma_list, long min_addr, long vma_len) { struct vma_area *t_vma, *s_vma; - long prev_vma_end = 0; + long prev_vma_end = min_addr; struct vma_area end_vma; VmaEntry end_e; end_vma.e = &end_e; end_e.start = end_e.end = kdat.task_size; - prev_vma_end = kdat.mmap_min_addr; s_vma = list_first_entry(self_vma_list, struct vma_area, list); t_vma = list_first_entry(tgt_vma_list, struct vma_area, list); @@ -3226,7 +3225,9 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns * or inited from scratch). */ - mem = (void *)restorer_get_vma_hint(&vmas->h, &self_vmas.h, task_args->bootstrap_len); + mem = (void *)restorer_get_vma_hint(&vmas->h, &self_vmas.h, + shstk_min_mmap_addr(&task_args->shstk, kdat.mmap_min_addr), + task_args->bootstrap_len); if (mem == (void *)-1) { pr_err("No suitable area for task_restore bootstrap (%ldK)\n", task_args->bootstrap_len); goto err; diff --git a/criu/include/restore.h b/criu/include/restore.h index 2c4e4e267..189051826 100644 --- a/criu/include/restore.h +++ b/criu/include/restore.h @@ -53,4 +53,11 @@ static always_inline long shstk_set_restorer_stack(struct rst_shstk_info *info, } #endif +#ifndef shstk_min_mmap_addr +static always_inline long shstk_min_mmap_addr(struct rst_shstk_info *info, unsigned long def) +{ + return def; +} +#endif + #endif From 02462c19c443e18ee6cdd54d849086eb22815b7d Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Sat, 6 Sep 2025 17:25:06 +0200 Subject: [PATCH 202/257] restorer: shstk: allocate restorer shadow stack * reserve space for restorer shadow stack * set tmp_shstk at mem, advance mem by PAGE_SIZE * forget the extra PAGE_SIZE (shstk) for premapped VMAs Signed-off-by: Igor Svilenkov Bozic Co-Authored-By: Andrei Vagin [ alex: small code cleanups ] Signed-off-by: Alexander Mikhalitsyn --- criu/arch/x86/shstk.c | 1 - criu/cr-restore.c | 6 +++++- criu/mem.c | 9 --------- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/criu/arch/x86/shstk.c b/criu/arch/x86/shstk.c index b752f114a..0810efac5 100644 --- a/criu/arch/x86/shstk.c +++ b/criu/arch/x86/shstk.c @@ -45,7 +45,6 @@ static int shstk_prepare_task(struct vm_area_list *vmas, shstk->vma_start = vma->e->start; shstk->vma_size = size; shstk->premmaped_addr = premmaped_addr; - shstk->tmp_shstk = premmaped_addr + size; break; } diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 9781dbfa0..057ec0e93 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -3195,7 +3195,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns rst_mem_size = rst_mem_lock(); memzone_size = round_up(sizeof(struct restore_mem_zone) * current->nr_threads, page_size()); - task_args->bootstrap_len = restorer_len + memzone_size + alen + rst_mem_size; + task_args->bootstrap_len = restorer_len + memzone_size + alen + rst_mem_size + shstk_restorer_stack_size(); BUG_ON(task_args->bootstrap_len & (PAGE_SIZE - 1)); pr_info("%d threads require %ldK of memory\n", current->nr_threads, KBYTES(task_args->bootstrap_len)); @@ -3466,6 +3466,10 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns * self-vmas are unmaped. */ mem += rst_mem_size; + + shstk_set_restorer_stack(&task_args->shstk, mem); + mem += shstk_restorer_stack_size(); + task_args->vdso_rt_parked_at = (unsigned long)mem; task_args->vdso_maps_rt = vdso_maps_rt; task_args->vdso_rt_size = vdso_rt_size; diff --git a/criu/mem.c b/criu/mem.c index f8c550842..9e8740c07 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -787,8 +787,6 @@ int prepare_mm_pid(struct pstree_item *i) ri->vmas.rst_priv_size += vma_area_len(vma); if (vma_has_guard_gap_hidden(vma)) ri->vmas.rst_priv_size += PAGE_SIZE; - if (vma_area_is(vma, VMA_AREA_SHSTK)) - ri->vmas.rst_priv_size += PAGE_SIZE; } pr_info("vma 0x%" PRIx64 " 0x%" PRIx64 "\n", vma->e->start, vma->e->end); @@ -931,13 +929,6 @@ static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void size = vma_entry_len(vma->e); - /* - * map an extra page for shadow stack VMAs, it will be used as a - * temporary shadow stack - */ - if (vma_area_is(vma, VMA_AREA_SHSTK)) - size += PAGE_SIZE; - if (!vma_inherited(vma)) { int flag = 0; /* From abf4a71d9945cb841fe8d5406cd32c3b46e9e2a0 Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Sat, 6 Sep 2025 18:02:37 +0200 Subject: [PATCH 203/257] x86/criu: shstk: add shstk_vma_restore() 1. create shadow stack vma during vma_remap cycle 2. copy contents from a premapped non-shstk VMA into it 3. unmap premapped non-shstk VMA 4. Mark shstk VMA for remap into the final destination Signed-off-by: Igor Svilenkov Bozic Co-Authored-By: Andrei Vagin Co-Authored-By: Alexander Mikhalitsyn [ alex: debugging, rework together with Andrei and code cleanup ] Signed-off-by: Alexander Mikhalitsyn --- criu/arch/x86/include/asm/shstk.h | 37 +++++++++++++++++++++++++++++++ criu/include/restorer.h | 7 ++++++ 2 files changed, 44 insertions(+) diff --git a/criu/arch/x86/include/asm/shstk.h b/criu/arch/x86/include/asm/shstk.h index f62b8c3e9..da4fb80cd 100644 --- a/criu/arch/x86/include/asm/shstk.h +++ b/criu/arch/x86/include/asm/shstk.h @@ -163,6 +163,43 @@ static inline int shstk_finalize(void) return ret; } +/* + * Create shadow stack vma and restore its content from premmapped anonymous (non-shstk) vma + */ +static always_inline int shstk_vma_restore(VmaEntry *vma_entry) +{ + long shstk, i; + unsigned long *shstk_data = (void *)vma_premmaped_start(vma_entry); + unsigned long vma_size = vma_entry_len(vma_entry); + long ret; + + shstk = sys_map_shadow_stack(0, vma_size, SHADOW_STACK_SET_TOKEN); + if (shstk < 0) { + pr_err("Failed to map shadow stack: %ld\n", shstk); + return -1; + } + + /* restore shadow stack contents */ + for (i = 0; i < vma_size / 8; i++) + wrssq(shstk + i * 8, shstk_data[i]); + + ret = sys_munmap(shstk_data, vma_size); + if (ret < 0) { + pr_err("Failed to unmap premmaped shadow stack\n"); + return ret; + } + + /* + * From that point premapped vma is (shstk) and we need + * to mremap() it to the final location. Originally premapped + * (shstk_data) has been unmapped already. + */ + vma_premmaped_start(vma_entry) = shstk; + + return 0; +} +#define shstk_vma_restore shstk_vma_restore + /* * Restore contents of the shadow stack and set shadow stack pointer */ diff --git a/criu/include/restorer.h b/criu/include/restorer.h index 56bea0fcc..14c0a3768 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -357,4 +357,11 @@ static inline int arch_shstk_restore(struct rst_shstk_info *shstk) #define arch_shstk_restore arch_shstk_restore #endif +#ifndef shstk_vma_restore +static always_inline int shstk_vma_restore(VmaEntry *vma_entry) +{ + return -1; +} +#endif + #endif /* __CR_RESTORER_H__ */ From 6fd71b9ee9775f7b275051d0cd028397235f86e8 Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Sat, 6 Sep 2025 18:13:37 +0200 Subject: [PATCH 204/257] x86/criu: shstk: restore SHSTK via premap loops * call shstk_vma_restore() for VMA_AREA_SHSTK in vma_remap() * delete map/copy/unmap from shstk_restore() and keep token setup + finalize * before the loop naturally stopped at cet->ssp-8, so a -8 nudge is required here Signed-off-by: Igor Svilenkov Bozic Co-Authored-By: Andrei Vagin [ alex: small code cleanups ] Signed-off-by: Alexander Mikhalitsyn --- criu/arch/x86/include/asm/shstk.h | 26 ++------------------------ criu/pie/restorer.c | 31 +++++++++++++++++-------------- 2 files changed, 19 insertions(+), 38 deletions(-) diff --git a/criu/arch/x86/include/asm/shstk.h b/criu/arch/x86/include/asm/shstk.h index da4fb80cd..d113fd8ab 100644 --- a/criu/arch/x86/include/asm/shstk.h +++ b/criu/arch/x86/include/asm/shstk.h @@ -205,28 +205,11 @@ static always_inline int shstk_vma_restore(VmaEntry *vma_entry) */ static always_inline int shstk_restore(struct rst_shstk_info *cet) { - unsigned long *shstk_data = (unsigned long *)cet->premmaped_addr; - unsigned long ssp = cet->vma_start + cet->vma_size - 8; - unsigned long shstk_top = cet->vma_size / 8 - 1; - unsigned long val; - long ret; + unsigned long ssp, val; if (!(cet->cet & ARCH_SHSTK_SHSTK)) return 0; - if (shstk_map(cet->vma_start, cet->vma_size)) - return -1; - - /* - * Switch shadow stack from temporary location to the actual task's - * shadow stack VMA - */ - shstk_switch_ssp(ssp); - - /* restore shadow stack contents */ - for (; ssp >= cet->ssp; ssp -= 8, shstk_top--) - wrssq(ssp, shstk_data[shstk_top]); - /* * Add tokens for sigreturn frame and for switch of the shadow stack. * The sigreturn token will be checked by the kernel during @@ -236,6 +219,7 @@ static always_inline int shstk_restore(struct rst_shstk_info *cet) */ /* token for sigreturn frame */ + ssp = cet->ssp - 8; val = ALIGN_DOWN(cet->ssp, 8) | SHSTK_DATA_BIT; wrssq(ssp, val); @@ -247,12 +231,6 @@ static always_inline int shstk_restore(struct rst_shstk_info *cet) /* reset shadow stack pointer to the proper location */ shstk_switch_ssp(ssp); - ret = sys_munmap(shstk_data, cet->vma_size + PAGE_SIZE); - if (ret < 0) { - pr_err("Failed to unmap premmaped shadow stack\n"); - return ret; - } - return shstk_finalize(); } #define arch_shstk_restore shstk_restore diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 394d3dea0..5c40b0e93 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1112,6 +1112,23 @@ static int vma_remap(VmaEntry *vma_entry, int uffd) pr_info("Remap %lx->%lx len %lx\n", src, dst, len); + /* + * SHSTK VMAs are a bit special, in fact we create shstk vma right in the + * shstk_vma_restore() and populate it with contents from a premapped VMA + * (which in turns is just a normal anonymous VMA!). Then, we munmap() this + * premapped VMA. After, we need to adjust vma_premmaped_start(vma_entry) + * to point to a created shstk vma and treat it as a premmaped one in vma_remap(). + */ + if (vma_entry_is(vma_entry, VMA_AREA_SHSTK)) { + if (shstk_vma_restore(vma_entry)) { + pr_err("Unable to prepare shadow stack vma for remap %lx -> %lx\n", src, dst); + return -1; + } + + /* shstk_vma_restore() modifies vma premapped address */ + src = vma_premmaped_start(vma_entry); + } + if (src - dst < len) guard = dst; else if (dst - src < len) @@ -1811,13 +1828,6 @@ __visible long __export_restore_task(struct task_restore_args *args) if (vma_entry->start > vma_entry->shmid) break; - /* - * shadow stack VMAs cannot be remapped, they must be - * recreated with map_shadow_stack system call - */ - if (vma_entry_is(vma_entry, VMA_AREA_SHSTK)) - continue; - if (vma_remap(vma_entry, args->uffd)) goto core_restore_end; } @@ -1835,13 +1845,6 @@ __visible long __export_restore_task(struct task_restore_args *args) if (vma_entry->start < vma_entry->shmid) break; - /* - * shadow stack VMAs cannot be remapped, they must be - * recreated with map_shadow_stack system call - */ - if (vma_entry_is(vma_entry, VMA_AREA_SHSTK)) - continue; - if (vma_remap(vma_entry, args->uffd)) goto core_restore_end; } From 697c31abe442c3fe5e783994312ccfdbe5b4d265 Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Sat, 6 Sep 2025 19:40:35 +0200 Subject: [PATCH 205/257] zdtm: shstk: add SHSTK_ENABLE test build option * add SHSTK_ENABLE=1 toggle * passes -mshstk to compiler and -z shstk to linker Example: $ make -C test/zdtm/static clean $ make -C test/zdtm/static V=1 SHSTK_ENABLE=1 env00 $ readelf --notes test/zdtm/static/env00 | grep SHSTK Properties: x86 feature: SHSTK Signed-off-by: Igor Svilenkov Bozic Co-Authored-By: Andrei Vagin Signed-off-by: Alexander Mikhalitsyn --- test/zdtm/Makefile.inc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/zdtm/Makefile.inc b/test/zdtm/Makefile.inc index 3b349ed4d..465285f08 100644 --- a/test/zdtm/Makefile.inc +++ b/test/zdtm/Makefile.inc @@ -66,6 +66,11 @@ endif export PKG_CONFIG_PATH endif +ifeq ($(SHSTK_ENABLE),1) + CFLAGS += -mshstk + LDFLAGS += -Wl,-z,shstk +endif + define pkg-libs $(shell PKG_CONFIG_PATH="$(PKG_CONFIG_PATH)" $(PKG_CONFIG) --libs $(1)) endef From a5ae3c184be47ca76b3c09f47406bcc234480966 Mon Sep 17 00:00:00 2001 From: Andrii Herheliuk Date: Sat, 18 Oct 2025 06:39:17 +0100 Subject: [PATCH 206/257] pycriu: set licence to LGPLv2.1 We use LGPL-v2.1 license for the libcriu and pycriu as they are intended to be usable by both proprietary and open-source applications. Signed-off-by: Andrii Herheliuk Signed-off-by: Radostin Stoyanov --- lib/pyproject.toml | 2 +- lib/setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/pyproject.toml b/lib/pyproject.toml index c9e11551b..63d9b5f47 100644 --- a/lib/pyproject.toml +++ b/lib/pyproject.toml @@ -8,7 +8,7 @@ description = "Python bindings for CRIU" authors = [ {name = "CRIU team", email = "criu@lists.linux.dev"}, ] -license = {text = "GPLv2"} +license = {text = "LGPLv2.1"} dynamic = ["version"] requires-python = ">=3.6" diff --git a/lib/setup.cfg b/lib/setup.cfg index 5d75719ca..902fed9ee 100644 --- a/lib/setup.cfg +++ b/lib/setup.cfg @@ -8,7 +8,7 @@ name = pycriu description = Python bindings for CRIU author = CRIU team author_email = criu@lists.linux.dev -license = GPLv2 +license = LGPLv2.1 version = attr: pycriu.__version__ [options] From 540c631dd006b071cc5f46968ec3a01757d7e66b Mon Sep 17 00:00:00 2001 From: Andrii Herheliuk Date: Fri, 17 Oct 2025 06:05:14 +0100 Subject: [PATCH 207/257] pycriu: add missing protobuf dependency pycriu depends on protobuf to function correctly. Currently, it raises an error if protobuf is not installed. Adding protobuf to the dependencies ensures it is available after installing pycriu. Signed-off-by: Andrii Herheliuk --- lib/pyproject.toml | 1 + lib/setup.cfg | 2 ++ 2 files changed, 3 insertions(+) diff --git a/lib/pyproject.toml b/lib/pyproject.toml index 63d9b5f47..ea9f88dcc 100644 --- a/lib/pyproject.toml +++ b/lib/pyproject.toml @@ -11,6 +11,7 @@ authors = [ license = {text = "LGPLv2.1"} dynamic = ["version"] requires-python = ">=3.6" +dependencies = ["protobuf"] [tool.setuptools] packages = ["pycriu", "pycriu.images"] diff --git a/lib/setup.cfg b/lib/setup.cfg index 902fed9ee..28c9e49c3 100644 --- a/lib/setup.cfg +++ b/lib/setup.cfg @@ -14,3 +14,5 @@ version = attr: pycriu.__version__ [options] packages = find: python_requires = >=3.6 +install_requires = + protobuf From d5c81f810816ae69d83d71ecd09c562f5bd50167 Mon Sep 17 00:00:00 2001 From: Andrii Herheliuk Date: Sat, 18 Oct 2025 04:00:08 +0100 Subject: [PATCH 208/257] pycriu: prevent always appending "Unknown" to error messages Regardless of the actual error message, "Unknown" was always appended to the end of the string, resulting in messages like: "DUMP failed: Error(3): No process with such pidUnknown". Fixed by changing standalone if statements to else-if blocks so "Unknown" is only added when no specific error condition matches. Signed-off-by: Andrii Herheliuk --- lib/pycriu/criu.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/lib/pycriu/criu.py b/lib/pycriu/criu.py index f3e018095..5bd7ffecd 100644 --- a/lib/pycriu/criu.py +++ b/lib/pycriu/criu.py @@ -181,15 +181,14 @@ class CRIUExceptionExternal(CRIUException): if self.errno == errno.EBADRQC: s += "Bad options" - if self.typ == rpc.DUMP: - if self.errno == errno.ESRCH: - s += "No process with such pid" + elif self.typ == rpc.DUMP and self.errno == errno.ESRCH: + s += "No process with such pid" - if self.typ == rpc.RESTORE: - if self.errno == errno.EEXIST: - s += "Process with requested pid already exists" + elif self.typ == rpc.RESTORE and self.errno == errno.EEXIST: + s += "Process with requested pid already exists" - s += "Unknown" + else: + s += "Unknown" return s From f824dc735bb905d2a3c7adc70a7abd68cdea8a99 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 21 Oct 2025 12:06:56 +0100 Subject: [PATCH 209/257] ci: consolidate action-script tests This patch consolidates the action-script tests into `test/others/action-script` to ensure all tests are executed consistently and reduce duplication. Since we had two tests that appear to do the same thing, we can remove the one that doesn't use zdtm.py. Signed-off-by: Radostin Stoyanov --- Makefile | 1 + test/jenkins/actions.sh | 8 --- test/others/action-script/.gitignore | 2 +- test/others/action-script/Makefile | 2 - test/others/action-script/action-script.sh | 2 - .../action-script}/check_actions.py | 0 test/others/action-script/run.sh | 59 ++----------------- .../{ => others/action-script}/show_action.sh | 3 +- 8 files changed, 9 insertions(+), 68 deletions(-) delete mode 100755 test/jenkins/actions.sh delete mode 100755 test/others/action-script/action-script.sh rename test/{ => others/action-script}/check_actions.py (100%) rename test/{ => others/action-script}/show_action.sh (66%) diff --git a/Makefile b/Makefile index 611bcdd5a..e6653bd6c 100644 --- a/Makefile +++ b/Makefile @@ -451,6 +451,7 @@ ruff: test/zdtm.py \ test/inhfd/*.py \ test/others/rpc/config_file.py \ + test/others/action-script/check_actions.py \ lib/pycriu/images/pb2dict.py \ lib/pycriu/images/images.py \ scripts/criu-ns \ diff --git a/test/jenkins/actions.sh b/test/jenkins/actions.sh deleted file mode 100755 index 801904500..000000000 --- a/test/jenkins/actions.sh +++ /dev/null @@ -1,8 +0,0 @@ -# Check how crit de/encodes images -set -e -source `dirname $0`/criu-lib.sh -# prep -rm -f actions_called.txt -./test/zdtm.py run -t zdtm/static/env00 --script "$(pwd)/test/show_action.sh" || fail -./test/check_actions.py || fail -exit 0 diff --git a/test/others/action-script/.gitignore b/test/others/action-script/.gitignore index c0b6a2490..ca9a0b541 100644 --- a/test/others/action-script/.gitignore +++ b/test/others/action-script/.gitignore @@ -1 +1 @@ -img-dir-* +actions_called.txt diff --git a/test/others/action-script/Makefile b/test/others/action-script/Makefile index f1ce191db..594edc070 100644 --- a/test/others/action-script/Makefile +++ b/test/others/action-script/Makefile @@ -1,5 +1,3 @@ run: - @make -C .. loop ./run.sh - .PHONY: run diff --git a/test/others/action-script/action-script.sh b/test/others/action-script/action-script.sh deleted file mode 100755 index aba8292c0..000000000 --- a/test/others/action-script/action-script.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -touch action-hook-"$CRTOOLS_SCRIPT_ACTION" diff --git a/test/check_actions.py b/test/others/action-script/check_actions.py similarity index 100% rename from test/check_actions.py rename to test/others/action-script/check_actions.py diff --git a/test/others/action-script/run.sh b/test/others/action-script/run.sh index a82fccf35..f18301502 100755 --- a/test/others/action-script/run.sh +++ b/test/others/action-script/run.sh @@ -1,60 +1,11 @@ #!/bin/bash -set -ebm +set -e -# shellcheck source=test/others/env.sh -source ../env.sh || exit 1 +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" -SELFDIR="$(dirname "$(readlink -f "$0")")" -SCRIPT="$SELFDIR/action-script.sh" -IMGDIR="$SELFDIR/img-dir-$$" +rm -f "${SCRIPT_DIR}"/actions_called.txt +"${SCRIPT_DIR}"/../../zdtm.py run -t zdtm/static/env00 --script "$SCRIPT_DIR/show_action.sh" || exit 1 +"${SCRIPT_DIR}"/check_actions.py || exit 1 -rm -rf "$IMGDIR" -mkdir "$IMGDIR" - -trap "cleanup" QUIT TERM INT HUP EXIT - -# shellcheck disable=SC2317 -# https://github.com/koalaman/shellcheck/issues/2660 -function cleanup() -{ - if [[ -n "$PID" ]]; then - kill -9 "$PID" - fi -} - -PID=$(../loop) -if ! $CRIU dump -v4 -o dump.log -t "$PID" -D "$IMGDIR" --action-script "$SCRIPT"; then - echo "Failed to checkpoint process $PID" - cat dump.log - kill -9 "$PID" - exit 1 -fi - -if ! $CRIU restore -v4 -o restore.log -D "$IMGDIR" -d --pidfile test.pidfile --action-script "$SCRIPT"; then - echo "CRIU restore failed" - echo FAIL - exit 1 -fi - -PID=$(cat "$IMGDIR"/test.pidfile) - -found_missing_file=false -hooks=("pre-dump" "post-dump" "pre-restore" "pre-resume" "post-restore" "post-resume") - -for hook in "${hooks[@]}" -do - if [ ! -e "$IMGDIR/action-hook-$hook" ]; then - echo "ERROR: action-hook-$hook does not exist" - found_missing_file=true - fi -done - -if [ "$found_missing_file" = true ]; then - exit 1 -fi - -echo PASS - -rm -rf "$IMGDIR" exit 0 diff --git a/test/show_action.sh b/test/others/action-script/show_action.sh similarity index 66% rename from test/show_action.sh rename to test/others/action-script/show_action.sh index 86468b67a..afbfc3f27 100755 --- a/test/show_action.sh +++ b/test/others/action-script/show_action.sh @@ -1,3 +1,4 @@ #!/bin/bash + echo "${CRTOOLS_SCRIPT_ACTION} ${CRTOOLS_IMAGE_DIR} ${CRTOOLS_INIT_PID}" \ - >> "$(dirname $0)/actions_called.txt" + >> "$(dirname "$0")/actions_called.txt" From f74e68daf90aa2401024b1f106d84677e6354e47 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 21 Oct 2025 12:11:45 +0100 Subject: [PATCH 210/257] ci: verify call order of action-script hooks The existing test collects all action-script hooks triggered during `h`, `ns`, and `uns` runs with ZDTM into `actions_called.txt`, then verifies that each hook appears at least once. However, the test does not verify that hooks are invoked *exactly once* or in *correct order*. This change updates the test to run ZDTM only with ns flavour as this seems to cover all action-script hooks, and checks that all hooks are called correctly. Signed-off-by: Radostin Stoyanov --- test/others/action-script/check_actions.py | 65 +++++++++++++--------- test/others/action-script/run.sh | 2 +- 2 files changed, 40 insertions(+), 27 deletions(-) diff --git a/test/others/action-script/check_actions.py b/test/others/action-script/check_actions.py index 84d738dbb..0140d8762 100755 --- a/test/others/action-script/check_actions.py +++ b/test/others/action-script/check_actions.py @@ -1,41 +1,54 @@ #!/usr/bin/env python3 -import sys import os +import sys + +EXPECTED_ACTIONS = [ + 'pre-dump', + 'network-lock', + 'post-dump', + 'pre-restore', + 'setup-namespaces', + 'post-setup-namespaces', + 'post-restore', + 'network-unlock', + 'pre-resume', + 'post-resume', +] -actions = set(['pre-dump', 'pre-restore', 'post-dump', 'setup-namespaces', \ - 'post-setup-namespaces', 'post-restore', 'post-resume', \ - 'network-lock', 'network-unlock' ]) errors = [] -af = os.path.dirname(os.path.abspath(__file__)) + '/actions_called.txt' +actions_called = [] +actions_called_file = os.path.join(os.path.dirname(__file__), 'actions_called.txt') -for act in open(af): - act = act.strip().split() - act.append('EMPTY') - act.append('EMPTY') +with open(actions_called_file) as f: + for index, line in enumerate(f): + parts = line.strip().split() + parts += ['EMPTY'] * (3 - len(parts)) + action_hook, image_dir, pid = parts - if act[0] == 'EMPTY': - raise Exception("Error in test, bogus actions line") + if action_hook == 'EMPTY': + raise ValueError("Error in test: bogus actions line") - if act[1] == 'EMPTY': - errors.append('Action %s misses CRTOOLS_IMAGE_DIR' % act[0]) + expected_action = EXPECTED_ACTIONS[index] if index < len(EXPECTED_ACTIONS) else None + if action_hook != expected_action: + raise ValueError(f"Invalid action: {action_hook} != {expected_action}") - if act[0] in ('post-dump', 'setup-namespaces', 'post-setup-namespaces', \ - 'post-restore', 'post-resume', 'network-lock', 'network-unlock'): - if act[2] == 'EMPTY': - errors.append('Action %s misses CRTOOLS_INIT_PID' % act[0]) - elif not act[2].isdigit() or int(act[2]) == 0: - errors.append('Action %s PID is not number (%s)' % - (act[0], act[2])) + if image_dir == 'EMPTY': + errors.append(f'Action {action_hook} misses CRTOOLS_IMAGE_DIR') - actions -= set([act[0]]) + if action_hook != 'pre-restore': + if pid == 'EMPTY': + errors.append(f'Action {action_hook} misses CRTOOLS_INIT_PID') + elif not pid.isdigit() or int(pid) == 0: + errors.append(f'Action {action_hook} PID is not a valid number ({pid})') -if actions: - errors.append('Not all actions called: %r' % actions) + actions_called.append(action_hook) + +if actions_called != EXPECTED_ACTIONS: + errors.append(f'Not all actions called: {actions_called!r}') if errors: - for x in errors: - print(x) + print('\n'.join(errors)) sys.exit(1) -print('PASS') +print('Check Actions PASS') diff --git a/test/others/action-script/run.sh b/test/others/action-script/run.sh index f18301502..574f6fc86 100755 --- a/test/others/action-script/run.sh +++ b/test/others/action-script/run.sh @@ -5,7 +5,7 @@ set -e SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" rm -f "${SCRIPT_DIR}"/actions_called.txt -"${SCRIPT_DIR}"/../../zdtm.py run -t zdtm/static/env00 --script "$SCRIPT_DIR/show_action.sh" || exit 1 +"${SCRIPT_DIR}"/../../zdtm.py run -t zdtm/static/env00 -f ns --script "$SCRIPT_DIR/show_action.sh" || exit 1 "${SCRIPT_DIR}"/check_actions.py || exit 1 exit 0 From d3dfb663b1022ec89431a0e61113f55a771bc73c Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 21 Oct 2025 10:51:46 +0100 Subject: [PATCH 211/257] make: don't install external dependencies Don't install external pip dependencies when running `make install`. As we are not really into developing a Python project, we should not install additional packages. CRIU does that nowhere else. Signed-off-by: Radostin Stoyanov --- Makefile.install | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/Makefile.install b/Makefile.install index 455735f3b..70c607ec6 100644 --- a/Makefile.install +++ b/Makefile.install @@ -46,9 +46,13 @@ endif endif # Default flags for pip install: -# --upgrade: Upgrade crit/pycriu packages -# --ignore-installed: Ignore existing packages and reinstall them -PIPFLAGS ?= --upgrade --ignore-installed +# --ignore-installed: Overwrite already installed pycriu/crit packages +# --no-build-isolation: Use current Python environment to build pycriu/crit packages +# --no-deps: Don't install any dependencies +# --no-index: Don't use PyPI index to find packages +# --progress-bar: Cleaner output +# --upgrade: Treat the install as an upgrade when replacing the installed version +PIPFLAGS ?= --ignore-installed --no-build-isolation --no-deps --no-index --progress-bar off --upgrade export SKIP_PIP_INSTALL PIPFLAGS From 68601814747470bc0ef28b3ce42f5b8d61f230e8 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 21 Oct 2025 11:43:17 +0100 Subject: [PATCH 212/257] ci: add wheel and setuptools in dnf packages These dependencies are required to for `pip install`. Signed-off-by: Radostin Stoyanov --- contrib/dependencies/dnf-packages.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/contrib/dependencies/dnf-packages.sh b/contrib/dependencies/dnf-packages.sh index 00dc91a2e..793f267a5 100755 --- a/contrib/dependencies/dnf-packages.sh +++ b/contrib/dependencies/dnf-packages.sh @@ -30,9 +30,11 @@ dnf install -y \ protobuf-c-devel \ protobuf-compiler \ protobuf-devel \ + python-devel \ python3-importlib-metadata \ python3-protobuf \ python3-pyyaml \ - python-devel \ + python3-setuptools \ + python3-wheel \ rubygem-asciidoctor \ xmlto From afcfcd3bf68bb0e1c45c1951b0469fe9588512b4 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 21 Oct 2025 12:49:00 +0100 Subject: [PATCH 213/257] ci: add which dependency in dnf packages which is used in Makefiles to check for dependencies: Example: export USE_ASCIIDOCTOR ?= $(shell which asciidoctor 2>/dev/null) Signed-off-by: Radostin Stoyanov --- contrib/dependencies/dnf-packages.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/dependencies/dnf-packages.sh b/contrib/dependencies/dnf-packages.sh index 793f267a5..60f21db6d 100755 --- a/contrib/dependencies/dnf-packages.sh +++ b/contrib/dependencies/dnf-packages.sh @@ -37,4 +37,5 @@ dnf install -y \ python3-setuptools \ python3-wheel \ rubygem-asciidoctor \ + which \ xmlto From 07ad2473f27a2afd09e0379d18cf046782752d6c Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Sun, 26 Oct 2025 17:14:03 -0700 Subject: [PATCH 214/257] Use command -v instead of which Unlike "which", which is a separate executable not always installed by default, "command -v" is a shell built-in available at least for bash, dash, and busybox shell. Unlike "which", "command -v" is also easier to grep for, and it is already used in a few places here. Inspired by commit 57251d811. Signed-off-by: Kir Kolyshkin --- .github/workflows/lint.yml | 2 +- Makefile | 2 +- contrib/dependencies/dnf-packages.sh | 1 - contrib/docker_cr.sh | 4 ++-- scripts/ci/prepare-for-fedora-rawhide.sh | 3 +-- scripts/nmk/scripts/tools.mk | 4 ++-- 6 files changed, 7 insertions(+), 9 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 862d68245..f7da4f6f6 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -14,7 +14,7 @@ jobs: image: registry.fedoraproject.org/fedora:latest steps: - name: Install tools - run: sudo dnf -y install git make ruff xz clang-tools-extra which codespell git-clang-format ShellCheck + run: sudo dnf -y install git make ruff xz clang-tools-extra codespell git-clang-format ShellCheck - uses: actions/checkout@v4 diff --git a/Makefile b/Makefile index e6653bd6c..1824ea180 100644 --- a/Makefile +++ b/Makefile @@ -489,7 +489,7 @@ lint: ruff shellcheck codespell ! git --no-pager grep -E '\s+$$' \*.c \*.h .PHONY: lint ruff shellcheck codespell -codecov: SHELL := $(shell which bash) +codecov: SHELL := $(shell command -v bash) codecov: curl -Os https://uploader.codecov.io/latest/linux/codecov chmod +x codecov diff --git a/contrib/dependencies/dnf-packages.sh b/contrib/dependencies/dnf-packages.sh index 60f21db6d..793f267a5 100755 --- a/contrib/dependencies/dnf-packages.sh +++ b/contrib/dependencies/dnf-packages.sh @@ -37,5 +37,4 @@ dnf install -y \ python3-setuptools \ python3-wheel \ rubygem-asciidoctor \ - which \ xmlto diff --git a/contrib/docker_cr.sh b/contrib/docker_cr.sh index 9b43d8ba1..04ef676cd 100755 --- a/contrib/docker_cr.sh +++ b/contrib/docker_cr.sh @@ -418,7 +418,7 @@ resolve_path() { local p p="${2}" - if which realpath > /dev/null; then + if command -v realpath > /dev/null; then p=$(realpath "${p}") fi ${ECHO} "${1}: ${p}" @@ -427,7 +427,7 @@ resolve_path() { resolve_cmd() { local cpath - cpath=$(which "${2}") + cpath=$(command -v "${2}") resolve_path "${1}" "${cpath}" } diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index ff75717c5..b0b45fcc3 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -15,8 +15,7 @@ dnf install -y \ python-unversioned-command \ redhat-rpm-config \ sudo \ - tar \ - which + tar # /tmp is no longer 755 in the rawhide container image and breaks CI - fix it chmod 1777 /tmp diff --git a/scripts/nmk/scripts/tools.mk b/scripts/nmk/scripts/tools.mk index 724204a03..de5782c13 100644 --- a/scripts/nmk/scripts/tools.mk +++ b/scripts/nmk/scripts/tools.mk @@ -23,7 +23,7 @@ MAKE := make MKDIR := mkdir -p AWK := awk PERL := perl -FULL_PYTHON := $(shell which python3 2>/dev/null) +FULL_PYTHON := $(shell command -v python3 2>/dev/null) PYTHON ?= $(shell basename $(FULL_PYTHON)) FIND := find SH := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \ @@ -36,7 +36,7 @@ CTAGS := ctags export RM HOSTLD LD HOSTCC CC CPP AS AR STRIP OBJCOPY OBJDUMP export NM SH MAKE MKDIR AWK PERL PYTHON SH CSCOPE -export USE_ASCIIDOCTOR ?= $(shell which asciidoctor 2>/dev/null) +export USE_ASCIIDOCTOR ?= $(shell command -v asciidoctor 2>/dev/null) # # Footer. From 2878faa74c96c0b816453d0a0c86e219e4e33fed Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 7 Oct 2025 16:31:17 +0100 Subject: [PATCH 215/257] libcriu: enable setting of RPC config file Container runtimes that use libcriu (e.g., crun) need to specify a CRIU configuration file that allows to overwrite default options set via RPC. This is particularly useful to set options such as `--tcp-established` via `/etc/criu/runc.conf` in Kubernetes. Signed-off-by: Radostin Stoyanov --- lib/c/criu.c | 19 +++++++++++++++++++ lib/c/criu.h | 2 ++ 2 files changed, 21 insertions(+) diff --git a/lib/c/criu.c b/lib/c/criu.c index c16fe5dcd..485c8b178 100644 --- a/lib/c/criu.c +++ b/lib/c/criu.c @@ -2041,3 +2041,22 @@ void criu_set_empty_ns(int namespaces) { criu_local_set_empty_ns(global_opts, namespaces); } + +int criu_local_set_config_file(criu_opts *opts, const char *path) +{ + char *new; + + new = strdup(path); + if (!new) + return -ENOMEM; + + free(opts->rpc->config_file); + opts->rpc->config_file = new; + + return 0; +} + +int criu_set_config_file(const char *path) +{ + return criu_local_set_config_file(global_opts, path); +} diff --git a/lib/c/criu.h b/lib/c/criu.h index c1c607869..44446f664 100644 --- a/lib/c/criu.h +++ b/lib/c/criu.h @@ -116,6 +116,7 @@ void criu_set_pidfd_store_sk(int sk); int criu_set_network_lock(enum criu_network_lock_method method); int criu_join_ns_add(const char *ns, const char *ns_file, const char *extra_opt); void criu_set_mntns_compat_mode(bool val); +int criu_set_config_file(const char *path); /* * The criu_notify_arg_t na argument is an opaque @@ -281,6 +282,7 @@ void criu_local_set_pidfd_store_sk(criu_opts *opts, int sk); int criu_local_set_network_lock(criu_opts *opts, enum criu_network_lock_method method); int criu_local_join_ns_add(criu_opts *opts, const char *ns, const char *ns_file, const char *extra_opt); void criu_local_set_mntns_compat_mode(criu_opts *opts, bool val); +int criu_local_set_config_file(criu_opts *opts, const char *path); void criu_local_set_notify_cb(criu_opts *opts, int (*cb)(char *action, criu_notify_arg_t na)); From 3f97cfe876b4e54be42c19263796de61633402ac Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 9 Oct 2025 11:21:35 +0100 Subject: [PATCH 216/257] test/libcriu: check setting of RPC config file Signed-off-by: Radostin Stoyanov --- test/others/libcriu/.gitignore | 1 + test/others/libcriu/Makefile | 1 + test/others/libcriu/run.sh | 1 + test/others/libcriu/test_rpc_config.c | 223 ++++++++++++++++++++++++++ 4 files changed, 226 insertions(+) create mode 100644 test/others/libcriu/test_rpc_config.c diff --git a/test/others/libcriu/.gitignore b/test/others/libcriu/.gitignore index 0f6e52bb4..30a56999c 100644 --- a/test/others/libcriu/.gitignore +++ b/test/others/libcriu/.gitignore @@ -8,3 +8,4 @@ test_pre_dump test_feature_check output/ libcriu.so.* +test_rpc_config diff --git a/test/others/libcriu/Makefile b/test/others/libcriu/Makefile index ae7330533..e0ee5b2ab 100644 --- a/test/others/libcriu/Makefile +++ b/test/others/libcriu/Makefile @@ -3,6 +3,7 @@ include ../../../../criu/Makefile.versions TESTS += test_sub TESTS += test_self TESTS += test_notify +TESTS += test_rpc_config TESTS += test_iters TESTS += test_errno TESTS += test_join_ns diff --git a/test/others/libcriu/run.sh b/test/others/libcriu/run.sh index f7d363aab..804af9b83 100755 --- a/test/others/libcriu/run.sh +++ b/test/others/libcriu/run.sh @@ -55,6 +55,7 @@ run_test() { run_test test_sub run_test test_self run_test test_notify +run_test test_rpc_config if [ "$(uname -m)" = "x86_64" ]; then # Skip this on aarch64 as aarch64 has no dirty page tracking run_test test_iters diff --git a/test/others/libcriu/test_rpc_config.c b/test/others/libcriu/test_rpc_config.c new file mode 100644 index 000000000..529f13637 --- /dev/null +++ b/test/others/libcriu/test_rpc_config.c @@ -0,0 +1,223 @@ +#include "criu.h" +#include "lib.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RANDOM_NAME_LEN 6 +#define PATH_BUF_SIZE 128 + +static volatile sig_atomic_t stop = 0; +static char base_name[RANDOM_NAME_LEN + 1]; +static char log_file[PATH_BUF_SIZE]; +static char conf_file[PATH_BUF_SIZE]; + +static void handle_signal(int sig) +{ + (void)sig; + stop = 1; +} + +static void generate_random_base_name(void) +{ + const char charset[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + size_t charset_len; + int i; + + charset_len = sizeof(charset) - 1; + + for (i = 0; i < RANDOM_NAME_LEN; i++) { + base_name[i] = charset[rand() % charset_len]; + } + base_name[i] = '\0'; + + snprintf(log_file, sizeof(log_file), "/tmp/criu-%s.log", base_name); + snprintf(conf_file, sizeof(conf_file), "/tmp/criu-%s.conf", base_name); +} + +static int create_criu_config_file(void) +{ + int fd; + FILE *fp; + + srand(time(NULL)); + generate_random_base_name(); + + fd = open(conf_file, O_CREAT | O_EXCL | O_WRONLY, 0600); + if (fd < 0) { + perror("Failed to create config file"); + return -1; + } + + fp = fdopen(fd, "w"); + if (!fp) { + perror("fdopen failed"); + close(fd); + unlink(conf_file); + return -1; + } + + fprintf(fp, "log-file=%s\n", log_file); + fflush(fp); + fclose(fp); + + return 0; +} + +static int check_log_file(void) +{ + struct stat st; + + if (stat(log_file, &st) < 0) { + perror("Config file does not exist"); + return -1; + } + + if (st.st_size == 0) { + fprintf(stderr, "Config file is empty\n"); + return -1; + } + + unlink(log_file); + return 0; +} + +int main(int argc, char **argv) +{ + int pipe_fd[2]; + pid_t pid; + int ret; + int child_ret; + + int img_fd = open(argv[2], O_DIRECTORY); + if (img_fd < 0) { + perror("Failed to open images directory"); + goto cleanup; + } + + if (create_criu_config_file() < 0) { + printf("Failed to create config file\n"); + return EXIT_FAILURE; + } + + if (pipe(pipe_fd) < 0) { + perror("pipe"); + return EXIT_FAILURE; + } + + pid = fork(); + if (pid < 0) { + perror("fork failed"); + return EXIT_FAILURE; + } + + if (pid == 0) { + /** child process **/ + printf(" `- loop: initializing\n"); + + if (setsid() < 0 || signal(SIGUSR1, handle_signal) == SIG_ERR) { + _exit(EXIT_FAILURE); + } + + close(STDIN_FILENO); + close(STDOUT_FILENO); + close(STDERR_FILENO); + close(pipe_fd[0]); + + child_ret = SUCC_ECODE; + write(pipe_fd[1], &child_ret, sizeof(child_ret)); + close(pipe_fd[1]); + + while (!stop) { + sleep(1); + } + + _exit(SUCC_ECODE); + } + + /** parent process **/ + close(pipe_fd[1]); + + ret = -1; + if (read(pipe_fd[0], &ret, sizeof(ret)) != sizeof(ret) || ret != SUCC_ECODE) { + printf("Error starting loop\n"); + goto cleanup; + } + + read(pipe_fd[0], &ret, 1); + close(pipe_fd[0]); + + printf("--- Loop process started (pid: %d) ---\n", pid); + + printf("--- Checkpoint ---\n"); + criu_init_opts(); + criu_set_service_binary(argv[1]); + criu_set_images_dir_fd(img_fd); + criu_set_pid(pid); + criu_set_log_level(CRIU_LOG_DEBUG); + + /* The RPC config file should overwrite the log-file set below */ + printf("Setting dump RPC config file: %s\n", conf_file); + criu_set_config_file(conf_file); + criu_set_log_file("dump.log"); + + ret = criu_dump(); + if (ret < 0) { + what_err_ret_mean(ret); + kill(pid, SIGKILL); + printf("criu dump failed\n"); + goto cleanup; + } + + printf(" `- Dump succeeded\n"); + waitpid(pid, NULL, 0); + + if (check_log_file()) { + printf("Error: log file not overwritten by RPC config file\n"); + goto cleanup; + } + + printf("--- Restore loop ---\n"); + criu_init_opts(); + criu_set_images_dir_fd(img_fd); + criu_set_log_level(CRIU_LOG_DEBUG); + + /* The RPC config file should overwrite the log-file set below */ + printf("Setting restore RPC config file: %s\n", conf_file); + criu_set_config_file(conf_file); + criu_set_log_file("restore.log"); + + pid = criu_restore_child(); + if (pid <= 0) { + what_err_ret_mean(pid); + ret = EXIT_FAILURE; + goto cleanup; + } + + printf(" `- Restore returned pid %d\n", pid); + kill(pid, SIGUSR1); + + if (check_log_file()) { + printf("Error: log file not overwritten by RPC config file\n"); + goto cleanup; + } + +cleanup: + if (waitpid(pid, &ret, 0) < 0) { + perror("waitpid failed"); + return EXIT_FAILURE; + } + + printf("Remove RPC config file: %s\n", conf_file); + unlink(conf_file); + return chk_exit(ret, SUCC_ECODE); +} From 7aad7317b407925519d2b9137f87b1a0f53fc879 Mon Sep 17 00:00:00 2001 From: Andrii Herheliuk Date: Wed, 22 Oct 2025 21:51:28 +0100 Subject: [PATCH 217/257] lib/pycriu: changing the default behavior to use the system binary Use system-installed CRIU binary instead of a local file Thanks to @avagin for suggesting this solution. Co-authored-by: Andrei Vagin Signed-off-by: Andrii Herheliuk --- lib/pycriu/criu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/pycriu/criu.py b/lib/pycriu/criu.py index 5bd7ffecd..5973b4b91 100644 --- a/lib/pycriu/criu.py +++ b/lib/pycriu/criu.py @@ -103,7 +103,7 @@ class _criu_comm_bin(_criu_comm): os.close(2) css[0].send(struct.pack('i', os.getpid())) - os.execv(self.comm, + os.execvp(self.comm, [self.comm, 'swrk', "%d" % css[0].fileno()]) os._exit(1) From d2c46b92b0d394e04b7da5d16909ed7f88e84271 Mon Sep 17 00:00:00 2001 From: Andrii Herheliuk Date: Thu, 23 Oct 2025 10:50:40 +0100 Subject: [PATCH 218/257] pycriu: better socket error handling [Errno 2] No such file or directory -> Socket file not found. [Errno 111] Connection refused -> Service not running. Signed-off-by: Andrii Herheliuk --- lib/pycriu/criu.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/lib/pycriu/criu.py b/lib/pycriu/criu.py index 5973b4b91..43550c3ca 100644 --- a/lib/pycriu/criu.py +++ b/lib/pycriu/criu.py @@ -45,7 +45,14 @@ class _criu_comm_sk(_criu_comm): def connect(self, daemon): self.sk = socket.socket(socket.AF_UNIX, socket.SOCK_SEQPACKET) - self.sk.connect(self.comm) + try: + self.sk.connect(self.comm) + + except FileNotFoundError: + raise FileNotFoundError("Socket file not found.") + + except ConnectionRefusedError: + raise ConnectionRefusedError("Service not running.") return self.sk From 71a637923f420dd50cef02912519b44722338ae4 Mon Sep 17 00:00:00 2001 From: Andrii Herheliuk Date: Mon, 27 Oct 2025 21:57:41 +0000 Subject: [PATCH 219/257] pycriu: set default value for sk_name This change allows users to call criu.use_sk() without any parameters to use the default socket name. Co-authored-by: Radostin Stoyanov Signed-off-by: Andrii Herheliuk --- lib/pycriu/criu.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/pycriu/criu.py b/lib/pycriu/criu.py index 43550c3ca..05a85c58d 100644 --- a/lib/pycriu/criu.py +++ b/lib/pycriu/criu.py @@ -8,6 +8,7 @@ import struct import pycriu.rpc_pb2 as rpc +CR_DEFAULT_SERVICE_ADDRESS = "./criu_service.socket" class _criu_comm: """ @@ -213,7 +214,7 @@ class criu: self.opts = rpc.criu_opts() self.sk = None - def use_sk(self, sk_name): + def use_sk(self, sk_name=CR_DEFAULT_SERVICE_ADDRESS): """ Access criu using unix socket which that belongs to criu service daemon. """ From ee4100c09f7de7ef9e9db59288118646f28cd4b4 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 24 Oct 2025 06:42:53 +0100 Subject: [PATCH 220/257] cr-service: refactor images/workdir setup Move the code that opens the images directory, resolves its absolute path via readlink(), selects the work_dir, and chdir()s into it into a new function: setup_images_and_workdir(). This reduces the size of `setup_opts_from_req()`, improves its readability, and allows this functionality to be reused. While at it, change open_image_dir() to take a const char *dir parameter, reflecting that the path is not modified by the function and allowing callers to pass string literals without casts. No functional changes are intended. Signed-off-by: Radostin Stoyanov --- criu/cr-service.c | 74 +++++++++++++++++++++++++------------------- criu/image.c | 2 +- criu/include/image.h | 2 +- 3 files changed, 45 insertions(+), 33 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index e6aac232e..36ef8d72b 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -285,13 +285,54 @@ int exec_rpc_query_external_files(char *name, int sk) static char images_dir[PATH_MAX]; +static int setup_images_and_workdir(const char *images_dir_path, + bool work_changed_by_rpc_conf, + CriuOpts *req, + pid_t peer_pid) +{ + char work_dir_path[PATH_MAX]; + + /* + * Image streaming is not supported with CRIU's service feature as + * the streamer must be started for each dump/restore operation. + * It is unclear how to do that with RPC, so we punt for now. + * This explains why we provide the argument mode=-1 instead of + * O_RSTR or O_DUMP. + */ + if (open_image_dir(images_dir_path, -1) < 0) { + pr_perror("Can't open images directory"); + return -1; + } + + /* get full path to images_dir to use in process title */ + if (readlink(images_dir_path, images_dir, PATH_MAX) == -1) { + pr_perror("Can't readlink %s", images_dir_path); + return -1; + } + + if (work_changed_by_rpc_conf) + strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); + else if (req->has_work_dir_fd) + sprintf(work_dir_path, "/proc/%d/fd/%d", peer_pid, req->work_dir_fd); + else if (opts.work_dir) + strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); + else + strcpy(work_dir_path, images_dir_path); + + if (chdir(work_dir_path)) { + pr_perror("Can't chdir to work_dir"); + return -1; + } + + return 0; +} + static int setup_opts_from_req(int sk, CriuOpts *req) { struct ucred ids; struct stat st; socklen_t ids_len = sizeof(struct ucred); char images_dir_path[PATH_MAX]; - char work_dir_path[PATH_MAX]; char status_fd[PATH_MAX]; bool output_changed_by_rpc_conf = false; bool work_changed_by_rpc_conf = false; @@ -701,37 +742,8 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->parent_img) SET_CHAR_OPTS(img_parent, req->parent_img); - /* - * Image streaming is not supported with CRIU's service feature as - * the streamer must be started for each dump/restore operation. - * It is unclear how to do that with RPC, so we punt for now. - * This explains why we provide the argument mode=-1 instead of - * O_RSTR or O_DUMP. - */ - if (open_image_dir(images_dir_path, -1) < 0) { - pr_perror("Can't open images directory"); + if (setup_images_and_workdir(images_dir_path, work_changed_by_rpc_conf, req, ids.pid)) goto err; - } - - /* get full path to images_dir to use in process title */ - if (readlink(images_dir_path, images_dir, PATH_MAX) == -1) { - pr_perror("Can't readlink %s", images_dir_path); - goto err; - } - - if (work_changed_by_rpc_conf) - strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); - else if (req->has_work_dir_fd) - sprintf(work_dir_path, "/proc/%d/fd/%d", ids.pid, req->work_dir_fd); - else if (opts.work_dir) - strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); - else - strcpy(work_dir_path, images_dir_path); - - if (chdir(work_dir_path)) { - pr_perror("Can't chdir to work_dir"); - goto err; - } if (req->n_irmap_scan_paths) { for (i = 0; i < req->n_irmap_scan_paths; i++) { diff --git a/criu/image.c b/criu/image.c index c4f05e159..91101c3eb 100644 --- a/criu/image.c +++ b/criu/image.c @@ -717,7 +717,7 @@ struct cr_img *img_from_fd(int fd) * This is used when opts.stream is enabled for picking the right streamer * socket name. `mode` is ignored when opts.stream is not enabled. */ -int open_image_dir(char *dir, int mode) +int open_image_dir(const char *dir, int mode) { int fd, ret; diff --git a/criu/include/image.h b/criu/include/image.h index b06dbf706..30e32323d 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -165,7 +165,7 @@ static inline int img_raw_fd(struct cr_img *img) extern off_t img_raw_size(struct cr_img *img); -extern int open_image_dir(char *dir, int mode); +extern int open_image_dir(const char *dir, int mode); extern void close_image_dir(void); /* * Return -1 -- parent symlink points to invalid target From 60a731ab38d53c69fbf0fc8bf7bb02701930424c Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 24 Oct 2025 11:04:35 +0100 Subject: [PATCH 221/257] cr-service: drop images_dir from setproctitle Commit 9089ce8 ("service: use setproctitle") extended cr-service to get the full path of images_dir using readlink(). However, the RPC API was later extended to allow setting a custom path (folder) to be set instead of passing a file descriptor, which causes readlink() to fail as the path is not a symbolic link. It would be better to drop the code setting the images-dir path as a string in the proctitle. Fixes: #2794 Suggested-by: Andrei Vagin Co-authored-by: Andrii Herheliuk Signed-off-by: Radostin Stoyanov --- criu/cr-service.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index 36ef8d72b..0808be3e7 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -283,8 +283,6 @@ int exec_rpc_query_external_files(char *name, int sk) return ret; } -static char images_dir[PATH_MAX]; - static int setup_images_and_workdir(const char *images_dir_path, bool work_changed_by_rpc_conf, CriuOpts *req, @@ -304,12 +302,6 @@ static int setup_images_and_workdir(const char *images_dir_path, return -1; } - /* get full path to images_dir to use in process title */ - if (readlink(images_dir_path, images_dir, PATH_MAX) == -1) { - pr_perror("Can't readlink %s", images_dir_path); - return -1; - } - if (work_changed_by_rpc_conf) strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); else if (req->has_work_dir_fd) @@ -802,7 +794,7 @@ static int dump_using_req(int sk, CriuOpts *req) if (setup_opts_from_req(sk, req)) goto exit; - __setproctitle("dump --rpc -t %d -D %s", req->pid, images_dir); + __setproctitle("dump --rpc -t %d", req->pid); if (init_pidfd_store_hash()) goto pidfd_store_err; @@ -845,7 +837,7 @@ static int restore_using_req(int sk, CriuOpts *req) if (setup_opts_from_req(sk, req)) goto exit; - __setproctitle("restore --rpc -D %s", images_dir); + __setproctitle("restore --rpc"); if (cr_restore_tasks()) goto exit; @@ -940,7 +932,7 @@ static int pre_dump_using_req(int sk, CriuOpts *req, bool single) if (setup_opts_from_req(sk, req)) goto cout; - __setproctitle("pre-dump --rpc -t %d -D %s", req->pid, images_dir); + __setproctitle("pre-dump --rpc -t %d", req->pid); if (init_pidfd_store_hash()) goto pidfd_store_err; @@ -1276,8 +1268,7 @@ static int handle_cpuinfo(int sk, CriuReq *msg) if (setup_opts_from_req(sk, msg->opts)) goto cout; - __setproctitle("cpuinfo %s --rpc -D %s", msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP ? "dump" : "check", - images_dir); + __setproctitle("cpuinfo %s --rpc", msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP ? "dump" : "check"); if (msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP) ret = cpuinfo_dump(); From 5966ffe8a7fa452a8c8256962436dceb4479237e Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 24 Oct 2025 06:50:31 +0100 Subject: [PATCH 222/257] cr-service: refactor images_dir path resolution Move the images_dir selection logic from setup_opts_from_req() into a new function: resolve_images_dir_path(). This improves readability and allows the code to be reused. While at it, use snprintf() instead of sprintf() for the /proc path and ensure NULL termination after strncpy(). Signed-off-by: Radostin Stoyanov --- criu/cr-service.c | 59 +++++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 23 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index 0808be3e7..7d17a63e0 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -283,6 +283,41 @@ int exec_rpc_query_external_files(char *name, int sk) return ret; } +static int resolve_images_dir_path(char *images_dir_path, + bool imgs_changed_by_rpc_conf, + const CriuOpts *req, + pid_t peer_pid) +{ + /* + * images_dir_fd is a required RPC parameter with -1 as default value. + * + * This assumes that if opts.imgs_dir is set, we have a value + * from the configuration file parser. The test to see that + * imgs_changed_by_rpc_conf is true is used to make sure the value + * is from the RPC configuration file. The idea is that only the + * RPC configuration file is able to overwrite RPC settings: + * * apply_config(global_conf) + * * apply_config(user_conf) + * * apply_config(environment variable) + * * apply_rpc_options() + * * apply_config(rpc_conf) + */ + if (imgs_changed_by_rpc_conf) { + strncpy(images_dir_path, opts.imgs_dir, PATH_MAX - 1); + images_dir_path[PATH_MAX - 1] = '\0'; + } else if (req->images_dir_fd != -1) { + snprintf(images_dir_path, PATH_MAX, "/proc/%d/fd/%d", peer_pid, req->images_dir_fd); + } else if (req->images_dir) { + strncpy(images_dir_path, req->images_dir, PATH_MAX - 1); + images_dir_path[PATH_MAX - 1] = '\0'; + } else { + pr_err("Neither images_dir_fd nor images_dir was passed by RPC client.\n"); + return -1; + } + + return 0; +} + static int setup_images_and_workdir(const char *images_dir_path, bool work_changed_by_rpc_conf, CriuOpts *req, @@ -706,30 +741,8 @@ static int setup_opts_from_req(int sk, CriuOpts *req) xfree(tmp_work); } - /* - * open images_dir - images_dir_fd is a required RPC parameter - * - * This assumes that if opts.imgs_dir is set we have a value - * from the configuration file parser. The test to see that - * imgs_changed_by_rpc_conf is true is used to make sure the value - * is from the RPC configuration file. The idea is that only the - * RPC configuration file is able to overwrite RPC settings: - * * apply_config(global_conf) - * * apply_config(user_conf) - * * apply_config(environment variable) - * * apply_rpc_options() - * * apply_config(rpc_conf) - */ - if (imgs_changed_by_rpc_conf) { - strncpy(images_dir_path, opts.imgs_dir, PATH_MAX - 1); - } else if (req->images_dir_fd != -1) { - sprintf(images_dir_path, "/proc/%d/fd/%d", ids.pid, req->images_dir_fd); - } else if (req->images_dir) { - strncpy(images_dir_path, req->images_dir, PATH_MAX - 1); - } else { - pr_err("Neither images_dir_fd nor images_dir was passed by RPC client.\n"); + if (resolve_images_dir_path(images_dir_path, imgs_changed_by_rpc_conf, req, ids.pid) < 0) goto err; - } if (req->parent_img) SET_CHAR_OPTS(img_parent, req->parent_img); From 72ca94db4de93105f89c9411b87a70e8f2353745 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 28 Oct 2025 18:37:31 +0000 Subject: [PATCH 223/257] cr-service: refactor logging setup Move the logging initialization into a helper function that can be reused. No functional change intended. Signed-off-by: Radostin Stoyanov --- criu/cr-service.c | 51 ++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index 7d17a63e0..b4e8629c9 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -354,6 +354,31 @@ static int setup_images_and_workdir(const char *images_dir_path, return 0; } +static int setup_logging_from_req(CriuOpts *req, bool output_changed_by_rpc_conf) +{ + if (req->log_file && !output_changed_by_rpc_conf) { + if (strchr(req->log_file, '/')) { + pr_perror("No subdirs are allowed in log_file name"); + return -1; + } + SET_CHAR_OPTS(output, req->log_file); + } else if (req->has_log_to_stderr && req->log_to_stderr && !output_changed_by_rpc_conf) { + xfree(opts.output); + opts.output = NULL; /* log_init(NULL) writes to stderr */ + } else if (!opts.output) { + SET_CHAR_OPTS(output, DEFAULT_LOG_FILENAME); + } + + opts.log_level = req->log_level; + log_set_loglevel(opts.log_level); + if (log_init(opts.output)) { + pr_perror("Can't initiate log"); + return -1; + } + + return 0; +} + static int setup_opts_from_req(int sk, CriuOpts *req) { struct ucred ids; @@ -758,36 +783,12 @@ static int setup_opts_from_req(int sk, CriuOpts *req) } /* initiate log file in work dir */ - if (req->log_file && !output_changed_by_rpc_conf) { - /* - * If RPC sets a log file and if there nothing from the - * RPC configuration file, use the RPC value. - */ - if (strchr(req->log_file, '/')) { - pr_perror("No subdirs are allowed in log_file name"); - goto err; - } - - SET_CHAR_OPTS(output, req->log_file); - } else if (req->has_log_to_stderr && req->log_to_stderr && !output_changed_by_rpc_conf) { - xfree(opts.output); - opts.output = NULL; - } else if (!opts.output) { - SET_CHAR_OPTS(output, DEFAULT_LOG_FILENAME); - } - - /* This is needed later to correctly set the log_level */ - opts.log_level = req->log_level; - log_set_loglevel(req->log_level); - if (log_init(opts.output) == -1) { - pr_perror("Can't initiate log"); + if (setup_logging_from_req(req, output_changed_by_rpc_conf)) goto err; - } if (req->mntns_compat_mode) opts.mntns_compat_mode = true; - log_set_loglevel(opts.log_level); if (check_options()) goto err; From 9371c4a789889f26d11ca04a4c7c9847a2abbbcc Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 24 Oct 2025 07:12:03 +0100 Subject: [PATCH 224/257] cr-service: refactor RPC opts parsing for check() The check() functionality is very different from dump, pre-dump, and restore. It is used only to check if the kernel supports required features, and does not need the majority of options set via RPC. In particular, we don't need to open `image_dir` when running `check()` because this functionality doesn't create or process image files. In this case, `image_dir` is used as `work_dir`, only when the latter is not specified and a log file is used. This patch updates the RPC options parser so that it only handles the logging options when check() is used. Logging to a file is required when log_file is explicitly set or no log_to_stderr is used. In such case, we also resolve images_dir and work_dir where the log file will be created. Fixes: #2758 Suggested-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- criu/cr-service.c | 57 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 13 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index b4e8629c9..b4718dde2 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -311,6 +311,12 @@ static int resolve_images_dir_path(char *images_dir_path, strncpy(images_dir_path, req->images_dir, PATH_MAX - 1); images_dir_path[PATH_MAX - 1] = '\0'; } else { + /* + * Since images dir is not required in CHECK mode, we need to + * check for work_dir_fd in setup_images_and_workdir() + */ + if (opts.mode == CR_CHECK) + return 0; pr_err("Neither images_dir_fd nor images_dir was passed by RPC client.\n"); return -1; } @@ -323,18 +329,21 @@ static int setup_images_and_workdir(const char *images_dir_path, CriuOpts *req, pid_t peer_pid) { - char work_dir_path[PATH_MAX]; + char work_dir_path[PATH_MAX] = ""; - /* - * Image streaming is not supported with CRIU's service feature as - * the streamer must be started for each dump/restore operation. - * It is unclear how to do that with RPC, so we punt for now. - * This explains why we provide the argument mode=-1 instead of - * O_RSTR or O_DUMP. - */ - if (open_image_dir(images_dir_path, -1) < 0) { - pr_perror("Can't open images directory"); - return -1; + /* We don't need to open images dir in CHECK mode. */ + if (opts.mode != CR_CHECK) { + /* + * Image streaming is not supported with CRIU's service feature as + * the streamer must be started for each dump/restore operation. + * It is unclear how to do that with RPC, so we punt for now. + * This explains why we provide the argument mode=-1 instead of + * O_RSTR or O_DUMP. + */ + if (open_image_dir(images_dir_path, -1) < 0) { + pr_perror("Can't open images directory"); + return -1; + } } if (work_changed_by_rpc_conf) @@ -343,9 +352,14 @@ static int setup_images_and_workdir(const char *images_dir_path, sprintf(work_dir_path, "/proc/%d/fd/%d", peer_pid, req->work_dir_fd); else if (opts.work_dir) strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); - else + else if (images_dir_path[0] != '\0') strcpy(work_dir_path, images_dir_path); + if (work_dir_path[0] == '\0') { + pr_err("images-dir or work-dir is required when using log file\n"); + return -1; + } + if (chdir(work_dir_path)) { pr_perror("Can't chdir to work_dir"); return -1; @@ -384,7 +398,7 @@ static int setup_opts_from_req(int sk, CriuOpts *req) struct ucred ids; struct stat st; socklen_t ids_len = sizeof(struct ucred); - char images_dir_path[PATH_MAX]; + char images_dir_path[PATH_MAX] = ""; char status_fd[PATH_MAX]; bool output_changed_by_rpc_conf = false; bool work_changed_by_rpc_conf = false; @@ -397,6 +411,23 @@ static int setup_opts_from_req(int sk, CriuOpts *req) goto err; } + /* + * The options relevant in CHECK mode are: log_file, log_to_stderr, and log_level. + * When logging to a file, we also need to resolve images_dir and work_dir. + */ + if (opts.mode == CR_CHECK) { + if (!req) + return 0; /* nothing to do */ + + /* + * A log file is needed only if: + * - log_file is explicitly set, or + * - log_to_stderr is NOT requested (i.e., using DEFAULT_LOG_FILENAME) + */ + if (!req->log_file || (req->has_log_to_stderr && req->log_to_stderr)) + return 0; /* no log file, don't require images_dir or work_dir */ + } + if (fstat(sk, &st)) { pr_perror("Can't get socket stat"); goto err; From f7ccb63bdd496409d968390aa15a3a8c4b877110 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 24 Oct 2025 09:28:28 +0100 Subject: [PATCH 225/257] pycriu: set RPC opts for CHECK This allows users to specify RPC options when using the check() functionality. Co-authored-by: Andrii Herheliuk Signed-off-by: Radostin Stoyanov --- lib/pycriu/criu.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/pycriu/criu.py b/lib/pycriu/criu.py index 05a85c58d..760d2be78 100644 --- a/lib/pycriu/criu.py +++ b/lib/pycriu/criu.py @@ -211,7 +211,8 @@ class criu: def __init__(self): self.use_binary('criu') - self.opts = rpc.criu_opts() + # images_dir_fd is required field with default value of -1 + self.opts = rpc.criu_opts(images_dir_fd=-1) self.sk = None def use_sk(self, sk_name=CR_DEFAULT_SERVICE_ADDRESS): @@ -273,6 +274,7 @@ class criu: """ req = rpc.criu_req() req.type = rpc.CHECK + req.opts.MergeFrom(self.opts) resp = self._send_req_and_recv_resp(req) From 3c841af2cf1f1769c2fa1527bf2706b705da1202 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 25 Oct 2025 12:35:19 +0100 Subject: [PATCH 226/257] pycriu: use explicit imports for __init__ _init__.py defines the public API for pycriu. It is important to use explicit imports to avoid leaking every symbol from criu.py into the pycriu namespace. This avoids import-time side effects, prevents name collisions, and circular-import traps. Fixes the following lint error: F403 `from .criu import *` used; unable to detect undefined names Signed-off-by: Radostin Stoyanov --- Makefile | 2 ++ lib/pycriu/__init__.py | 15 +++++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 1824ea180..05834d682 100644 --- a/Makefile +++ b/Makefile @@ -452,6 +452,8 @@ ruff: test/inhfd/*.py \ test/others/rpc/config_file.py \ test/others/action-script/check_actions.py \ + lib/pycriu/criu.py \ + lib/pycriu/__init__.py \ lib/pycriu/images/pb2dict.py \ lib/pycriu/images/images.py \ scripts/criu-ns \ diff --git a/lib/pycriu/__init__.py b/lib/pycriu/__init__.py index 2abcf029d..28f1e9424 100644 --- a/lib/pycriu/__init__.py +++ b/lib/pycriu/__init__.py @@ -1,4 +1,15 @@ from . import rpc_pb2 as rpc from . import images -from .criu import * -from .version import __version__ \ No newline at end of file +from .criu import criu, CRIUExceptionExternal, CRIUException +from .criu import CR_DEFAULT_SERVICE_ADDRESS +from .version import __version__ + +__all__ = ( + "rpc", + "images", + "criu", + "CRIUExceptionExternal", + "CRIUException", + "CR_DEFAULT_SERVICE_ADDRESS", + "__version__", +) \ No newline at end of file From a1dc885027f6866f01df38c9f14e71a9102e298a Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 24 Oct 2025 19:32:23 +0100 Subject: [PATCH 227/257] test/rpc: update errno check The --mntns-compat-mode option is no longer parsed with CHECK. Use --log-file instead to test the error message. Signed-off-by: Radostin Stoyanov --- test/others/rpc/errno.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/test/others/rpc/errno.py b/test/others/rpc/errno.py index a5a3eb54d..ea841199f 100755 --- a/test/others/rpc/errno.py +++ b/test/others/rpc/errno.py @@ -49,8 +49,8 @@ class test: if err and resp.cr_errno != err: raise Exception('Unexpected cr_errno ' + str(resp.cr_errno)) - - if errmsg and errmsg not in resp.cr_errmsg: + + if errmsg and errmsg not in str(resp.cr_errmsg): raise Exception('Unexpected cr_msg \'' + str(resp.cr_errmsg) + '\'') def no_process(self): @@ -134,20 +134,19 @@ class test: self.check_resp(resp, rpc.EMPTY, None) print('Success') - + def child_first_err(self): print('Receive correct first error message') req = self.get_base_req() req.type = rpc.CHECK - - # mntns_compat_mode options is only allowed on restore - req.opts.mntns_compat_mode = True + # Log file must not have subdirectory + req.opts.log_file = "/foo/bar.log" self.send_req(req) resp = self.recv_resp() - self.check_resp(resp, rpc.CHECK, None, "Option --mntns-compat-mode is only valid on restore\n") + self.check_resp(resp, rpc.CHECK, None, "No subdirs are allowed in log_file name") print('Success') From 567f70ce191326c56cd223ce94a079dceb7c71fd Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 20 Oct 2025 10:24:49 +0100 Subject: [PATCH 228/257] test/others: add test for check() with libcriu Signed-off-by: Radostin Stoyanov --- test/others/libcriu/Makefile | 1 + test/others/libcriu/run.sh | 1 + test/others/libcriu/test_check.c | 17 +++++++++++++++++ 3 files changed, 19 insertions(+) create mode 100644 test/others/libcriu/test_check.c diff --git a/test/others/libcriu/Makefile b/test/others/libcriu/Makefile index e0ee5b2ab..927f17c23 100644 --- a/test/others/libcriu/Makefile +++ b/test/others/libcriu/Makefile @@ -8,6 +8,7 @@ TESTS += test_iters TESTS += test_errno TESTS += test_join_ns TESTS += test_pre_dump +TESTS += test_check TESTS += test_feature_check all: $(TESTS) diff --git a/test/others/libcriu/run.sh b/test/others/libcriu/run.sh index 804af9b83..6b36d4496 100755 --- a/test/others/libcriu/run.sh +++ b/test/others/libcriu/run.sh @@ -63,6 +63,7 @@ if [ "$(uname -m)" = "x86_64" ]; then fi run_test test_errno run_test test_join_ns +run_test test_check if criu check --feature mem_dirty_track > /dev/null; then export CRIU_FEATURE_MEM_TRACK=1 fi diff --git a/test/others/libcriu/test_check.c b/test/others/libcriu/test_check.c new file mode 100644 index 000000000..4af3b3630 --- /dev/null +++ b/test/others/libcriu/test_check.c @@ -0,0 +1,17 @@ +#include +#include "criu.h" +#include "lib.h" + +int main(int argc, char **argv) +{ + int ret; + + printf("--- Start check ---\n"); + criu_init_opts(); + criu_set_service_binary(argv[1]); + + if (criu_check()) + return -1; + + return 0; +} From 0fa6ff3d188245091dce1516fa7804ebfa6be337 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 26 Oct 2025 10:00:39 +0000 Subject: [PATCH 229/257] test/others: add tests for check() with pycriu Signed-off-by: Radostin Stoyanov --- Makefile | 1 + test/others/pycriu/.gitignore | 1 + test/others/pycriu/Makefile | 63 ++++++++++++++++++++ test/others/pycriu/read.py | 1 + test/others/pycriu/test_check.py | 29 +++++++++ test/others/pycriu/test_check_fail.py | 32 ++++++++++ test/others/pycriu/test_check_images_dir.py | 44 ++++++++++++++ test/others/pycriu/test_check_work_dir_fd.py | 44 ++++++++++++++ test/others/rpc/read.py | 0 9 files changed, 215 insertions(+) create mode 100644 test/others/pycriu/.gitignore create mode 100644 test/others/pycriu/Makefile create mode 120000 test/others/pycriu/read.py create mode 100755 test/others/pycriu/test_check.py create mode 100755 test/others/pycriu/test_check_fail.py create mode 100755 test/others/pycriu/test_check_images_dir.py create mode 100755 test/others/pycriu/test_check_work_dir_fd.py mode change 100644 => 100755 test/others/rpc/read.py diff --git a/Makefile b/Makefile index 05834d682..e26807158 100644 --- a/Makefile +++ b/Makefile @@ -452,6 +452,7 @@ ruff: test/inhfd/*.py \ test/others/rpc/config_file.py \ test/others/action-script/check_actions.py \ + test/others/pycriu/*.py \ lib/pycriu/criu.py \ lib/pycriu/__init__.py \ lib/pycriu/images/pb2dict.py \ diff --git a/test/others/pycriu/.gitignore b/test/others/pycriu/.gitignore new file mode 100644 index 000000000..567609b12 --- /dev/null +++ b/test/others/pycriu/.gitignore @@ -0,0 +1 @@ +build/ diff --git a/test/others/pycriu/Makefile b/test/others/pycriu/Makefile new file mode 100644 index 000000000..b6e3b4814 --- /dev/null +++ b/test/others/pycriu/Makefile @@ -0,0 +1,63 @@ +.SHELLFLAGS := -eu -o pipefail -c +.ONESHELL: + +CRIU ?= ../../../criu/criu +BUILD_DIR ?= build +SOCKET_NAME ?= criu_service.socket +PIDFILE_NAME ?= pidfile +SERVICE_LOG ?= service.log +PYTHON ?= python3 + +PIDFILE := $(BUILD_DIR)/$(PIDFILE_NAME) +CRIU_SOCKET := $(BUILD_DIR)/$(SOCKET_NAME) +STATUS_FIFO := $(BUILD_DIR)/startup.status +STATUS_FD := 200 + +run: start + cleanup() { $(MAKE) --no-print-directory stop || true; } + trap cleanup EXIT INT TERM + "$(PYTHON)" test_check.py + "$(PYTHON)" test_check_fail.py + "$(PYTHON)" test_check_images_dir.py + "$(PYTHON)" test_check_work_dir_fd.py + +start: + mkdir -p "$(BUILD_DIR)" + if [ -s "$(PIDFILE)" ] && kill -0 "$$(cat "$(PIDFILE)")" 2>/dev/null; then + echo "Service running (PID $$(cat "$(PIDFILE)"))." + exit 0 + fi + if ! command -v "$(CRIU)" >/dev/null 2>&1; then + echo "CRIU not found at $(CRIU)" + exit 1 + fi + mkfifo "$(STATUS_FIFO)" + exec $(STATUS_FD)<>"$(STATUS_FIFO)" + "$(CRIU)" service \ + -v4 \ + -W "$(BUILD_DIR)" \ + --address "$(SOCKET_NAME)" \ + -d \ + --pidfile "$(PIDFILE_NAME)" \ + -o "$(SERVICE_LOG)" \ + --status-fd "$(STATUS_FD)" + "$(PYTHON)" read.py "$(STATUS_FIFO)" + +stop: + if [ ! -s "$(PIDFILE)" ]; then + echo "pidfile missing or empty" + exit 1 + fi + pid=$$(cat "$(PIDFILE)") + if kill -0 "$$pid" 2>/dev/null; then + kill -9 "$$pid" || true + fi + rm -f "$(PIDFILE)" "$(CRIU_SOCKET)" "$(STATUS_FIFO)" + +clean: + if [ -s "$(PIDFILE)" ] && kill -0 "$$(cat "$(PIDFILE)")" 2>/dev/null; then + kill -9 "$$(cat "$(PIDFILE)")" || true + fi + rm -rf "$(BUILD_DIR)" + +.PHONY: start stop clean run \ No newline at end of file diff --git a/test/others/pycriu/read.py b/test/others/pycriu/read.py new file mode 120000 index 000000000..c2c1e1365 --- /dev/null +++ b/test/others/pycriu/read.py @@ -0,0 +1 @@ +../rpc/read.py \ No newline at end of file diff --git a/test/others/pycriu/test_check.py b/test/others/pycriu/test_check.py new file mode 100755 index 000000000..9888158db --- /dev/null +++ b/test/others/pycriu/test_check.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +import os +import sys + +# Add ../../../lib so we can import pycriu +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +LIB_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "../../../lib")) +if LIB_DIR not in sys.path: + sys.path.insert(0, LIB_DIR) + +import pycriu # noqa: E402 + +def main(): + socket_path = os.path.join(SCRIPT_DIR, "build", "criu_service.socket") + + criu = pycriu.criu() + criu.use_sk(socket_path) + + try: + criu.check() + except Exception as e: + print(f"FAIL: {e}") + return 1 + + print("PASS") + return 0 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/others/pycriu/test_check_fail.py b/test/others/pycriu/test_check_fail.py new file mode 100755 index 000000000..b5634c60b --- /dev/null +++ b/test/others/pycriu/test_check_fail.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +import os +import sys + +# Add ../../../lib so we can import pycriu +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +LIB_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "../../../lib")) +if LIB_DIR not in sys.path: + sys.path.insert(0, LIB_DIR) + +import pycriu # noqa: E402 + +def main(): + socket_path = os.path.join(SCRIPT_DIR, "build", "criu_service.socket") + + criu = pycriu.criu() + criu.use_sk(socket_path) + + # Intentionally set only log_file (no images/work dir) to ensure check() fails + criu.opts.log_file = "check.log" + + try: + criu.check() + except Exception: + print("PASS") + return 0 + + print("FAIL: check() did not fail when log_file is set without images/work dir") + return 1 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/others/pycriu/test_check_images_dir.py b/test/others/pycriu/test_check_images_dir.py new file mode 100755 index 000000000..f479c2a88 --- /dev/null +++ b/test/others/pycriu/test_check_images_dir.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +import os +import sys + +# Add ../../../lib so we can import pycriu +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +LIB_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "../../../lib")) +if LIB_DIR not in sys.path: + sys.path.insert(0, LIB_DIR) + +import pycriu # noqa: E402 + +def _log_path(images_dir, log_file): + return log_file if os.path.isabs(log_file) else os.path.join(images_dir, log_file) + +def main(): + build_dir = os.path.join(SCRIPT_DIR, "build") + socket_path = os.path.join(build_dir, "criu_service.socket") + + criu = pycriu.criu() + criu.use_sk(socket_path) + + criu.opts.images_dir = build_dir + criu.opts.log_file = "check.log" + criu.opts.log_level = 4 + + try: + criu.check() + except Exception as e: + lp = _log_path(build_dir, criu.opts.log_file) + msg = f"FAIL: {e} ({'see log: ' + lp if os.path.exists(lp) else 'no log found'})" + print(msg) + return 1 + + lp = _log_path(build_dir, criu.opts.log_file) + if not (os.path.isfile(lp) and os.path.getsize(lp) > 0): + print(f"FAIL: log file missing or empty: {lp}") + return 1 + + print("PASS") + return 0 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/others/pycriu/test_check_work_dir_fd.py b/test/others/pycriu/test_check_work_dir_fd.py new file mode 100755 index 000000000..e20a83097 --- /dev/null +++ b/test/others/pycriu/test_check_work_dir_fd.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +import os +import sys + +# Add ../../../lib so we can import pycriu +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +LIB_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "../../../lib")) +if LIB_DIR not in sys.path: + sys.path.insert(0, LIB_DIR) + +import pycriu # noqa: E402 + +def main(): + build_dir = os.path.join(SCRIPT_DIR, "build") + socket_path = os.path.join(build_dir, "criu_service.socket") + os.makedirs(build_dir, exist_ok=True) + + # Open a directory FD to use as work_dir_fd (prefer O_PATH if available) + flags = getattr(os, "O_PATH", 0) or os.O_RDONLY + fd = os.open(build_dir, flags) + + criu = pycriu.criu() + criu.use_sk(socket_path) + + criu.opts.work_dir_fd = fd + criu.opts.log_file = "check.log" + criu.opts.log_level = 4 + + try: + criu.check() + except Exception as e: + print(f"FAIL: {e}") + return 1 + finally: + try: + os.close(fd) + except Exception: + pass + + print("PASS") + return 0 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/others/rpc/read.py b/test/others/rpc/read.py old mode 100644 new mode 100755 From cb8e1da3f483f53fcacb642b574866625f7dbb0a Mon Sep 17 00:00:00 2001 From: alam0rt Date: Fri, 31 Oct 2025 14:32:17 +1100 Subject: [PATCH 230/257] coredump: use compat_nr_pages as fallback Use nr_pages when available, falling back to compat_nr_pages for compatibility. Signed-off-by: alam0rt Signed-off-by: Radostin Stoyanov --- coredump/criu_coredump/coredump.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/coredump/criu_coredump/coredump.py b/coredump/criu_coredump/coredump.py index 9454d8f0b..3c9cd45aa 100644 --- a/coredump/criu_coredump/coredump.py +++ b/coredump/criu_coredump/coredump.py @@ -794,7 +794,8 @@ class coredump_generator: off = 0 # in pages for m in pagemap[1:]: found = False - for i in range(m["nr_pages"]): + num_pages = m.get("nr_pages", m.compat_nr_pages) + for i in range(num_pages): if m["vaddr"] + i * PAGESIZE == page_no * PAGESIZE: found = True break From 1d08ff8ca7b2a8bee5238e80bddd52a627c637cf Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 9 Nov 2025 16:24:48 +0000 Subject: [PATCH 231/257] coredump: fix handling of num_pages This patch fixes the following error: $ sudo make -C test/others/criu-coredump run ... Traceback (most recent call last): File "/home/circleci/criu/coredump/coredump", line 55, in main() File "/home/circleci/criu/coredump/coredump", line 47, in main coredump(opts) File "/home/circleci/criu/coredump/coredump", line 14, in coredump cores = generator(os.path.realpath(opts['in'])) File "/home/circleci/criu/coredump/criu_coredump/coredump.py", line 192, in __call__ self.coredumps[pid] = self._gen_coredump(pid) File "/home/circleci/criu/coredump/criu_coredump/coredump.py", line 214, in _gen_coredump cd.vmas = self._gen_vmas(pid) File "/home/circleci/criu/coredump/criu_coredump/coredump.py", line 992, in _gen_vmas v.data = self._gen_mem_chunk(pid, vma, v.filesz) File "/home/circleci/criu/coredump/criu_coredump/coredump.py", line 879, in _gen_mem_chunk page_mem = self._get_page(pid, page_no) File "/home/circleci/criu/coredump/criu_coredump/coredump.py", line 797, in _get_page num_pages = m.get("nr_pages", m.compat_nr_pages) AttributeError: 'dict' object has no attribute 'compat_nr_pages' + exit 1 make[1]: *** [Makefile:3: run] Error 1 Signed-off-by: Radostin Stoyanov Signed-off-by: Andrei Vagin --- coredump/criu_coredump/coredump.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/coredump/criu_coredump/coredump.py b/coredump/criu_coredump/coredump.py index 3c9cd45aa..acb806ace 100644 --- a/coredump/criu_coredump/coredump.py +++ b/coredump/criu_coredump/coredump.py @@ -794,7 +794,8 @@ class coredump_generator: off = 0 # in pages for m in pagemap[1:]: found = False - num_pages = m.get("nr_pages", m.compat_nr_pages) + num_pages = m.get("nr_pages", m["compat_nr_pages"]) + for i in range(num_pages): if m["vaddr"] + i * PAGESIZE == page_no * PAGESIZE: found = True From ce680fc6c71ddac19fec25669dffe123c36595e7 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sat, 8 Nov 2025 15:57:22 +0000 Subject: [PATCH 232/257] Revert "plugins/amdgpu: Implement parallel restore" This functionality (#2527) is being reverted and excluded from this release due to issue #2812. It will be included in a subsequent release once all associated issues are resolved. Signed-off-by: Andrei Vagin --- Documentation/criu-amdgpu-plugin.txt | 1 - plugins/amdgpu/Makefile | 2 +- plugins/amdgpu/README.md | 23 +- plugins/amdgpu/amdgpu_plugin.c | 420 +++--------------------- plugins/amdgpu/amdgpu_plugin_topology.c | 2 +- plugins/amdgpu/amdgpu_plugin_topology.h | 1 - plugins/amdgpu/amdgpu_socket_utils.c | 320 ------------------ plugins/amdgpu/amdgpu_socket_utils.h | 54 --- 8 files changed, 52 insertions(+), 771 deletions(-) delete mode 100644 plugins/amdgpu/amdgpu_socket_utils.c delete mode 100644 plugins/amdgpu/amdgpu_socket_utils.h diff --git a/Documentation/criu-amdgpu-plugin.txt b/Documentation/criu-amdgpu-plugin.txt index fe76fc3bc..68803f3db 100644 --- a/Documentation/criu-amdgpu-plugin.txt +++ b/Documentation/criu-amdgpu-plugin.txt @@ -15,7 +15,6 @@ Checkpoint / Restore inside a docker container Pytorch Tensorflow Using CRIU Image Streamer -Parallel Restore DESCRIPTION ----------- diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 870a039cd..3d55f8bb4 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -27,7 +27,7 @@ endif criu-amdgpu.pb-c.c: criu-amdgpu.proto protoc --proto_path=. --c_out=. criu-amdgpu.proto -amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c +amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC) amdgpu_plugin_clean: diff --git a/plugins/amdgpu/README.md b/plugins/amdgpu/README.md index b808fbc4f..1078eafe6 100644 --- a/plugins/amdgpu/README.md +++ b/plugins/amdgpu/README.md @@ -3,8 +3,7 @@ Supporting ROCm with CRIU _Felix Kuehling _
_Rajneesh Bardwaj _
-_David Yat Sin _
-_Yanning Yang _ +_David Yat Sin _ # Introduction @@ -225,26 +224,6 @@ to resume execution on the GPUs. *This new plugin is enabled by the new hook `__RESUME_DEVICES_LATE` in our RFC patch series.* -## Restoring BO content in parallel - -Restoring the BO content is an important part in the restore of GPU state and -usually takes a significant amount of time. A possible location for this -procedure is the `cr_plugin_restore_file` hook. However, restoring in this hook -blocks the target process from performing other restore operations, which -hinders further optimization of the restore process. - -Therefore, a new plugin hook that runs in the master restore process is -introduced, and it interacts with the `cr_plugin_restore_file` hook to complete -the restore of BO content. Specifically, the target process only needs to send -the relevant BOs to the master restore process, while this new hook handles all -the restore of buffer objects. Through this method, during the restore of the BO -content, the target process can perform other restore operations, thus -accelerating the restore procedure. This is an implementation of the gCROP -method proposed in the ACM SoCC'24 paper: [On-demand and Parallel -Checkpoint/Restore for GPU Applications](https://dl.acm.org/doi/10.1145/3698038.3698510). - -*This optimization technique is enabled by the `__POST_FORKING` hook.* - ## Other CRIU changes In addition to the new plugins, we need to make some changes to CRIU itself to diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 69194fbc7..96c086162 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -28,13 +28,11 @@ #include "xmalloc.h" #include "criu-log.h" #include "files.h" -#include "pstree.h" #include "common/list.h" #include "amdgpu_plugin_drm.h" #include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" -#include "amdgpu_socket_utils.h" #include "img-streamer.h" #include "image.h" @@ -66,18 +64,6 @@ bool plugin_added_to_inventory = false; bool plugin_disabled = false; -/* - * In the case of a single process (common case), this optimization can effectively - * reduce the restore latency with parallel restore. In the case of multiple processes, - * states are already restored in parallel within different processes. Therefore, this - * optimization does not introduce further improvement and will be disabled by default - * in this case. The flag, parallel_disabled, is used to control whether the - * optimization is enabled or disabled. - */ -bool parallel_disabled = false; - -pthread_t parallel_thread = 0; -int parallel_thread_result = 0; /**************************************************************************************************/ /* Call ioctl, restarting if it is interrupted */ @@ -365,15 +351,6 @@ int amdgpu_plugin_init(int stage) maps_init(&restore_maps); if (stage == CR_PLUGIN_STAGE__RESTORE) { - if (has_children(root_item)) { - pr_info("Parallel restore disabled\n"); - parallel_disabled = true; - } else { - if (install_parallel_sock() < 0) { - pr_err("Failed to install parallel socket\n"); - return -1; - } - } /* Default Values */ kfd_fw_version_check = true; kfd_sdma_fw_version_check = true; @@ -1462,9 +1439,14 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e) static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e) { - struct thread_data *thread_datas = NULL; + struct thread_data *thread_datas; int thread_i, ret = 0; - int offset = 0; + + thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus); + if (!thread_datas) { + ret = -ENOMEM; + goto exit; + } for (int i = 0; i < e->num_of_bos; i++) { struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i]; @@ -1507,101 +1489,56 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf } } - if (!parallel_disabled) { - parallel_restore_cmd restore_cmd; - pr_info("Begin to send parallel restore cmd\n"); - ret = init_parallel_restore_cmd(e->num_of_bos, id, e->num_of_gpus, &restore_cmd); - if (ret) - goto exit_parallel; + thread_i = 0; + for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { + struct tp_node *dev; + int ret_thread = 0; + uint32_t target_gpu_id; - for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { - uint32_t target_gpu_id; - struct tp_node *dev; + if (!e->device_entries[i]->gpu_id) + continue; - if (!e->device_entries[i]->gpu_id) - continue; + /* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */ + target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); - target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); - dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); - if (!dev) { - pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); - ret = -ENODEV; - goto exit_parallel; - } - parallel_restore_gpu_id_add(e->device_entries[i]->gpu_id, dev->drm_render_minor, &restore_cmd); - - for (int j = 0; j < e->num_of_bos; j++) { - if (bo_buckets[j].gpu_id != e->device_entries[i]->gpu_id) - continue; - if (bo_buckets[j].alloc_flags & - (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) { - parallel_restore_bo_add(bo_buckets[j].dmabuf_fd, bo_buckets[j].gpu_id, - bo_buckets[j].size, offset, &restore_cmd); - offset += bo_buckets[j].size; - } - } - } - ret = send_parallel_restore_cmd(&restore_cmd); -exit_parallel: - free_parallel_restore_cmd(&restore_cmd); - } else { - thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus); - if (!thread_datas) { - ret = -ENOMEM; + /* We need the fd for actual_gpu_id */ + dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); + if (!dev) { + pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); + ret = -ENODEV; goto exit; } - thread_i = 0; - for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { - struct tp_node *dev; - int ret_thread = 0; - uint32_t target_gpu_id; + thread_datas[thread_i].id = id; + thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id; + thread_datas[thread_i].bo_buckets = bo_buckets; + thread_datas[thread_i].bo_entries = e->bo_entries; + thread_datas[thread_i].pid = e->pid; + thread_datas[thread_i].num_of_bos = e->num_of_bos; - if (!e->device_entries[i]->gpu_id) - continue; - - /* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */ - target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); - - /* We need the fd for actual_gpu_id */ - dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); - if (!dev) { - pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); - ret = -ENODEV; - goto exit; - } - - thread_datas[thread_i].id = id; - thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id; - thread_datas[thread_i].bo_buckets = bo_buckets; - thread_datas[thread_i].bo_entries = e->bo_entries; - thread_datas[thread_i].pid = e->pid; - thread_datas[thread_i].num_of_bos = e->num_of_bos; - - thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev); - if (thread_datas[thread_i].drm_fd < 0) { - ret = -thread_datas[thread_i].drm_fd; - goto exit; - } - - ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents, - (void *)&thread_datas[thread_i]); - if (ret_thread) { - pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread); - ret = -ret_thread; - goto exit; - } - thread_i++; + thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev); + if (thread_datas[thread_i].drm_fd < 0) { + ret = -thread_datas[thread_i].drm_fd; + goto exit; } - for (int i = 0; i < e->num_of_gpus; i++) { - pthread_join(thread_datas[i].thread, NULL); - pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); + ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents, + (void *)&thread_datas[thread_i]); + if (ret_thread) { + pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread); + ret = -ret_thread; + goto exit; + } + thread_i++; + } - if (thread_datas[i].ret) { - ret = thread_datas[i].ret; - goto exit; - } + for (int i = 0; i < e->num_of_gpus; i++) { + pthread_join(thread_datas[i].thread, NULL); + pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); + + if (thread_datas[i].ret) { + ret = thread_datas[i].ret; + goto exit; } } exit: @@ -1609,8 +1546,8 @@ exit: if (bo_buckets[i].dmabuf_fd != KFD_INVALID_FD) close(bo_buckets[i].dmabuf_fd); } - if (thread_datas) - xfree(thread_datas); + + xfree(thread_datas); return ret; } @@ -1899,24 +1836,6 @@ int amdgpu_plugin_resume_devices_late(int target_pid) if (plugin_disabled) return -ENOTSUP; - if (!parallel_disabled) { - pr_info("Close parallel restore server\n"); - if (close_parallel_restore_server()) { - pr_err("Close parallel restore server fail\n"); - return -1; - } - - exit_code = pthread_join(parallel_thread, NULL); - if (exit_code) { - pr_err("Failed to join parallel thread ret:%d\n", exit_code); - return -1; - } - if (parallel_thread_result) { - pr_err("Parallel restore fail\n"); - return parallel_thread_result; - } - } - pr_info("Inside %s for target pid = %d\n", __func__, target_pid); fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); @@ -1943,244 +1862,3 @@ int amdgpu_plugin_resume_devices_late(int target_pid) } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late) - -int sdma_copy_bo_helper(uint64_t size, int fd, FILE *storage_fp, void *buffer, size_t buffer_size, - amdgpu_device_handle h_dev, uint64_t max_copy_size, enum sdma_op_type type) -{ - return sdma_copy_bo((struct kfd_criu_bo_bucket){ 0, size, 0, 0, 0, 0, fd, 0 }, storage_fp, buffer, - buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); -} - -int init_dev(int dev_minor, amdgpu_device_handle *h_dev, uint64_t *max_copy_size) -{ - int ret = 0; - int drm_fd = -1; - uint32_t major, minor; - - struct amdgpu_gpu_info gpu_info = { 0 }; - - drm_fd = open_drm_render_device(dev_minor); - if (drm_fd < 0) { - return drm_fd; - } - - ret = amdgpu_device_initialize(drm_fd, &major, &minor, h_dev); - if (ret) { - pr_perror("Failed to initialize device"); - goto err; - } - - ret = amdgpu_query_gpu_info(*h_dev, &gpu_info); - if (ret) { - pr_perror("failed to query gpuinfo via libdrm"); - goto err; - } - *max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE : - SDMA_LINEAR_COPY_MAX_SIZE - 1; - return 0; -err: - amdgpu_device_deinitialize(*h_dev); - return ret; -} - -FILE *get_bo_contents_fp(int id, int gpu_id, size_t tot_size) -{ - char img_path[PATH_MAX]; - size_t image_size = 0; - FILE *bo_contents_fp = NULL; - - snprintf(img_path, sizeof(img_path), IMG_KFD_PAGES_FILE, id, gpu_id); - bo_contents_fp = open_img_file(img_path, false, &image_size); - if (!bo_contents_fp) { - pr_perror("Cannot fopen %s", img_path); - return NULL; - } - - if (tot_size != image_size) { - pr_err("%s size mismatch (current:%ld:expected:%ld)\n", img_path, image_size, tot_size); - fclose(bo_contents_fp); - return NULL; - } - return bo_contents_fp; -} - -struct parallel_thread_data { - pthread_t thread; - uint32_t gpu_id; - int minor; - parallel_restore_cmd *restore_cmd; - int ret; -}; - -void *parallel_restore_bo_contents(void *_thread_data) -{ - struct parallel_thread_data *thread_data = (struct parallel_thread_data *)_thread_data; - amdgpu_device_handle h_dev; - uint64_t max_copy_size; - size_t total_bo_size = 0, max_bo_size = 0, buffer_size = 0; - FILE *bo_contents_fp = NULL; - parallel_restore_entry *entry; - parallel_restore_cmd *restore_cmd = thread_data->restore_cmd; - int ret = 0; - int offset = 0; - void *buffer = NULL; - - ret = init_dev(thread_data->minor, &h_dev, &max_copy_size); - if (ret) { - goto err; - } - - for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) { - if (restore_cmd->entries[i].gpu_id == thread_data->gpu_id) { - total_bo_size += restore_cmd->entries[i].size; - max_bo_size = max(restore_cmd->entries[i].size, max_bo_size); - } - } - - buffer_size = kfd_max_buffer_size > 0 ? min(kfd_max_buffer_size, max_bo_size) : max_bo_size; - - bo_contents_fp = get_bo_contents_fp(restore_cmd->cmd_head.id, thread_data->gpu_id, total_bo_size); - if (bo_contents_fp == NULL) { - ret = -1; - goto err_sdma; - } - offset = ftell(bo_contents_fp); - - posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); - if (!buffer) { - pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); - ret = -ENOMEM; - goto err_sdma; - } - - for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) { - if (restore_cmd->entries[i].gpu_id != thread_data->gpu_id) - continue; - - entry = &restore_cmd->entries[i]; - fseek(bo_contents_fp, entry->read_offset + offset, SEEK_SET); - ret = sdma_copy_bo_helper(entry->size, restore_cmd->fds_write[entry->write_id], bo_contents_fp, buffer, - buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); - if (ret) { - pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); - goto err_sdma; - } - } - -err_sdma: - if (bo_contents_fp) - fclose(bo_contents_fp); - if (buffer) - xfree(buffer); - amdgpu_device_deinitialize(h_dev); -err: - thread_data->ret = ret; - return NULL; -} - -void *restore_device_parallel_worker(void *arg) -{ - while (1) { - parallel_restore_cmd restore_cmd = { 0 }; - struct parallel_thread_data *thread_datas = NULL; - int ret; - int error_occurred = 0, join_ret = 0, created_threads = 0; - - ret = recv_parallel_restore_cmd(&restore_cmd); - if (ret) { - if (ret == 1) { - *(int *)arg = 0; - goto exit; - } - goto err; - } - - thread_datas = xzalloc(sizeof(*thread_datas) * restore_cmd.cmd_head.gpu_num); - if (!thread_datas) { - ret = -ENOMEM; - goto err; - } - - for (; created_threads < restore_cmd.cmd_head.gpu_num; created_threads++) { - thread_datas[created_threads].gpu_id = restore_cmd.gpu_ids[created_threads].gpu_id; - thread_datas[created_threads].minor = restore_cmd.gpu_ids[created_threads].minor; - thread_datas[created_threads].restore_cmd = &restore_cmd; - - ret = pthread_create(&thread_datas[created_threads].thread, NULL, parallel_restore_bo_contents, - (void *)&thread_datas[created_threads]); - if (ret) { - pr_err("Failed to create thread[0x%x] ret:%d\n", thread_datas[created_threads].gpu_id, ret); - error_occurred = 1; - break; - } - } - - for (int i = 0; i < created_threads; i++) { - join_ret = pthread_join(thread_datas[i].thread, NULL); - if (join_ret != 0) { - pr_err("pthread_join failed for Thread[0x%x] ret:%d\n", - thread_datas[i].gpu_id, join_ret); - if (!error_occurred) { - ret = join_ret; - error_occurred = 1; - } - } - - pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); - - /* Check thread return value */ - if (thread_datas[i].ret && !error_occurred) { - ret = thread_datas[i].ret; - error_occurred = 1; - } - } - - if (thread_datas) - xfree(thread_datas); -err: - free_parallel_restore_cmd(&restore_cmd); - - if (ret) { - *(int *)arg = ret; - return NULL; - } - } -exit: - return NULL; -} - -/* - * While the background thread is running, some processing functions (e.g., stop_cgroupd) - * in the main thread need to block SIGCHLD. To prevent interference from this background - * thread, SIGCHLD is blocked in this thread. - */ -static int back_thread_create(pthread_t *newthread, void *(*f)(void *), void *arg) -{ - int ret = 0; - sigset_t blockmask, oldmask; - - sigemptyset(&blockmask); - sigaddset(&blockmask, SIGCHLD); - sigprocmask(SIG_BLOCK, &blockmask, &oldmask); - - ret = pthread_create(newthread, NULL, f, arg); - if (ret) { - pr_err("Create worker thread fail: %d\n", ret); - return -1; - } - - sigprocmask(SIG_SETMASK, &oldmask, NULL); - return 0; -} - -int amdgpu_plugin_post_forking(void) -{ - if (plugin_disabled) - return -ENOTSUP; - - if (parallel_disabled) - return 0; - - return back_thread_create(¶llel_thread, restore_device_parallel_worker, ¶llel_thread_result); -} -CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__POST_FORKING, amdgpu_plugin_post_forking) \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_plugin_topology.c b/plugins/amdgpu/amdgpu_plugin_topology.c index 730f2e028..5b4396a0c 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.c +++ b/plugins/amdgpu/amdgpu_plugin_topology.c @@ -45,7 +45,7 @@ bool kfd_capability_check = true; */ int fd_next = -1; -int open_drm_render_device(int minor) +static int open_drm_render_device(int minor) { char path[128]; int fd, ret_fd; diff --git a/plugins/amdgpu/amdgpu_plugin_topology.h b/plugins/amdgpu/amdgpu_plugin_topology.h index e19f8e7ce..c890e3dda 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.h +++ b/plugins/amdgpu/amdgpu_plugin_topology.h @@ -118,7 +118,6 @@ struct tp_node *sys_get_node_by_gpu_id(const struct tp_system *sys, const uint32 struct tp_node *sys_get_node_by_render_minor(const struct tp_system *sys, const int drm_render_minor); struct tp_node *sys_get_node_by_index(const struct tp_system *sys, uint32_t index); -int open_drm_render_device(int minor); int node_get_drm_render_device(struct tp_node *node); void sys_close_drm_render_devices(struct tp_system *sys); diff --git a/plugins/amdgpu/amdgpu_socket_utils.c b/plugins/amdgpu/amdgpu_socket_utils.c deleted file mode 100644 index c8bf6d1ba..000000000 --- a/plugins/amdgpu/amdgpu_socket_utils.c +++ /dev/null @@ -1,320 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -#include "amdgpu_socket_utils.h" -#include "criu-log.h" -#include "common/scm.h" -#include "fdstore.h" -#include "util-pie.h" -#include "util.h" - -int parallel_socket_addr_len; -struct sockaddr_un parallel_socket_addr; -int parallel_socket_id = 0; - -static void amdgpu_socket_name_gen(struct sockaddr_un *addr, int *len) -{ - addr->sun_family = AF_UNIX; - snprintf(addr->sun_path, UNIX_PATH_MAX, "x/criu-amdgpu-parallel-%s", criu_run_id); - *len = SUN_LEN(addr); - *addr->sun_path = '\0'; -} - -int install_parallel_sock(void) -{ - int ret = 0; - int sock_fd; - - sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); - if (sock_fd < 0) { - pr_perror("socket creation failed"); - return -1; - } - - amdgpu_socket_name_gen(¶llel_socket_addr, ¶llel_socket_addr_len); - ret = bind(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); - if (ret < 0) { - pr_perror("bind failed"); - goto err; - } - - ret = listen(sock_fd, SOMAXCONN); - if (ret < 0) { - pr_perror("listen failed"); - goto err; - } - - parallel_socket_id = fdstore_add(sock_fd); - if (parallel_socket_id < 0) { - ret = -1; - goto err; - } -err: - close(sock_fd); - return ret; -} - -void parallel_restore_bo_add(int dmabuf_fd, int gpu_id, uint64_t size, uint64_t offset, - parallel_restore_cmd *restore_cmd) -{ - parallel_restore_entry *restore_entry = &restore_cmd->entries[restore_cmd->cmd_head.entry_num]; - restore_entry->gpu_id = gpu_id; - restore_entry->write_id = restore_cmd->cmd_head.fd_write_num; - restore_entry->write_offset = 0; - restore_entry->read_offset = offset; - restore_entry->size = size; - - restore_cmd->fds_write[restore_cmd->cmd_head.fd_write_num] = dmabuf_fd; - - restore_cmd->cmd_head.entry_num += 1; - restore_cmd->cmd_head.fd_write_num += 1; -} - -void parallel_restore_gpu_id_add(int gpu_id, int minor, parallel_restore_cmd *restore_cmd) -{ - restore_cmd->gpu_ids[restore_cmd->cmd_head.gpu_num] = (parallel_gpu_info){ gpu_id, minor }; - restore_cmd->cmd_head.gpu_num += 1; -} - -static int send_metadata(int sock_fd, parallel_restore_cmd *restore_cmd) -{ - if (send(sock_fd, &restore_cmd->cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { - pr_perror("Send parallel restore command head fail"); - return -1; - } - return 0; -} - -static int send_gpu_ids(int sock_fd, parallel_restore_cmd *restore_cmd) -{ - if (send(sock_fd, restore_cmd->gpu_ids, restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info), 0) < 0) { - pr_perror("Send GPU ids of parallel restore command fail"); - return -1; - } - return 0; -} - -static int send_cmds(int sock_fd, parallel_restore_cmd *restore_cmd) -{ - if (send(sock_fd, restore_cmd->entries, restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry), 0) < 0) { - pr_perror("Send parallel restore command fail"); - return -1; - } - return 0; -} - -static int send_dmabuf_fds(int sock_fd, parallel_restore_cmd *restore_cmd) -{ - if (send_fds(sock_fd, NULL, 0, restore_cmd->fds_write, restore_cmd->cmd_head.fd_write_num, 0, 0) < 0) { - pr_perror("Send dmabuf fds fail"); - return -1; - } - return 0; -} - -int send_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) -{ - int sock_fd; - int ret = 0; - - sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); - if (sock_fd < 0) { - pr_perror("Socket creation failed"); - return -1; - } - - ret = connect(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); - if (ret < 0) { - pr_perror("Connect failed"); - goto err; - } - - ret = send_metadata(sock_fd, restore_cmd); - if (ret) { - goto err; - } - - ret = send_gpu_ids(sock_fd, restore_cmd); - if (ret) { - goto err; - } - - ret = send_cmds(sock_fd, restore_cmd); - if (ret) { - goto err; - } - - ret = send_dmabuf_fds(sock_fd, restore_cmd); - -err: - close(sock_fd); - return ret; -} - -int init_parallel_restore_cmd(int num, int id, int gpu_num, parallel_restore_cmd *restore_cmd) -{ - restore_cmd->cmd_head.id = id; - restore_cmd->cmd_head.fd_write_num = 0; - restore_cmd->cmd_head.entry_num = 0; - restore_cmd->cmd_head.gpu_num = 0; - - restore_cmd->gpu_ids = xzalloc(gpu_num * sizeof(parallel_gpu_info)); - if (!restore_cmd->gpu_ids) - return -ENOMEM; - restore_cmd->fds_write = xzalloc(num * sizeof(int)); - if (!restore_cmd->fds_write) - return -ENOMEM; - restore_cmd->entries = xzalloc(num * sizeof(parallel_restore_entry)); - if (!restore_cmd->entries) - return -ENOMEM; - return 0; -} - -void free_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) -{ - if (restore_cmd->gpu_ids) - xfree(restore_cmd->gpu_ids); - if (restore_cmd->fds_write) - xfree(restore_cmd->fds_write); - if (restore_cmd->entries) - xfree(restore_cmd->entries); -} - -static int init_parallel_restore_cmd_by_head(parallel_restore_cmd *restore_cmd) -{ - restore_cmd->gpu_ids = xzalloc(restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info)); - if (!restore_cmd->gpu_ids) - return -ENOMEM; - restore_cmd->fds_write = xzalloc(restore_cmd->cmd_head.fd_write_num * sizeof(int)); - if (!restore_cmd->fds_write) - return -ENOMEM; - restore_cmd->entries = xzalloc(restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry)); - if (!restore_cmd->entries) - return -ENOMEM; - return 0; -} - -static int check_quit_cmd(parallel_restore_cmd *restore_cmd) -{ - return restore_cmd->cmd_head.fd_write_num == 0; -} - -static int recv_metadata(int client_fd, parallel_restore_cmd *restore_cmd) -{ - if (recv(client_fd, &restore_cmd->cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { - pr_perror("Recv parallel restore command head fail"); - return -1; - } - return 0; -} - -static int recv_cmds(int client_fd, parallel_restore_cmd *restore_cmd) -{ - if (recv(client_fd, restore_cmd->entries, restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry), 0) < 0) { - pr_perror("Recv parallel restore command fail"); - return -1; - } - return 0; -} - -static int recv_gpu_ids(int sock_fd, parallel_restore_cmd *restore_cmd) -{ - if (recv(sock_fd, restore_cmd->gpu_ids, restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info), 0) < 0) { - pr_perror("Send GPU ids of parallel restore command fail"); - return -1; - } - return 0; -} - -static int recv_dmabuf_fds(int client_fd, parallel_restore_cmd *restore_cmd) -{ - if (recv_fds(client_fd, restore_cmd->fds_write, restore_cmd->cmd_head.fd_write_num, 0, 0) < 0) { - pr_perror("Recv dmabuf fds fail"); - return -1; - } - return 0; -} - -int recv_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) -{ - int sock_fd, client_fd; - int ret = 0; - - sock_fd = fdstore_get(parallel_socket_id); - if (sock_fd < 0) - return -1; - - client_fd = accept(sock_fd, NULL, NULL); - if (client_fd < 0) { - ret = client_fd; - goto err_accept; - } - - ret = recv_metadata(client_fd, restore_cmd); - if (ret) { - goto err; - } - - // Return 1 to quit - if (check_quit_cmd(restore_cmd)) { - ret = 1; - goto err; - } - - ret = init_parallel_restore_cmd_by_head(restore_cmd); - if (ret) { - goto err; - } - - ret = recv_gpu_ids(client_fd, restore_cmd); - if (ret) { - goto err; - } - - ret = recv_cmds(client_fd, restore_cmd); - if (ret) { - goto err; - } - - ret = recv_dmabuf_fds(client_fd, restore_cmd); - -err: - close(client_fd); -err_accept: - close(sock_fd); - return ret; -} - -int close_parallel_restore_server(void) -{ - int sock_fd; - int ret = 0; - parallel_restore_cmd_head cmd_head; - - sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); - if (sock_fd < 0) { - pr_perror("Socket creation failed"); - return -1; - } - - ret = connect(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); - if (ret < 0) { - pr_perror("Connect failed"); - goto err; - } - - memset(&cmd_head, 0, sizeof(parallel_restore_cmd_head)); - if (send(sock_fd, &cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { - pr_perror("Send parallel restore command head fail"); - return -1; - } - -err: - close(sock_fd); - return ret; -} \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_socket_utils.h b/plugins/amdgpu/amdgpu_socket_utils.h deleted file mode 100644 index d7200c6bd..000000000 --- a/plugins/amdgpu/amdgpu_socket_utils.h +++ /dev/null @@ -1,54 +0,0 @@ -#ifndef __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ -#define __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ - -typedef struct { - int id; - int fd_write_num; /* The number of buffer objects to be restored. */ - int entry_num; /* The number of restore commands.*/ - int gpu_num; -} parallel_restore_cmd_head; - -typedef struct { - int gpu_id; - int minor; -} parallel_gpu_info; - -typedef struct { - int gpu_id; - int write_id; - uint64_t read_offset; - uint64_t write_offset; - uint64_t size; -} parallel_restore_entry; - -typedef struct { - parallel_restore_cmd_head cmd_head; - int *fds_write; - parallel_gpu_info *gpu_ids; - parallel_restore_entry *entries; -} parallel_restore_cmd; - -/* - * For parallel_restore, a background thread in the main CRIU process is used to restore the GPU - * buffer object. However, initially, the ownership of these buffer objects and the metadata for - * restoration are all with the target process. Therefore, we introduce a series of functions to - * help the target process send these tasks to the main CRIU process. - */ -int init_parallel_restore_cmd(int num, int id, int gpu_num, parallel_restore_cmd *restore_cmd); - -void free_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); - -int install_parallel_sock(void); - -int send_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); - -int recv_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); - -void parallel_restore_bo_add(int dmabuf_fd, int gpu_id, uint64_t size, uint64_t offset, - parallel_restore_cmd *restore_cmd); - -void parallel_restore_gpu_id_add(int gpu_id, int minor, parallel_restore_cmd *restore_cmd); - -int close_parallel_restore_server(void); - -#endif \ No newline at end of file From a525b3c32ea0a4b8bff66ad31941fc574914d18d Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 9 Nov 2025 20:26:50 -0800 Subject: [PATCH 233/257] test/vdso-proxy: handle merged vma-s When we compare two list of vma-s, we need to take into account that some of them could be merged. Fixes #12286 Signed-off-by: Andrei Vagin --- test/zdtm/static/vdso-proxy.c | 51 +++++++++++++++-------------------- 1 file changed, 22 insertions(+), 29 deletions(-) diff --git a/test/zdtm/static/vdso-proxy.c b/test/zdtm/static/vdso-proxy.c index 43334974f..a53e6cdc0 100644 --- a/test/zdtm/static/vdso-proxy.c +++ b/test/zdtm/static/vdso-proxy.c @@ -70,6 +70,7 @@ static int parse_maps(struct vm_area *vmas) #endif v->is_vvar_or_vdso |= strstr(buf, "[vdso]") != NULL; v->is_vvar_or_vdso |= strstr(buf, "[vvar]") != NULL; + v->is_vvar_or_vdso |= strstr(buf, "[vvar_vclock]") != NULL; test_msg("[NOTE]\tVMA: [%#" PRIx64 ", %#" PRIx64 "]\n", v->start, v->end); } @@ -86,42 +87,35 @@ static int parse_maps(struct vm_area *vmas) return i; } -int compare_vmas(struct vm_area *vmax, struct vm_area *vmay) -{ - if (vmax->start > vmay->start) - return 1; - if (vmax->start < vmay->start) - return -1; - if (vmax->end > vmay->end) - return 1; - if (vmax->end < vmay->end) - return -1; - - return 0; -} - -static int check_vvar_vdso(struct vm_area *before, struct vm_area *after) +static int check_vvar_vdso(struct vm_area *before, int nr_before, struct vm_area *after, int nr_after) { int i, j = 0; - for (i = 0; i < MAX_VMAS && j < MAX_VMAS; i++, j++) { - int cmp = compare_vmas(&before[i], &after[j]); - - if (cmp == 0) - continue; - - if (cmp < 0) { /* Lost mapping */ + for (i = 0, j = 0; i < nr_before || j < nr_after;) { + if (j == nr_after || before[i].start < after[j].start) { test_msg("[NOTE]\tLost mapping: %#" PRIx64 "-%#" PRIx64 "\n", before[i].start, before[i].end); - j--; if (before[i].is_vvar_or_vdso) { fail("Lost vvar/vdso mapping"); return -1; } + i++; continue; } - - test_msg("[NOTE]\tNew mapping appeared: %#" PRIx64 "-%#" PRIx64 "\n", after[j].start, after[j].end); - i--; + if (i == nr_before || before[i].start > after[j].start) { + test_msg("[NOTE]\tNew mapping appeared: %#" PRIx64 "-%#" PRIx64 "\n", after[j].start, after[j].end); + j++; + continue; + } + if (before[i].end == after[j].end) { + i++; + j++; + } else if (before[i].end > after[j].end) { + before[i].start = after[j].end; + j++; + } else { + after[j].start = before[i].end; + i++; + } } return 0; @@ -129,11 +123,10 @@ static int check_vvar_vdso(struct vm_area *before, struct vm_area *after) static struct vm_area vmas_before[MAX_VMAS]; static struct vm_area vmas_after[MAX_VMAS]; +static int nr_before, nr_after; int main(int argc, char *argv[]) { - int nr_before, nr_after; - test_init(argc, argv); test_msg("[NOTE]\tMappings before:\n"); @@ -154,7 +147,7 @@ int main(int argc, char *argv[]) } /* After restore vDSO/VVAR blobs must remain in the old place. */ - if (check_vvar_vdso(vmas_before, vmas_after)) + if (check_vvar_vdso(vmas_before, nr_before, vmas_after, nr_after)) return -1; if (nr_before + 2 < nr_after) { From 6344e8d71c57d44600fca0c34ec64827688c737d Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 11 Nov 2025 22:10:36 +0000 Subject: [PATCH 234/257] cr-servce: move kerndat_init after log_init kerndat_init() can generate a significant volume of logs. If called before log_init(), all these messages will be saved in the early_log_buffer, which has a limited capacity. Additionally, saving to the early_log_buffer can introduce a performance penalty, especially when verbose mode is not enabled. Signed-off-by: Radostin Stoyanov Signed-off-by: Andrei Vagin --- criu/cr-service.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index b4718dde2..dccf4ef38 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -439,12 +439,6 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->has_unprivileged) opts.unprivileged = req->unprivileged; - if (check_caps()) - return 1; - - if (kerndat_init()) - return 1; - if (log_keep_err()) { pr_perror("Can't tune log"); goto err; @@ -738,9 +732,6 @@ static int setup_opts_from_req(int sk, CriuOpts *req) } } - if (req->has_pidfd_store_sk && init_pidfd_store_sk(ids.pid, req->pidfd_store_sk)) - goto err; - if (req->orphan_pts_master) opts.orphan_pts_master = true; @@ -817,6 +808,16 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (setup_logging_from_req(req, output_changed_by_rpc_conf)) goto err; + if (check_caps()) + goto err; + + if (kerndat_init()) + goto err; + + /* init_pidfd_store_sk must be called after kerndat_init. */ + if (req->has_pidfd_store_sk && init_pidfd_store_sk(ids.pid, req->pidfd_store_sk)) + goto err; + if (req->mntns_compat_mode) opts.mntns_compat_mode = true; From e689d902b3d5dabcad8107c00a463a607ef49ebb Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 11 Nov 2025 15:21:09 -0800 Subject: [PATCH 235/257] criu/log: properly handle truncated length from vsnprintf vsnprintf does not always return the number of bytes actually written to the buffer. If the output was truncated due to the buffer limit, the return value is the total number of bytes which WOULD have been written to the final string if enough space had been available. This means we must cap the return value to the buffer size excluding the terminating null byte to correctly calculate the log entry size. Signed-off-by: Andrei Vagin --- criu/log.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/criu/log.c b/criu/log.c index a02a8df20..fe7077702 100644 --- a/criu/log.c +++ b/criu/log.c @@ -202,7 +202,7 @@ void flush_early_log_buffer(int fd) } pos += hdr->len; } - if (early_log_buf_off == EARLY_LOG_BUF_LEN) + if ((early_log_buf_off + sizeof(struct early_log_hdr)) >= EARLY_LOG_BUF_LEN) pr_warn("The early log buffer is full, some messages may have been lost\n"); early_log_buf_off = 0; } @@ -320,7 +320,7 @@ unsigned int log_get_loglevel(void) static void early_vprint(const char *format, unsigned int loglevel, va_list params) { - unsigned int log_size = 0; + int log_size = 0, log_space; struct early_log_hdr *hdr; if ((early_log_buf_off + sizeof(hdr)) >= EARLY_LOG_BUF_LEN) @@ -332,6 +332,7 @@ static void early_vprint(const char *format, unsigned int loglevel, va_list para hdr->level = loglevel; /* Skip the log entry size */ early_log_buf_off += sizeof(hdr); + log_space = EARLY_LOG_BUF_LEN - early_log_buf_off; if (loglevel >= LOG_TIMESTAMP) { /* * If logging is not yet setup we just write zeros @@ -339,12 +340,17 @@ static void early_vprint(const char *format, unsigned int loglevel, va_list para * keep the same format as the other messages on * log levels with timestamps (>=LOG_TIMESTAMP). */ - log_size = snprintf(early_log_buffer + early_log_buf_off, sizeof(early_log_buffer) - early_log_buf_off, + log_size = snprintf(early_log_buffer + early_log_buf_off, log_space, "(00.000000) "); } - log_size += vsnprintf(early_log_buffer + early_log_buf_off + log_size, - sizeof(early_log_buffer) - early_log_buf_off - log_size, format, params); + if (log_size < log_space) + log_size += vsnprintf(early_log_buffer + early_log_buf_off + log_size, + log_space - log_size, format, params); + if (log_size > log_space) { + /* vsnprintf always add the terminating null byte. */ + log_size = log_space - 1; + } /* Save log entry size */ hdr->len = log_size; From 0a7e7d09dd91354277e697495bd8fb05626987a9 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 12 Nov 2025 05:50:23 +0000 Subject: [PATCH 236/257] log: use sizeof(*hdr) instead of sizeof(hdr) Using sizeof(hdr) where hdr is a pointer gives the size of the pointer, not the size of the structure it points to. Reported-by: Kir Kolyshkin Signed-off-by: Andrei Vagin --- criu/log.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/criu/log.c b/criu/log.c index fe7077702..bf6f657f2 100644 --- a/criu/log.c +++ b/criu/log.c @@ -190,7 +190,7 @@ void flush_early_log_buffer(int fd) * with reading the log_level. */ struct early_log_hdr *hdr = (void *)early_log_buffer + pos; - pos += sizeof(hdr); + pos += sizeof(*hdr); if (hdr->level <= current_loglevel) { size_t size = 0; while (size < hdr->len) { @@ -323,7 +323,7 @@ static void early_vprint(const char *format, unsigned int loglevel, va_list para int log_size = 0, log_space; struct early_log_hdr *hdr; - if ((early_log_buf_off + sizeof(hdr)) >= EARLY_LOG_BUF_LEN) + if ((early_log_buf_off + sizeof(*hdr)) >= EARLY_LOG_BUF_LEN) return; /* Save loglevel */ @@ -331,7 +331,7 @@ static void early_vprint(const char *format, unsigned int loglevel, va_list para hdr = (void *)early_log_buffer + early_log_buf_off; hdr->level = loglevel; /* Skip the log entry size */ - early_log_buf_off += sizeof(hdr); + early_log_buf_off += sizeof(*hdr); log_space = EARLY_LOG_BUF_LEN - early_log_buf_off; if (loglevel >= LOG_TIMESTAMP) { /* From 3c7d4fa013297b431da48eff821db7f2e8b90c27 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sat, 8 Nov 2025 06:53:19 +0000 Subject: [PATCH 237/257] criu: Version 4.2 (CRIUTIBILITY) Major changes: * plugins/amdgpu: Implement parallel restore * Handle processes with uprobes vma * Fix: getsockopt usage for SO_PASSCRED/SO_PASSSEC on Linux 6.16 * Relax ELF magic check to support MIPS libraries * pagemap: prevent integer overflow in pagemap_len This release's name is a nod to the growing challenge we face in maintaining compatibility across the rapidly evolving Linux kernel ecosystem. The full changelog can be found here: https://criu.org/Download/criu/4.2. Signed-off-by: Andrei Vagin --- Makefile.versions | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.versions b/Makefile.versions index 0b1a46a16..3e6c9ed22 100644 --- a/Makefile.versions +++ b/Makefile.versions @@ -1,10 +1,10 @@ # # CRIU version. CRIU_VERSION_MAJOR := 4 -CRIU_VERSION_MINOR := 1 -CRIU_VERSION_SUBLEVEL := 1 +CRIU_VERSION_MINOR := 2 +CRIU_VERSION_SUBLEVEL := CRIU_VERSION_EXTRA := -CRIU_VERSION_NAME := CRISCV +CRIU_VERSION_NAME := CRIUTIBILITY CRIU_VERSION := $(CRIU_VERSION_MAJOR)$(if $(CRIU_VERSION_MINOR),.$(CRIU_VERSION_MINOR))$(if $(CRIU_VERSION_SUBLEVEL),.$(CRIU_VERSION_SUBLEVEL))$(if $(CRIU_VERSION_EXTRA),.$(CRIU_VERSION_EXTRA)) export CRIU_VERSION_MAJOR CRIU_VERSION_MINOR CRIU_VERSION_SUBLEVEL From ddbb3dbd8d84a785ad211be42d2ba0d034c0291f Mon Sep 17 00:00:00 2001 From: Pengda Yang Date: Wed, 15 Mar 2023 16:58:31 +0800 Subject: [PATCH 238/257] limit the field width of 'scanf' Fixes: #2121 Signed-off-by: Pengda Yang --- criu/proc_parse.c | 6 +++--- test/zdtm/lib/fs.c | 2 +- test/zdtm/static/apparmor.c | 2 +- test/zdtm/static/apparmor_stacking.c | 2 +- test/zdtm/static/cgroup01.c | 2 +- test/zdtm/static/cgroup02.c | 2 +- test/zdtm/static/change_mnt_context.c | 2 +- test/zdtm/static/file_locks01.c | 2 +- test/zdtm/static/file_locks02.c | 2 +- test/zdtm/static/file_locks03.c | 2 +- test/zdtm/static/file_locks04.c | 2 +- test/zdtm/static/netns-dev.c | 2 +- test/zdtm/static/ofd_file_locks.c | 2 +- 13 files changed, 15 insertions(+), 15 deletions(-) diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 0d3b5b23f..f51f2e801 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -1477,7 +1477,7 @@ static int parse_mountinfo_ent(char *str, struct mount_info *new, char **fsname) goto err; new->mountpoint[0] = '.'; - ret = sscanf(str, "%i %i %u:%u %ms %s %ms %n", &new->mnt_id, &new->parent_mnt_id, &kmaj, &kmin, &new->root, + ret = sscanf(str, "%i %i %u:%u %ms %4094s %ms %n", &new->mnt_id, &new->parent_mnt_id, &kmaj, &kmin, &new->root, new->mountpoint + 1, &opt, &n); if (ret != 7) goto err; @@ -2302,10 +2302,10 @@ static int parse_file_lock_buf(char *buf, struct file_lock *fl, bool is_blocked) char fl_flag[10], fl_type[15], fl_option[10]; if (is_blocked) { - num = sscanf(buf, "%lld: -> %s %s %s %d %x:%x:%ld %lld %s", &fl->fl_id, fl_flag, fl_type, fl_option, + num = sscanf(buf, "%lld: -> %9s %14s %9s %d %x:%x:%ld %lld %31s", &fl->fl_id, fl_flag, fl_type, fl_option, &fl->fl_owner, &fl->maj, &fl->min, &fl->i_no, &fl->start, fl->end); } else { - num = sscanf(buf, "%lld:%s %s %s %d %x:%x:%ld %lld %s", &fl->fl_id, fl_flag, fl_type, fl_option, + num = sscanf(buf, "%lld:%9s %14s %9s %d %x:%x:%ld %lld %31s", &fl->fl_id, fl_flag, fl_type, fl_option, &fl->fl_owner, &fl->maj, &fl->min, &fl->i_no, &fl->start, fl->end); } diff --git a/test/zdtm/lib/fs.c b/test/zdtm/lib/fs.c index bf8cd9cd3..efcc7a1d0 100644 --- a/test/zdtm/lib/fs.c +++ b/test/zdtm/lib/fs.c @@ -54,7 +54,7 @@ mnt_info_t *get_cwd_mnt_info(void) while (fgets(str, sizeof(str), f)) { char *hyphen = strchr(str, '-'); - ret = sscanf(str, "%i %i %u:%u %s %s", &mnt_id, &parent_mnt_id, &kmaj, &kmin, root, mountpoint); + ret = sscanf(str, "%i %i %u:%u %4095s %4095s", &mnt_id, &parent_mnt_id, &kmaj, &kmin, root, mountpoint); if (ret != 6 || !hyphen) goto err; ret = sscanf(hyphen + 1, " %ms", &fsname); diff --git a/test/zdtm/static/apparmor.c b/test/zdtm/static/apparmor.c index 713ffaa46..dc1636821 100644 --- a/test/zdtm/static/apparmor.c +++ b/test/zdtm/static/apparmor.c @@ -59,7 +59,7 @@ int checkprofile(void) return -1; } - len = fscanf(f, "%[^ \n]s", profile); + len = fscanf(f, "%1023[^ \n]s", profile); fclose(f); if (len != 1) { fail("wrong number of items scanned %d", len); diff --git a/test/zdtm/static/apparmor_stacking.c b/test/zdtm/static/apparmor_stacking.c index 76de8b8b4..0bc36048c 100644 --- a/test/zdtm/static/apparmor_stacking.c +++ b/test/zdtm/static/apparmor_stacking.c @@ -56,7 +56,7 @@ static int checkprofile(pid_t pid, char *expected) return -1; } - len = fscanf(f, "%[^ \n]s", profile); + len = fscanf(f, "%1023[^ \n]s", profile); fclose(f); if (len != 1) { fail("wrong number of items scanned %d", len); diff --git a/test/zdtm/static/cgroup01.c b/test/zdtm/static/cgroup01.c index bc8515264..7bfb67762 100644 --- a/test/zdtm/static/cgroup01.c +++ b/test/zdtm/static/cgroup01.c @@ -79,7 +79,7 @@ int main(int argc, char **argv) if (!s) continue; - sscanf(paux, "%*d %*d %*d:%*d %*s %s", aux); + sscanf(paux, "%*d %*d %*d:%*d %*s %1023s", aux); test_msg("found cgroup at %s\n", aux); for (i = 0; i < 2; i++) { diff --git a/test/zdtm/static/cgroup02.c b/test/zdtm/static/cgroup02.c index 6229a8a08..8a925c0a4 100644 --- a/test/zdtm/static/cgroup02.c +++ b/test/zdtm/static/cgroup02.c @@ -75,7 +75,7 @@ bool test_exists(char *mountinfo_line, char *path) char aux[1024], paux[1024]; struct stat st; - sscanf(mountinfo_line, "%*d %*d %*d:%*d %*s %s", aux); + sscanf(mountinfo_line, "%*d %*d %*d:%*d %*s %1023s", aux); test_msg("found cgroup at %s\n", aux); ssprintf(paux, "%s/%s", aux, path); diff --git a/test/zdtm/static/change_mnt_context.c b/test/zdtm/static/change_mnt_context.c index 6d436014b..8787ae5cf 100644 --- a/test/zdtm/static/change_mnt_context.c +++ b/test/zdtm/static/change_mnt_context.c @@ -46,7 +46,7 @@ int main(int argc, char **argv) if (!pos) continue; - result = sscanf(pos, " - %*s %*s %s", opts); + result = sscanf(pos, " - %*s %*s %1023s", opts); if (result != 1) { fail("Not able to sscanf line from mountinfo"); goto out; diff --git a/test/zdtm/static/file_locks01.c b/test/zdtm/static/file_locks01.c index beea171f5..bfdca51d9 100644 --- a/test/zdtm/static/file_locks01.c +++ b/test/zdtm/static/file_locks01.c @@ -107,7 +107,7 @@ static int check_file_lock(int fd, char *expected_type, char *expected_option, u memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); - num = sscanf(buf, "%*s %*d:%s %s %s %d %x:%x:%ld %*d %*s", fl_flag, fl_type, fl_option, &fl_owner, &maj, + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d %x:%x:%ld %*d %*s", fl_flag, fl_type, fl_option, &fl_owner, &maj, &min, &i_no); if (num < 7) { pr_err("Invalid lock info\n"); diff --git a/test/zdtm/static/file_locks02.c b/test/zdtm/static/file_locks02.c index d2049ebaa..ae4827de9 100644 --- a/test/zdtm/static/file_locks02.c +++ b/test/zdtm/static/file_locks02.c @@ -41,7 +41,7 @@ static int check_file_lock(pid_t pid, pid_t child, int fd, char *expected_type, memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); - num = sscanf(buf, "%*s %*d:%s %s %s %d", fl_flag, fl_type, fl_option, &fl_owner); + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { pr_perror("Invalid lock info."); break; diff --git a/test/zdtm/static/file_locks03.c b/test/zdtm/static/file_locks03.c index 35ef41a21..228e66892 100644 --- a/test/zdtm/static/file_locks03.c +++ b/test/zdtm/static/file_locks03.c @@ -41,7 +41,7 @@ static int check_file_lock(pid_t pid, pid_t child, int fd, char *expected_type, memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); - num = sscanf(buf, "%*s %*d:%s %s %s %d", fl_flag, fl_type, fl_option, &fl_owner); + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { pr_perror("Invalid lock info."); break; diff --git a/test/zdtm/static/file_locks04.c b/test/zdtm/static/file_locks04.c index 11d224fa7..7e0d2654e 100644 --- a/test/zdtm/static/file_locks04.c +++ b/test/zdtm/static/file_locks04.c @@ -34,7 +34,7 @@ static int check_file_locks(pid_t child_pid, int fd, int child_fd) continue; test_msg("c: %s", buf); - num = sscanf(buf, "%*s %*d:%s %s %s %d %*02x:%*02x:%*d %*d %*s", fl_flag, fl_type, fl_option, + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d %*02x:%*02x:%*d %*d %*s", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { diff --git a/test/zdtm/static/netns-dev.c b/test/zdtm/static/netns-dev.c index 1e6ee1dea..f268f2fec 100644 --- a/test/zdtm/static/netns-dev.c +++ b/test/zdtm/static/netns-dev.c @@ -414,7 +414,7 @@ static int check_stable_secret(struct test_conf *tc) return -1; } - ret = fscanf(fp, "%s", val); + ret = fscanf(fp, "%200s", val); if (ret != 1) { pr_perror("fscanf"); fclose(fp); diff --git a/test/zdtm/static/ofd_file_locks.c b/test/zdtm/static/ofd_file_locks.c index 68b6f22f5..a68fa38ee 100644 --- a/test/zdtm/static/ofd_file_locks.c +++ b/test/zdtm/static/ofd_file_locks.c @@ -16,7 +16,7 @@ static int parse_ofd_lock(char *buf, struct flock *lck) if (strncmp(buf, "lock:\t", 6) != 0) return 1; /* isn't lock, skip record */ - num = sscanf(buf, "%*s %*d: %s %s %s %*d %*x:%*x:%*d %lld %s", fl_flag, fl_type, fl_option, &start, fl_end); + num = sscanf(buf, "%*s %*d: %9s %14s %9s %*d %*x:%*x:%*d %lld %31s", fl_flag, fl_type, fl_option, &start, fl_end); if (num < 4) { pr_err("Invalid lock info %s\n", buf); From 63861407544172a04c8b03d3387ea6a8b23d9be2 Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Tue, 10 Dec 2024 12:36:33 +0000 Subject: [PATCH 239/257] plugins/amdgpu: Add socket operations When enabling parallel restore, the target process and the main CRIU process need an IPC interface to communicate and transfer restore commands. This patch adds a Unix domain TCP socket and stores this socket in `fdstore`. Signed-off-by: Yanning Yang --- plugins/amdgpu/amdgpu_socket_utils.c | 59 ++++++++++++++++++++++++++++ plugins/amdgpu/amdgpu_socket_utils.h | 6 +++ 2 files changed, 65 insertions(+) create mode 100644 plugins/amdgpu/amdgpu_socket_utils.c create mode 100644 plugins/amdgpu/amdgpu_socket_utils.h diff --git a/plugins/amdgpu/amdgpu_socket_utils.c b/plugins/amdgpu/amdgpu_socket_utils.c new file mode 100644 index 000000000..9e957ae54 --- /dev/null +++ b/plugins/amdgpu/amdgpu_socket_utils.c @@ -0,0 +1,59 @@ +#include +#include +#include +#include +#include +#include + +#include "amdgpu_socket_utils.h" +#include "criu-log.h" +#include "common/scm.h" +#include "fdstore.h" +#include "util-pie.h" +#include "util.h" + +int parallel_socket_addr_len; +struct sockaddr_un parallel_socket_addr; +int parallel_socket_id = 0; + +static void amdgpu_socket_name_gen(struct sockaddr_un *addr, int *len) +{ + addr->sun_family = AF_UNIX; + snprintf(addr->sun_path, UNIX_PATH_MAX, "x/criu-amdgpu-parallel-%s", criu_run_id); + *len = SUN_LEN(addr); + *addr->sun_path = '\0'; +} + +int install_parallel_sock(void) +{ + int ret = 0; + int sock_fd; + + sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (sock_fd < 0) { + pr_perror("socket creation failed"); + return -1; + } + + amdgpu_socket_name_gen(¶llel_socket_addr, ¶llel_socket_addr_len); + ret = bind(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); + if (ret < 0) { + pr_perror("bind failed"); + goto err; + } + + ret = listen(sock_fd, SOMAXCONN); + if (ret < 0) { + pr_perror("listen failed"); + goto err; + } + + parallel_socket_id = fdstore_add(sock_fd); + if (parallel_socket_id < 0) { + ret = -1; + goto err; + } +err: + close(sock_fd); + return ret; +} \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_socket_utils.h b/plugins/amdgpu/amdgpu_socket_utils.h new file mode 100644 index 000000000..4e7aa2aa4 --- /dev/null +++ b/plugins/amdgpu/amdgpu_socket_utils.h @@ -0,0 +1,6 @@ +#ifndef __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ +#define __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ + +int install_parallel_sock(void); + +#endif \ No newline at end of file From 33ed774c8dd13fc48955557434bad9908031379e Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Tue, 10 Dec 2024 12:38:48 +0000 Subject: [PATCH 240/257] plugins/amdgpu: Add parallel restore command Currently the restore of buffer object comsumes a significant amount of time. However, this part has no logical dependencies with other restore operations. This patch introduce some structures and some helper functions for the target process to offload this task to the main CRIU process. Signed-off-by: Yanning Yang --- plugins/amdgpu/amdgpu_socket_utils.c | 261 +++++++++++++++++++++++++++ plugins/amdgpu/amdgpu_socket_utils.h | 48 +++++ 2 files changed, 309 insertions(+) diff --git a/plugins/amdgpu/amdgpu_socket_utils.c b/plugins/amdgpu/amdgpu_socket_utils.c index 9e957ae54..c8bf6d1ba 100644 --- a/plugins/amdgpu/amdgpu_socket_utils.c +++ b/plugins/amdgpu/amdgpu_socket_utils.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "amdgpu_socket_utils.h" #include "criu-log.h" @@ -53,6 +54,266 @@ int install_parallel_sock(void) ret = -1; goto err; } +err: + close(sock_fd); + return ret; +} + +void parallel_restore_bo_add(int dmabuf_fd, int gpu_id, uint64_t size, uint64_t offset, + parallel_restore_cmd *restore_cmd) +{ + parallel_restore_entry *restore_entry = &restore_cmd->entries[restore_cmd->cmd_head.entry_num]; + restore_entry->gpu_id = gpu_id; + restore_entry->write_id = restore_cmd->cmd_head.fd_write_num; + restore_entry->write_offset = 0; + restore_entry->read_offset = offset; + restore_entry->size = size; + + restore_cmd->fds_write[restore_cmd->cmd_head.fd_write_num] = dmabuf_fd; + + restore_cmd->cmd_head.entry_num += 1; + restore_cmd->cmd_head.fd_write_num += 1; +} + +void parallel_restore_gpu_id_add(int gpu_id, int minor, parallel_restore_cmd *restore_cmd) +{ + restore_cmd->gpu_ids[restore_cmd->cmd_head.gpu_num] = (parallel_gpu_info){ gpu_id, minor }; + restore_cmd->cmd_head.gpu_num += 1; +} + +static int send_metadata(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send(sock_fd, &restore_cmd->cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { + pr_perror("Send parallel restore command head fail"); + return -1; + } + return 0; +} + +static int send_gpu_ids(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send(sock_fd, restore_cmd->gpu_ids, restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info), 0) < 0) { + pr_perror("Send GPU ids of parallel restore command fail"); + return -1; + } + return 0; +} + +static int send_cmds(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send(sock_fd, restore_cmd->entries, restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry), 0) < 0) { + pr_perror("Send parallel restore command fail"); + return -1; + } + return 0; +} + +static int send_dmabuf_fds(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send_fds(sock_fd, NULL, 0, restore_cmd->fds_write, restore_cmd->cmd_head.fd_write_num, 0, 0) < 0) { + pr_perror("Send dmabuf fds fail"); + return -1; + } + return 0; +} + +int send_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) +{ + int sock_fd; + int ret = 0; + + sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (sock_fd < 0) { + pr_perror("Socket creation failed"); + return -1; + } + + ret = connect(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); + if (ret < 0) { + pr_perror("Connect failed"); + goto err; + } + + ret = send_metadata(sock_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = send_gpu_ids(sock_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = send_cmds(sock_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = send_dmabuf_fds(sock_fd, restore_cmd); + +err: + close(sock_fd); + return ret; +} + +int init_parallel_restore_cmd(int num, int id, int gpu_num, parallel_restore_cmd *restore_cmd) +{ + restore_cmd->cmd_head.id = id; + restore_cmd->cmd_head.fd_write_num = 0; + restore_cmd->cmd_head.entry_num = 0; + restore_cmd->cmd_head.gpu_num = 0; + + restore_cmd->gpu_ids = xzalloc(gpu_num * sizeof(parallel_gpu_info)); + if (!restore_cmd->gpu_ids) + return -ENOMEM; + restore_cmd->fds_write = xzalloc(num * sizeof(int)); + if (!restore_cmd->fds_write) + return -ENOMEM; + restore_cmd->entries = xzalloc(num * sizeof(parallel_restore_entry)); + if (!restore_cmd->entries) + return -ENOMEM; + return 0; +} + +void free_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) +{ + if (restore_cmd->gpu_ids) + xfree(restore_cmd->gpu_ids); + if (restore_cmd->fds_write) + xfree(restore_cmd->fds_write); + if (restore_cmd->entries) + xfree(restore_cmd->entries); +} + +static int init_parallel_restore_cmd_by_head(parallel_restore_cmd *restore_cmd) +{ + restore_cmd->gpu_ids = xzalloc(restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info)); + if (!restore_cmd->gpu_ids) + return -ENOMEM; + restore_cmd->fds_write = xzalloc(restore_cmd->cmd_head.fd_write_num * sizeof(int)); + if (!restore_cmd->fds_write) + return -ENOMEM; + restore_cmd->entries = xzalloc(restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry)); + if (!restore_cmd->entries) + return -ENOMEM; + return 0; +} + +static int check_quit_cmd(parallel_restore_cmd *restore_cmd) +{ + return restore_cmd->cmd_head.fd_write_num == 0; +} + +static int recv_metadata(int client_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv(client_fd, &restore_cmd->cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { + pr_perror("Recv parallel restore command head fail"); + return -1; + } + return 0; +} + +static int recv_cmds(int client_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv(client_fd, restore_cmd->entries, restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry), 0) < 0) { + pr_perror("Recv parallel restore command fail"); + return -1; + } + return 0; +} + +static int recv_gpu_ids(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv(sock_fd, restore_cmd->gpu_ids, restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info), 0) < 0) { + pr_perror("Send GPU ids of parallel restore command fail"); + return -1; + } + return 0; +} + +static int recv_dmabuf_fds(int client_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv_fds(client_fd, restore_cmd->fds_write, restore_cmd->cmd_head.fd_write_num, 0, 0) < 0) { + pr_perror("Recv dmabuf fds fail"); + return -1; + } + return 0; +} + +int recv_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) +{ + int sock_fd, client_fd; + int ret = 0; + + sock_fd = fdstore_get(parallel_socket_id); + if (sock_fd < 0) + return -1; + + client_fd = accept(sock_fd, NULL, NULL); + if (client_fd < 0) { + ret = client_fd; + goto err_accept; + } + + ret = recv_metadata(client_fd, restore_cmd); + if (ret) { + goto err; + } + + // Return 1 to quit + if (check_quit_cmd(restore_cmd)) { + ret = 1; + goto err; + } + + ret = init_parallel_restore_cmd_by_head(restore_cmd); + if (ret) { + goto err; + } + + ret = recv_gpu_ids(client_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = recv_cmds(client_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = recv_dmabuf_fds(client_fd, restore_cmd); + +err: + close(client_fd); +err_accept: + close(sock_fd); + return ret; +} + +int close_parallel_restore_server(void) +{ + int sock_fd; + int ret = 0; + parallel_restore_cmd_head cmd_head; + + sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (sock_fd < 0) { + pr_perror("Socket creation failed"); + return -1; + } + + ret = connect(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); + if (ret < 0) { + pr_perror("Connect failed"); + goto err; + } + + memset(&cmd_head, 0, sizeof(parallel_restore_cmd_head)); + if (send(sock_fd, &cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { + pr_perror("Send parallel restore command head fail"); + return -1; + } + err: close(sock_fd); return ret; diff --git a/plugins/amdgpu/amdgpu_socket_utils.h b/plugins/amdgpu/amdgpu_socket_utils.h index 4e7aa2aa4..d7200c6bd 100644 --- a/plugins/amdgpu/amdgpu_socket_utils.h +++ b/plugins/amdgpu/amdgpu_socket_utils.h @@ -1,6 +1,54 @@ #ifndef __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ #define __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ +typedef struct { + int id; + int fd_write_num; /* The number of buffer objects to be restored. */ + int entry_num; /* The number of restore commands.*/ + int gpu_num; +} parallel_restore_cmd_head; + +typedef struct { + int gpu_id; + int minor; +} parallel_gpu_info; + +typedef struct { + int gpu_id; + int write_id; + uint64_t read_offset; + uint64_t write_offset; + uint64_t size; +} parallel_restore_entry; + +typedef struct { + parallel_restore_cmd_head cmd_head; + int *fds_write; + parallel_gpu_info *gpu_ids; + parallel_restore_entry *entries; +} parallel_restore_cmd; + +/* + * For parallel_restore, a background thread in the main CRIU process is used to restore the GPU + * buffer object. However, initially, the ownership of these buffer objects and the metadata for + * restoration are all with the target process. Therefore, we introduce a series of functions to + * help the target process send these tasks to the main CRIU process. + */ +int init_parallel_restore_cmd(int num, int id, int gpu_num, parallel_restore_cmd *restore_cmd); + +void free_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); + int install_parallel_sock(void); +int send_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); + +int recv_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); + +void parallel_restore_bo_add(int dmabuf_fd, int gpu_id, uint64_t size, uint64_t offset, + parallel_restore_cmd *restore_cmd); + +void parallel_restore_gpu_id_add(int gpu_id, int minor, parallel_restore_cmd *restore_cmd); + +int close_parallel_restore_server(void); + #endif \ No newline at end of file From 4a3a695dfb9da7338174549b0cadcc4279cbf51a Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Wed, 15 Jan 2025 06:38:27 +0000 Subject: [PATCH 241/257] plugins/amdgpu: Implement parallel restore This patch implements the entire logic to enable the offloading of buffer object content restoration. The goal of this patch is to offload the buffer object content restoration to the main CRIU process so that this restoration can occur in parallel with other restoration logic (mainly the restoration of memory state in the restore blob, which is time-consuming) to speed up the restore phase. The restoration of buffer object content usually takes a significant amount of time for GPU applications, so parallelizing it with other operations can reduce the overall restore time. It has three parts: the first replaces the restoration of buffer objects in the target process by sending a parallel restore command to the main CRIU process; the second implements the POST_FORKING hook in the amdgpu plugin to enable buffer object content restoration in the main CRIU process; the third stops the parallel thread in the RESUME_DEVICES_LATE hook. This optimization only focuses on the single-process situation (common case). In other scenarios, it will turn to the original method. This is achieved with the new `parallel_disabled` flag. Signed-off-by: Yanning Yang --- plugins/amdgpu/Makefile | 2 +- plugins/amdgpu/amdgpu_plugin.c | 420 +++++++++++++++++++++--- plugins/amdgpu/amdgpu_plugin_topology.c | 2 +- plugins/amdgpu/amdgpu_plugin_topology.h | 1 + 4 files changed, 374 insertions(+), 51 deletions(-) diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 3d55f8bb4..870a039cd 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -27,7 +27,7 @@ endif criu-amdgpu.pb-c.c: criu-amdgpu.proto protoc --proto_path=. --c_out=. criu-amdgpu.proto -amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c +amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC) amdgpu_plugin_clean: diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 96c086162..69194fbc7 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -28,11 +28,13 @@ #include "xmalloc.h" #include "criu-log.h" #include "files.h" +#include "pstree.h" #include "common/list.h" #include "amdgpu_plugin_drm.h" #include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" +#include "amdgpu_socket_utils.h" #include "img-streamer.h" #include "image.h" @@ -64,6 +66,18 @@ bool plugin_added_to_inventory = false; bool plugin_disabled = false; +/* + * In the case of a single process (common case), this optimization can effectively + * reduce the restore latency with parallel restore. In the case of multiple processes, + * states are already restored in parallel within different processes. Therefore, this + * optimization does not introduce further improvement and will be disabled by default + * in this case. The flag, parallel_disabled, is used to control whether the + * optimization is enabled or disabled. + */ +bool parallel_disabled = false; + +pthread_t parallel_thread = 0; +int parallel_thread_result = 0; /**************************************************************************************************/ /* Call ioctl, restarting if it is interrupted */ @@ -351,6 +365,15 @@ int amdgpu_plugin_init(int stage) maps_init(&restore_maps); if (stage == CR_PLUGIN_STAGE__RESTORE) { + if (has_children(root_item)) { + pr_info("Parallel restore disabled\n"); + parallel_disabled = true; + } else { + if (install_parallel_sock() < 0) { + pr_err("Failed to install parallel socket\n"); + return -1; + } + } /* Default Values */ kfd_fw_version_check = true; kfd_sdma_fw_version_check = true; @@ -1439,14 +1462,9 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e) static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e) { - struct thread_data *thread_datas; + struct thread_data *thread_datas = NULL; int thread_i, ret = 0; - - thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus); - if (!thread_datas) { - ret = -ENOMEM; - goto exit; - } + int offset = 0; for (int i = 0; i < e->num_of_bos; i++) { struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i]; @@ -1489,56 +1507,101 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf } } - thread_i = 0; - for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { - struct tp_node *dev; - int ret_thread = 0; - uint32_t target_gpu_id; + if (!parallel_disabled) { + parallel_restore_cmd restore_cmd; + pr_info("Begin to send parallel restore cmd\n"); + ret = init_parallel_restore_cmd(e->num_of_bos, id, e->num_of_gpus, &restore_cmd); + if (ret) + goto exit_parallel; - if (!e->device_entries[i]->gpu_id) - continue; + for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { + uint32_t target_gpu_id; + struct tp_node *dev; - /* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */ - target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); + if (!e->device_entries[i]->gpu_id) + continue; - /* We need the fd for actual_gpu_id */ - dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); - if (!dev) { - pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); - ret = -ENODEV; + target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); + dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); + if (!dev) { + pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); + ret = -ENODEV; + goto exit_parallel; + } + parallel_restore_gpu_id_add(e->device_entries[i]->gpu_id, dev->drm_render_minor, &restore_cmd); + + for (int j = 0; j < e->num_of_bos; j++) { + if (bo_buckets[j].gpu_id != e->device_entries[i]->gpu_id) + continue; + if (bo_buckets[j].alloc_flags & + (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) { + parallel_restore_bo_add(bo_buckets[j].dmabuf_fd, bo_buckets[j].gpu_id, + bo_buckets[j].size, offset, &restore_cmd); + offset += bo_buckets[j].size; + } + } + } + ret = send_parallel_restore_cmd(&restore_cmd); +exit_parallel: + free_parallel_restore_cmd(&restore_cmd); + } else { + thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus); + if (!thread_datas) { + ret = -ENOMEM; goto exit; } - thread_datas[thread_i].id = id; - thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id; - thread_datas[thread_i].bo_buckets = bo_buckets; - thread_datas[thread_i].bo_entries = e->bo_entries; - thread_datas[thread_i].pid = e->pid; - thread_datas[thread_i].num_of_bos = e->num_of_bos; + thread_i = 0; + for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { + struct tp_node *dev; + int ret_thread = 0; + uint32_t target_gpu_id; - thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev); - if (thread_datas[thread_i].drm_fd < 0) { - ret = -thread_datas[thread_i].drm_fd; - goto exit; + if (!e->device_entries[i]->gpu_id) + continue; + + /* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */ + target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); + + /* We need the fd for actual_gpu_id */ + dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); + if (!dev) { + pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); + ret = -ENODEV; + goto exit; + } + + thread_datas[thread_i].id = id; + thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id; + thread_datas[thread_i].bo_buckets = bo_buckets; + thread_datas[thread_i].bo_entries = e->bo_entries; + thread_datas[thread_i].pid = e->pid; + thread_datas[thread_i].num_of_bos = e->num_of_bos; + + thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev); + if (thread_datas[thread_i].drm_fd < 0) { + ret = -thread_datas[thread_i].drm_fd; + goto exit; + } + + ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents, + (void *)&thread_datas[thread_i]); + if (ret_thread) { + pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread); + ret = -ret_thread; + goto exit; + } + thread_i++; } - ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents, - (void *)&thread_datas[thread_i]); - if (ret_thread) { - pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread); - ret = -ret_thread; - goto exit; - } - thread_i++; - } + for (int i = 0; i < e->num_of_gpus; i++) { + pthread_join(thread_datas[i].thread, NULL); + pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); - for (int i = 0; i < e->num_of_gpus; i++) { - pthread_join(thread_datas[i].thread, NULL); - pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); - - if (thread_datas[i].ret) { - ret = thread_datas[i].ret; - goto exit; + if (thread_datas[i].ret) { + ret = thread_datas[i].ret; + goto exit; + } } } exit: @@ -1546,8 +1609,8 @@ exit: if (bo_buckets[i].dmabuf_fd != KFD_INVALID_FD) close(bo_buckets[i].dmabuf_fd); } - - xfree(thread_datas); + if (thread_datas) + xfree(thread_datas); return ret; } @@ -1836,6 +1899,24 @@ int amdgpu_plugin_resume_devices_late(int target_pid) if (plugin_disabled) return -ENOTSUP; + if (!parallel_disabled) { + pr_info("Close parallel restore server\n"); + if (close_parallel_restore_server()) { + pr_err("Close parallel restore server fail\n"); + return -1; + } + + exit_code = pthread_join(parallel_thread, NULL); + if (exit_code) { + pr_err("Failed to join parallel thread ret:%d\n", exit_code); + return -1; + } + if (parallel_thread_result) { + pr_err("Parallel restore fail\n"); + return parallel_thread_result; + } + } + pr_info("Inside %s for target pid = %d\n", __func__, target_pid); fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); @@ -1862,3 +1943,244 @@ int amdgpu_plugin_resume_devices_late(int target_pid) } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late) + +int sdma_copy_bo_helper(uint64_t size, int fd, FILE *storage_fp, void *buffer, size_t buffer_size, + amdgpu_device_handle h_dev, uint64_t max_copy_size, enum sdma_op_type type) +{ + return sdma_copy_bo((struct kfd_criu_bo_bucket){ 0, size, 0, 0, 0, 0, fd, 0 }, storage_fp, buffer, + buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); +} + +int init_dev(int dev_minor, amdgpu_device_handle *h_dev, uint64_t *max_copy_size) +{ + int ret = 0; + int drm_fd = -1; + uint32_t major, minor; + + struct amdgpu_gpu_info gpu_info = { 0 }; + + drm_fd = open_drm_render_device(dev_minor); + if (drm_fd < 0) { + return drm_fd; + } + + ret = amdgpu_device_initialize(drm_fd, &major, &minor, h_dev); + if (ret) { + pr_perror("Failed to initialize device"); + goto err; + } + + ret = amdgpu_query_gpu_info(*h_dev, &gpu_info); + if (ret) { + pr_perror("failed to query gpuinfo via libdrm"); + goto err; + } + *max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE : + SDMA_LINEAR_COPY_MAX_SIZE - 1; + return 0; +err: + amdgpu_device_deinitialize(*h_dev); + return ret; +} + +FILE *get_bo_contents_fp(int id, int gpu_id, size_t tot_size) +{ + char img_path[PATH_MAX]; + size_t image_size = 0; + FILE *bo_contents_fp = NULL; + + snprintf(img_path, sizeof(img_path), IMG_KFD_PAGES_FILE, id, gpu_id); + bo_contents_fp = open_img_file(img_path, false, &image_size); + if (!bo_contents_fp) { + pr_perror("Cannot fopen %s", img_path); + return NULL; + } + + if (tot_size != image_size) { + pr_err("%s size mismatch (current:%ld:expected:%ld)\n", img_path, image_size, tot_size); + fclose(bo_contents_fp); + return NULL; + } + return bo_contents_fp; +} + +struct parallel_thread_data { + pthread_t thread; + uint32_t gpu_id; + int minor; + parallel_restore_cmd *restore_cmd; + int ret; +}; + +void *parallel_restore_bo_contents(void *_thread_data) +{ + struct parallel_thread_data *thread_data = (struct parallel_thread_data *)_thread_data; + amdgpu_device_handle h_dev; + uint64_t max_copy_size; + size_t total_bo_size = 0, max_bo_size = 0, buffer_size = 0; + FILE *bo_contents_fp = NULL; + parallel_restore_entry *entry; + parallel_restore_cmd *restore_cmd = thread_data->restore_cmd; + int ret = 0; + int offset = 0; + void *buffer = NULL; + + ret = init_dev(thread_data->minor, &h_dev, &max_copy_size); + if (ret) { + goto err; + } + + for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) { + if (restore_cmd->entries[i].gpu_id == thread_data->gpu_id) { + total_bo_size += restore_cmd->entries[i].size; + max_bo_size = max(restore_cmd->entries[i].size, max_bo_size); + } + } + + buffer_size = kfd_max_buffer_size > 0 ? min(kfd_max_buffer_size, max_bo_size) : max_bo_size; + + bo_contents_fp = get_bo_contents_fp(restore_cmd->cmd_head.id, thread_data->gpu_id, total_bo_size); + if (bo_contents_fp == NULL) { + ret = -1; + goto err_sdma; + } + offset = ftell(bo_contents_fp); + + posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); + if (!buffer) { + pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); + ret = -ENOMEM; + goto err_sdma; + } + + for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) { + if (restore_cmd->entries[i].gpu_id != thread_data->gpu_id) + continue; + + entry = &restore_cmd->entries[i]; + fseek(bo_contents_fp, entry->read_offset + offset, SEEK_SET); + ret = sdma_copy_bo_helper(entry->size, restore_cmd->fds_write[entry->write_id], bo_contents_fp, buffer, + buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); + if (ret) { + pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); + goto err_sdma; + } + } + +err_sdma: + if (bo_contents_fp) + fclose(bo_contents_fp); + if (buffer) + xfree(buffer); + amdgpu_device_deinitialize(h_dev); +err: + thread_data->ret = ret; + return NULL; +} + +void *restore_device_parallel_worker(void *arg) +{ + while (1) { + parallel_restore_cmd restore_cmd = { 0 }; + struct parallel_thread_data *thread_datas = NULL; + int ret; + int error_occurred = 0, join_ret = 0, created_threads = 0; + + ret = recv_parallel_restore_cmd(&restore_cmd); + if (ret) { + if (ret == 1) { + *(int *)arg = 0; + goto exit; + } + goto err; + } + + thread_datas = xzalloc(sizeof(*thread_datas) * restore_cmd.cmd_head.gpu_num); + if (!thread_datas) { + ret = -ENOMEM; + goto err; + } + + for (; created_threads < restore_cmd.cmd_head.gpu_num; created_threads++) { + thread_datas[created_threads].gpu_id = restore_cmd.gpu_ids[created_threads].gpu_id; + thread_datas[created_threads].minor = restore_cmd.gpu_ids[created_threads].minor; + thread_datas[created_threads].restore_cmd = &restore_cmd; + + ret = pthread_create(&thread_datas[created_threads].thread, NULL, parallel_restore_bo_contents, + (void *)&thread_datas[created_threads]); + if (ret) { + pr_err("Failed to create thread[0x%x] ret:%d\n", thread_datas[created_threads].gpu_id, ret); + error_occurred = 1; + break; + } + } + + for (int i = 0; i < created_threads; i++) { + join_ret = pthread_join(thread_datas[i].thread, NULL); + if (join_ret != 0) { + pr_err("pthread_join failed for Thread[0x%x] ret:%d\n", + thread_datas[i].gpu_id, join_ret); + if (!error_occurred) { + ret = join_ret; + error_occurred = 1; + } + } + + pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); + + /* Check thread return value */ + if (thread_datas[i].ret && !error_occurred) { + ret = thread_datas[i].ret; + error_occurred = 1; + } + } + + if (thread_datas) + xfree(thread_datas); +err: + free_parallel_restore_cmd(&restore_cmd); + + if (ret) { + *(int *)arg = ret; + return NULL; + } + } +exit: + return NULL; +} + +/* + * While the background thread is running, some processing functions (e.g., stop_cgroupd) + * in the main thread need to block SIGCHLD. To prevent interference from this background + * thread, SIGCHLD is blocked in this thread. + */ +static int back_thread_create(pthread_t *newthread, void *(*f)(void *), void *arg) +{ + int ret = 0; + sigset_t blockmask, oldmask; + + sigemptyset(&blockmask); + sigaddset(&blockmask, SIGCHLD); + sigprocmask(SIG_BLOCK, &blockmask, &oldmask); + + ret = pthread_create(newthread, NULL, f, arg); + if (ret) { + pr_err("Create worker thread fail: %d\n", ret); + return -1; + } + + sigprocmask(SIG_SETMASK, &oldmask, NULL); + return 0; +} + +int amdgpu_plugin_post_forking(void) +{ + if (plugin_disabled) + return -ENOTSUP; + + if (parallel_disabled) + return 0; + + return back_thread_create(¶llel_thread, restore_device_parallel_worker, ¶llel_thread_result); +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__POST_FORKING, amdgpu_plugin_post_forking) \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_plugin_topology.c b/plugins/amdgpu/amdgpu_plugin_topology.c index 5b4396a0c..730f2e028 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.c +++ b/plugins/amdgpu/amdgpu_plugin_topology.c @@ -45,7 +45,7 @@ bool kfd_capability_check = true; */ int fd_next = -1; -static int open_drm_render_device(int minor) +int open_drm_render_device(int minor) { char path[128]; int fd, ret_fd; diff --git a/plugins/amdgpu/amdgpu_plugin_topology.h b/plugins/amdgpu/amdgpu_plugin_topology.h index c890e3dda..e19f8e7ce 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.h +++ b/plugins/amdgpu/amdgpu_plugin_topology.h @@ -118,6 +118,7 @@ struct tp_node *sys_get_node_by_gpu_id(const struct tp_system *sys, const uint32 struct tp_node *sys_get_node_by_render_minor(const struct tp_system *sys, const int drm_render_minor); struct tp_node *sys_get_node_by_index(const struct tp_system *sys, uint32_t index); +int open_drm_render_device(int minor); int node_get_drm_render_device(struct tp_node *node); void sys_close_drm_render_devices(struct tp_system *sys); From 920437205c4f5359e4c54765c9e23d2d57c2f4ec Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Tue, 10 Dec 2024 12:44:35 +0000 Subject: [PATCH 242/257] plugins/amdgpu: Update `README.md` and `criu-amdgpu-plugin.txt` Signed-off-by: Yanning Yang --- Documentation/criu-amdgpu-plugin.txt | 1 + plugins/amdgpu/README.md | 23 ++++++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/Documentation/criu-amdgpu-plugin.txt b/Documentation/criu-amdgpu-plugin.txt index 68803f3db..fe76fc3bc 100644 --- a/Documentation/criu-amdgpu-plugin.txt +++ b/Documentation/criu-amdgpu-plugin.txt @@ -15,6 +15,7 @@ Checkpoint / Restore inside a docker container Pytorch Tensorflow Using CRIU Image Streamer +Parallel Restore DESCRIPTION ----------- diff --git a/plugins/amdgpu/README.md b/plugins/amdgpu/README.md index 1078eafe6..b808fbc4f 100644 --- a/plugins/amdgpu/README.md +++ b/plugins/amdgpu/README.md @@ -3,7 +3,8 @@ Supporting ROCm with CRIU _Felix Kuehling _
_Rajneesh Bardwaj _
-_David Yat Sin _ +_David Yat Sin _
+_Yanning Yang _ # Introduction @@ -224,6 +225,26 @@ to resume execution on the GPUs. *This new plugin is enabled by the new hook `__RESUME_DEVICES_LATE` in our RFC patch series.* +## Restoring BO content in parallel + +Restoring the BO content is an important part in the restore of GPU state and +usually takes a significant amount of time. A possible location for this +procedure is the `cr_plugin_restore_file` hook. However, restoring in this hook +blocks the target process from performing other restore operations, which +hinders further optimization of the restore process. + +Therefore, a new plugin hook that runs in the master restore process is +introduced, and it interacts with the `cr_plugin_restore_file` hook to complete +the restore of BO content. Specifically, the target process only needs to send +the relevant BOs to the master restore process, while this new hook handles all +the restore of buffer objects. Through this method, during the restore of the BO +content, the target process can perform other restore operations, thus +accelerating the restore procedure. This is an implementation of the gCROP +method proposed in the ACM SoCC'24 paper: [On-demand and Parallel +Checkpoint/Restore for GPU Applications](https://dl.acm.org/doi/10.1145/3698038.3698510). + +*This optimization technique is enabled by the `__POST_FORKING` hook.* + ## Other CRIU changes In addition to the new plugins, we need to make some changes to CRIU itself to From 7a4ee0ae8effdbf475804b72995912b0911ad28a Mon Sep 17 00:00:00 2001 From: David Francis Date: Wed, 12 Feb 2025 09:26:21 -0500 Subject: [PATCH 243/257] restorer: Skip non-regular VMAs amdgpu represents allocated device memory as a memory mapping of the device file. This is a non-standard VMA that must be handled by the plugin, not the normal VMA code. Ignore all VMAs on device files. Signed-off-by: David Francis --- criu/pie/restorer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 5c40b0e93..008e1398d 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1989,6 +1989,9 @@ __visible long __export_restore_task(struct task_restore_args *args) for (m = 0; m < sizeof(vma_entry->madv) * 8; m++) { if (vma_entry->madv & (1ul << m)) { + if (!(vma_entry_is(vma_entry, VMA_AREA_REGULAR))) + continue; + ret = sys_madvise(vma_entry->start, vma_entry_len(vma_entry), m); if (ret) { pr_err("madvise(%" PRIx64 ", %" PRIu64 ", %ld) " From fb02dbf68582c6589724d7aa3bb06ce3d588cc71 Mon Sep 17 00:00:00 2001 From: David Francis Date: Wed, 12 Feb 2025 09:45:37 -0500 Subject: [PATCH 244/257] files-ext: Allow plugin files to retry amdgpu dmabuf CRIU requires the ability of the amdgpu plugin to retry. Change files_ext.c to read a response of 1 from a plugin restore function to mean retry. Signed-off-by: David Francis --- criu/files-ext.c | 10 +++++++--- criu/include/criu-plugin.h | 2 +- plugins/amdgpu/amdgpu_plugin.c | 4 +++- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/criu/files-ext.c b/criu/files-ext.c index 95ec8e37c..4cc99d921 100644 --- a/criu/files-ext.c +++ b/criu/files-ext.c @@ -45,10 +45,11 @@ static int open_fd(struct file_desc *d, int *new_fd) { struct ext_file_info *xfi; int fd; + bool retry_needed; xfi = container_of(d, struct ext_file_info, d); - fd = run_plugins(RESTORE_EXT_FILE, xfi->xfe->id); + fd = run_plugins(RESTORE_EXT_FILE, xfi->xfe->id, &retry_needed); if (fd < 0) { pr_err("Unable to restore %#x\n", xfi->xfe->id); return -1; @@ -57,8 +58,11 @@ static int open_fd(struct file_desc *d, int *new_fd) if (restore_fown(fd, xfi->xfe->fown)) return -1; - *new_fd = fd; - return 0; + if (!retry_needed) + *new_fd = fd; + else + *new_fd = -1; + return retry_needed; } static struct file_desc_ops ext_desc_ops = { diff --git a/criu/include/criu-plugin.h b/criu/include/criu-plugin.h index 9fb21a449..ee84ccdf6 100644 --- a/criu/include/criu-plugin.h +++ b/criu/include/criu-plugin.h @@ -70,7 +70,7 @@ enum { DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_UNIX_SK, int fd, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_UNIX_SK, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_FILE, int fd, int id); -DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_FILE, int id); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_FILE, int id, bool *retry_needed); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_MOUNT, char *mountpoint, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_MOUNT, int id, char *mountpoint, char *old_root, int *is_file); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_LINK, int index, int type, char *kind); diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 69194fbc7..e3b4ead3f 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -1614,7 +1614,7 @@ exit: return ret; } -int amdgpu_plugin_restore_file(int id) +int amdgpu_plugin_restore_file(int id, bool *retry_needed) { int ret = 0, fd; char img_path[PATH_MAX]; @@ -1625,6 +1625,8 @@ int amdgpu_plugin_restore_file(int id) size_t img_size; FILE *img_fp = NULL; + *retry_needed = false; + if (plugin_disabled) return -ENOTSUP; From 0b7ca29c1944a8021c22a8e7041f047facb52e48 Mon Sep 17 00:00:00 2001 From: David Francis Date: Wed, 19 Feb 2025 14:30:07 -0500 Subject: [PATCH 245/257] plugin/amdgpu: Add amdgpu drm header For amdgpu plugin to call the new amdgpu drm CRIU ioctls, it needs the amdgpu drm header file, copied from the kernel's includes. Signed-off-by: David Francis --- plugins/amdgpu/amdgpu_drm.h | 1688 +++++++++++++++++++++++++++++++++++ 1 file changed, 1688 insertions(+) create mode 100644 plugins/amdgpu/amdgpu_drm.h diff --git a/plugins/amdgpu/amdgpu_drm.h b/plugins/amdgpu/amdgpu_drm.h new file mode 100644 index 000000000..9cebd072a --- /dev/null +++ b/plugins/amdgpu/amdgpu_drm.h @@ -0,0 +1,1688 @@ +/* amdgpu_drm.h -- Public header for the amdgpu driver -*- linux-c -*- + * + * Copyright 2000 Precision Insight, Inc., Cedar Park, Texas. + * Copyright 2000 VA Linux Systems, Inc., Fremont, California. + * Copyright 2002 Tungsten Graphics, Inc., Cedar Park, Texas. + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Kevin E. Martin + * Gareth Hughes + * Keith Whitwell + */ + +#ifndef __AMDGPU_DRM_H__ +#define __AMDGPU_DRM_H__ + +#include "drm.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +#define DRM_AMDGPU_GEM_CREATE 0x00 +#define DRM_AMDGPU_GEM_MMAP 0x01 +#define DRM_AMDGPU_CTX 0x02 +#define DRM_AMDGPU_BO_LIST 0x03 +#define DRM_AMDGPU_CS 0x04 +#define DRM_AMDGPU_INFO 0x05 +#define DRM_AMDGPU_GEM_METADATA 0x06 +#define DRM_AMDGPU_GEM_WAIT_IDLE 0x07 +#define DRM_AMDGPU_GEM_VA 0x08 +#define DRM_AMDGPU_WAIT_CS 0x09 +#define DRM_AMDGPU_GEM_OP 0x10 +#define DRM_AMDGPU_GEM_USERPTR 0x11 +#define DRM_AMDGPU_WAIT_FENCES 0x12 +#define DRM_AMDGPU_VM 0x13 +#define DRM_AMDGPU_FENCE_TO_HANDLE 0x14 +#define DRM_AMDGPU_SCHED 0x15 +#define DRM_AMDGPU_USERQ 0x16 +#define DRM_AMDGPU_USERQ_SIGNAL 0x17 +#define DRM_AMDGPU_USERQ_WAIT 0x18 +#define DRM_AMDGPU_GEM_LIST_HANDLES 0x19 + +#define DRM_IOCTL_AMDGPU_GEM_CREATE DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_CREATE, union drm_amdgpu_gem_create) +#define DRM_IOCTL_AMDGPU_GEM_MMAP DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_MMAP, union drm_amdgpu_gem_mmap) +#define DRM_IOCTL_AMDGPU_CTX DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_CTX, union drm_amdgpu_ctx) +#define DRM_IOCTL_AMDGPU_BO_LIST DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_BO_LIST, union drm_amdgpu_bo_list) +#define DRM_IOCTL_AMDGPU_CS DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_CS, union drm_amdgpu_cs) +#define DRM_IOCTL_AMDGPU_INFO DRM_IOW(DRM_COMMAND_BASE + DRM_AMDGPU_INFO, struct drm_amdgpu_info) +#define DRM_IOCTL_AMDGPU_GEM_METADATA DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_METADATA, struct drm_amdgpu_gem_metadata) +#define DRM_IOCTL_AMDGPU_GEM_WAIT_IDLE DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_WAIT_IDLE, union drm_amdgpu_gem_wait_idle) +#define DRM_IOCTL_AMDGPU_GEM_VA DRM_IOW(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_VA, struct drm_amdgpu_gem_va) +#define DRM_IOCTL_AMDGPU_WAIT_CS DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_WAIT_CS, union drm_amdgpu_wait_cs) +#define DRM_IOCTL_AMDGPU_GEM_OP DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_OP, struct drm_amdgpu_gem_op) +#define DRM_IOCTL_AMDGPU_GEM_USERPTR DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_USERPTR, struct drm_amdgpu_gem_userptr) +#define DRM_IOCTL_AMDGPU_WAIT_FENCES DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_WAIT_FENCES, union drm_amdgpu_wait_fences) +#define DRM_IOCTL_AMDGPU_VM DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_VM, union drm_amdgpu_vm) +#define DRM_IOCTL_AMDGPU_FENCE_TO_HANDLE DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_FENCE_TO_HANDLE, union drm_amdgpu_fence_to_handle) +#define DRM_IOCTL_AMDGPU_SCHED DRM_IOW(DRM_COMMAND_BASE + DRM_AMDGPU_SCHED, union drm_amdgpu_sched) +#define DRM_IOCTL_AMDGPU_USERQ DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ, union drm_amdgpu_userq) +#define DRM_IOCTL_AMDGPU_USERQ_SIGNAL DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_SIGNAL, struct drm_amdgpu_userq_signal) +#define DRM_IOCTL_AMDGPU_USERQ_WAIT DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_WAIT, struct drm_amdgpu_userq_wait) +#define DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_LIST_HANDLES, struct drm_amdgpu_gem_list_handles) + +/** + * DOC: memory domains + * + * %AMDGPU_GEM_DOMAIN_CPU System memory that is not GPU accessible. + * Memory in this pool could be swapped out to disk if there is pressure. + * + * %AMDGPU_GEM_DOMAIN_GTT GPU accessible system memory, mapped into the + * GPU's virtual address space via gart. Gart memory linearizes non-contiguous + * pages of system memory, allows GPU access system memory in a linearized + * fashion. + * + * %AMDGPU_GEM_DOMAIN_VRAM Local video memory. For APUs, it is memory + * carved out by the BIOS. + * + * %AMDGPU_GEM_DOMAIN_GDS Global on-chip data storage used to share data + * across shader threads. + * + * %AMDGPU_GEM_DOMAIN_GWS Global wave sync, used to synchronize the + * execution of all the waves on a device. + * + * %AMDGPU_GEM_DOMAIN_OA Ordered append, used by 3D or Compute engines + * for appending data. + * + * %AMDGPU_GEM_DOMAIN_DOORBELL Doorbell. It is an MMIO region for + * signalling user mode queues. + */ +#define AMDGPU_GEM_DOMAIN_CPU 0x1 +#define AMDGPU_GEM_DOMAIN_GTT 0x2 +#define AMDGPU_GEM_DOMAIN_VRAM 0x4 +#define AMDGPU_GEM_DOMAIN_GDS 0x8 +#define AMDGPU_GEM_DOMAIN_GWS 0x10 +#define AMDGPU_GEM_DOMAIN_OA 0x20 +#define AMDGPU_GEM_DOMAIN_DOORBELL 0x40 +#define AMDGPU_GEM_DOMAIN_MASK (AMDGPU_GEM_DOMAIN_CPU | \ + AMDGPU_GEM_DOMAIN_GTT | \ + AMDGPU_GEM_DOMAIN_VRAM | \ + AMDGPU_GEM_DOMAIN_GDS | \ + AMDGPU_GEM_DOMAIN_GWS | \ + AMDGPU_GEM_DOMAIN_OA | \ + AMDGPU_GEM_DOMAIN_DOORBELL) + +/* Flag that CPU access will be required for the case of VRAM domain */ +#define AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED (1 << 0) +/* Flag that CPU access will not work, this VRAM domain is invisible */ +#define AMDGPU_GEM_CREATE_NO_CPU_ACCESS (1 << 1) +/* Flag that USWC attributes should be used for GTT */ +#define AMDGPU_GEM_CREATE_CPU_GTT_USWC (1 << 2) +/* Flag that the memory should be in VRAM and cleared */ +#define AMDGPU_GEM_CREATE_VRAM_CLEARED (1 << 3) +/* Flag that allocating the BO should use linear VRAM */ +#define AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS (1 << 5) +/* Flag that BO is always valid in this VM */ +#define AMDGPU_GEM_CREATE_VM_ALWAYS_VALID (1 << 6) +/* Flag that BO sharing will be explicitly synchronized */ +#define AMDGPU_GEM_CREATE_EXPLICIT_SYNC (1 << 7) +/* Flag that indicates allocating MQD gart on GFX9, where the mtype + * for the second page onward should be set to NC. It should never + * be used by user space applications. + */ +#define AMDGPU_GEM_CREATE_CP_MQD_GFX9 (1 << 8) +/* Flag that BO may contain sensitive data that must be wiped before + * releasing the memory + */ +#define AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE (1 << 9) +/* Flag that BO will be encrypted and that the TMZ bit should be + * set in the PTEs when mapping this buffer via GPUVM or + * accessing it with various hw blocks + */ +#define AMDGPU_GEM_CREATE_ENCRYPTED (1 << 10) +/* Flag that BO will be used only in preemptible context, which does + * not require GTT memory accounting + */ +#define AMDGPU_GEM_CREATE_PREEMPTIBLE (1 << 11) +/* Flag that BO can be discarded under memory pressure without keeping the + * content. + */ +#define AMDGPU_GEM_CREATE_DISCARDABLE (1 << 12) +/* Flag that BO is shared coherently between multiple devices or CPU threads. + * May depend on GPU instructions to flush caches to system scope explicitly. + * + * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and + * may override the MTYPE selected in AMDGPU_VA_OP_MAP. + */ +#define AMDGPU_GEM_CREATE_COHERENT (1 << 13) +/* Flag that BO should not be cached by GPU. Coherent without having to flush + * GPU caches explicitly + * + * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and + * may override the MTYPE selected in AMDGPU_VA_OP_MAP. + */ +#define AMDGPU_GEM_CREATE_UNCACHED (1 << 14) +/* Flag that BO should be coherent across devices when using device-level + * atomics. May depend on GPU instructions to flush caches to device scope + * explicitly, promoting them to system scope automatically. + * + * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and + * may override the MTYPE selected in AMDGPU_VA_OP_MAP. + */ +#define AMDGPU_GEM_CREATE_EXT_COHERENT (1 << 15) +/* Set PTE.D and recompress during GTT->VRAM moves according to TILING flags. */ +#define AMDGPU_GEM_CREATE_GFX12_DCC (1 << 16) + +struct drm_amdgpu_gem_create_in { + /** the requested memory size */ + __u64 bo_size; + /** physical start_addr alignment in bytes for some HW requirements */ + __u64 alignment; + /** the requested memory domains */ + __u64 domains; + /** allocation flags */ + __u64 domain_flags; +}; + +struct drm_amdgpu_gem_create_out { + /** returned GEM object handle */ + __u32 handle; + __u32 _pad; +}; + +union drm_amdgpu_gem_create { + struct drm_amdgpu_gem_create_in in; + struct drm_amdgpu_gem_create_out out; +}; + +/** Opcode to create new residency list. */ +#define AMDGPU_BO_LIST_OP_CREATE 0 +/** Opcode to destroy previously created residency list */ +#define AMDGPU_BO_LIST_OP_DESTROY 1 +/** Opcode to update resource information in the list */ +#define AMDGPU_BO_LIST_OP_UPDATE 2 + +struct drm_amdgpu_bo_list_in { + /** Type of operation */ + __u32 operation; + /** Handle of list or 0 if we want to create one */ + __u32 list_handle; + /** Number of BOs in list */ + __u32 bo_number; + /** Size of each element describing BO */ + __u32 bo_info_size; + /** Pointer to array describing BOs */ + __u64 bo_info_ptr; +}; + +struct drm_amdgpu_bo_list_entry { + /** Handle of BO */ + __u32 bo_handle; + /** New (if specified) BO priority to be used during migration */ + __u32 bo_priority; +}; + +struct drm_amdgpu_bo_list_out { + /** Handle of resource list */ + __u32 list_handle; + __u32 _pad; +}; + +union drm_amdgpu_bo_list { + struct drm_amdgpu_bo_list_in in; + struct drm_amdgpu_bo_list_out out; +}; + +/* context related */ +#define AMDGPU_CTX_OP_ALLOC_CTX 1 +#define AMDGPU_CTX_OP_FREE_CTX 2 +#define AMDGPU_CTX_OP_QUERY_STATE 3 +#define AMDGPU_CTX_OP_QUERY_STATE2 4 +#define AMDGPU_CTX_OP_GET_STABLE_PSTATE 5 +#define AMDGPU_CTX_OP_SET_STABLE_PSTATE 6 + +/* GPU reset status */ +#define AMDGPU_CTX_NO_RESET 0 +/* this the context caused it */ +#define AMDGPU_CTX_GUILTY_RESET 1 +/* some other context caused it */ +#define AMDGPU_CTX_INNOCENT_RESET 2 +/* unknown cause */ +#define AMDGPU_CTX_UNKNOWN_RESET 3 + +/* indicate gpu reset occurred after ctx created */ +#define AMDGPU_CTX_QUERY2_FLAGS_RESET (1<<0) +/* indicate vram lost occurred after ctx created */ +#define AMDGPU_CTX_QUERY2_FLAGS_VRAMLOST (1<<1) +/* indicate some job from this context once cause gpu hang */ +#define AMDGPU_CTX_QUERY2_FLAGS_GUILTY (1<<2) +/* indicate some errors are detected by RAS */ +#define AMDGPU_CTX_QUERY2_FLAGS_RAS_CE (1<<3) +#define AMDGPU_CTX_QUERY2_FLAGS_RAS_UE (1<<4) +/* indicate that the reset hasn't completed yet */ +#define AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS (1<<5) + +/* Context priority level */ +#define AMDGPU_CTX_PRIORITY_UNSET -2048 +#define AMDGPU_CTX_PRIORITY_VERY_LOW -1023 +#define AMDGPU_CTX_PRIORITY_LOW -512 +#define AMDGPU_CTX_PRIORITY_NORMAL 0 +/* + * When used in struct drm_amdgpu_ctx_in, a priority above NORMAL requires + * CAP_SYS_NICE or DRM_MASTER +*/ +#define AMDGPU_CTX_PRIORITY_HIGH 512 +#define AMDGPU_CTX_PRIORITY_VERY_HIGH 1023 + +/* select a stable profiling pstate for perfmon tools */ +#define AMDGPU_CTX_STABLE_PSTATE_FLAGS_MASK 0xf +#define AMDGPU_CTX_STABLE_PSTATE_NONE 0 +#define AMDGPU_CTX_STABLE_PSTATE_STANDARD 1 +#define AMDGPU_CTX_STABLE_PSTATE_MIN_SCLK 2 +#define AMDGPU_CTX_STABLE_PSTATE_MIN_MCLK 3 +#define AMDGPU_CTX_STABLE_PSTATE_PEAK 4 + +struct drm_amdgpu_ctx_in { + /** AMDGPU_CTX_OP_* */ + __u32 op; + /** Flags */ + __u32 flags; + __u32 ctx_id; + /** AMDGPU_CTX_PRIORITY_* */ + __s32 priority; +}; + +union drm_amdgpu_ctx_out { + struct { + __u32 ctx_id; + __u32 _pad; + } alloc; + + struct { + /** For future use, no flags defined so far */ + __u64 flags; + /** Number of resets caused by this context so far. */ + __u32 hangs; + /** Reset status since the last call of the ioctl. */ + __u32 reset_status; + } state; + + struct { + __u32 flags; + __u32 _pad; + } pstate; +}; + +union drm_amdgpu_ctx { + struct drm_amdgpu_ctx_in in; + union drm_amdgpu_ctx_out out; +}; + +/* user queue IOCTL operations */ +#define AMDGPU_USERQ_OP_CREATE 1 +#define AMDGPU_USERQ_OP_FREE 2 + +/* queue priority levels */ +/* low < normal low < normal high < high */ +#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_MASK 0x3 +#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_SHIFT 0 +#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_NORMAL_LOW 0 +#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_LOW 1 +#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_NORMAL_HIGH 2 +#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_HIGH 3 /* admin only */ +/* for queues that need access to protected content */ +#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_SECURE (1 << 2) + +/* + * This structure is a container to pass input configuration + * info for all supported userqueue related operations. + * For operation AMDGPU_USERQ_OP_CREATE: user is expected + * to set all fields, excep the parameter 'queue_id'. + * For operation AMDGPU_USERQ_OP_FREE: the only input parameter expected + * to be set is 'queue_id', eveything else is ignored. + */ +struct drm_amdgpu_userq_in { + /** AMDGPU_USERQ_OP_* */ + __u32 op; + /** Queue id passed for operation USERQ_OP_FREE */ + __u32 queue_id; + /** the target GPU engine to execute workload (AMDGPU_HW_IP_*) */ + __u32 ip_type; + /** + * @doorbell_handle: the handle of doorbell GEM object + * associated with this userqueue client. + */ + __u32 doorbell_handle; + /** + * @doorbell_offset: 32-bit offset of the doorbell in the doorbell bo. + * Kernel will generate absolute doorbell offset using doorbell_handle + * and doorbell_offset in the doorbell bo. + */ + __u32 doorbell_offset; + /** + * @flags: flags used for queue parameters + */ + __u32 flags; + /** + * @queue_va: Virtual address of the GPU memory which holds the queue + * object. The queue holds the workload packets. + */ + __u64 queue_va; + /** + * @queue_size: Size of the queue in bytes, this needs to be 256-byte + * aligned. + */ + __u64 queue_size; + /** + * @rptr_va : Virtual address of the GPU memory which holds the ring RPTR. + * This object must be at least 8 byte in size and aligned to 8-byte offset. + */ + __u64 rptr_va; + /** + * @wptr_va : Virtual address of the GPU memory which holds the ring WPTR. + * This object must be at least 8 byte in size and aligned to 8-byte offset. + * + * Queue, RPTR and WPTR can come from the same object, as long as the size + * and alignment related requirements are met. + */ + __u64 wptr_va; + /** + * @mqd: MQD (memory queue descriptor) is a set of parameters which allow + * the GPU to uniquely define and identify a usermode queue. + * + * MQD data can be of different size for different GPU IP/engine and + * their respective versions/revisions, so this points to a __u64 * + * which holds IP specific MQD of this usermode queue. + */ + __u64 mqd; + /** + * @size: size of MQD data in bytes, it must match the MQD structure + * size of the respective engine/revision defined in UAPI for ex, for + * gfx11 workloads, size = sizeof(drm_amdgpu_userq_mqd_gfx11). + */ + __u64 mqd_size; +}; + +/* The structure to carry output of userqueue ops */ +struct drm_amdgpu_userq_out { + /** + * For operation AMDGPU_USERQ_OP_CREATE: This field contains a unique + * queue ID to represent the newly created userqueue in the system, otherwise + * it should be ignored. + */ + __u32 queue_id; + __u32 _pad; +}; + +union drm_amdgpu_userq { + struct drm_amdgpu_userq_in in; + struct drm_amdgpu_userq_out out; +}; + +/* GFX V11 IP specific MQD parameters */ +struct drm_amdgpu_userq_mqd_gfx11 { + /** + * @shadow_va: Virtual address of the GPU memory to hold the shadow buffer. + * Use AMDGPU_INFO_IOCTL to find the exact size of the object. + */ + __u64 shadow_va; + /** + * @csa_va: Virtual address of the GPU memory to hold the CSA buffer. + * Use AMDGPU_INFO_IOCTL to find the exact size of the object. + */ + __u64 csa_va; +}; + +/* GFX V11 SDMA IP specific MQD parameters */ +struct drm_amdgpu_userq_mqd_sdma_gfx11 { + /** + * @csa_va: Virtual address of the GPU memory to hold the CSA buffer. + * This must be a from a separate GPU object, and use AMDGPU_INFO IOCTL + * to get the size. + */ + __u64 csa_va; +}; + +/* GFX V11 Compute IP specific MQD parameters */ +struct drm_amdgpu_userq_mqd_compute_gfx11 { + /** + * @eop_va: Virtual address of the GPU memory to hold the EOP buffer. + * This must be a from a separate GPU object, and use AMDGPU_INFO IOCTL + * to get the size. + */ + __u64 eop_va; +}; + +/* userq signal/wait ioctl */ +struct drm_amdgpu_userq_signal { + /** + * @queue_id: Queue handle used by the userq fence creation function + * to retrieve the WPTR. + */ + __u32 queue_id; + __u32 pad; + /** + * @syncobj_handles: The list of syncobj handles submitted by the user queue + * job to be signaled. + */ + __u64 syncobj_handles; + /** + * @num_syncobj_handles: A count that represents the number of syncobj handles in + * @syncobj_handles. + */ + __u64 num_syncobj_handles; + /** + * @bo_read_handles: The list of BO handles that the submitted user queue job + * is using for read only. This will update BO fences in the kernel. + */ + __u64 bo_read_handles; + /** + * @bo_write_handles: The list of BO handles that the submitted user queue job + * is using for write only. This will update BO fences in the kernel. + */ + __u64 bo_write_handles; + /** + * @num_bo_read_handles: A count that represents the number of read BO handles in + * @bo_read_handles. + */ + __u32 num_bo_read_handles; + /** + * @num_bo_write_handles: A count that represents the number of write BO handles in + * @bo_write_handles. + */ + __u32 num_bo_write_handles; +}; + +struct drm_amdgpu_userq_fence_info { + /** + * @va: A gpu address allocated for each queue which stores the + * read pointer (RPTR) value. + */ + __u64 va; + /** + * @value: A 64 bit value represents the write pointer (WPTR) of the + * queue commands which compared with the RPTR value to signal the + * fences. + */ + __u64 value; +}; + +struct drm_amdgpu_userq_wait { + /** + * @waitq_id: Queue handle used by the userq wait IOCTL to retrieve the + * wait queue and maintain the fence driver references in it. + */ + __u32 waitq_id; + __u32 pad; + /** + * @syncobj_handles: The list of syncobj handles submitted by the user queue + * job to get the va/value pairs. + */ + __u64 syncobj_handles; + /** + * @syncobj_timeline_handles: The list of timeline syncobj handles submitted by + * the user queue job to get the va/value pairs at given @syncobj_timeline_points. + */ + __u64 syncobj_timeline_handles; + /** + * @syncobj_timeline_points: The list of timeline syncobj points submitted by the + * user queue job for the corresponding @syncobj_timeline_handles. + */ + __u64 syncobj_timeline_points; + /** + * @bo_read_handles: The list of read BO handles submitted by the user queue + * job to get the va/value pairs. + */ + __u64 bo_read_handles; + /** + * @bo_write_handles: The list of write BO handles submitted by the user queue + * job to get the va/value pairs. + */ + __u64 bo_write_handles; + /** + * @num_syncobj_timeline_handles: A count that represents the number of timeline + * syncobj handles in @syncobj_timeline_handles. + */ + __u16 num_syncobj_timeline_handles; + /** + * @num_fences: This field can be used both as input and output. As input it defines + * the maximum number of fences that can be returned and as output it will specify + * how many fences were actually returned from the ioctl. + */ + __u16 num_fences; + /** + * @num_syncobj_handles: A count that represents the number of syncobj handles in + * @syncobj_handles. + */ + __u32 num_syncobj_handles; + /** + * @num_bo_read_handles: A count that represents the number of read BO handles in + * @bo_read_handles. + */ + __u32 num_bo_read_handles; + /** + * @num_bo_write_handles: A count that represents the number of write BO handles in + * @bo_write_handles. + */ + __u32 num_bo_write_handles; + /** + * @out_fences: The field is a return value from the ioctl containing the list of + * address/value pairs to wait for. + */ + __u64 out_fences; +}; + +/* vm ioctl */ +#define AMDGPU_VM_OP_RESERVE_VMID 1 +#define AMDGPU_VM_OP_UNRESERVE_VMID 2 + +struct drm_amdgpu_vm_in { + /** AMDGPU_VM_OP_* */ + __u32 op; + __u32 flags; +}; + +struct drm_amdgpu_vm_out { + /** For future use, no flags defined so far */ + __u64 flags; +}; + +union drm_amdgpu_vm { + struct drm_amdgpu_vm_in in; + struct drm_amdgpu_vm_out out; +}; + +/* sched ioctl */ +#define AMDGPU_SCHED_OP_PROCESS_PRIORITY_OVERRIDE 1 +#define AMDGPU_SCHED_OP_CONTEXT_PRIORITY_OVERRIDE 2 + +struct drm_amdgpu_sched_in { + /* AMDGPU_SCHED_OP_* */ + __u32 op; + __u32 fd; + /** AMDGPU_CTX_PRIORITY_* */ + __s32 priority; + __u32 ctx_id; +}; + +union drm_amdgpu_sched { + struct drm_amdgpu_sched_in in; +}; + +/* + * This is not a reliable API and you should expect it to fail for any + * number of reasons and have fallback path that do not use userptr to + * perform any operation. + */ +#define AMDGPU_GEM_USERPTR_READONLY (1 << 0) +#define AMDGPU_GEM_USERPTR_ANONONLY (1 << 1) +#define AMDGPU_GEM_USERPTR_VALIDATE (1 << 2) +#define AMDGPU_GEM_USERPTR_REGISTER (1 << 3) + +struct drm_amdgpu_gem_userptr { + __u64 addr; + __u64 size; + /* AMDGPU_GEM_USERPTR_* */ + __u32 flags; + /* Resulting GEM handle */ + __u32 handle; +}; + +/* SI-CI-VI: */ +/* same meaning as the GB_TILE_MODE and GL_MACRO_TILE_MODE fields */ +#define AMDGPU_TILING_ARRAY_MODE_SHIFT 0 +#define AMDGPU_TILING_ARRAY_MODE_MASK 0xf +#define AMDGPU_TILING_PIPE_CONFIG_SHIFT 4 +#define AMDGPU_TILING_PIPE_CONFIG_MASK 0x1f +#define AMDGPU_TILING_TILE_SPLIT_SHIFT 9 +#define AMDGPU_TILING_TILE_SPLIT_MASK 0x7 +#define AMDGPU_TILING_MICRO_TILE_MODE_SHIFT 12 +#define AMDGPU_TILING_MICRO_TILE_MODE_MASK 0x7 +#define AMDGPU_TILING_BANK_WIDTH_SHIFT 15 +#define AMDGPU_TILING_BANK_WIDTH_MASK 0x3 +#define AMDGPU_TILING_BANK_HEIGHT_SHIFT 17 +#define AMDGPU_TILING_BANK_HEIGHT_MASK 0x3 +#define AMDGPU_TILING_MACRO_TILE_ASPECT_SHIFT 19 +#define AMDGPU_TILING_MACRO_TILE_ASPECT_MASK 0x3 +#define AMDGPU_TILING_NUM_BANKS_SHIFT 21 +#define AMDGPU_TILING_NUM_BANKS_MASK 0x3 + +/* GFX9 - GFX11: */ +#define AMDGPU_TILING_SWIZZLE_MODE_SHIFT 0 +#define AMDGPU_TILING_SWIZZLE_MODE_MASK 0x1f +#define AMDGPU_TILING_DCC_OFFSET_256B_SHIFT 5 +#define AMDGPU_TILING_DCC_OFFSET_256B_MASK 0xFFFFFF +#define AMDGPU_TILING_DCC_PITCH_MAX_SHIFT 29 +#define AMDGPU_TILING_DCC_PITCH_MAX_MASK 0x3FFF +#define AMDGPU_TILING_DCC_INDEPENDENT_64B_SHIFT 43 +#define AMDGPU_TILING_DCC_INDEPENDENT_64B_MASK 0x1 +#define AMDGPU_TILING_DCC_INDEPENDENT_128B_SHIFT 44 +#define AMDGPU_TILING_DCC_INDEPENDENT_128B_MASK 0x1 +#define AMDGPU_TILING_SCANOUT_SHIFT 63 +#define AMDGPU_TILING_SCANOUT_MASK 0x1 + +/* GFX12 and later: */ +#define AMDGPU_TILING_GFX12_SWIZZLE_MODE_SHIFT 0 +#define AMDGPU_TILING_GFX12_SWIZZLE_MODE_MASK 0x7 +/* These are DCC recompression settings for memory management: */ +#define AMDGPU_TILING_GFX12_DCC_MAX_COMPRESSED_BLOCK_SHIFT 3 +#define AMDGPU_TILING_GFX12_DCC_MAX_COMPRESSED_BLOCK_MASK 0x3 /* 0:64B, 1:128B, 2:256B */ +#define AMDGPU_TILING_GFX12_DCC_NUMBER_TYPE_SHIFT 5 +#define AMDGPU_TILING_GFX12_DCC_NUMBER_TYPE_MASK 0x7 /* CB_COLOR0_INFO.NUMBER_TYPE */ +#define AMDGPU_TILING_GFX12_DCC_DATA_FORMAT_SHIFT 8 +#define AMDGPU_TILING_GFX12_DCC_DATA_FORMAT_MASK 0x3f /* [0:4]:CB_COLOR0_INFO.FORMAT, [5]:MM */ +/* When clearing the buffer or moving it from VRAM to GTT, don't compress and set DCC metadata + * to uncompressed. Set when parts of an allocation bypass DCC and read raw data. */ +#define AMDGPU_TILING_GFX12_DCC_WRITE_COMPRESS_DISABLE_SHIFT 14 +#define AMDGPU_TILING_GFX12_DCC_WRITE_COMPRESS_DISABLE_MASK 0x1 +/* bit gap */ +#define AMDGPU_TILING_GFX12_SCANOUT_SHIFT 63 +#define AMDGPU_TILING_GFX12_SCANOUT_MASK 0x1 + +/* Set/Get helpers for tiling flags. */ +#define AMDGPU_TILING_SET(field, value) \ + (((__u64)(value) & AMDGPU_TILING_##field##_MASK) << AMDGPU_TILING_##field##_SHIFT) +#define AMDGPU_TILING_GET(value, field) \ + (((__u64)(value) >> AMDGPU_TILING_##field##_SHIFT) & AMDGPU_TILING_##field##_MASK) + +#define AMDGPU_GEM_METADATA_OP_SET_METADATA 1 +#define AMDGPU_GEM_METADATA_OP_GET_METADATA 2 + +/** The same structure is shared for input/output */ +struct drm_amdgpu_gem_metadata { + /** GEM Object handle */ + __u32 handle; + /** Do we want get or set metadata */ + __u32 op; + struct { + /** For future use, no flags defined so far */ + __u64 flags; + /** family specific tiling info */ + __u64 tiling_info; + __u32 data_size_bytes; + __u32 data[64]; + } data; +}; + +struct drm_amdgpu_gem_mmap_in { + /** the GEM object handle */ + __u32 handle; + __u32 _pad; +}; + +struct drm_amdgpu_gem_mmap_out { + /** mmap offset from the vma offset manager */ + __u64 addr_ptr; +}; + +union drm_amdgpu_gem_mmap { + struct drm_amdgpu_gem_mmap_in in; + struct drm_amdgpu_gem_mmap_out out; +}; + +struct drm_amdgpu_gem_wait_idle_in { + /** GEM object handle */ + __u32 handle; + /** For future use, no flags defined so far */ + __u32 flags; + /** Absolute timeout to wait */ + __u64 timeout; +}; + +struct drm_amdgpu_gem_wait_idle_out { + /** BO status: 0 - BO is idle, 1 - BO is busy */ + __u32 status; + /** Returned current memory domain */ + __u32 domain; +}; + +union drm_amdgpu_gem_wait_idle { + struct drm_amdgpu_gem_wait_idle_in in; + struct drm_amdgpu_gem_wait_idle_out out; +}; + +struct drm_amdgpu_wait_cs_in { + /* Command submission handle + * handle equals 0 means none to wait for + * handle equals ~0ull means wait for the latest sequence number + */ + __u64 handle; + /** Absolute timeout to wait */ + __u64 timeout; + __u32 ip_type; + __u32 ip_instance; + __u32 ring; + __u32 ctx_id; +}; + +struct drm_amdgpu_wait_cs_out { + /** CS status: 0 - CS completed, 1 - CS still busy */ + __u64 status; +}; + +union drm_amdgpu_wait_cs { + struct drm_amdgpu_wait_cs_in in; + struct drm_amdgpu_wait_cs_out out; +}; + +struct drm_amdgpu_fence { + __u32 ctx_id; + __u32 ip_type; + __u32 ip_instance; + __u32 ring; + __u64 seq_no; +}; + +struct drm_amdgpu_wait_fences_in { + /** This points to uint64_t * which points to fences */ + __u64 fences; + __u32 fence_count; + __u32 wait_all; + __u64 timeout_ns; +}; + +struct drm_amdgpu_wait_fences_out { + __u32 status; + __u32 first_signaled; +}; + +union drm_amdgpu_wait_fences { + struct drm_amdgpu_wait_fences_in in; + struct drm_amdgpu_wait_fences_out out; +}; + +#define AMDGPU_GEM_OP_GET_GEM_CREATE_INFO 0 +#define AMDGPU_GEM_OP_SET_PLACEMENT 1 +#define AMDGPU_GEM_OP_GET_MAPPING_INFO 2 + +struct drm_amdgpu_gem_vm_entry { + /* Start of mapping (in bytes) */ + __u64 addr; + + /* Size of mapping (in bytes) */ + __u64 size; + + /* Mapping offset */ + __u64 offset; + + /* flags needed to recreate mapping */ + __u64 flags; +}; + +/* Sets or returns a value associated with a buffer. */ +struct drm_amdgpu_gem_op { + /** GEM object handle */ + __u32 handle; + /** AMDGPU_GEM_OP_* */ + __u32 op; + /** Input or return value. For MAPPING_INFO op: pointer to array of struct drm_amdgpu_gem_vm_entry */ + __u64 value; + /** For MAPPING_INFO op: number of mappings (in/out) */ + __u32 num_entries; + + __u32 padding; +}; + +#define AMDGPU_GEM_LIST_HANDLES_FLAG_IS_IMPORT (1 << 0) + +struct drm_amdgpu_gem_list_handles { + /* User pointer to array of drm_amdgpu_gem_bo_info_entry */ + __u64 entries; + + /* Size of entries buffer / Number of handles in process (if larger than size of buffer, must retry) */ + __u32 num_entries; + + __u32 padding; +}; + +struct drm_amdgpu_gem_list_handles_entry { + /* gem handle of buffer object */ + __u32 gem_handle; + + /* Currently just one flag: IS_IMPORT */ + __u32 flags; + + /* Size of bo */ + __u64 size; + + /* Preferred domains for GEM_CREATE */ + __u64 preferred_domains; + + /* GEM_CREATE flags for re-creation of buffer */ + __u64 alloc_flags; + + /* physical start_addr alignment in bytes for some HW requirements */ + __u64 alignment; +}; + +#define AMDGPU_VA_OP_MAP 1 +#define AMDGPU_VA_OP_UNMAP 2 +#define AMDGPU_VA_OP_CLEAR 3 +#define AMDGPU_VA_OP_REPLACE 4 + +/* Delay the page table update till the next CS */ +#define AMDGPU_VM_DELAY_UPDATE (1 << 0) + +/* Mapping flags */ +/* readable mapping */ +#define AMDGPU_VM_PAGE_READABLE (1 << 1) +/* writable mapping */ +#define AMDGPU_VM_PAGE_WRITEABLE (1 << 2) +/* executable mapping, new for VI */ +#define AMDGPU_VM_PAGE_EXECUTABLE (1 << 3) +/* partially resident texture */ +#define AMDGPU_VM_PAGE_PRT (1 << 4) +/* MTYPE flags use bit 5 to 8 */ +#define AMDGPU_VM_MTYPE_MASK (0xf << 5) +/* Default MTYPE. Pre-AI must use this. Recommended for newer ASICs. */ +#define AMDGPU_VM_MTYPE_DEFAULT (0 << 5) +/* Use Non Coherent MTYPE instead of default MTYPE */ +#define AMDGPU_VM_MTYPE_NC (1 << 5) +/* Use Write Combine MTYPE instead of default MTYPE */ +#define AMDGPU_VM_MTYPE_WC (2 << 5) +/* Use Cache Coherent MTYPE instead of default MTYPE */ +#define AMDGPU_VM_MTYPE_CC (3 << 5) +/* Use UnCached MTYPE instead of default MTYPE */ +#define AMDGPU_VM_MTYPE_UC (4 << 5) +/* Use Read Write MTYPE instead of default MTYPE */ +#define AMDGPU_VM_MTYPE_RW (5 << 5) +/* don't allocate MALL */ +#define AMDGPU_VM_PAGE_NOALLOC (1 << 9) + +struct drm_amdgpu_gem_va { + /** GEM object handle */ + __u32 handle; + __u32 _pad; + /** AMDGPU_VA_OP_* */ + __u32 operation; + /** AMDGPU_VM_PAGE_* */ + __u32 flags; + /** va address to assign . Must be correctly aligned.*/ + __u64 va_address; + /** Specify offset inside of BO to assign. Must be correctly aligned.*/ + __u64 offset_in_bo; + /** Specify mapping size. Must be correctly aligned. */ + __u64 map_size; + /** + * vm_timeline_point is a sequence number used to add new timeline point. + */ + __u64 vm_timeline_point; + /** + * The vm page table update fence is installed in given vm_timeline_syncobj_out + * at vm_timeline_point. + */ + __u32 vm_timeline_syncobj_out; + /** the number of syncobj handles in @input_fence_syncobj_handles */ + __u32 num_syncobj_handles; + /** Array of sync object handle to wait for given input fences */ + __u64 input_fence_syncobj_handles; +}; + +#define AMDGPU_HW_IP_GFX 0 +#define AMDGPU_HW_IP_COMPUTE 1 +#define AMDGPU_HW_IP_DMA 2 +#define AMDGPU_HW_IP_UVD 3 +#define AMDGPU_HW_IP_VCE 4 +#define AMDGPU_HW_IP_UVD_ENC 5 +#define AMDGPU_HW_IP_VCN_DEC 6 +/* + * From VCN4, AMDGPU_HW_IP_VCN_ENC is re-used to support + * both encoding and decoding jobs. + */ +#define AMDGPU_HW_IP_VCN_ENC 7 +#define AMDGPU_HW_IP_VCN_JPEG 8 +#define AMDGPU_HW_IP_VPE 9 +#define AMDGPU_HW_IP_NUM 10 + +#define AMDGPU_HW_IP_INSTANCE_MAX_COUNT 1 + +#define AMDGPU_CHUNK_ID_IB 0x01 +#define AMDGPU_CHUNK_ID_FENCE 0x02 +#define AMDGPU_CHUNK_ID_DEPENDENCIES 0x03 +#define AMDGPU_CHUNK_ID_SYNCOBJ_IN 0x04 +#define AMDGPU_CHUNK_ID_SYNCOBJ_OUT 0x05 +#define AMDGPU_CHUNK_ID_BO_HANDLES 0x06 +#define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES 0x07 +#define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT 0x08 +#define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL 0x09 +#define AMDGPU_CHUNK_ID_CP_GFX_SHADOW 0x0a + +struct drm_amdgpu_cs_chunk { + __u32 chunk_id; + __u32 length_dw; + __u64 chunk_data; +}; + +struct drm_amdgpu_cs_in { + /** Rendering context id */ + __u32 ctx_id; + /** Handle of resource list associated with CS */ + __u32 bo_list_handle; + __u32 num_chunks; + __u32 flags; + /** this points to __u64 * which point to cs chunks */ + __u64 chunks; +}; + +struct drm_amdgpu_cs_out { + __u64 handle; +}; + +union drm_amdgpu_cs { + struct drm_amdgpu_cs_in in; + struct drm_amdgpu_cs_out out; +}; + +/* Specify flags to be used for IB */ + +/* This IB should be submitted to CE */ +#define AMDGPU_IB_FLAG_CE (1<<0) + +/* Preamble flag, which means the IB could be dropped if no context switch */ +#define AMDGPU_IB_FLAG_PREAMBLE (1<<1) + +/* Preempt flag, IB should set Pre_enb bit if PREEMPT flag detected */ +#define AMDGPU_IB_FLAG_PREEMPT (1<<2) + +/* The IB fence should do the L2 writeback but not invalidate any shader + * caches (L2/vL1/sL1/I$). */ +#define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3) + +/* Set GDS_COMPUTE_MAX_WAVE_ID = DEFAULT before PACKET3_INDIRECT_BUFFER. + * This will reset wave ID counters for the IB. + */ +#define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4) + +/* Flag the IB as secure (TMZ) + */ +#define AMDGPU_IB_FLAGS_SECURE (1 << 5) + +/* Tell KMD to flush and invalidate caches + */ +#define AMDGPU_IB_FLAG_EMIT_MEM_SYNC (1 << 6) + +struct drm_amdgpu_cs_chunk_ib { + __u32 _pad; + /** AMDGPU_IB_FLAG_* */ + __u32 flags; + /** Virtual address to begin IB execution */ + __u64 va_start; + /** Size of submission */ + __u32 ib_bytes; + /** HW IP to submit to */ + __u32 ip_type; + /** HW IP index of the same type to submit to */ + __u32 ip_instance; + /** Ring index to submit to */ + __u32 ring; +}; + +struct drm_amdgpu_cs_chunk_dep { + __u32 ip_type; + __u32 ip_instance; + __u32 ring; + __u32 ctx_id; + __u64 handle; +}; + +struct drm_amdgpu_cs_chunk_fence { + __u32 handle; + __u32 offset; +}; + +struct drm_amdgpu_cs_chunk_sem { + __u32 handle; +}; + +struct drm_amdgpu_cs_chunk_syncobj { + __u32 handle; + __u32 flags; + __u64 point; +}; + +#define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ 0 +#define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ_FD 1 +#define AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD 2 + +union drm_amdgpu_fence_to_handle { + struct { + struct drm_amdgpu_fence fence; + __u32 what; + __u32 pad; + } in; + struct { + __u32 handle; + } out; +}; + +struct drm_amdgpu_cs_chunk_data { + union { + struct drm_amdgpu_cs_chunk_ib ib_data; + struct drm_amdgpu_cs_chunk_fence fence_data; + }; +}; + +#define AMDGPU_CS_CHUNK_CP_GFX_SHADOW_FLAGS_INIT_SHADOW 0x1 + +struct drm_amdgpu_cs_chunk_cp_gfx_shadow { + __u64 shadow_va; + __u64 csa_va; + __u64 gds_va; + __u64 flags; +}; + +/* + * Query h/w info: Flag that this is integrated (a.h.a. fusion) GPU + * + */ +#define AMDGPU_IDS_FLAGS_FUSION 0x1 +#define AMDGPU_IDS_FLAGS_PREEMPTION 0x2 +#define AMDGPU_IDS_FLAGS_TMZ 0x4 +#define AMDGPU_IDS_FLAGS_CONFORMANT_TRUNC_COORD 0x8 + +/* + * Query h/w info: Flag identifying VF/PF/PT mode + * + */ +#define AMDGPU_IDS_FLAGS_MODE_MASK 0x300 +#define AMDGPU_IDS_FLAGS_MODE_SHIFT 0x8 +#define AMDGPU_IDS_FLAGS_MODE_PF 0x0 +#define AMDGPU_IDS_FLAGS_MODE_VF 0x1 +#define AMDGPU_IDS_FLAGS_MODE_PT 0x2 + +/* indicate if acceleration can be working */ +#define AMDGPU_INFO_ACCEL_WORKING 0x00 +/* get the crtc_id from the mode object id? */ +#define AMDGPU_INFO_CRTC_FROM_ID 0x01 +/* query hw IP info */ +#define AMDGPU_INFO_HW_IP_INFO 0x02 +/* query hw IP instance count for the specified type */ +#define AMDGPU_INFO_HW_IP_COUNT 0x03 +/* timestamp for GL_ARB_timer_query */ +#define AMDGPU_INFO_TIMESTAMP 0x05 +/* Query the firmware version */ +#define AMDGPU_INFO_FW_VERSION 0x0e + /* Subquery id: Query VCE firmware version */ + #define AMDGPU_INFO_FW_VCE 0x1 + /* Subquery id: Query UVD firmware version */ + #define AMDGPU_INFO_FW_UVD 0x2 + /* Subquery id: Query GMC firmware version */ + #define AMDGPU_INFO_FW_GMC 0x03 + /* Subquery id: Query GFX ME firmware version */ + #define AMDGPU_INFO_FW_GFX_ME 0x04 + /* Subquery id: Query GFX PFP firmware version */ + #define AMDGPU_INFO_FW_GFX_PFP 0x05 + /* Subquery id: Query GFX CE firmware version */ + #define AMDGPU_INFO_FW_GFX_CE 0x06 + /* Subquery id: Query GFX RLC firmware version */ + #define AMDGPU_INFO_FW_GFX_RLC 0x07 + /* Subquery id: Query GFX MEC firmware version */ + #define AMDGPU_INFO_FW_GFX_MEC 0x08 + /* Subquery id: Query SMC firmware version */ + #define AMDGPU_INFO_FW_SMC 0x0a + /* Subquery id: Query SDMA firmware version */ + #define AMDGPU_INFO_FW_SDMA 0x0b + /* Subquery id: Query PSP SOS firmware version */ + #define AMDGPU_INFO_FW_SOS 0x0c + /* Subquery id: Query PSP ASD firmware version */ + #define AMDGPU_INFO_FW_ASD 0x0d + /* Subquery id: Query VCN firmware version */ + #define AMDGPU_INFO_FW_VCN 0x0e + /* Subquery id: Query GFX RLC SRLC firmware version */ + #define AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_CNTL 0x0f + /* Subquery id: Query GFX RLC SRLG firmware version */ + #define AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_GPM_MEM 0x10 + /* Subquery id: Query GFX RLC SRLS firmware version */ + #define AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_SRM_MEM 0x11 + /* Subquery id: Query DMCU firmware version */ + #define AMDGPU_INFO_FW_DMCU 0x12 + #define AMDGPU_INFO_FW_TA 0x13 + /* Subquery id: Query DMCUB firmware version */ + #define AMDGPU_INFO_FW_DMCUB 0x14 + /* Subquery id: Query TOC firmware version */ + #define AMDGPU_INFO_FW_TOC 0x15 + /* Subquery id: Query CAP firmware version */ + #define AMDGPU_INFO_FW_CAP 0x16 + /* Subquery id: Query GFX RLCP firmware version */ + #define AMDGPU_INFO_FW_GFX_RLCP 0x17 + /* Subquery id: Query GFX RLCV firmware version */ + #define AMDGPU_INFO_FW_GFX_RLCV 0x18 + /* Subquery id: Query MES_KIQ firmware version */ + #define AMDGPU_INFO_FW_MES_KIQ 0x19 + /* Subquery id: Query MES firmware version */ + #define AMDGPU_INFO_FW_MES 0x1a + /* Subquery id: Query IMU firmware version */ + #define AMDGPU_INFO_FW_IMU 0x1b + /* Subquery id: Query VPE firmware version */ + #define AMDGPU_INFO_FW_VPE 0x1c + +/* number of bytes moved for TTM migration */ +#define AMDGPU_INFO_NUM_BYTES_MOVED 0x0f +/* the used VRAM size */ +#define AMDGPU_INFO_VRAM_USAGE 0x10 +/* the used GTT size */ +#define AMDGPU_INFO_GTT_USAGE 0x11 +/* Information about GDS, etc. resource configuration */ +#define AMDGPU_INFO_GDS_CONFIG 0x13 +/* Query information about VRAM and GTT domains */ +#define AMDGPU_INFO_VRAM_GTT 0x14 +/* Query information about register in MMR address space*/ +#define AMDGPU_INFO_READ_MMR_REG 0x15 +/* Query information about device: rev id, family, etc. */ +#define AMDGPU_INFO_DEV_INFO 0x16 +/* visible vram usage */ +#define AMDGPU_INFO_VIS_VRAM_USAGE 0x17 +/* number of TTM buffer evictions */ +#define AMDGPU_INFO_NUM_EVICTIONS 0x18 +/* Query memory about VRAM and GTT domains */ +#define AMDGPU_INFO_MEMORY 0x19 +/* Query vce clock table */ +#define AMDGPU_INFO_VCE_CLOCK_TABLE 0x1A +/* Query vbios related information */ +#define AMDGPU_INFO_VBIOS 0x1B + /* Subquery id: Query vbios size */ + #define AMDGPU_INFO_VBIOS_SIZE 0x1 + /* Subquery id: Query vbios image */ + #define AMDGPU_INFO_VBIOS_IMAGE 0x2 + /* Subquery id: Query vbios info */ + #define AMDGPU_INFO_VBIOS_INFO 0x3 +/* Query UVD handles */ +#define AMDGPU_INFO_NUM_HANDLES 0x1C +/* Query sensor related information */ +#define AMDGPU_INFO_SENSOR 0x1D + /* Subquery id: Query GPU shader clock */ + #define AMDGPU_INFO_SENSOR_GFX_SCLK 0x1 + /* Subquery id: Query GPU memory clock */ + #define AMDGPU_INFO_SENSOR_GFX_MCLK 0x2 + /* Subquery id: Query GPU temperature */ + #define AMDGPU_INFO_SENSOR_GPU_TEMP 0x3 + /* Subquery id: Query GPU load */ + #define AMDGPU_INFO_SENSOR_GPU_LOAD 0x4 + /* Subquery id: Query average GPU power */ + #define AMDGPU_INFO_SENSOR_GPU_AVG_POWER 0x5 + /* Subquery id: Query northbridge voltage */ + #define AMDGPU_INFO_SENSOR_VDDNB 0x6 + /* Subquery id: Query graphics voltage */ + #define AMDGPU_INFO_SENSOR_VDDGFX 0x7 + /* Subquery id: Query GPU stable pstate shader clock */ + #define AMDGPU_INFO_SENSOR_STABLE_PSTATE_GFX_SCLK 0x8 + /* Subquery id: Query GPU stable pstate memory clock */ + #define AMDGPU_INFO_SENSOR_STABLE_PSTATE_GFX_MCLK 0x9 + /* Subquery id: Query GPU peak pstate shader clock */ + #define AMDGPU_INFO_SENSOR_PEAK_PSTATE_GFX_SCLK 0xa + /* Subquery id: Query GPU peak pstate memory clock */ + #define AMDGPU_INFO_SENSOR_PEAK_PSTATE_GFX_MCLK 0xb + /* Subquery id: Query input GPU power */ + #define AMDGPU_INFO_SENSOR_GPU_INPUT_POWER 0xc +/* Number of VRAM page faults on CPU access. */ +#define AMDGPU_INFO_NUM_VRAM_CPU_PAGE_FAULTS 0x1E +#define AMDGPU_INFO_VRAM_LOST_COUNTER 0x1F +/* query ras mask of enabled features*/ +#define AMDGPU_INFO_RAS_ENABLED_FEATURES 0x20 +/* RAS MASK: UMC (VRAM) */ +#define AMDGPU_INFO_RAS_ENABLED_UMC (1 << 0) +/* RAS MASK: SDMA */ +#define AMDGPU_INFO_RAS_ENABLED_SDMA (1 << 1) +/* RAS MASK: GFX */ +#define AMDGPU_INFO_RAS_ENABLED_GFX (1 << 2) +/* RAS MASK: MMHUB */ +#define AMDGPU_INFO_RAS_ENABLED_MMHUB (1 << 3) +/* RAS MASK: ATHUB */ +#define AMDGPU_INFO_RAS_ENABLED_ATHUB (1 << 4) +/* RAS MASK: PCIE */ +#define AMDGPU_INFO_RAS_ENABLED_PCIE (1 << 5) +/* RAS MASK: HDP */ +#define AMDGPU_INFO_RAS_ENABLED_HDP (1 << 6) +/* RAS MASK: XGMI */ +#define AMDGPU_INFO_RAS_ENABLED_XGMI (1 << 7) +/* RAS MASK: DF */ +#define AMDGPU_INFO_RAS_ENABLED_DF (1 << 8) +/* RAS MASK: SMN */ +#define AMDGPU_INFO_RAS_ENABLED_SMN (1 << 9) +/* RAS MASK: SEM */ +#define AMDGPU_INFO_RAS_ENABLED_SEM (1 << 10) +/* RAS MASK: MP0 */ +#define AMDGPU_INFO_RAS_ENABLED_MP0 (1 << 11) +/* RAS MASK: MP1 */ +#define AMDGPU_INFO_RAS_ENABLED_MP1 (1 << 12) +/* RAS MASK: FUSE */ +#define AMDGPU_INFO_RAS_ENABLED_FUSE (1 << 13) +/* query video encode/decode caps */ +#define AMDGPU_INFO_VIDEO_CAPS 0x21 + /* Subquery id: Decode */ + #define AMDGPU_INFO_VIDEO_CAPS_DECODE 0 + /* Subquery id: Encode */ + #define AMDGPU_INFO_VIDEO_CAPS_ENCODE 1 +/* Query the max number of IBs per gang per submission */ +#define AMDGPU_INFO_MAX_IBS 0x22 +/* query last page fault info */ +#define AMDGPU_INFO_GPUVM_FAULT 0x23 +/* query FW object size and alignment */ +#define AMDGPU_INFO_UQ_FW_AREAS 0x24 + +#define AMDGPU_INFO_MMR_SE_INDEX_SHIFT 0 +#define AMDGPU_INFO_MMR_SE_INDEX_MASK 0xff +#define AMDGPU_INFO_MMR_SH_INDEX_SHIFT 8 +#define AMDGPU_INFO_MMR_SH_INDEX_MASK 0xff + +struct drm_amdgpu_query_fw { + /** AMDGPU_INFO_FW_* */ + __u32 fw_type; + /** + * Index of the IP if there are more IPs of + * the same type. + */ + __u32 ip_instance; + /** + * Index of the engine. Whether this is used depends + * on the firmware type. (e.g. MEC, SDMA) + */ + __u32 index; + __u32 _pad; +}; + +/* Input structure for the INFO ioctl */ +struct drm_amdgpu_info { + /* Where the return value will be stored */ + __u64 return_pointer; + /* The size of the return value. Just like "size" in "snprintf", + * it limits how many bytes the kernel can write. */ + __u32 return_size; + /* The query request id. */ + __u32 query; + + union { + struct { + __u32 id; + __u32 _pad; + } mode_crtc; + + struct { + /** AMDGPU_HW_IP_* */ + __u32 type; + /** + * Index of the IP if there are more IPs of the same + * type. Ignored by AMDGPU_INFO_HW_IP_COUNT. + */ + __u32 ip_instance; + } query_hw_ip; + + struct { + __u32 dword_offset; + /** number of registers to read */ + __u32 count; + __u32 instance; + /** For future use, no flags defined so far */ + __u32 flags; + } read_mmr_reg; + + struct drm_amdgpu_query_fw query_fw; + + struct { + __u32 type; + __u32 offset; + } vbios_info; + + struct { + __u32 type; + } sensor_info; + + struct { + __u32 type; + } video_cap; + }; +}; + +struct drm_amdgpu_info_gds { + /** GDS GFX partition size */ + __u32 gds_gfx_partition_size; + /** GDS compute partition size */ + __u32 compute_partition_size; + /** total GDS memory size */ + __u32 gds_total_size; + /** GWS size per GFX partition */ + __u32 gws_per_gfx_partition; + /** GSW size per compute partition */ + __u32 gws_per_compute_partition; + /** OA size per GFX partition */ + __u32 oa_per_gfx_partition; + /** OA size per compute partition */ + __u32 oa_per_compute_partition; + __u32 _pad; +}; + +struct drm_amdgpu_info_vram_gtt { + __u64 vram_size; + __u64 vram_cpu_accessible_size; + __u64 gtt_size; +}; + +struct drm_amdgpu_heap_info { + /** max. physical memory */ + __u64 total_heap_size; + + /** Theoretical max. available memory in the given heap */ + __u64 usable_heap_size; + + /** + * Number of bytes allocated in the heap. This includes all processes + * and private allocations in the kernel. It changes when new buffers + * are allocated, freed, and moved. It cannot be larger than + * heap_size. + */ + __u64 heap_usage; + + /** + * Theoretical possible max. size of buffer which + * could be allocated in the given heap + */ + __u64 max_allocation; +}; + +struct drm_amdgpu_memory_info { + struct drm_amdgpu_heap_info vram; + struct drm_amdgpu_heap_info cpu_accessible_vram; + struct drm_amdgpu_heap_info gtt; +}; + +struct drm_amdgpu_info_firmware { + __u32 ver; + __u32 feature; +}; + +struct drm_amdgpu_info_vbios { + __u8 name[64]; + __u8 vbios_pn[64]; + __u32 version; + __u32 pad; + __u8 vbios_ver_str[32]; + __u8 date[32]; +}; + +#define AMDGPU_VRAM_TYPE_UNKNOWN 0 +#define AMDGPU_VRAM_TYPE_GDDR1 1 +#define AMDGPU_VRAM_TYPE_DDR2 2 +#define AMDGPU_VRAM_TYPE_GDDR3 3 +#define AMDGPU_VRAM_TYPE_GDDR4 4 +#define AMDGPU_VRAM_TYPE_GDDR5 5 +#define AMDGPU_VRAM_TYPE_HBM 6 +#define AMDGPU_VRAM_TYPE_DDR3 7 +#define AMDGPU_VRAM_TYPE_DDR4 8 +#define AMDGPU_VRAM_TYPE_GDDR6 9 +#define AMDGPU_VRAM_TYPE_DDR5 10 +#define AMDGPU_VRAM_TYPE_LPDDR4 11 +#define AMDGPU_VRAM_TYPE_LPDDR5 12 +#define AMDGPU_VRAM_TYPE_HBM3E 13 + +struct drm_amdgpu_info_device { + /** PCI Device ID */ + __u32 device_id; + /** Internal chip revision: A0, A1, etc.) */ + __u32 chip_rev; + __u32 external_rev; + /** Revision id in PCI Config space */ + __u32 pci_rev; + __u32 family; + __u32 num_shader_engines; + __u32 num_shader_arrays_per_engine; + /* in KHz */ + __u32 gpu_counter_freq; + __u64 max_engine_clock; + __u64 max_memory_clock; + /* cu information */ + __u32 cu_active_number; + /* NOTE: cu_ao_mask is INVALID, DON'T use it */ + __u32 cu_ao_mask; + __u32 cu_bitmap[4][4]; + /** Render backend pipe mask. One render backend is CB+DB. */ + __u32 enabled_rb_pipes_mask; + __u32 num_rb_pipes; + __u32 num_hw_gfx_contexts; + /* PCIe version (the smaller of the GPU and the CPU/motherboard) */ + __u32 pcie_gen; + __u64 ids_flags; + /** Starting virtual address for UMDs. */ + __u64 virtual_address_offset; + /** The maximum virtual address */ + __u64 virtual_address_max; + /** Required alignment of virtual addresses. */ + __u32 virtual_address_alignment; + /** Page table entry - fragment size */ + __u32 pte_fragment_size; + __u32 gart_page_size; + /** constant engine ram size*/ + __u32 ce_ram_size; + /** video memory type info*/ + __u32 vram_type; + /** video memory bit width*/ + __u32 vram_bit_width; + /* vce harvesting instance */ + __u32 vce_harvest_config; + /* gfx double offchip LDS buffers */ + __u32 gc_double_offchip_lds_buf; + /* NGG Primitive Buffer */ + __u64 prim_buf_gpu_addr; + /* NGG Position Buffer */ + __u64 pos_buf_gpu_addr; + /* NGG Control Sideband */ + __u64 cntl_sb_buf_gpu_addr; + /* NGG Parameter Cache */ + __u64 param_buf_gpu_addr; + __u32 prim_buf_size; + __u32 pos_buf_size; + __u32 cntl_sb_buf_size; + __u32 param_buf_size; + /* wavefront size*/ + __u32 wave_front_size; + /* shader visible vgprs*/ + __u32 num_shader_visible_vgprs; + /* CU per shader array*/ + __u32 num_cu_per_sh; + /* number of tcc blocks*/ + __u32 num_tcc_blocks; + /* gs vgt table depth*/ + __u32 gs_vgt_table_depth; + /* gs primitive buffer depth*/ + __u32 gs_prim_buffer_depth; + /* max gs wavefront per vgt*/ + __u32 max_gs_waves_per_vgt; + /* PCIe number of lanes (the smaller of the GPU and the CPU/motherboard) */ + __u32 pcie_num_lanes; + /* always on cu bitmap */ + __u32 cu_ao_bitmap[4][4]; + /** Starting high virtual address for UMDs. */ + __u64 high_va_offset; + /** The maximum high virtual address */ + __u64 high_va_max; + /* gfx10 pa_sc_tile_steering_override */ + __u32 pa_sc_tile_steering_override; + /* disabled TCCs */ + __u64 tcc_disabled_mask; + __u64 min_engine_clock; + __u64 min_memory_clock; + /* The following fields are only set on gfx11+, older chips set 0. */ + __u32 tcp_cache_size; /* AKA GL0, VMEM cache */ + __u32 num_sqc_per_wgp; + __u32 sqc_data_cache_size; /* AKA SMEM cache */ + __u32 sqc_inst_cache_size; + __u32 gl1c_cache_size; + __u32 gl2c_cache_size; + __u64 mall_size; /* AKA infinity cache */ + /* high 32 bits of the rb pipes mask */ + __u32 enabled_rb_pipes_mask_hi; + /* shadow area size for gfx11 */ + __u32 shadow_size; + /* shadow area base virtual alignment for gfx11 */ + __u32 shadow_alignment; + /* context save area size for gfx11 */ + __u32 csa_size; + /* context save area base virtual alignment for gfx11 */ + __u32 csa_alignment; + /* Userq IP mask (1 << AMDGPU_HW_IP_*) */ + __u32 userq_ip_mask; + __u32 pad; +}; + +struct drm_amdgpu_info_hw_ip { + /** Version of h/w IP */ + __u32 hw_ip_version_major; + __u32 hw_ip_version_minor; + /** Capabilities */ + __u64 capabilities_flags; + /** command buffer address start alignment*/ + __u32 ib_start_alignment; + /** command buffer size alignment*/ + __u32 ib_size_alignment; + /** Bitmask of available rings. Bit 0 means ring 0, etc. */ + __u32 available_rings; + /** version info: bits 23:16 major, 15:8 minor, 7:0 revision */ + __u32 ip_discovery_version; + /* Userq available slots */ + __u32 userq_num_slots; +}; + +/* GFX metadata BO sizes and alignment info (in bytes) */ +struct drm_amdgpu_info_uq_fw_areas_gfx { + /* shadow area size */ + __u32 shadow_size; + /* shadow area base virtual mem alignment */ + __u32 shadow_alignment; + /* context save area size */ + __u32 csa_size; + /* context save area base virtual mem alignment */ + __u32 csa_alignment; +}; + +/* IP specific fw related information used in the + * subquery AMDGPU_INFO_UQ_FW_AREAS + */ +struct drm_amdgpu_info_uq_fw_areas { + union { + struct drm_amdgpu_info_uq_fw_areas_gfx gfx; + }; +}; + +struct drm_amdgpu_info_num_handles { + /** Max handles as supported by firmware for UVD */ + __u32 uvd_max_handles; + /** Handles currently in use for UVD */ + __u32 uvd_used_handles; +}; + +#define AMDGPU_VCE_CLOCK_TABLE_ENTRIES 6 + +struct drm_amdgpu_info_vce_clock_table_entry { + /** System clock */ + __u32 sclk; + /** Memory clock */ + __u32 mclk; + /** VCE clock */ + __u32 eclk; + __u32 pad; +}; + +struct drm_amdgpu_info_vce_clock_table { + struct drm_amdgpu_info_vce_clock_table_entry entries[AMDGPU_VCE_CLOCK_TABLE_ENTRIES]; + __u32 num_valid_entries; + __u32 pad; +}; + +/* query video encode/decode caps */ +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG2 0 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4 1 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VC1 2 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC 3 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC 4 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG 5 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VP9 6 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_AV1 7 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_COUNT 8 + +struct drm_amdgpu_info_video_codec_info { + __u32 valid; + __u32 max_width; + __u32 max_height; + __u32 max_pixels_per_frame; + __u32 max_level; + __u32 pad; +}; + +struct drm_amdgpu_info_video_caps { + struct drm_amdgpu_info_video_codec_info codec_info[AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_COUNT]; +}; + +#define AMDGPU_VMHUB_TYPE_MASK 0xff +#define AMDGPU_VMHUB_TYPE_SHIFT 0 +#define AMDGPU_VMHUB_TYPE_GFX 0 +#define AMDGPU_VMHUB_TYPE_MM0 1 +#define AMDGPU_VMHUB_TYPE_MM1 2 +#define AMDGPU_VMHUB_IDX_MASK 0xff00 +#define AMDGPU_VMHUB_IDX_SHIFT 8 + +struct drm_amdgpu_info_gpuvm_fault { + __u64 addr; + __u32 status; + __u32 vmhub; +}; + +struct drm_amdgpu_info_uq_metadata_gfx { + /* shadow area size for gfx11 */ + __u32 shadow_size; + /* shadow area base virtual alignment for gfx11 */ + __u32 shadow_alignment; + /* context save area size for gfx11 */ + __u32 csa_size; + /* context save area base virtual alignment for gfx11 */ + __u32 csa_alignment; +}; + +struct drm_amdgpu_info_uq_metadata { + union { + struct drm_amdgpu_info_uq_metadata_gfx gfx; + }; +}; + +/* + * Supported GPU families + */ +#define AMDGPU_FAMILY_UNKNOWN 0 +#define AMDGPU_FAMILY_SI 110 /* Hainan, Oland, Verde, Pitcairn, Tahiti */ +#define AMDGPU_FAMILY_CI 120 /* Bonaire, Hawaii */ +#define AMDGPU_FAMILY_KV 125 /* Kaveri, Kabini, Mullins */ +#define AMDGPU_FAMILY_VI 130 /* Iceland, Tonga */ +#define AMDGPU_FAMILY_CZ 135 /* Carrizo, Stoney */ +#define AMDGPU_FAMILY_AI 141 /* Vega10 */ +#define AMDGPU_FAMILY_RV 142 /* Raven */ +#define AMDGPU_FAMILY_NV 143 /* Navi10 */ +#define AMDGPU_FAMILY_VGH 144 /* Van Gogh */ +#define AMDGPU_FAMILY_GC_11_0_0 145 /* GC 11.0.0 */ +#define AMDGPU_FAMILY_YC 146 /* Yellow Carp */ +#define AMDGPU_FAMILY_GC_11_0_1 148 /* GC 11.0.1 */ +#define AMDGPU_FAMILY_GC_10_3_6 149 /* GC 10.3.6 */ +#define AMDGPU_FAMILY_GC_10_3_7 151 /* GC 10.3.7 */ +#define AMDGPU_FAMILY_GC_11_5_0 150 /* GC 11.5.0 */ +#define AMDGPU_FAMILY_GC_12_0_0 152 /* GC 12.0.0 */ + +/* FIXME wrong namespace! */ +struct drm_color_ctm_3x4 { + /* + * Conversion matrix with 3x4 dimensions in S31.32 sign-magnitude + * (not two's complement!) format. + */ + __u64 matrix[12]; +}; + +#if defined(__cplusplus) +} +#endif + +#endif From 5eb61e1b14959acb858fea69d45bf5c8f7f53ee5 Mon Sep 17 00:00:00 2001 From: David Francis Date: Thu, 15 May 2025 09:49:24 -0400 Subject: [PATCH 246/257] plugin/amdgpu: Add drm header The amdgpu plugin usually calls drm ioctls through the libdrm wrappers. However, amdgpu restore requires dealing with dmabufs and gem handles directly, which means drm ioctls must be called directly. Add the drm.h header (from the kernel's uapi). Signed-off-by: David Francis --- plugins/amdgpu/drm.h | 1450 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1450 insertions(+) create mode 100644 plugins/amdgpu/drm.h diff --git a/plugins/amdgpu/drm.h b/plugins/amdgpu/drm.h new file mode 100644 index 000000000..84c819c17 --- /dev/null +++ b/plugins/amdgpu/drm.h @@ -0,0 +1,1450 @@ +/* + * Header for the Direct Rendering Manager + * + * Author: Rickard E. (Rik) Faith + * + * Acknowledgments: + * Dec 1999, Richard Henderson , move to generic cmpxchg. + */ + +/* + * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas. + * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. + * All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _DRM_H_ +#define _DRM_H_ + +#if defined(__KERNEL__) + +#include +#include +typedef unsigned int drm_handle_t; + +#elif defined(__linux__) + +#include +#include +typedef unsigned int drm_handle_t; + +#else /* One of the BSDs */ + +#include +#include +#include +typedef int8_t __s8; +typedef uint8_t __u8; +typedef int16_t __s16; +typedef uint16_t __u16; +typedef int32_t __s32; +typedef uint32_t __u32; +typedef int64_t __s64; +typedef uint64_t __u64; +typedef size_t __kernel_size_t; +typedef unsigned long drm_handle_t; + +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +#define DRM_NAME "drm" /**< Name in kernel, /dev, and /proc */ +#define DRM_MIN_ORDER 5 /**< At least 2^5 bytes = 32 bytes */ +#define DRM_MAX_ORDER 22 /**< Up to 2^22 bytes = 4MB */ +#define DRM_RAM_PERCENT 10 /**< How much system ram can we lock? */ + +#define _DRM_LOCK_HELD 0x80000000U /**< Hardware lock is held */ +#define _DRM_LOCK_CONT 0x40000000U /**< Hardware lock is contended */ +#define _DRM_LOCK_IS_HELD(lock) ((lock) & _DRM_LOCK_HELD) +#define _DRM_LOCK_IS_CONT(lock) ((lock) & _DRM_LOCK_CONT) +#define _DRM_LOCKING_CONTEXT(lock) ((lock) & ~(_DRM_LOCK_HELD|_DRM_LOCK_CONT)) + +typedef unsigned int drm_context_t; +typedef unsigned int drm_drawable_t; +typedef unsigned int drm_magic_t; + +/* + * Cliprect. + * + * \warning: If you change this structure, make sure you change + * XF86DRIClipRectRec in the server as well + * + * \note KW: Actually it's illegal to change either for + * backwards-compatibility reasons. + */ +struct drm_clip_rect { + unsigned short x1; + unsigned short y1; + unsigned short x2; + unsigned short y2; +}; + +/* + * Drawable information. + */ +struct drm_drawable_info { + unsigned int num_rects; + struct drm_clip_rect *rects; +}; + +/* + * Texture region, + */ +struct drm_tex_region { + unsigned char next; + unsigned char prev; + unsigned char in_use; + unsigned char padding; + unsigned int age; +}; + +/* + * Hardware lock. + * + * The lock structure is a simple cache-line aligned integer. To avoid + * processor bus contention on a multiprocessor system, there should not be any + * other data stored in the same cache line. + */ +struct drm_hw_lock { + __volatile__ unsigned int lock; /**< lock variable */ + char padding[60]; /**< Pad to cache line */ +}; + +/* + * DRM_IOCTL_VERSION ioctl argument type. + * + * \sa drmGetVersion(). + */ +struct drm_version { + int version_major; /**< Major version */ + int version_minor; /**< Minor version */ + int version_patchlevel; /**< Patch level */ + __kernel_size_t name_len; /**< Length of name buffer */ + char __user *name; /**< Name of driver */ + __kernel_size_t date_len; /**< Length of date buffer */ + char __user *date; /**< User-space buffer to hold date */ + __kernel_size_t desc_len; /**< Length of desc buffer */ + char __user *desc; /**< User-space buffer to hold desc */ +}; + +/* + * DRM_IOCTL_GET_UNIQUE ioctl argument type. + * + * \sa drmGetBusid() and drmSetBusId(). + */ +struct drm_unique { + __kernel_size_t unique_len; /**< Length of unique */ + char __user *unique; /**< Unique name for driver instantiation */ +}; + +struct drm_list { + int count; /**< Length of user-space structures */ + struct drm_version __user *version; +}; + +struct drm_block { + int unused; +}; + +/* + * DRM_IOCTL_CONTROL ioctl argument type. + * + * \sa drmCtlInstHandler() and drmCtlUninstHandler(). + */ +struct drm_control { + enum { + DRM_ADD_COMMAND, + DRM_RM_COMMAND, + DRM_INST_HANDLER, + DRM_UNINST_HANDLER + } func; + int irq; +}; + +/* + * Type of memory to map. + */ +enum drm_map_type { + _DRM_FRAME_BUFFER = 0, /**< WC (no caching), no core dump */ + _DRM_REGISTERS = 1, /**< no caching, no core dump */ + _DRM_SHM = 2, /**< shared, cached */ + _DRM_AGP = 3, /**< AGP/GART */ + _DRM_SCATTER_GATHER = 4, /**< Scatter/gather memory for PCI DMA */ + _DRM_CONSISTENT = 5 /**< Consistent memory for PCI DMA */ +}; + +/* + * Memory mapping flags. + */ +enum drm_map_flags { + _DRM_RESTRICTED = 0x01, /**< Cannot be mapped to user-virtual */ + _DRM_READ_ONLY = 0x02, + _DRM_LOCKED = 0x04, /**< shared, cached, locked */ + _DRM_KERNEL = 0x08, /**< kernel requires access */ + _DRM_WRITE_COMBINING = 0x10, /**< use write-combining if available */ + _DRM_CONTAINS_LOCK = 0x20, /**< SHM page that contains lock */ + _DRM_REMOVABLE = 0x40, /**< Removable mapping */ + _DRM_DRIVER = 0x80 /**< Managed by driver */ +}; + +struct drm_ctx_priv_map { + unsigned int ctx_id; /**< Context requesting private mapping */ + void *handle; /**< Handle of map */ +}; + +/* + * DRM_IOCTL_GET_MAP, DRM_IOCTL_ADD_MAP and DRM_IOCTL_RM_MAP ioctls + * argument type. + * + * \sa drmAddMap(). + */ +struct drm_map { + unsigned long offset; /**< Requested physical address (0 for SAREA)*/ + unsigned long size; /**< Requested physical size (bytes) */ + enum drm_map_type type; /**< Type of memory to map */ + enum drm_map_flags flags; /**< Flags */ + void *handle; /**< User-space: "Handle" to pass to mmap() */ + /**< Kernel-space: kernel-virtual address */ + int mtrr; /**< MTRR slot used */ + /* Private data */ +}; + +/* + * DRM_IOCTL_GET_CLIENT ioctl argument type. + */ +struct drm_client { + int idx; /**< Which client desired? */ + int auth; /**< Is client authenticated? */ + unsigned long pid; /**< Process ID */ + unsigned long uid; /**< User ID */ + unsigned long magic; /**< Magic */ + unsigned long iocs; /**< Ioctl count */ +}; + +enum drm_stat_type { + _DRM_STAT_LOCK, + _DRM_STAT_OPENS, + _DRM_STAT_CLOSES, + _DRM_STAT_IOCTLS, + _DRM_STAT_LOCKS, + _DRM_STAT_UNLOCKS, + _DRM_STAT_VALUE, /**< Generic value */ + _DRM_STAT_BYTE, /**< Generic byte counter (1024bytes/K) */ + _DRM_STAT_COUNT, /**< Generic non-byte counter (1000/k) */ + + _DRM_STAT_IRQ, /**< IRQ */ + _DRM_STAT_PRIMARY, /**< Primary DMA bytes */ + _DRM_STAT_SECONDARY, /**< Secondary DMA bytes */ + _DRM_STAT_DMA, /**< DMA */ + _DRM_STAT_SPECIAL, /**< Special DMA (e.g., priority or polled) */ + _DRM_STAT_MISSED /**< Missed DMA opportunity */ + /* Add to the *END* of the list */ +}; + +/* + * DRM_IOCTL_GET_STATS ioctl argument type. + */ +struct drm_stats { + unsigned long count; + struct { + unsigned long value; + enum drm_stat_type type; + } data[15]; +}; + +/* + * Hardware locking flags. + */ +enum drm_lock_flags { + _DRM_LOCK_READY = 0x01, /**< Wait until hardware is ready for DMA */ + _DRM_LOCK_QUIESCENT = 0x02, /**< Wait until hardware quiescent */ + _DRM_LOCK_FLUSH = 0x04, /**< Flush this context's DMA queue first */ + _DRM_LOCK_FLUSH_ALL = 0x08, /**< Flush all DMA queues first */ + /* These *HALT* flags aren't supported yet + -- they will be used to support the + full-screen DGA-like mode. */ + _DRM_HALT_ALL_QUEUES = 0x10, /**< Halt all current and future queues */ + _DRM_HALT_CUR_QUEUES = 0x20 /**< Halt all current queues */ +}; + +/* + * DRM_IOCTL_LOCK, DRM_IOCTL_UNLOCK and DRM_IOCTL_FINISH ioctl argument type. + * + * \sa drmGetLock() and drmUnlock(). + */ +struct drm_lock { + int context; + enum drm_lock_flags flags; +}; + +/* + * DMA flags + * + * \warning + * These values \e must match xf86drm.h. + * + * \sa drm_dma. + */ +enum drm_dma_flags { + /* Flags for DMA buffer dispatch */ + _DRM_DMA_BLOCK = 0x01, /**< + * Block until buffer dispatched. + * + * \note The buffer may not yet have + * been processed by the hardware -- + * getting a hardware lock with the + * hardware quiescent will ensure + * that the buffer has been + * processed. + */ + _DRM_DMA_WHILE_LOCKED = 0x02, /**< Dispatch while lock held */ + _DRM_DMA_PRIORITY = 0x04, /**< High priority dispatch */ + + /* Flags for DMA buffer request */ + _DRM_DMA_WAIT = 0x10, /**< Wait for free buffers */ + _DRM_DMA_SMALLER_OK = 0x20, /**< Smaller-than-requested buffers OK */ + _DRM_DMA_LARGER_OK = 0x40 /**< Larger-than-requested buffers OK */ +}; + +/* + * DRM_IOCTL_ADD_BUFS and DRM_IOCTL_MARK_BUFS ioctl argument type. + * + * \sa drmAddBufs(). + */ +struct drm_buf_desc { + int count; /**< Number of buffers of this size */ + int size; /**< Size in bytes */ + int low_mark; /**< Low water mark */ + int high_mark; /**< High water mark */ + enum { + _DRM_PAGE_ALIGN = 0x01, /**< Align on page boundaries for DMA */ + _DRM_AGP_BUFFER = 0x02, /**< Buffer is in AGP space */ + _DRM_SG_BUFFER = 0x04, /**< Scatter/gather memory buffer */ + _DRM_FB_BUFFER = 0x08, /**< Buffer is in frame buffer */ + _DRM_PCI_BUFFER_RO = 0x10 /**< Map PCI DMA buffer read-only */ + } flags; + unsigned long agp_start; /**< + * Start address of where the AGP buffers are + * in the AGP aperture + */ +}; + +/* + * DRM_IOCTL_INFO_BUFS ioctl argument type. + */ +struct drm_buf_info { + int count; /**< Entries in list */ + struct drm_buf_desc __user *list; +}; + +/* + * DRM_IOCTL_FREE_BUFS ioctl argument type. + */ +struct drm_buf_free { + int count; + int __user *list; +}; + +/* + * Buffer information + * + * \sa drm_buf_map. + */ +struct drm_buf_pub { + int idx; /**< Index into the master buffer list */ + int total; /**< Buffer size */ + int used; /**< Amount of buffer in use (for DMA) */ + void __user *address; /**< Address of buffer */ +}; + +/* + * DRM_IOCTL_MAP_BUFS ioctl argument type. + */ +struct drm_buf_map { + int count; /**< Length of the buffer list */ +#ifdef __cplusplus + void __user *virt; +#else + void __user *virtual; /**< Mmap'd area in user-virtual */ +#endif + struct drm_buf_pub __user *list; /**< Buffer information */ +}; + +/* + * DRM_IOCTL_DMA ioctl argument type. + * + * Indices here refer to the offset into the buffer list in drm_buf_get. + * + * \sa drmDMA(). + */ +struct drm_dma { + int context; /**< Context handle */ + int send_count; /**< Number of buffers to send */ + int __user *send_indices; /**< List of handles to buffers */ + int __user *send_sizes; /**< Lengths of data to send */ + enum drm_dma_flags flags; /**< Flags */ + int request_count; /**< Number of buffers requested */ + int request_size; /**< Desired size for buffers */ + int __user *request_indices; /**< Buffer information */ + int __user *request_sizes; + int granted_count; /**< Number of buffers granted */ +}; + +enum drm_ctx_flags { + _DRM_CONTEXT_PRESERVED = 0x01, + _DRM_CONTEXT_2DONLY = 0x02 +}; + +/* + * DRM_IOCTL_ADD_CTX ioctl argument type. + * + * \sa drmCreateContext() and drmDestroyContext(). + */ +struct drm_ctx { + drm_context_t handle; + enum drm_ctx_flags flags; +}; + +/* + * DRM_IOCTL_RES_CTX ioctl argument type. + */ +struct drm_ctx_res { + int count; + struct drm_ctx __user *contexts; +}; + +/* + * DRM_IOCTL_ADD_DRAW and DRM_IOCTL_RM_DRAW ioctl argument type. + */ +struct drm_draw { + drm_drawable_t handle; +}; + +/* + * DRM_IOCTL_UPDATE_DRAW ioctl argument type. + */ +typedef enum { + DRM_DRAWABLE_CLIPRECTS +} drm_drawable_info_type_t; + +struct drm_update_draw { + drm_drawable_t handle; + unsigned int type; + unsigned int num; + unsigned long long data; +}; + +/* + * DRM_IOCTL_GET_MAGIC and DRM_IOCTL_AUTH_MAGIC ioctl argument type. + */ +struct drm_auth { + drm_magic_t magic; +}; + +/* + * DRM_IOCTL_IRQ_BUSID ioctl argument type. + * + * \sa drmGetInterruptFromBusID(). + */ +struct drm_irq_busid { + int irq; /**< IRQ number */ + int busnum; /**< bus number */ + int devnum; /**< device number */ + int funcnum; /**< function number */ +}; + +enum drm_vblank_seq_type { + _DRM_VBLANK_ABSOLUTE = 0x0, /**< Wait for specific vblank sequence number */ + _DRM_VBLANK_RELATIVE = 0x1, /**< Wait for given number of vblanks */ + /* bits 1-6 are reserved for high crtcs */ + _DRM_VBLANK_HIGH_CRTC_MASK = 0x0000003e, + _DRM_VBLANK_EVENT = 0x4000000, /**< Send event instead of blocking */ + _DRM_VBLANK_FLIP = 0x8000000, /**< Scheduled buffer swap should flip */ + _DRM_VBLANK_NEXTONMISS = 0x10000000, /**< If missed, wait for next vblank */ + _DRM_VBLANK_SECONDARY = 0x20000000, /**< Secondary display controller */ + _DRM_VBLANK_SIGNAL = 0x40000000 /**< Send signal instead of blocking, unsupported */ +}; +#define _DRM_VBLANK_HIGH_CRTC_SHIFT 1 + +#define _DRM_VBLANK_TYPES_MASK (_DRM_VBLANK_ABSOLUTE | _DRM_VBLANK_RELATIVE) +#define _DRM_VBLANK_FLAGS_MASK (_DRM_VBLANK_EVENT | _DRM_VBLANK_SIGNAL | \ + _DRM_VBLANK_SECONDARY | _DRM_VBLANK_NEXTONMISS) + +struct drm_wait_vblank_request { + enum drm_vblank_seq_type type; + unsigned int sequence; + unsigned long signal; +}; + +struct drm_wait_vblank_reply { + enum drm_vblank_seq_type type; + unsigned int sequence; + long tval_sec; + long tval_usec; +}; + +/* + * DRM_IOCTL_WAIT_VBLANK ioctl argument type. + * + * \sa drmWaitVBlank(). + */ +union drm_wait_vblank { + struct drm_wait_vblank_request request; + struct drm_wait_vblank_reply reply; +}; + +#define _DRM_PRE_MODESET 1 +#define _DRM_POST_MODESET 2 + +/* + * DRM_IOCTL_MODESET_CTL ioctl argument type + * + * \sa drmModesetCtl(). + */ +struct drm_modeset_ctl { + __u32 crtc; + __u32 cmd; +}; + +/* + * DRM_IOCTL_AGP_ENABLE ioctl argument type. + * + * \sa drmAgpEnable(). + */ +struct drm_agp_mode { + unsigned long mode; /**< AGP mode */ +}; + +/* + * DRM_IOCTL_AGP_ALLOC and DRM_IOCTL_AGP_FREE ioctls argument type. + * + * \sa drmAgpAlloc() and drmAgpFree(). + */ +struct drm_agp_buffer { + unsigned long size; /**< In bytes -- will round to page boundary */ + unsigned long handle; /**< Used for binding / unbinding */ + unsigned long type; /**< Type of memory to allocate */ + unsigned long physical; /**< Physical used by i810 */ +}; + +/* + * DRM_IOCTL_AGP_BIND and DRM_IOCTL_AGP_UNBIND ioctls argument type. + * + * \sa drmAgpBind() and drmAgpUnbind(). + */ +struct drm_agp_binding { + unsigned long handle; /**< From drm_agp_buffer */ + unsigned long offset; /**< In bytes -- will round to page boundary */ +}; + +/* + * DRM_IOCTL_AGP_INFO ioctl argument type. + * + * \sa drmAgpVersionMajor(), drmAgpVersionMinor(), drmAgpGetMode(), + * drmAgpBase(), drmAgpSize(), drmAgpMemoryUsed(), drmAgpMemoryAvail(), + * drmAgpVendorId() and drmAgpDeviceId(). + */ +struct drm_agp_info { + int agp_version_major; + int agp_version_minor; + unsigned long mode; + unsigned long aperture_base; /* physical address */ + unsigned long aperture_size; /* bytes */ + unsigned long memory_allowed; /* bytes */ + unsigned long memory_used; + + /* PCI information */ + unsigned short id_vendor; + unsigned short id_device; +}; + +/* + * DRM_IOCTL_SG_ALLOC ioctl argument type. + */ +struct drm_scatter_gather { + unsigned long size; /**< In bytes -- will round to page boundary */ + unsigned long handle; /**< Used for mapping / unmapping */ +}; + +/* + * DRM_IOCTL_SET_VERSION ioctl argument type. + */ +struct drm_set_version { + int drm_di_major; + int drm_di_minor; + int drm_dd_major; + int drm_dd_minor; +}; + +/* DRM_IOCTL_GEM_CLOSE ioctl argument type */ +struct drm_gem_close { + /** Handle of the object to be closed. */ + __u32 handle; + __u32 pad; +}; + +/* DRM_IOCTL_GEM_FLINK ioctl argument type */ +struct drm_gem_flink { + /** Handle for the object being named */ + __u32 handle; + + /** Returned global name */ + __u32 name; +}; + +/* DRM_IOCTL_GEM_OPEN ioctl argument type */ +struct drm_gem_open { + /** Name of object being opened */ + __u32 name; + + /** Returned handle for the object */ + __u32 handle; + + /** Returned size of the object */ + __u64 size; +}; + +/* DRM_IOCTL_GEM_CHANGE_HANDLE ioctl argument type */ +struct drm_gem_change_handle { + /** Current handle of object */ + __u32 handle; + + /** Handle to change that object to */ + __u32 new_handle; +}; + +/** + * DRM_CAP_DUMB_BUFFER + * + * If set to 1, the driver supports creating dumb buffers via the + * &DRM_IOCTL_MODE_CREATE_DUMB ioctl. + */ +#define DRM_CAP_DUMB_BUFFER 0x1 +/** + * DRM_CAP_VBLANK_HIGH_CRTC + * + * If set to 1, the kernel supports specifying a :ref:`CRTC index` + * in the high bits of &drm_wait_vblank_request.type. + * + * Starting kernel version 2.6.39, this capability is always set to 1. + */ +#define DRM_CAP_VBLANK_HIGH_CRTC 0x2 +/** + * DRM_CAP_DUMB_PREFERRED_DEPTH + * + * The preferred bit depth for dumb buffers. + * + * The bit depth is the number of bits used to indicate the color of a single + * pixel excluding any padding. This is different from the number of bits per + * pixel. For instance, XRGB8888 has a bit depth of 24 but has 32 bits per + * pixel. + * + * Note that this preference only applies to dumb buffers, it's irrelevant for + * other types of buffers. + */ +#define DRM_CAP_DUMB_PREFERRED_DEPTH 0x3 +/** + * DRM_CAP_DUMB_PREFER_SHADOW + * + * If set to 1, the driver prefers userspace to render to a shadow buffer + * instead of directly rendering to a dumb buffer. For best speed, userspace + * should do streaming ordered memory copies into the dumb buffer and never + * read from it. + * + * Note that this preference only applies to dumb buffers, it's irrelevant for + * other types of buffers. + */ +#define DRM_CAP_DUMB_PREFER_SHADOW 0x4 +/** + * DRM_CAP_PRIME + * + * Bitfield of supported PRIME sharing capabilities. See &DRM_PRIME_CAP_IMPORT + * and &DRM_PRIME_CAP_EXPORT. + * + * Starting from kernel version 6.6, both &DRM_PRIME_CAP_IMPORT and + * &DRM_PRIME_CAP_EXPORT are always advertised. + * + * PRIME buffers are exposed as dma-buf file descriptors. + * See :ref:`prime_buffer_sharing`. + */ +#define DRM_CAP_PRIME 0x5 +/** + * DRM_PRIME_CAP_IMPORT + * + * If this bit is set in &DRM_CAP_PRIME, the driver supports importing PRIME + * buffers via the &DRM_IOCTL_PRIME_FD_TO_HANDLE ioctl. + * + * Starting from kernel version 6.6, this bit is always set in &DRM_CAP_PRIME. + */ +#define DRM_PRIME_CAP_IMPORT 0x1 +/** + * DRM_PRIME_CAP_EXPORT + * + * If this bit is set in &DRM_CAP_PRIME, the driver supports exporting PRIME + * buffers via the &DRM_IOCTL_PRIME_HANDLE_TO_FD ioctl. + * + * Starting from kernel version 6.6, this bit is always set in &DRM_CAP_PRIME. + */ +#define DRM_PRIME_CAP_EXPORT 0x2 +/** + * DRM_CAP_TIMESTAMP_MONOTONIC + * + * If set to 0, the kernel will report timestamps with ``CLOCK_REALTIME`` in + * struct drm_event_vblank. If set to 1, the kernel will report timestamps with + * ``CLOCK_MONOTONIC``. See ``clock_gettime(2)`` for the definition of these + * clocks. + * + * Starting from kernel version 2.6.39, the default value for this capability + * is 1. Starting kernel version 4.15, this capability is always set to 1. + */ +#define DRM_CAP_TIMESTAMP_MONOTONIC 0x6 +/** + * DRM_CAP_ASYNC_PAGE_FLIP + * + * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC for legacy + * page-flips. + */ +#define DRM_CAP_ASYNC_PAGE_FLIP 0x7 +/** + * DRM_CAP_CURSOR_WIDTH + * + * The ``CURSOR_WIDTH`` and ``CURSOR_HEIGHT`` capabilities return a valid + * width x height combination for the hardware cursor. The intention is that a + * hardware agnostic userspace can query a cursor plane size to use. + * + * Note that the cross-driver contract is to merely return a valid size; + * drivers are free to attach another meaning on top, eg. i915 returns the + * maximum plane size. + */ +#define DRM_CAP_CURSOR_WIDTH 0x8 +/** + * DRM_CAP_CURSOR_HEIGHT + * + * See &DRM_CAP_CURSOR_WIDTH. + */ +#define DRM_CAP_CURSOR_HEIGHT 0x9 +/** + * DRM_CAP_ADDFB2_MODIFIERS + * + * If set to 1, the driver supports supplying modifiers in the + * &DRM_IOCTL_MODE_ADDFB2 ioctl. + */ +#define DRM_CAP_ADDFB2_MODIFIERS 0x10 +/** + * DRM_CAP_PAGE_FLIP_TARGET + * + * If set to 1, the driver supports the &DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE and + * &DRM_MODE_PAGE_FLIP_TARGET_RELATIVE flags in + * &drm_mode_crtc_page_flip_target.flags for the &DRM_IOCTL_MODE_PAGE_FLIP + * ioctl. + */ +#define DRM_CAP_PAGE_FLIP_TARGET 0x11 +/** + * DRM_CAP_CRTC_IN_VBLANK_EVENT + * + * If set to 1, the kernel supports reporting the CRTC ID in + * &drm_event_vblank.crtc_id for the &DRM_EVENT_VBLANK and + * &DRM_EVENT_FLIP_COMPLETE events. + * + * Starting kernel version 4.12, this capability is always set to 1. + */ +#define DRM_CAP_CRTC_IN_VBLANK_EVENT 0x12 +/** + * DRM_CAP_SYNCOBJ + * + * If set to 1, the driver supports sync objects. See :ref:`drm_sync_objects`. + */ +#define DRM_CAP_SYNCOBJ 0x13 +/** + * DRM_CAP_SYNCOBJ_TIMELINE + * + * If set to 1, the driver supports timeline operations on sync objects. See + * :ref:`drm_sync_objects`. + */ +#define DRM_CAP_SYNCOBJ_TIMELINE 0x14 +/** + * DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP + * + * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC for atomic + * commits. + */ +#define DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP 0x15 + +/* DRM_IOCTL_GET_CAP ioctl argument type */ +struct drm_get_cap { + __u64 capability; + __u64 value; +}; + +/** + * DRM_CLIENT_CAP_STEREO_3D + * + * If set to 1, the DRM core will expose the stereo 3D capabilities of the + * monitor by advertising the supported 3D layouts in the flags of struct + * drm_mode_modeinfo. See ``DRM_MODE_FLAG_3D_*``. + * + * This capability is always supported for all drivers starting from kernel + * version 3.13. + */ +#define DRM_CLIENT_CAP_STEREO_3D 1 + +/** + * DRM_CLIENT_CAP_UNIVERSAL_PLANES + * + * If set to 1, the DRM core will expose all planes (overlay, primary, and + * cursor) to userspace. + * + * This capability has been introduced in kernel version 3.15. Starting from + * kernel version 3.17, this capability is always supported for all drivers. + */ +#define DRM_CLIENT_CAP_UNIVERSAL_PLANES 2 + +/** + * DRM_CLIENT_CAP_ATOMIC + * + * If set to 1, the DRM core will expose atomic properties to userspace. This + * implicitly enables &DRM_CLIENT_CAP_UNIVERSAL_PLANES and + * &DRM_CLIENT_CAP_ASPECT_RATIO. + * + * If the driver doesn't support atomic mode-setting, enabling this capability + * will fail with -EOPNOTSUPP. + * + * This capability has been introduced in kernel version 4.0. Starting from + * kernel version 4.2, this capability is always supported for atomic-capable + * drivers. + */ +#define DRM_CLIENT_CAP_ATOMIC 3 + +/** + * DRM_CLIENT_CAP_ASPECT_RATIO + * + * If set to 1, the DRM core will provide aspect ratio information in modes. + * See ``DRM_MODE_FLAG_PIC_AR_*``. + * + * This capability is always supported for all drivers starting from kernel + * version 4.18. + */ +#define DRM_CLIENT_CAP_ASPECT_RATIO 4 + +/** + * DRM_CLIENT_CAP_WRITEBACK_CONNECTORS + * + * If set to 1, the DRM core will expose special connectors to be used for + * writing back to memory the scene setup in the commit. The client must enable + * &DRM_CLIENT_CAP_ATOMIC first. + * + * This capability is always supported for atomic-capable drivers starting from + * kernel version 4.19. + */ +#define DRM_CLIENT_CAP_WRITEBACK_CONNECTORS 5 + +/** + * DRM_CLIENT_CAP_CURSOR_PLANE_HOTSPOT + * + * Drivers for para-virtualized hardware (e.g. vmwgfx, qxl, virtio and + * virtualbox) have additional restrictions for cursor planes (thus + * making cursor planes on those drivers not truly universal,) e.g. + * they need cursor planes to act like one would expect from a mouse + * cursor and have correctly set hotspot properties. + * If this client cap is not set the DRM core will hide cursor plane on + * those virtualized drivers because not setting it implies that the + * client is not capable of dealing with those extra restictions. + * Clients which do set cursor hotspot and treat the cursor plane + * like a mouse cursor should set this property. + * The client must enable &DRM_CLIENT_CAP_ATOMIC first. + * + * Setting this property on drivers which do not special case + * cursor planes (i.e. non-virtualized drivers) will return + * EOPNOTSUPP, which can be used by userspace to gauge + * requirements of the hardware/drivers they're running on. + * + * This capability is always supported for atomic-capable virtualized + * drivers starting from kernel version 6.6. + */ +#define DRM_CLIENT_CAP_CURSOR_PLANE_HOTSPOT 6 + +/* DRM_IOCTL_SET_CLIENT_CAP ioctl argument type */ +struct drm_set_client_cap { + __u64 capability; + __u64 value; +}; + +#define DRM_RDWR O_RDWR +#define DRM_CLOEXEC O_CLOEXEC +struct drm_prime_handle { + __u32 handle; + + /** Flags.. only applicable for handle->fd */ + __u32 flags; + + /** Returned dmabuf file descriptor */ + __s32 fd; +}; + +struct drm_syncobj_create { + __u32 handle; +#define DRM_SYNCOBJ_CREATE_SIGNALED (1 << 0) + __u32 flags; +}; + +struct drm_syncobj_destroy { + __u32 handle; + __u32 pad; +}; + +#define DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE (1 << 0) +#define DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE (1 << 0) +struct drm_syncobj_handle { + __u32 handle; + __u32 flags; + + __s32 fd; + __u32 pad; +}; + +struct drm_syncobj_transfer { + __u32 src_handle; + __u32 dst_handle; + __u64 src_point; + __u64 dst_point; + __u32 flags; + __u32 pad; +}; + +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL (1 << 0) +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT (1 << 1) +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE (1 << 2) /* wait for time point to become available */ +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE (1 << 3) /* set fence deadline to deadline_nsec */ +struct drm_syncobj_wait { + __u64 handles; + /* absolute timeout */ + __s64 timeout_nsec; + __u32 count_handles; + __u32 flags; + __u32 first_signaled; /* only valid when not waiting all */ + __u32 pad; + /** + * @deadline_nsec - fence deadline hint + * + * Deadline hint, in absolute CLOCK_MONOTONIC, to set on backing + * fence(s) if the DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE flag is + * set. + */ + __u64 deadline_nsec; +}; + +struct drm_syncobj_timeline_wait { + __u64 handles; + /* wait on specific timeline point for every handles*/ + __u64 points; + /* absolute timeout */ + __s64 timeout_nsec; + __u32 count_handles; + __u32 flags; + __u32 first_signaled; /* only valid when not waiting all */ + __u32 pad; + /** + * @deadline_nsec - fence deadline hint + * + * Deadline hint, in absolute CLOCK_MONOTONIC, to set on backing + * fence(s) if the DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE flag is + * set. + */ + __u64 deadline_nsec; +}; + +/** + * struct drm_syncobj_eventfd + * @handle: syncobj handle. + * @flags: Zero to wait for the point to be signalled, or + * &DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE to wait for a fence to be + * available for the point. + * @point: syncobj timeline point (set to zero for binary syncobjs). + * @fd: Existing eventfd to sent events to. + * @pad: Must be zero. + * + * Register an eventfd to be signalled by a syncobj. The eventfd counter will + * be incremented by one. + */ +struct drm_syncobj_eventfd { + __u32 handle; + __u32 flags; + __u64 point; + __s32 fd; + __u32 pad; +}; + + +struct drm_syncobj_array { + __u64 handles; + __u32 count_handles; + __u32 pad; +}; + +#define DRM_SYNCOBJ_QUERY_FLAGS_LAST_SUBMITTED (1 << 0) /* last available point on timeline syncobj */ +struct drm_syncobj_timeline_array { + __u64 handles; + __u64 points; + __u32 count_handles; + __u32 flags; +}; + + +/* Query current scanout sequence number */ +struct drm_crtc_get_sequence { + __u32 crtc_id; /* requested crtc_id */ + __u32 active; /* return: crtc output is active */ + __u64 sequence; /* return: most recent vblank sequence */ + __s64 sequence_ns; /* return: most recent time of first pixel out */ +}; + +/* Queue event to be delivered at specified sequence. Time stamp marks + * when the first pixel of the refresh cycle leaves the display engine + * for the display + */ +#define DRM_CRTC_SEQUENCE_RELATIVE 0x00000001 /* sequence is relative to current */ +#define DRM_CRTC_SEQUENCE_NEXT_ON_MISS 0x00000002 /* Use next sequence if we've missed */ + +struct drm_crtc_queue_sequence { + __u32 crtc_id; + __u32 flags; + __u64 sequence; /* on input, target sequence. on output, actual sequence */ + __u64 user_data; /* user data passed to event */ +}; + +#define DRM_CLIENT_NAME_MAX_LEN 64 +struct drm_set_client_name { + __u64 name_len; + __u64 name; +}; + + +#if defined(__cplusplus) +} +#endif + +#include "drm_mode.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +#define DRM_IOCTL_BASE 'd' +#define DRM_IO(nr) _IO(DRM_IOCTL_BASE,nr) +#define DRM_IOR(nr,type) _IOR(DRM_IOCTL_BASE,nr,type) +#define DRM_IOW(nr,type) _IOW(DRM_IOCTL_BASE,nr,type) +#define DRM_IOWR(nr,type) _IOWR(DRM_IOCTL_BASE,nr,type) + +#define DRM_IOCTL_VERSION DRM_IOWR(0x00, struct drm_version) +#define DRM_IOCTL_GET_UNIQUE DRM_IOWR(0x01, struct drm_unique) +#define DRM_IOCTL_GET_MAGIC DRM_IOR( 0x02, struct drm_auth) +#define DRM_IOCTL_IRQ_BUSID DRM_IOWR(0x03, struct drm_irq_busid) +#define DRM_IOCTL_GET_MAP DRM_IOWR(0x04, struct drm_map) +#define DRM_IOCTL_GET_CLIENT DRM_IOWR(0x05, struct drm_client) +#define DRM_IOCTL_GET_STATS DRM_IOR( 0x06, struct drm_stats) +#define DRM_IOCTL_SET_VERSION DRM_IOWR(0x07, struct drm_set_version) +#define DRM_IOCTL_MODESET_CTL DRM_IOW(0x08, struct drm_modeset_ctl) +/** + * DRM_IOCTL_GEM_CLOSE - Close a GEM handle. + * + * GEM handles are not reference-counted by the kernel. User-space is + * responsible for managing their lifetime. For example, if user-space imports + * the same memory object twice on the same DRM file description, the same GEM + * handle is returned by both imports, and user-space needs to ensure + * &DRM_IOCTL_GEM_CLOSE is performed once only. The same situation can happen + * when a memory object is allocated, then exported and imported again on the + * same DRM file description. The &DRM_IOCTL_MODE_GETFB2 IOCTL is an exception + * and always returns fresh new GEM handles even if an existing GEM handle + * already refers to the same memory object before the IOCTL is performed. + */ +#define DRM_IOCTL_GEM_CLOSE DRM_IOW (0x09, struct drm_gem_close) +#define DRM_IOCTL_GEM_FLINK DRM_IOWR(0x0a, struct drm_gem_flink) +#define DRM_IOCTL_GEM_OPEN DRM_IOWR(0x0b, struct drm_gem_open) +#define DRM_IOCTL_GET_CAP DRM_IOWR(0x0c, struct drm_get_cap) +#define DRM_IOCTL_SET_CLIENT_CAP DRM_IOW( 0x0d, struct drm_set_client_cap) + +#define DRM_IOCTL_SET_UNIQUE DRM_IOW( 0x10, struct drm_unique) +#define DRM_IOCTL_AUTH_MAGIC DRM_IOW( 0x11, struct drm_auth) +#define DRM_IOCTL_BLOCK DRM_IOWR(0x12, struct drm_block) +#define DRM_IOCTL_UNBLOCK DRM_IOWR(0x13, struct drm_block) +#define DRM_IOCTL_CONTROL DRM_IOW( 0x14, struct drm_control) +#define DRM_IOCTL_ADD_MAP DRM_IOWR(0x15, struct drm_map) +#define DRM_IOCTL_ADD_BUFS DRM_IOWR(0x16, struct drm_buf_desc) +#define DRM_IOCTL_MARK_BUFS DRM_IOW( 0x17, struct drm_buf_desc) +#define DRM_IOCTL_INFO_BUFS DRM_IOWR(0x18, struct drm_buf_info) +#define DRM_IOCTL_MAP_BUFS DRM_IOWR(0x19, struct drm_buf_map) +#define DRM_IOCTL_FREE_BUFS DRM_IOW( 0x1a, struct drm_buf_free) + +#define DRM_IOCTL_RM_MAP DRM_IOW( 0x1b, struct drm_map) + +#define DRM_IOCTL_SET_SAREA_CTX DRM_IOW( 0x1c, struct drm_ctx_priv_map) +#define DRM_IOCTL_GET_SAREA_CTX DRM_IOWR(0x1d, struct drm_ctx_priv_map) + +#define DRM_IOCTL_SET_MASTER DRM_IO(0x1e) +#define DRM_IOCTL_DROP_MASTER DRM_IO(0x1f) + +#define DRM_IOCTL_ADD_CTX DRM_IOWR(0x20, struct drm_ctx) +#define DRM_IOCTL_RM_CTX DRM_IOWR(0x21, struct drm_ctx) +#define DRM_IOCTL_MOD_CTX DRM_IOW( 0x22, struct drm_ctx) +#define DRM_IOCTL_GET_CTX DRM_IOWR(0x23, struct drm_ctx) +#define DRM_IOCTL_SWITCH_CTX DRM_IOW( 0x24, struct drm_ctx) +#define DRM_IOCTL_NEW_CTX DRM_IOW( 0x25, struct drm_ctx) +#define DRM_IOCTL_RES_CTX DRM_IOWR(0x26, struct drm_ctx_res) +#define DRM_IOCTL_ADD_DRAW DRM_IOWR(0x27, struct drm_draw) +#define DRM_IOCTL_RM_DRAW DRM_IOWR(0x28, struct drm_draw) +#define DRM_IOCTL_DMA DRM_IOWR(0x29, struct drm_dma) +#define DRM_IOCTL_LOCK DRM_IOW( 0x2a, struct drm_lock) +#define DRM_IOCTL_UNLOCK DRM_IOW( 0x2b, struct drm_lock) +#define DRM_IOCTL_FINISH DRM_IOW( 0x2c, struct drm_lock) + +/** + * DRM_IOCTL_PRIME_HANDLE_TO_FD - Convert a GEM handle to a DMA-BUF FD. + * + * User-space sets &drm_prime_handle.handle with the GEM handle to export and + * &drm_prime_handle.flags, and gets back a DMA-BUF file descriptor in + * &drm_prime_handle.fd. + * + * The export can fail for any driver-specific reason, e.g. because export is + * not supported for this specific GEM handle (but might be for others). + * + * Support for exporting DMA-BUFs is advertised via &DRM_PRIME_CAP_EXPORT. + */ +#define DRM_IOCTL_PRIME_HANDLE_TO_FD DRM_IOWR(0x2d, struct drm_prime_handle) +/** + * DRM_IOCTL_PRIME_FD_TO_HANDLE - Convert a DMA-BUF FD to a GEM handle. + * + * User-space sets &drm_prime_handle.fd with a DMA-BUF file descriptor to + * import, and gets back a GEM handle in &drm_prime_handle.handle. + * &drm_prime_handle.flags is unused. + * + * If an existing GEM handle refers to the memory object backing the DMA-BUF, + * that GEM handle is returned. Therefore user-space which needs to handle + * arbitrary DMA-BUFs must have a user-space lookup data structure to manually + * reference-count duplicated GEM handles. For more information see + * &DRM_IOCTL_GEM_CLOSE. + * + * The import can fail for any driver-specific reason, e.g. because import is + * only supported for DMA-BUFs allocated on this DRM device. + * + * Support for importing DMA-BUFs is advertised via &DRM_PRIME_CAP_IMPORT. + */ +#define DRM_IOCTL_PRIME_FD_TO_HANDLE DRM_IOWR(0x2e, struct drm_prime_handle) + +#define DRM_IOCTL_AGP_ACQUIRE DRM_IO( 0x30) +#define DRM_IOCTL_AGP_RELEASE DRM_IO( 0x31) +#define DRM_IOCTL_AGP_ENABLE DRM_IOW( 0x32, struct drm_agp_mode) +#define DRM_IOCTL_AGP_INFO DRM_IOR( 0x33, struct drm_agp_info) +#define DRM_IOCTL_AGP_ALLOC DRM_IOWR(0x34, struct drm_agp_buffer) +#define DRM_IOCTL_AGP_FREE DRM_IOW( 0x35, struct drm_agp_buffer) +#define DRM_IOCTL_AGP_BIND DRM_IOW( 0x36, struct drm_agp_binding) +#define DRM_IOCTL_AGP_UNBIND DRM_IOW( 0x37, struct drm_agp_binding) + +#define DRM_IOCTL_SG_ALLOC DRM_IOWR(0x38, struct drm_scatter_gather) +#define DRM_IOCTL_SG_FREE DRM_IOW( 0x39, struct drm_scatter_gather) + +#define DRM_IOCTL_WAIT_VBLANK DRM_IOWR(0x3a, union drm_wait_vblank) + +#define DRM_IOCTL_CRTC_GET_SEQUENCE DRM_IOWR(0x3b, struct drm_crtc_get_sequence) +#define DRM_IOCTL_CRTC_QUEUE_SEQUENCE DRM_IOWR(0x3c, struct drm_crtc_queue_sequence) + +#define DRM_IOCTL_UPDATE_DRAW DRM_IOW(0x3f, struct drm_update_draw) + +#define DRM_IOCTL_MODE_GETRESOURCES DRM_IOWR(0xA0, struct drm_mode_card_res) +#define DRM_IOCTL_MODE_GETCRTC DRM_IOWR(0xA1, struct drm_mode_crtc) +#define DRM_IOCTL_MODE_SETCRTC DRM_IOWR(0xA2, struct drm_mode_crtc) +#define DRM_IOCTL_MODE_CURSOR DRM_IOWR(0xA3, struct drm_mode_cursor) +#define DRM_IOCTL_MODE_GETGAMMA DRM_IOWR(0xA4, struct drm_mode_crtc_lut) +#define DRM_IOCTL_MODE_SETGAMMA DRM_IOWR(0xA5, struct drm_mode_crtc_lut) +#define DRM_IOCTL_MODE_GETENCODER DRM_IOWR(0xA6, struct drm_mode_get_encoder) +#define DRM_IOCTL_MODE_GETCONNECTOR DRM_IOWR(0xA7, struct drm_mode_get_connector) +#define DRM_IOCTL_MODE_ATTACHMODE DRM_IOWR(0xA8, struct drm_mode_mode_cmd) /* deprecated (never worked) */ +#define DRM_IOCTL_MODE_DETACHMODE DRM_IOWR(0xA9, struct drm_mode_mode_cmd) /* deprecated (never worked) */ + +#define DRM_IOCTL_MODE_GETPROPERTY DRM_IOWR(0xAA, struct drm_mode_get_property) +#define DRM_IOCTL_MODE_SETPROPERTY DRM_IOWR(0xAB, struct drm_mode_connector_set_property) +#define DRM_IOCTL_MODE_GETPROPBLOB DRM_IOWR(0xAC, struct drm_mode_get_blob) +#define DRM_IOCTL_MODE_GETFB DRM_IOWR(0xAD, struct drm_mode_fb_cmd) +#define DRM_IOCTL_MODE_ADDFB DRM_IOWR(0xAE, struct drm_mode_fb_cmd) +/** + * DRM_IOCTL_MODE_RMFB - Remove a framebuffer. + * + * This removes a framebuffer previously added via ADDFB/ADDFB2. The IOCTL + * argument is a framebuffer object ID. + * + * Warning: removing a framebuffer currently in-use on an enabled plane will + * disable that plane. The CRTC the plane is linked to may also be disabled + * (depending on driver capabilities). + */ +#define DRM_IOCTL_MODE_RMFB DRM_IOWR(0xAF, unsigned int) +#define DRM_IOCTL_MODE_PAGE_FLIP DRM_IOWR(0xB0, struct drm_mode_crtc_page_flip) +#define DRM_IOCTL_MODE_DIRTYFB DRM_IOWR(0xB1, struct drm_mode_fb_dirty_cmd) + +/** + * DRM_IOCTL_MODE_CREATE_DUMB - Create a new dumb buffer object. + * + * KMS dumb buffers provide a very primitive way to allocate a buffer object + * suitable for scanout and map it for software rendering. KMS dumb buffers are + * not suitable for hardware-accelerated rendering nor video decoding. KMS dumb + * buffers are not suitable to be displayed on any other device than the KMS + * device where they were allocated from. Also see + * :ref:`kms_dumb_buffer_objects`. + * + * The IOCTL argument is a struct drm_mode_create_dumb. + * + * User-space is expected to create a KMS dumb buffer via this IOCTL, then add + * it as a KMS framebuffer via &DRM_IOCTL_MODE_ADDFB and map it via + * &DRM_IOCTL_MODE_MAP_DUMB. + * + * &DRM_CAP_DUMB_BUFFER indicates whether this IOCTL is supported. + * &DRM_CAP_DUMB_PREFERRED_DEPTH and &DRM_CAP_DUMB_PREFER_SHADOW indicate + * driver preferences for dumb buffers. + */ +#define DRM_IOCTL_MODE_CREATE_DUMB DRM_IOWR(0xB2, struct drm_mode_create_dumb) +#define DRM_IOCTL_MODE_MAP_DUMB DRM_IOWR(0xB3, struct drm_mode_map_dumb) +#define DRM_IOCTL_MODE_DESTROY_DUMB DRM_IOWR(0xB4, struct drm_mode_destroy_dumb) +#define DRM_IOCTL_MODE_GETPLANERESOURCES DRM_IOWR(0xB5, struct drm_mode_get_plane_res) +#define DRM_IOCTL_MODE_GETPLANE DRM_IOWR(0xB6, struct drm_mode_get_plane) +#define DRM_IOCTL_MODE_SETPLANE DRM_IOWR(0xB7, struct drm_mode_set_plane) +#define DRM_IOCTL_MODE_ADDFB2 DRM_IOWR(0xB8, struct drm_mode_fb_cmd2) +#define DRM_IOCTL_MODE_OBJ_GETPROPERTIES DRM_IOWR(0xB9, struct drm_mode_obj_get_properties) +#define DRM_IOCTL_MODE_OBJ_SETPROPERTY DRM_IOWR(0xBA, struct drm_mode_obj_set_property) +#define DRM_IOCTL_MODE_CURSOR2 DRM_IOWR(0xBB, struct drm_mode_cursor2) +#define DRM_IOCTL_MODE_ATOMIC DRM_IOWR(0xBC, struct drm_mode_atomic) +#define DRM_IOCTL_MODE_CREATEPROPBLOB DRM_IOWR(0xBD, struct drm_mode_create_blob) +#define DRM_IOCTL_MODE_DESTROYPROPBLOB DRM_IOWR(0xBE, struct drm_mode_destroy_blob) + +#define DRM_IOCTL_SYNCOBJ_CREATE DRM_IOWR(0xBF, struct drm_syncobj_create) +#define DRM_IOCTL_SYNCOBJ_DESTROY DRM_IOWR(0xC0, struct drm_syncobj_destroy) +#define DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD DRM_IOWR(0xC1, struct drm_syncobj_handle) +#define DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE DRM_IOWR(0xC2, struct drm_syncobj_handle) +#define DRM_IOCTL_SYNCOBJ_WAIT DRM_IOWR(0xC3, struct drm_syncobj_wait) +#define DRM_IOCTL_SYNCOBJ_RESET DRM_IOWR(0xC4, struct drm_syncobj_array) +#define DRM_IOCTL_SYNCOBJ_SIGNAL DRM_IOWR(0xC5, struct drm_syncobj_array) + +#define DRM_IOCTL_MODE_CREATE_LEASE DRM_IOWR(0xC6, struct drm_mode_create_lease) +#define DRM_IOCTL_MODE_LIST_LESSEES DRM_IOWR(0xC7, struct drm_mode_list_lessees) +#define DRM_IOCTL_MODE_GET_LEASE DRM_IOWR(0xC8, struct drm_mode_get_lease) +#define DRM_IOCTL_MODE_REVOKE_LEASE DRM_IOWR(0xC9, struct drm_mode_revoke_lease) + +#define DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT DRM_IOWR(0xCA, struct drm_syncobj_timeline_wait) +#define DRM_IOCTL_SYNCOBJ_QUERY DRM_IOWR(0xCB, struct drm_syncobj_timeline_array) +#define DRM_IOCTL_SYNCOBJ_TRANSFER DRM_IOWR(0xCC, struct drm_syncobj_transfer) +#define DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL DRM_IOWR(0xCD, struct drm_syncobj_timeline_array) + +/** + * DRM_IOCTL_MODE_GETFB2 - Get framebuffer metadata. + * + * This queries metadata about a framebuffer. User-space fills + * &drm_mode_fb_cmd2.fb_id as the input, and the kernels fills the rest of the + * struct as the output. + * + * If the client is DRM master or has &CAP_SYS_ADMIN, &drm_mode_fb_cmd2.handles + * will be filled with GEM buffer handles. Fresh new GEM handles are always + * returned, even if another GEM handle referring to the same memory object + * already exists on the DRM file description. The caller is responsible for + * removing the new handles, e.g. via the &DRM_IOCTL_GEM_CLOSE IOCTL. The same + * new handle will be returned for multiple planes in case they use the same + * memory object. Planes are valid until one has a zero handle -- this can be + * used to compute the number of planes. + * + * Otherwise, &drm_mode_fb_cmd2.handles will be zeroed and planes are valid + * until one has a zero &drm_mode_fb_cmd2.pitches. + * + * If the framebuffer has a format modifier, &DRM_MODE_FB_MODIFIERS will be set + * in &drm_mode_fb_cmd2.flags and &drm_mode_fb_cmd2.modifier will contain the + * modifier. Otherwise, user-space must ignore &drm_mode_fb_cmd2.modifier. + * + * To obtain DMA-BUF FDs for each plane without leaking GEM handles, user-space + * can export each handle via &DRM_IOCTL_PRIME_HANDLE_TO_FD, then immediately + * close each unique handle via &DRM_IOCTL_GEM_CLOSE, making sure to not + * double-close handles which are specified multiple times in the array. + */ +#define DRM_IOCTL_MODE_GETFB2 DRM_IOWR(0xCE, struct drm_mode_fb_cmd2) + +#define DRM_IOCTL_SYNCOBJ_EVENTFD DRM_IOWR(0xCF, struct drm_syncobj_eventfd) + +/** + * DRM_IOCTL_MODE_CLOSEFB - Close a framebuffer. + * + * This closes a framebuffer previously added via ADDFB/ADDFB2. The IOCTL + * argument is a framebuffer object ID. + * + * This IOCTL is similar to &DRM_IOCTL_MODE_RMFB, except it doesn't disable + * planes and CRTCs. As long as the framebuffer is used by a plane, it's kept + * alive. When the plane no longer uses the framebuffer (because the + * framebuffer is replaced with another one, or the plane is disabled), the + * framebuffer is cleaned up. + * + * This is useful to implement flicker-free transitions between two processes. + * + * Depending on the threat model, user-space may want to ensure that the + * framebuffer doesn't expose any sensitive user information: closed + * framebuffers attached to a plane can be read back by the next DRM master. + */ +#define DRM_IOCTL_MODE_CLOSEFB DRM_IOWR(0xD0, struct drm_mode_closefb) + +/** + * DRM_IOCTL_SET_CLIENT_NAME - Attach a name to a drm_file + * + * Having a name allows for easier tracking and debugging. + * The length of the name (without null ending char) must be + * <= DRM_CLIENT_NAME_MAX_LEN. + * The call will fail if the name contains whitespaces or non-printable chars. + */ +#define DRM_IOCTL_SET_CLIENT_NAME DRM_IOWR(0xD1, struct drm_set_client_name) + +/** + * DRM_IOCTL_GEM_CHANGE_HANDLE - Move an object to a different handle + * + * Some applications (notably CRIU) need objects to have specific gem handles. + * This ioctl changes the object at one gem handle to use a new gem handle. + */ +#define DRM_IOCTL_GEM_CHANGE_HANDLE DRM_IOWR(0xD2, struct drm_gem_change_handle) + +/* + * Device specific ioctls should only be in their respective headers + * The device specific ioctl range is from 0x40 to 0x9f. + * Generic IOCTLS restart at 0xA0. + * + * \sa drmCommandNone(), drmCommandRead(), drmCommandWrite(), and + * drmCommandReadWrite(). + */ +#define DRM_COMMAND_BASE 0x40 +#define DRM_COMMAND_END 0xA0 + +/** + * struct drm_event - Header for DRM events + * @type: event type. + * @length: total number of payload bytes (including header). + * + * This struct is a header for events written back to user-space on the DRM FD. + * A read on the DRM FD will always only return complete events: e.g. if the + * read buffer is 100 bytes large and there are two 64 byte events pending, + * only one will be returned. + * + * Event types 0 - 0x7fffffff are generic DRM events, 0x80000000 and + * up are chipset specific. Generic DRM events include &DRM_EVENT_VBLANK, + * &DRM_EVENT_FLIP_COMPLETE and &DRM_EVENT_CRTC_SEQUENCE. + */ +struct drm_event { + __u32 type; + __u32 length; +}; + +/** + * DRM_EVENT_VBLANK - vertical blanking event + * + * This event is sent in response to &DRM_IOCTL_WAIT_VBLANK with the + * &_DRM_VBLANK_EVENT flag set. + * + * The event payload is a struct drm_event_vblank. + */ +#define DRM_EVENT_VBLANK 0x01 +/** + * DRM_EVENT_FLIP_COMPLETE - page-flip completion event + * + * This event is sent in response to an atomic commit or legacy page-flip with + * the &DRM_MODE_PAGE_FLIP_EVENT flag set. + * + * The event payload is a struct drm_event_vblank. + */ +#define DRM_EVENT_FLIP_COMPLETE 0x02 +/** + * DRM_EVENT_CRTC_SEQUENCE - CRTC sequence event + * + * This event is sent in response to &DRM_IOCTL_CRTC_QUEUE_SEQUENCE. + * + * The event payload is a struct drm_event_crtc_sequence. + */ +#define DRM_EVENT_CRTC_SEQUENCE 0x03 + +struct drm_event_vblank { + struct drm_event base; + __u64 user_data; + __u32 tv_sec; + __u32 tv_usec; + __u32 sequence; + __u32 crtc_id; /* 0 on older kernels that do not support this */ +}; + +/* Event delivered at sequence. Time stamp marks when the first pixel + * of the refresh cycle leaves the display engine for the display + */ +struct drm_event_crtc_sequence { + struct drm_event base; + __u64 user_data; + __s64 time_ns; + __u64 sequence; +}; + +/* typedef area */ +#ifndef __KERNEL__ +typedef struct drm_clip_rect drm_clip_rect_t; +typedef struct drm_drawable_info drm_drawable_info_t; +typedef struct drm_tex_region drm_tex_region_t; +typedef struct drm_hw_lock drm_hw_lock_t; +typedef struct drm_version drm_version_t; +typedef struct drm_unique drm_unique_t; +typedef struct drm_list drm_list_t; +typedef struct drm_block drm_block_t; +typedef struct drm_control drm_control_t; +typedef enum drm_map_type drm_map_type_t; +typedef enum drm_map_flags drm_map_flags_t; +typedef struct drm_ctx_priv_map drm_ctx_priv_map_t; +typedef struct drm_map drm_map_t; +typedef struct drm_client drm_client_t; +typedef enum drm_stat_type drm_stat_type_t; +typedef struct drm_stats drm_stats_t; +typedef enum drm_lock_flags drm_lock_flags_t; +typedef struct drm_lock drm_lock_t; +typedef enum drm_dma_flags drm_dma_flags_t; +typedef struct drm_buf_desc drm_buf_desc_t; +typedef struct drm_buf_info drm_buf_info_t; +typedef struct drm_buf_free drm_buf_free_t; +typedef struct drm_buf_pub drm_buf_pub_t; +typedef struct drm_buf_map drm_buf_map_t; +typedef struct drm_dma drm_dma_t; +typedef union drm_wait_vblank drm_wait_vblank_t; +typedef struct drm_agp_mode drm_agp_mode_t; +typedef enum drm_ctx_flags drm_ctx_flags_t; +typedef struct drm_ctx drm_ctx_t; +typedef struct drm_ctx_res drm_ctx_res_t; +typedef struct drm_draw drm_draw_t; +typedef struct drm_update_draw drm_update_draw_t; +typedef struct drm_auth drm_auth_t; +typedef struct drm_irq_busid drm_irq_busid_t; +typedef enum drm_vblank_seq_type drm_vblank_seq_type_t; + +typedef struct drm_agp_buffer drm_agp_buffer_t; +typedef struct drm_agp_binding drm_agp_binding_t; +typedef struct drm_agp_info drm_agp_info_t; +typedef struct drm_scatter_gather drm_scatter_gather_t; +typedef struct drm_set_version drm_set_version_t; +#endif + +#if defined(__cplusplus) +} +#endif + +#endif From db0ec806d12d1435fbf2ccbcac05ec878fe0f401 Mon Sep 17 00:00:00 2001 From: David Francis Date: Wed, 12 Feb 2025 09:29:21 -0500 Subject: [PATCH 247/257] plugin/amdgpu: Add handling for amdgpu drm buffer objects Buffer objects held by the amdgpu drm driver are checkpointed with the new BO_INFO and MAPPING_INFO ioctls/ioctl options. Handling is in amdgpu_plugin_drm.h Handling of imported buffer objects may require dmabuf fds to be transferred between processes. These occur over fdstore, with the handle-fstore id relationships kept in shread memory. There is a new plugin callback: RESTORE_INIT to create the shared memory. During checkpoint, track shared buffer objects, so that buffer objects that are shared across processes can be identified. During restore, track which buffer objects have been restored. Retry restore of a drm file if a buffer object is imported and the original has not been exported yet. Skip buffer objects that have already been completed or cannot be completed in the current restore. So drm code can use sdma_copy_bo, that function no longer requires kfd bo structs Update the protobuf messages with new amdgpu drm information. Signed-off-by: David Francis --- criu/include/criu-plugin.h | 3 + criu/plugin.c | 13 +- criu/servicefd.c | 2 +- plugins/amdgpu/amdgpu_plugin.c | 281 ++++++++++++++-- plugins/amdgpu/amdgpu_plugin_drm.c | 487 +++++++++++++++++++++++++++- plugins/amdgpu/amdgpu_plugin_drm.h | 12 + plugins/amdgpu/amdgpu_plugin_util.c | 84 +++++ plugins/amdgpu/amdgpu_plugin_util.h | 39 ++- plugins/amdgpu/criu-amdgpu.proto | 25 ++ 9 files changed, 900 insertions(+), 46 deletions(-) diff --git a/criu/include/criu-plugin.h b/criu/include/criu-plugin.h index ee84ccdf6..977dad655 100644 --- a/criu/include/criu-plugin.h +++ b/criu/include/criu-plugin.h @@ -62,6 +62,8 @@ enum { CR_PLUGIN_HOOK__POST_FORKING = 12, + CR_PLUGIN_HOOK__RESTORE_INIT = 13, + CR_PLUGIN_HOOK__MAX }; @@ -81,6 +83,7 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__POST_FORKING, void); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_INIT, void); enum { CR_PLUGIN_STAGE__DUMP, diff --git a/criu/plugin.c b/criu/plugin.c index 18da0499d..a2057e9c1 100644 --- a/criu/plugin.c +++ b/criu/plugin.c @@ -60,6 +60,7 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path) __assign_hook(PAUSE_DEVICES, "cr_plugin_pause_devices"); __assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices"); __assign_hook(POST_FORKING, "cr_plugin_post_forking"); + __assign_hook(RESTORE_INIT, "cr_plugin_restore_init"); #undef __assign_hook @@ -257,8 +258,16 @@ int cr_plugin_init(int stage) goto err; } - if (stage == CR_PLUGIN_STAGE__RESTORE && check_inventory_plugins()) - goto err; + if (stage == CR_PLUGIN_STAGE__RESTORE) { + int ret; + + if (check_inventory_plugins()) + goto err; + + ret = run_plugins(RESTORE_INIT); + if (ret < 0 && ret != -ENOTSUP) + goto err; + } exit_code = 0; err: diff --git a/criu/servicefd.c b/criu/servicefd.c index 06a8d3eba..dfb019066 100644 --- a/criu/servicefd.c +++ b/criu/servicefd.c @@ -313,4 +313,4 @@ int clone_service_fd(struct pstree_item *me) ret = 0; return ret; -} +} \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index e3b4ead3f..4be8421a0 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include #include @@ -23,12 +25,17 @@ #include "criu-plugin.h" #include "plugin.h" #include "criu-amdgpu.pb-c.h" +#include "util.h" +#include "util-pie.h" +#include "fdstore.h" #include "kfd_ioctl.h" #include "xmalloc.h" #include "criu-log.h" #include "files.h" #include "pstree.h" +#include "sockets.h" +#include "rst-malloc.h" #include "common/list.h" #include "amdgpu_plugin_drm.h" @@ -66,6 +73,19 @@ bool plugin_added_to_inventory = false; bool plugin_disabled = false; +struct handle_id { + int handle; + int fdstore_id; +}; +struct shared_handle_ids { + int num_handles; + struct handle_id *handles; +}; +struct shared_handle_ids *shared_memory = NULL; + +static mutex_t *shared_memory_mutex; + +int current_pid; /* * In the case of a single process (common case), this optimization can effectively * reduce the restore latency with parallel restore. In the case of multiple processes, @@ -526,11 +546,11 @@ void free_and_unmap(uint64_t size, amdgpu_bo_handle h_bo, amdgpu_va_handle h_va, amdgpu_bo_free(h_bo); } -static int sdma_copy_bo(struct kfd_criu_bo_bucket bo_bucket, FILE *storage_fp, - void *buffer, size_t buffer_size, amdgpu_device_handle h_dev, - uint64_t max_copy_size, enum sdma_op_type type) +int sdma_copy_bo(int shared_fd, uint64_t size, FILE *storage_fp, + void *buffer, size_t buffer_size, amdgpu_device_handle h_dev, + uint64_t max_copy_size, enum sdma_op_type type, bool do_not_free) { - uint64_t size, src_bo_size, dst_bo_size, buffer_bo_size, bytes_remain, buffer_space_remain; + uint64_t src_bo_size, dst_bo_size, buffer_bo_size, bytes_remain, buffer_space_remain; uint64_t gpu_addr_src, gpu_addr_dst, gpu_addr_ib, copy_src, copy_dst, copy_size; amdgpu_va_handle h_va_src, h_va_dst, h_va_ib; amdgpu_bo_handle h_bo_src, h_bo_dst, h_bo_ib; @@ -543,10 +563,8 @@ static int sdma_copy_bo(struct kfd_criu_bo_bucket bo_bucket, FILE *storage_fp, uint32_t expired; amdgpu_context_handle h_ctx; uint32_t *ib = NULL; - int j, err, shared_fd, packets_per_buffer; + int j, err, packets_per_buffer; - shared_fd = bo_bucket.dmabuf_fd; - size = bo_bucket.size; buffer_bo_size = min(size, buffer_size); packets_per_buffer = ((buffer_bo_size - 1) / max_copy_size) + 1; src_bo_size = (type == SDMA_OP_VRAM_WRITE) ? buffer_bo_size : size; @@ -757,7 +775,8 @@ err_dst_bo_map: if (err) pr_perror("dest range free failed"); err_dst_va: - err = amdgpu_bo_free(h_bo_dst); + if (!do_not_free) + err = amdgpu_bo_free(h_bo_dst); if (err) pr_perror("dest bo free failed"); err_dst_bo_prep: @@ -845,8 +864,9 @@ void *dump_bo_contents(void *_thread_data) num_bos++; /* perform sDMA based vram copy */ - ret = sdma_copy_bo(bo_buckets[i], bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, - SDMA_OP_VRAM_READ); + ret = sdma_copy_bo(bo_buckets[i].dmabuf_fd, bo_buckets[i].size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, + SDMA_OP_VRAM_READ, false); + if (ret) { pr_err("Failed to drain the BO using sDMA: bo_buckets[%d]\n", i); break; @@ -943,8 +963,8 @@ void *restore_bo_contents(void *_thread_data) num_bos++; - ret = sdma_copy_bo(bo_buckets[i], bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, - SDMA_OP_VRAM_WRITE); + ret = sdma_copy_bo(bo_buckets[i].dmabuf_fd, bo_buckets[i].size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, + SDMA_OP_VRAM_WRITE, false); if (ret) { pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); break; @@ -1053,6 +1073,134 @@ exit: return ret; } +int store_dmabuf_fd(int handle, int fd) +{ + int id; + + id = fdstore_add(fd); + mutex_lock(shared_memory_mutex); + for (int i = 0; i < shared_memory->num_handles; i++) { + if (shared_memory->handles[i].handle == handle) { + mutex_unlock(shared_memory_mutex); + return 0; + } + if (shared_memory->handles[i].handle == -1) { + shared_memory->handles[i].handle = handle; + shared_memory->handles[i].fdstore_id = id; + mutex_unlock(shared_memory_mutex); + return 0; + } + } + mutex_unlock(shared_memory_mutex); + + return -1; +} + +int amdgpu_id_for_handle(int handle) +{ + mutex_lock(shared_memory_mutex); + for (int i = 0; i < shared_memory->num_handles; i++) { + if (shared_memory->handles[i].handle == handle) { + mutex_unlock(shared_memory_mutex); + return shared_memory->handles[i].fdstore_id; + } + } + mutex_unlock(shared_memory_mutex); + return -1; +} + +int amdgpu_restore_init(void) +{ + if (!shared_memory) { + int protection = PROT_READ | PROT_WRITE; + int visibility = MAP_SHARED | MAP_ANONYMOUS; + size_t img_size; + FILE *img_fp = NULL; + int ret; + unsigned char *buf; + int num_handles = 0; + char img_path[PATH_MAX]; + CriuRenderNode *rd = NULL; + CriuKfd *e = NULL; + + DIR *d; + struct dirent *dir; + d = opendir("."); + if (d) { + while ((dir = readdir(d)) != NULL) { + if (strncmp("amdgpu-kfd-", dir->d_name, strlen("amdgpu-kfd-")) == 0) { + pr_info("CC3: Found kfd file\n"); + img_fp = open_img_file(dir->d_name, false, &img_size); + buf = xmalloc(img_size); + if (!buf) { + fclose(img_fp); + return -ENOMEM; + } + + ret = read_fp(img_fp, buf, img_size); + if (ret) { + pr_perror("Unable to read from %s", img_path); + fclose(img_fp); + xfree(buf); + return ret; + } + + fclose(img_fp); + e = criu_kfd__unpack(NULL, img_size, buf); + num_handles += e->num_of_bos; + criu_kfd__free_unpacked(e, NULL); + xfree(buf); + } + if (strncmp("amdgpu-renderD-", dir->d_name, strlen("amdgpu-renderD-")) == 0) { + pr_info("CC3: Found drm file\n"); + img_fp = open_img_file(dir->d_name, false, &img_size); + buf = xmalloc(img_size); + if (!buf) { + fclose(img_fp); + return -ENOMEM; + } + + ret = read_fp(img_fp, buf, img_size); + if (ret) { + pr_perror("Unable to read from %s", img_path); + fclose(img_fp); + xfree(buf); + return ret; + } + + fclose(img_fp); + rd = criu_render_node__unpack(NULL, img_size, buf); + num_handles += rd->num_of_bos; + criu_render_node__free_unpacked(rd, NULL); + xfree(buf); + } + } + closedir(d); + } + + if (num_handles > 0) { + shared_memory = mmap(NULL, sizeof(shared_memory), protection, visibility, -1, 0); + shared_memory->num_handles = num_handles; + shared_memory->handles = mmap(NULL, sizeof(struct handle_id) * num_handles, protection, visibility, -1, 0); + + for (int i = 0; i < num_handles; i++) { + shared_memory->handles[i].handle = -1; + shared_memory->handles[i].fdstore_id = -1; + } + + shared_memory_mutex = shmalloc(sizeof(*shared_memory_mutex)); + if (!shared_memory_mutex) { + pr_err("Can't create amdgpu mutex\n"); + return -1; + } + mutex_init(shared_memory_mutex); + } + } + + return 0; +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESTORE_INIT, amdgpu_restore_init) + static int save_devices(int fd, struct kfd_ioctl_criu_args *args, struct kfd_criu_device_bucket *device_buckets, CriuKfd *e) { @@ -1095,6 +1243,8 @@ static int save_bos(int id, int fd, struct kfd_ioctl_criu_args *args, struct kfd { struct thread_data *thread_datas; int ret = 0, i; + amdgpu_device_handle h_dev; + uint32_t major, minor; pr_debug("Dumping %d BOs\n", args->num_bos); @@ -1118,6 +1268,19 @@ static int save_bos(int id, int fd, struct kfd_ioctl_criu_args *args, struct kfd boinfo->size = bo_bucket->size; boinfo->offset = bo_bucket->offset; boinfo->alloc_flags = bo_bucket->alloc_flags; + + ret = amdgpu_device_initialize(node_get_drm_render_device(sys_get_node_by_gpu_id(&src_topology, bo_bucket->gpu_id)), &major, &minor, &h_dev); + + boinfo->handle = get_gem_handle(h_dev, bo_bucket->dmabuf_fd); + + amdgpu_device_deinitialize(h_dev); + } + for (i = 0; i < e->num_of_bos; i++) { + KfdBoEntry *boinfo = e->bo_entries[i]; + + ret = record_shared_bo(boinfo->handle, false); + if (ret) + goto exit; } for (int i = 0; i < e->num_of_gpus; i++) { @@ -1457,6 +1620,29 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e) } pr_info("Restore BOs Ok\n"); + + return 0; +} + +int save_vma_updates(uint64_t offset, uint64_t addr, uint64_t restored_offset, int fd) +{ + struct vma_metadata *vma_md; + + vma_md = xmalloc(sizeof(*vma_md)); + if (!vma_md) { + return -ENOMEM; + } + + memset(vma_md, 0, sizeof(*vma_md)); + + vma_md->old_pgoff = offset; + vma_md->vma_entry = addr; + + vma_md->new_pgoff = restored_offset; + vma_md->fd = fd; + + list_add_tail(&vma_md->list, &update_vma_info_list); + return 0; } @@ -1691,8 +1877,18 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed) pr_info("render node destination gpu_id = 0x%04x\n", tp_node->gpu_id); fd = node_get_drm_render_device(tp_node); - if (fd < 0) + if (fd < 0) { pr_err("Failed to open render device (minor:%d)\n", tp_node->drm_render_minor); + return -1; + } + + ret = amdgpu_plugin_drm_restore_file(fd, rd); + if (ret == 1) + *retry_needed = true; + if (ret < 0) { + fd = ret; + goto fail; + } fail: criu_render_node__free_unpacked(rd, NULL); xfree(buf); @@ -1704,12 +1900,20 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed) * copy of the fd. CRIU core owns the duplicated returned fd, and amdgpu_plugin owns the fd stored in * tp_node. */ - fd = dup(fd); - if (fd == -1) { - pr_perror("unable to duplicate the render fd"); - return -1; + + if (fd < 0) + return fd; + + if (!(*retry_needed)) { + fd = dup(fd); + if (fd == -1) { + pr_perror("unable to duplicate the render fd"); + return -1; + } + return fd; } - return fd; + + return 0; } fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); @@ -1753,11 +1957,13 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed) * This way, we know that the file descriptors we store will not conflict with file descriptors inside core * CRIU. */ - fd_next = find_unused_fd_pid(e->pid); - if (fd_next <= 0) { - pr_err("Failed to find unused fd (fd:%d)\n", fd_next); - ret = -EINVAL; - goto exit; + if (fd_next == -1) { + fd_next = find_unused_fd_pid(e->pid); + if (fd_next <= 0) { + pr_err("Failed to find unused fd (fd:%d)\n", fd_next); + ret = -EINVAL; + goto exit; + } } ret = devinfo_to_topology(e->device_entries, e->num_of_gpus + e->num_of_cpus, &src_topology); @@ -1790,14 +1996,26 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed) args.num_objects = e->num_of_objects; args.priv_data_size = e->priv_data.len; args.priv_data = (uintptr_t)e->priv_data.data; - args.op = KFD_CRIU_OP_RESTORE; + if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) { pr_perror("Restore ioctl failed"); ret = -1; goto exit; } + if (ret < 0) + goto exit; + + for (int i = 0; i < args.num_bos; i++) { + struct kfd_criu_bo_bucket *bo_bucket = &((struct kfd_criu_bo_bucket *)args.bos)[i]; + KfdBoEntry *bo_entry = e->bo_entries[i]; + + if (bo_entry->handle != -1) { + store_dmabuf_fd(bo_entry->handle, bo_bucket->dmabuf_fd); + } + } + ret = restore_bo_data(id, (struct kfd_criu_bo_bucket *)args.bos, e); if (ret) goto exit; @@ -1940,19 +2158,14 @@ int amdgpu_plugin_resume_devices_late(int target_pid) } } + clear_restore_state(); + close(fd); return exit_code; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late) -int sdma_copy_bo_helper(uint64_t size, int fd, FILE *storage_fp, void *buffer, size_t buffer_size, - amdgpu_device_handle h_dev, uint64_t max_copy_size, enum sdma_op_type type) -{ - return sdma_copy_bo((struct kfd_criu_bo_bucket){ 0, size, 0, 0, 0, 0, fd, 0 }, storage_fp, buffer, - buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); -} - int init_dev(int dev_minor, amdgpu_device_handle *h_dev, uint64_t *max_copy_size) { int ret = 0; @@ -2061,8 +2274,10 @@ void *parallel_restore_bo_contents(void *_thread_data) entry = &restore_cmd->entries[i]; fseek(bo_contents_fp, entry->read_offset + offset, SEEK_SET); - ret = sdma_copy_bo_helper(entry->size, restore_cmd->fds_write[entry->write_id], bo_contents_fp, buffer, - buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); + ret = sdma_copy_bo(restore_cmd->fds_write[entry->write_id], entry->size, bo_contents_fp, + buffer, buffer_size, h_dev, + max_copy_size, SDMA_OP_VRAM_WRITE, false); + if (ret) { pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); goto err_sdma; diff --git a/plugins/amdgpu/amdgpu_plugin_drm.c b/plugins/amdgpu/amdgpu_plugin_drm.c index d54cd937d..199dad21e 100644 --- a/plugins/amdgpu/amdgpu_plugin_drm.c +++ b/plugins/amdgpu/amdgpu_plugin_drm.c @@ -19,19 +19,112 @@ #include #include "common/list.h" +#include "files.h" +#include "fdstore.h" #include "criu-amdgpu.pb-c.h" +#define __user +#include "drm.h" #include #include #include "xmalloc.h" -#include "criu-log.h" -#include "kfd_ioctl.h" +#include "amdgpu_drm.h" #include "amdgpu_plugin_drm.h" #include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" +#include "util.h" +#include "common/scm.h" + +int get_gem_handle(amdgpu_device_handle h_dev, int dmabuf_fd) +{ + uint32_t handle; + int fd = amdgpu_device_get_fd(h_dev); + + if (dmabuf_fd == -1) { + return -1; + } + + drmPrimeFDToHandle(fd, dmabuf_fd, &handle); + + return handle; +} + +int drmIoctl(int fd, unsigned long request, void *arg) +{ + int ret, max_retries = 200; + + do { + ret = ioctl(fd, request, arg); + } while (ret == -1 && max_retries-- > 0 && (errno == EINTR || errno == EAGAIN)); + + if (ret == -1 && errno == EBADF) + /* In case pthread_atfork didn't catch it, this will + * make any subsequent hsaKmt calls fail in CHECK_KFD_OPEN. + */ + pr_perror("KFD file descriptor not valid in this process"); + return ret; +} + +static int allocate_bo_entries(CriuRenderNode *e, int num_bos) +{ + e->bo_entries = xmalloc(sizeof(DrmBoEntry *) * num_bos); + if (!e->bo_entries) { + pr_err("Failed to allocate bo_info\n"); + return -ENOMEM; + } + + for (int i = 0; i < num_bos; i++) { + DrmBoEntry *entry = xzalloc(sizeof(*entry)); + + if (!entry) { + pr_err("Failed to allocate botest\n"); + return -ENOMEM; + } + + drm_bo_entry__init(entry); + + e->bo_entries[i] = entry; + e->n_bo_entries++; + } + return 0; +} + +static int allocate_vm_entries(DrmBoEntry *e, int num_vms) +{ + e->vm_entries = xmalloc(sizeof(DrmVmEntry *) * num_vms); + if (!e->vm_entries) { + pr_err("Failed to allocate bo_info\n"); + return -ENOMEM; + } + + for (int i = 0; i < num_vms; i++) { + DrmVmEntry *entry = xzalloc(sizeof(*entry)); + + if (!entry) { + pr_err("Failed to allocate botest\n"); + return -ENOMEM; + } + + drm_vm_entry__init(entry); + + e->vm_entries[i] = entry; + e->n_vm_entries++; + } + return 0; +} + +static void free_e(CriuRenderNode *e) +{ + for (int i = 0; i < e->n_bo_entries; i++) { + if (e->bo_entries[i]) + xfree(e->bo_entries[i]); + } + + xfree(e); +} int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st) { @@ -60,19 +153,260 @@ int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st) return 0; } +static int restore_bo_contents_drm(int drm_render_minor, CriuRenderNode *rd, int drm_fd, int *dmabufs) +{ + size_t image_size = 0, max_bo_size = 0, buffer_size; + struct amdgpu_gpu_info gpu_info = { 0 }; + amdgpu_device_handle h_dev; + uint64_t max_copy_size; + uint32_t major, minor; + FILE *bo_contents_fp = NULL; + void *buffer = NULL; + char img_path[40]; + int num_bos = 0; + int i, ret = 0; + + ret = amdgpu_device_initialize(drm_fd, &major, &minor, &h_dev); + if (ret) { + pr_perror("failed to initialize device"); + goto exit; + } + plugin_log_msg("libdrm initialized successfully\n"); + + ret = amdgpu_query_gpu_info(h_dev, &gpu_info); + if (ret) { + pr_perror("failed to query gpuinfo via libdrm"); + goto exit; + } + + max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE : + SDMA_LINEAR_COPY_MAX_SIZE - 1; + + for (i = 0; i < rd->num_of_bos; i++) { + if (rd->bo_entries[i]->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | AMDGPU_GEM_DOMAIN_GTT)) { + if (rd->bo_entries[i]->size > max_bo_size) + max_bo_size = rd->bo_entries[i]->size; + } + } + + buffer_size = max_bo_size; + + posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); + if (!buffer) { + pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); + ret = -ENOMEM; + goto exit; + } + + for (i = 0; i < rd->num_of_bos; i++) { + if (!(rd->bo_entries[i]->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | AMDGPU_GEM_DOMAIN_GTT))) + continue; + + if (rd->bo_entries[i]->num_of_vms == 0) + continue; + + num_bos++; + + snprintf(img_path, sizeof(img_path), IMG_DRM_PAGES_FILE, rd->id, drm_render_minor, i); + + bo_contents_fp = open_img_file(img_path, false, &image_size); + + ret = sdma_copy_bo(dmabufs[i], rd->bo_entries[i]->size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, + SDMA_OP_VRAM_WRITE, true); + if (ret) { + pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); + break; + } + plugin_log_msg("** Successfully filled the BO using sDMA: bo_buckets[%d] **\n", i); + + if (bo_contents_fp) + fclose(bo_contents_fp); + } + +exit: + for (int i = 0; i < rd->num_of_bos; i++) { + if (dmabufs[i] != KFD_INVALID_FD) + close(dmabufs[i]); + } + + xfree(buffer); + + amdgpu_device_deinitialize(h_dev); + return ret; +} int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm) { - CriuRenderNode rd = CRIU_RENDER_NODE__INIT; - struct tp_node *tp_node; + CriuRenderNode *rd = NULL; char path[PATH_MAX]; unsigned char *buf; int minor; int len; int ret; + size_t image_size; + struct tp_node *tp_node; + struct drm_amdgpu_gem_list_handles list_handles_args = { 0 }; + struct drm_amdgpu_gem_list_handles_entry *list_handles_entries; + int num_bos; + + rd = xmalloc(sizeof(*rd)); + if (!rd) { + ret = -ENOMEM; + goto exit; + } + criu_render_node__init(rd); /* Get the topology node of the DRM device */ minor = minor(drm->st_rdev); + rd->drm_render_minor = minor; + rd->id = id; + + num_bos = 8; + list_handles_entries = xzalloc(sizeof(struct drm_amdgpu_gem_list_handles_entry) * num_bos); + list_handles_args.num_entries = num_bos; + list_handles_args.entries = (uintptr_t)list_handles_entries; + + ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES, &list_handles_args); + if (ret && errno == EINVAL) { + pr_info("This kernel appears not to have AMDGPU_GEM_LIST_HANDLES ioctl. Consider disabling Dmabuf IPC or updating your kernel.\n"); + list_handles_args.num_entries = 0; + } else if (ret) { + pr_perror("Failed to call bo info ioctl"); + goto exit; + } + + if (list_handles_args.num_entries > num_bos) { + num_bos = list_handles_args.num_entries; + xfree(list_handles_entries); + list_handles_entries = xzalloc(sizeof(struct drm_amdgpu_gem_list_handles_entry) * num_bos); + list_handles_args.num_entries = num_bos; + list_handles_args.entries = (uintptr_t)list_handles_entries; + ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES, &list_handles_args); + if (ret) { + pr_perror("Failed to call bo info ioctl"); + goto exit; + } + } else { + num_bos = list_handles_args.num_entries; + } + + rd->num_of_bos = num_bos; + ret = allocate_bo_entries(rd, num_bos); + if (ret) + goto exit; + + for (int i = 0; i < num_bos; i++) { + int num_vm_entries = 8; + struct drm_amdgpu_gem_vm_entry *vm_info_entries; + struct drm_amdgpu_gem_op vm_info_args = { 0 }; + DrmBoEntry *boinfo = rd->bo_entries[i]; + struct drm_amdgpu_gem_list_handles_entry handle_entry = list_handles_entries[i]; + union drm_amdgpu_gem_mmap mmap_args = { 0 }; + int dmabuf_fd; + uint32_t major, minor; + amdgpu_device_handle h_dev; + void *buffer = NULL; + char img_path[40]; + FILE *bo_contents_fp = NULL; + int device_fd; + + boinfo->size = handle_entry.size; + + boinfo->alloc_flags = handle_entry.alloc_flags; + boinfo->preferred_domains = handle_entry.preferred_domains; + boinfo->alignment = handle_entry.alignment; + boinfo->handle = handle_entry.gem_handle; + boinfo->is_import = (handle_entry.flags & AMDGPU_GEM_LIST_HANDLES_FLAG_IS_IMPORT) || shared_bo_has_exporter(boinfo->handle); + + mmap_args.in.handle = boinfo->handle; + + if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_MMAP, &mmap_args) == -1) { + pr_perror("Error Failed to call mmap ioctl"); + ret = -1; + goto exit; + } + + boinfo->offset = mmap_args.out.addr_ptr; + + vm_info_entries = xzalloc(sizeof(struct drm_amdgpu_gem_vm_entry) * num_vm_entries); + vm_info_args.handle = handle_entry.gem_handle; + vm_info_args.num_entries = num_vm_entries; + vm_info_args.value = (uintptr_t)vm_info_entries; + vm_info_args.op = AMDGPU_GEM_OP_GET_MAPPING_INFO; + ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_OP, &vm_info_args); + if (ret) { + pr_perror("Failed to call vm info ioctl"); + goto exit; + } + + if (vm_info_args.num_entries > num_vm_entries) { + num_vm_entries = vm_info_args.num_entries; + xfree(vm_info_entries); + vm_info_entries = xzalloc(sizeof(struct drm_amdgpu_gem_vm_entry) * num_vm_entries); + vm_info_args.handle = handle_entry.gem_handle; + vm_info_args.num_entries = num_vm_entries; + vm_info_args.value = (uintptr_t)vm_info_entries; + vm_info_args.op = AMDGPU_GEM_OP_GET_MAPPING_INFO; + ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_OP, &vm_info_args); + if (ret) { + pr_perror("Failed to call vm info ioctl"); + goto exit; + } + } else { + num_vm_entries = vm_info_args.num_entries; + } + + boinfo->num_of_vms = num_vm_entries; + ret = allocate_vm_entries(boinfo, num_vm_entries); + if (ret) + goto exit; + + for (int j = 0; j < num_vm_entries; j++) { + DrmVmEntry *vminfo = boinfo->vm_entries[j]; + + boinfo->addr = vm_info_entries[j].addr; + vminfo->addr = vm_info_entries[j].addr; + vminfo->size = vm_info_entries[j].size; + vminfo->offset = vm_info_entries[j].offset; + vminfo->flags = vm_info_entries[j].flags; + } + + ret = amdgpu_device_initialize(fd, &major, &minor, &h_dev); + + device_fd = amdgpu_device_get_fd(h_dev); + + drmPrimeHandleToFD(device_fd, boinfo->handle, 0, &dmabuf_fd); + + snprintf(img_path, sizeof(img_path), IMG_DRM_PAGES_FILE, rd->id, rd->drm_render_minor, i); + bo_contents_fp = open_img_file(img_path, true, &image_size); + + posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), handle_entry.size); + + ret = sdma_copy_bo(dmabuf_fd, handle_entry.size, bo_contents_fp, buffer, handle_entry.size, h_dev, 0x1000, + SDMA_OP_VRAM_READ, false); + + if (dmabuf_fd != KFD_INVALID_FD) + close(dmabuf_fd); + + if (bo_contents_fp) + fclose(bo_contents_fp); + + ret = amdgpu_device_deinitialize(h_dev); + if (ret) + goto exit; + + xfree(vm_info_entries); + } + xfree(list_handles_entries); + + for (int i = 0; i < num_bos; i++) { + DrmBoEntry *boinfo = rd->bo_entries[i]; + + ret = record_shared_bo(boinfo->handle, boinfo->is_import); + if (ret) + goto exit; + } + tp_node = sys_get_node_by_render_minor(&src_topology, minor); if (!tp_node) { pr_err("Failed to find a device with minor number = %d\n", minor); @@ -80,21 +414,156 @@ int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm) } /* Get the GPU_ID of the DRM device */ - rd.gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id); - if (!rd.gpu_id) { - pr_err("Failed to find valid gpu_id for the device = %d\n", rd.gpu_id); + rd->gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id); + if (!rd->gpu_id) { + pr_err("Failed to find valid gpu_id for the device = %d\n", rd->gpu_id); return -ENODEV; } - len = criu_render_node__get_packed_size(&rd); + len = criu_render_node__get_packed_size(rd); buf = xmalloc(len); if (!buf) return -ENOMEM; - criu_render_node__pack(&rd, buf); + criu_render_node__pack(rd, buf); snprintf(path, sizeof(path), IMG_DRM_FILE, id); ret = write_img_file(path, buf, len); + xfree(buf); +exit: + free_e(rd); return ret; } + +int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd) +{ + int ret = 0; + bool retry_needed = false; + uint32_t major, minor; + amdgpu_device_handle h_dev; + int device_fd; + int *dmabufs = xzalloc(sizeof(int) * rd->num_of_bos); + + ret = amdgpu_device_initialize(fd, &major, &minor, &h_dev); + if (ret) { + pr_info("Error in init amdgpu device\n"); + goto exit; + } + + device_fd = amdgpu_device_get_fd(h_dev); + + for (int i = 0; i < rd->num_of_bos; i++) { + DrmBoEntry *boinfo = rd->bo_entries[i]; + int dmabuf_fd = -1; + uint32_t handle; + struct drm_gem_change_handle change_args = { 0 }; + union drm_amdgpu_gem_mmap mmap_args = { 0 }; + struct drm_amdgpu_gem_va va_args = { 0 }; + int fd_id; + + if (work_already_completed(boinfo->handle, rd->drm_render_minor)) { + continue; + } else if (boinfo->handle != -1) { + if (boinfo->is_import) { + fd_id = amdgpu_id_for_handle(boinfo->handle); + if (fd_id == -1) { + retry_needed = true; + continue; + } + dmabuf_fd = fdstore_get(fd_id); + } + } + + if (boinfo->is_import) { + drmPrimeFDToHandle(device_fd, dmabuf_fd, &handle); + } else { + union drm_amdgpu_gem_create create_args = { 0 }; + + create_args.in.bo_size = boinfo->size; + create_args.in.alignment = boinfo->alignment; + create_args.in.domains = boinfo->preferred_domains; + create_args.in.domain_flags = boinfo->alloc_flags; + + if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_CREATE, &create_args) == -1) { + pr_perror("Error Failed to call create ioctl"); + ret = -1; + goto exit; + } + handle = create_args.out.handle; + + drmPrimeHandleToFD(device_fd, handle, 0, &dmabuf_fd); + } + + change_args.handle = handle; + change_args.new_handle = boinfo->handle; + + if (drmIoctl(fd, DRM_IOCTL_GEM_CHANGE_HANDLE, &change_args) == -1) { + pr_perror("Error Failed to call change ioctl; check if the kernel has DRM_IOCTL_GEM_CHANGE_HANDLE support"); + ret = -1; + goto exit; + } + + if (!boinfo->is_import) + store_dmabuf_fd(boinfo->handle, dmabuf_fd); + + dmabufs[i] = dmabuf_fd; + + ret = record_completed_work(boinfo->handle, rd->drm_render_minor); + if (ret) + goto exit; + + mmap_args.in.handle = boinfo->handle; + + if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_MMAP, &mmap_args) == -1) { + pr_perror("Error Failed to call mmap ioctl"); + ret = -1; + goto exit; + } + + for (int j = 0; j < boinfo->num_of_vms; j++) { + DrmVmEntry *vminfo = boinfo->vm_entries[j]; + + va_args.handle = boinfo->handle; + va_args.operation = AMDGPU_VA_OP_MAP; + va_args.flags = vminfo->flags; + va_args.va_address = vminfo->addr; + va_args.offset_in_bo = vminfo->offset; + va_args.map_size = vminfo->size; + + if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_VA, &va_args) == -1) { + pr_perror("Error Failed to call gem va ioctl"); + ret = -1; + goto exit; + } + } + + ret = save_vma_updates(boinfo->offset, boinfo->addr, mmap_args.out.addr_ptr, fd); + if (ret < 0) + goto exit; + } + + if (ret) { + pr_info("Error in deinit amdgpu device\n"); + goto exit; + } + + ret = record_completed_work(-1, rd->drm_render_minor); + if (ret) + goto exit; + + ret = amdgpu_device_deinitialize(h_dev); + + if (rd->num_of_bos > 0) { + ret = restore_bo_contents_drm(rd->drm_render_minor, rd, fd, dmabufs); + if (ret) + goto exit; + } + +exit: + if (ret < 0) + return ret; + xfree(dmabufs); + + return retry_needed; +} diff --git a/plugins/amdgpu/amdgpu_plugin_drm.h b/plugins/amdgpu/amdgpu_plugin_drm.h index 6f0c1a9a6..c766def56 100644 --- a/plugins/amdgpu/amdgpu_plugin_drm.h +++ b/plugins/amdgpu/amdgpu_plugin_drm.h @@ -24,5 +24,17 @@ int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *drm); */ int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm); +int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd); + +int amdgpu_plugin_drm_unpause_file(int fd); + +int amdgpu_id_for_handle(int handle); + +int store_dmabuf_fd(int handle, int fd); + +int get_gem_handle(amdgpu_device_handle h_dev, int dmabuf_fd); + +int save_vma_updates(uint64_t offset, uint64_t addr, uint64_t restored_offset, int gpu_id); + #endif /* __AMDGPU_PLUGIN_DRM_H__ */ diff --git a/plugins/amdgpu/amdgpu_plugin_util.c b/plugins/amdgpu/amdgpu_plugin_util.c index a165fc9cd..491e7fc74 100644 --- a/plugins/amdgpu/amdgpu_plugin_util.c +++ b/plugins/amdgpu/amdgpu_plugin_util.c @@ -41,6 +41,9 @@ /* Tracks number of device files that need to be checkpointed */ static int dev_file_cnt = 0; +static LIST_HEAD(shared_bos); +static LIST_HEAD(completed_work); + /* Helper structures to encode device topology of SRC and DEST platforms */ struct tp_system src_topology; struct tp_system dest_topology; @@ -68,6 +71,87 @@ void init_gpu_count(struct tp_system *topo) dev_file_cnt = 1 + topology_gpu_count(topo); } +bool shared_bo_has_exporter(int handle) +{ + struct shared_bo *bo; + + if (handle == -1) + return false; + + list_for_each_entry(bo, &shared_bos, l) { + if (bo->handle == handle) { + return bo->has_exporter; + } + } + + return false; +} + +int record_shared_bo(int handle, bool is_imported) +{ + struct shared_bo *bo; + + if (handle == -1) + return 0; + + list_for_each_entry(bo, &shared_bos, l) { + if (bo->handle == handle) { + return 0; + } + } + bo = malloc(sizeof(struct shared_bo)); + if (!bo) + return -1; + bo->handle = handle; + bo->has_exporter = !is_imported; + list_add(&bo->l, &shared_bos); + + return 0; +} + +int record_completed_work(int handle, int id) +{ + struct restore_completed_work *work; + + work = malloc(sizeof(struct restore_completed_work)); + if (!work) + return -1; + work->handle = handle; + work->id = id; + list_add(&work->l, &completed_work); + + return 0; +} + +bool work_already_completed(int handle, int id) +{ + struct restore_completed_work *work; + + list_for_each_entry(work, &completed_work, l) { + if (work->handle == handle && work->id == id) { + return true; + } + } + + return false; +} + +void clear_restore_state() +{ + while (!list_empty(&shared_dmabuf_fds)) { + struct shared_dmabuf *st = list_first_entry(&shared_dmabuf_fds, struct shared_dmabuf, l); + list_del(&st->l); + close(st->dmabuf_fd); + free(st); + } + + while (!list_empty(&completed_work)) { + struct restore_completed_work *st = list_first_entry(&completed_work, struct restore_completed_work, l); + list_del(&st->l); + free(st); + } +} + int read_fp(FILE *fp, void *buf, const size_t buf_len) { size_t len_read; diff --git a/plugins/amdgpu/amdgpu_plugin_util.h b/plugins/amdgpu/amdgpu_plugin_util.h index aacca3a28..046a82fb0 100644 --- a/plugins/amdgpu/amdgpu_plugin_util.h +++ b/plugins/amdgpu/amdgpu_plugin_util.h @@ -1,6 +1,8 @@ #ifndef __AMDGPU_PLUGIN_UTIL_H__ #define __AMDGPU_PLUGIN_UTIL_H__ +#include + #ifndef _GNU_SOURCE #define _GNU_SOURCE 1 #endif @@ -52,7 +54,7 @@ #define IMG_DRM_FILE "amdgpu-renderD-%d.img" /* Name of file having serialized data of DRM device buffer objects (BOs) */ -#define IMG_DRM_PAGES_FILE "amdgpu-drm-pages-%d-%04x.img" +#define IMG_DRM_PAGES_FILE "amdgpu-drm-pages-%d-%d-%04x.img" /* Helper macros to Checkpoint and Restore a ROCm file */ #define HSAKMT_SHM_PATH "/dev/shm/hsakmt_shared_mem" @@ -73,6 +75,24 @@ enum sdma_op_type { SDMA_OP_VRAM_WRITE, }; +struct dumped_fd { + struct list_head l; + int fd; + bool is_drm; +}; + +struct shared_bo { + struct list_head l; + int handle; + bool has_exporter; +}; + +struct restore_completed_work { + struct list_head l; + int handle; + int id; +}; + /* Helper structures to encode device topology of SRC and DEST platforms */ extern struct tp_system src_topology; extern struct tp_system dest_topology; @@ -101,6 +121,23 @@ bool checkpoint_is_complete(); void decrement_checkpoint_count(); void init_gpu_count(struct tp_system *topology); +bool shared_bo_has_exporter(int handle); +int record_shared_bo(int handle, bool is_imported); + +int record_shared_dmabuf_fd(int handle, int dmabuf_fd); +int dmabuf_fd_for_handle(int handle); + +int record_completed_work(int handle, int id); +bool work_already_completed(int handle, int id); + +void clear_restore_state(); + void print_kfd_bo_stat(int bo_cnt, struct kfd_criu_bo_bucket *bo_list); +int sdma_copy_bo(int shared_fd, uint64_t size, FILE *storage_fp, + void *buffer, size_t buffer_size, amdgpu_device_handle h_dev, + uint64_t max_copy_size, enum sdma_op_type type, bool do_not_free); + +int serve_out_dmabuf_fd(int handle, int fd); + #endif /* __AMDGPU_PLUGIN_UTIL_H__ */ diff --git a/plugins/amdgpu/criu-amdgpu.proto b/plugins/amdgpu/criu-amdgpu.proto index 078b67650..565413c34 100644 --- a/plugins/amdgpu/criu-amdgpu.proto +++ b/plugins/amdgpu/criu-amdgpu.proto @@ -46,6 +46,7 @@ message kfd_bo_entry { required uint64 offset = 3; required uint32 alloc_flags = 4; required uint32 gpu_id = 5; + required uint32 handle = 6; } message criu_kfd { @@ -61,6 +62,30 @@ message criu_kfd { required bytes priv_data = 10; } +message drm_bo_entry { + required uint64 addr = 1; + required uint64 size = 2; + required uint64 offset = 3; + required uint64 alloc_flags = 4; + required uint64 alignment = 5; + required uint32 preferred_domains = 6; + required uint32 handle = 7; + required uint32 is_import = 8; + required uint32 num_of_vms = 9; + repeated drm_vm_entry vm_entries = 10; +} + +message drm_vm_entry { + required uint64 addr = 1; + required uint64 size = 2; + required uint64 offset = 3; + required uint64 flags = 4; +} + message criu_render_node { required uint32 gpu_id = 1; + required uint32 id = 2; + required uint32 drm_render_minor = 3; + required uint64 num_of_bos = 4; + repeated drm_bo_entry bo_entries = 5; } From d43217dadb9764e0342306da84f45f7a85c78bbf Mon Sep 17 00:00:00 2001 From: David Francis Date: Thu, 30 Oct 2025 22:56:37 -0700 Subject: [PATCH 248/257] plugin: Add DUMP_DEVICES_LATE callback The amdgpu plugin was counting how many files were checkpointed to determine when it should close the device files. The number of device files is not consistent; a process may have multiple copies of the drm device files open. Instead of doing this counting, add a new callback after all files are checkpointed, so plugins can clean up their resources at an appropriate time. Signed-off-by: David Francis --- criu/cr-dump.c | 4 +++ criu/include/criu-plugin.h | 3 ++ criu/plugin.c | 1 + plugins/amdgpu/amdgpu_plugin.c | 55 ++++++++++++----------------- plugins/amdgpu/amdgpu_plugin_util.c | 42 +++++++++++++--------- plugins/amdgpu/amdgpu_plugin_util.h | 6 ++-- 6 files changed, 60 insertions(+), 51 deletions(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 60b8e793c..4df40e9b6 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -2247,6 +2247,10 @@ int cr_dump_tasks(pid_t pid) goto err; } + ret = run_plugins(DUMP_DEVICES_LATE, pid); + if (ret && ret != -ENOTSUP) + goto err; + if (parent_ie) { inventory_entry__free_unpacked(parent_ie, NULL); parent_ie = NULL; diff --git a/criu/include/criu-plugin.h b/criu/include/criu-plugin.h index 977dad655..c3bea1385 100644 --- a/criu/include/criu-plugin.h +++ b/criu/include/criu-plugin.h @@ -64,6 +64,8 @@ enum { CR_PLUGIN_HOOK__RESTORE_INIT = 13, + CR_PLUGIN_HOOK__DUMP_DEVICES_LATE = 14, + CR_PLUGIN_HOOK__MAX }; @@ -84,6 +86,7 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__POST_FORKING, void); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_INIT, void); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_DEVICES_LATE, int id); enum { CR_PLUGIN_STAGE__DUMP, diff --git a/criu/plugin.c b/criu/plugin.c index a2057e9c1..f9322a3c2 100644 --- a/criu/plugin.c +++ b/criu/plugin.c @@ -61,6 +61,7 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path) __assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices"); __assign_hook(POST_FORKING, "cr_plugin_post_forking"); __assign_hook(RESTORE_INIT, "cr_plugin_restore_init"); + __assign_hook(DUMP_DEVICES_LATE, "cr_plugin_dump_devices_late"); #undef __assign_hook diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 4be8421a0..11e410c31 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -58,13 +58,6 @@ struct vma_metadata { /************************************ Global Variables ********************************************/ -/** - * FD of KFD device used to checkpoint. On a multi-process - * tree the order of checkpointing goes from parent to child - * and so on - so saving the FD will not be overwritten - */ -static int kfd_checkpoint_fd; - static LIST_HEAD(update_vma_info_list); size_t kfd_max_buffer_size; @@ -1050,28 +1043,34 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha return 0; } -static int unpause_process(int fd) +int amdgpu_unpause_processes(int pid) { int ret = 0; struct kfd_ioctl_criu_args args = { 0 }; + struct list_head *l = get_dumped_fds(); + struct dumped_fd *st; - args.op = KFD_CRIU_OP_UNPAUSE; + list_for_each_entry(st, l, l) { + if (st->is_drm) { + close(st->fd); + } else { + args.op = KFD_CRIU_OP_UNPAUSE; - ret = kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args); - if (ret) { - pr_perror("Failed to unpause process"); - goto exit; + ret = kmtIoctl(st->fd, AMDKFD_IOC_CRIU_OP, &args); + if (ret) { + pr_perror("Failed to unpause process"); + goto exit; + } + } } - // Reset the KFD FD - kfd_checkpoint_fd = -1; - sys_close_drm_render_devices(&src_topology); - exit: pr_info("Process unpaused %s (ret:%d)\n", ret ? "Failed" : "Ok", ret); + clear_dumped_fds(); return ret; } +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__DUMP_DEVICES_LATE, amdgpu_unpause_processes) int store_dmabuf_fd(int handle, int fd) { @@ -1401,9 +1400,6 @@ int amdgpu_plugin_dump_file(int fd, int id) return -1; } - /* Initialize number of device files that will be checkpointed */ - init_gpu_count(&src_topology); - /* Check whether this plugin was called for kfd or render nodes */ if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) { @@ -1415,11 +1411,9 @@ int amdgpu_plugin_dump_file(int fd, int id) if (ret) return ret; - /* Invoke unpause process if needed */ - decrement_checkpoint_count(); - if (checkpoint_is_complete()) { - ret = unpause_process(kfd_checkpoint_fd); - } + ret = record_dumped_fd(fd, true); + if (ret) + return ret; /* Need to return success here so that criu can call plugins for renderD nodes */ return ret; @@ -1517,14 +1511,11 @@ int amdgpu_plugin_dump_file(int fd, int id) xfree(buf); -exit: - /* Restore all queues if conditions permit */ - kfd_checkpoint_fd = fd; - decrement_checkpoint_count(); - if (checkpoint_is_complete()) { - ret = unpause_process(fd); - } + ret = record_dumped_fd(fd, false); + if (ret) + goto exit; +exit: xfree((void *)args.devices); xfree((void *)args.bos); xfree((void *)args.priv_data); diff --git a/plugins/amdgpu/amdgpu_plugin_util.c b/plugins/amdgpu/amdgpu_plugin_util.c index 491e7fc74..fd59c06ad 100644 --- a/plugins/amdgpu/amdgpu_plugin_util.c +++ b/plugins/amdgpu/amdgpu_plugin_util.c @@ -38,9 +38,7 @@ #include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" -/* Tracks number of device files that need to be checkpointed */ -static int dev_file_cnt = 0; - +static LIST_HEAD(dumped_fds); static LIST_HEAD(shared_bos); static LIST_HEAD(completed_work); @@ -52,23 +50,25 @@ struct tp_system dest_topology; struct device_maps checkpoint_maps; struct device_maps restore_maps; -bool checkpoint_is_complete() +int record_dumped_fd(int fd, bool is_drm) { - return (dev_file_cnt == 0); + int newfd = dup(fd); + + if (newfd < 0) + return newfd; + struct dumped_fd *st = malloc(sizeof(struct dumped_fd)); + if (!st) + return -1; + st->fd = newfd; + st->is_drm = is_drm; + list_add(&st->l, &dumped_fds); + + return 0; } -void decrement_checkpoint_count() +struct list_head *get_dumped_fds() { - dev_file_cnt--; -} - -void init_gpu_count(struct tp_system *topo) -{ - if (dev_file_cnt != 0) - return; - - /* We add ONE to include checkpointing of KFD device */ - dev_file_cnt = 1 + topology_gpu_count(topo); + return &dumped_fds; } bool shared_bo_has_exporter(int handle) @@ -152,6 +152,16 @@ void clear_restore_state() } } +void clear_dumped_fds() +{ + while (!list_empty(&dumped_fds)) { + struct dumped_fd *st = list_first_entry(&dumped_fds, struct dumped_fd, l); + list_del(&st->l); + close(st->fd); + free(st); + } +} + int read_fp(FILE *fp, void *buf, const size_t buf_len) { size_t len_read; diff --git a/plugins/amdgpu/amdgpu_plugin_util.h b/plugins/amdgpu/amdgpu_plugin_util.h index 046a82fb0..f20388efa 100644 --- a/plugins/amdgpu/amdgpu_plugin_util.h +++ b/plugins/amdgpu/amdgpu_plugin_util.h @@ -117,9 +117,9 @@ int read_file(const char *file_path, void *buf, const size_t buf_len); int write_img_file(char *path, const void *buf, const size_t buf_len); FILE *open_img_file(char *path, bool write, size_t *size); -bool checkpoint_is_complete(); -void decrement_checkpoint_count(); -void init_gpu_count(struct tp_system *topology); +int record_dumped_fd(int fd, bool is_drm); +struct list_head *get_dumped_fds(); +void clear_dumped_fds(); bool shared_bo_has_exporter(int handle); int record_shared_bo(int handle, bool is_imported); From 9e404e2083913cde0bad2d0396e6cc7c311a8ba4 Mon Sep 17 00:00:00 2001 From: David Francis Date: Thu, 30 Oct 2025 22:57:04 -0700 Subject: [PATCH 249/257] plugin/amdgpu: Support for checkpoint of dmabuf fds amdgpu libraries that use dmabuf fd to share GPU memory between processes close the dmabuf fds immediately after using them. However, it is possible that checkpoint of a process catches one of the dmabuf fds open. In that case, the amdgpu plugin needs to handle it. The checkpoint of the dmabuf fd does require the device file it was exported from to have already been dumped To identify which device this dmabuf fd was exprted from, attempt to import it on each device, then record the dmabuf handle it imports as. This handle can be used to restore it. Signed-off-by: David Francis --- plugins/amdgpu/Makefile | 2 +- plugins/amdgpu/amdgpu_plugin.c | 34 ++++- plugins/amdgpu/amdgpu_plugin_dmabuf.c | 207 ++++++++++++++++++++++++++ plugins/amdgpu/amdgpu_plugin_dmabuf.h | 16 ++ plugins/amdgpu/amdgpu_plugin_drm.c | 7 +- plugins/amdgpu/amdgpu_plugin_util.c | 48 +++++- plugins/amdgpu/amdgpu_plugin_util.h | 8 +- plugins/amdgpu/criu-amdgpu.proto | 4 + 8 files changed, 306 insertions(+), 20 deletions(-) create mode 100644 plugins/amdgpu/amdgpu_plugin_dmabuf.c create mode 100644 plugins/amdgpu/amdgpu_plugin_dmabuf.h diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 870a039cd..31e177e4a 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -27,7 +27,7 @@ endif criu-amdgpu.pb-c.c: criu-amdgpu.proto protoc --proto_path=. --c_out=. criu-amdgpu.proto -amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c +amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_dmabuf.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC) amdgpu_plugin_clean: diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 11e410c31..125aaef9a 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -38,6 +38,7 @@ #include "rst-malloc.h" #include "common/list.h" +#include "amdgpu_plugin_dmabuf.h" #include "amdgpu_plugin_drm.h" #include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" @@ -46,7 +47,7 @@ #include "img-streamer.h" #include "image.h" #include "cr_options.h" - +#include "util.h" struct vma_metadata { struct list_head list; uint64_t old_pgoff; @@ -1064,6 +1065,9 @@ int amdgpu_unpause_processes(int pid) } } + if (post_dump_dmabuf_check() < 0) + ret = -1; + exit: pr_info("Process unpaused %s (ret:%d)\n", ret ? "Failed" : "Ok", ret); clear_dumped_fds(); @@ -1400,7 +1404,17 @@ int amdgpu_plugin_dump_file(int fd, int id) return -1; } - /* Check whether this plugin was called for kfd or render nodes */ + /* Check whether this plugin was called for kfd, dmabuf or render nodes */ + ret = get_dmabuf_info(fd, &st); + if (ret < 0) { + pr_perror("Failed to get dmabuf info"); + return -1; + } else if (ret == 0) { + pr_info("Dumping dmabuf fd = %d\n", fd); + ret = amdgpu_plugin_dmabuf_dump(fd, id); + return ret; + } + if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) { /* This is RenderD dumper plugin, for now just save renderD @@ -1414,7 +1428,7 @@ int amdgpu_plugin_dump_file(int fd, int id) ret = record_dumped_fd(fd, true); if (ret) return ret; - + ret = try_dump_dmabuf_list(); /* Need to return success here so that criu can call plugins for renderD nodes */ return ret; } @@ -1538,7 +1552,6 @@ static int restore_devices(struct kfd_ioctl_criu_args *args, CriuKfd *e) int ret = 0, bucket_index = 0; pr_debug("Restoring %d devices\n", e->num_of_gpus); - args->num_devices = e->num_of_gpus; device_buckets = xzalloc(sizeof(*device_buckets) * args->num_devices); if (!device_buckets) @@ -1822,12 +1835,17 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed) * first as we assume restore_maps is already filled. Need to fix this later. */ snprintf(img_path, sizeof(img_path), IMG_DRM_FILE, id); - pr_info("Restoring RenderD %s\n", img_path); img_fp = open_img_file(img_path, false, &img_size); - if (!img_fp) - return -EINVAL; - + if (!img_fp) { + ret = amdgpu_plugin_dmabuf_restore(id); + if (ret == 1) { + *retry_needed = true; + return 0; + } + return ret; + } + pr_info("Restoring RenderD %s\n", img_path); pr_debug("RenderD Image file size:%ld\n", img_size); buf = xmalloc(img_size); if (!buf) { diff --git a/plugins/amdgpu/amdgpu_plugin_dmabuf.c b/plugins/amdgpu/amdgpu_plugin_dmabuf.c new file mode 100644 index 000000000..74b5f9038 --- /dev/null +++ b/plugins/amdgpu/amdgpu_plugin_dmabuf.c @@ -0,0 +1,207 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "common/list.h" +#include "criu-amdgpu.pb-c.h" + +#include "xmalloc.h" +#include "criu-log.h" +#include "amdgpu_plugin_drm.h" +#include "amdgpu_plugin_util.h" +#include "amdgpu_plugin_dmabuf.h" +#include "fdstore.h" + +#include "util.h" +#include "common/scm.h" + +struct dmabuf { + int id; + int dmabuf_fd; + struct list_head node; +}; + +static LIST_HEAD(dmabuf_list); + +/* Return < 0 for error, > 0 for "not a dmabuf" and 0 "is a dmabuf" */ +int get_dmabuf_info(int fd, struct stat *st) +{ + char path[PATH_MAX]; + + if (read_fd_link(fd, path, sizeof(path)) < 0) + return -1; + + if (strncmp(path, DMABUF_LINK, strlen(DMABUF_LINK)) != 0) + return 1; + + return 0; +} + +int __amdgpu_plugin_dmabuf_dump(int dmabuf_fd, int id) +{ + int ret = 0; + char path[PATH_MAX]; + size_t len = 0; + unsigned char *buf = NULL; + int gem_handle; + + pr_info("TWI: Dumping dmabuf fd = %d\n", dmabuf_fd); + + gem_handle = handle_for_shared_bo_fd(dmabuf_fd); + if (gem_handle < 0) { + pr_err("Failed to get handle for dmabuf_fd = %d\n", dmabuf_fd); + return -EAGAIN; /* Retry needed */ + } + + CriuDmabufNode *node = xmalloc(sizeof(*node)); + if (!node) { + pr_err("Failed to allocate memory for dmabuf node\n"); + return -ENOMEM; + } + criu_dmabuf_node__init(node); + + node->gem_handle = gem_handle; + + if (node->gem_handle < 0) { + pr_err("Failed to get handle for dmabuf_fd\n"); + xfree(node); + return -EINVAL; + } + + /* Serialize metadata to a file */ + snprintf(path, sizeof(path), IMG_DMABUF_FILE, id); + len = criu_dmabuf_node__get_packed_size(node); + buf = xmalloc(len); + if (!buf) { + pr_err("Failed to allocate buffer for dmabuf metadata\n"); + xfree(node); + return -ENOMEM; + } + criu_dmabuf_node__pack(node, buf); + ret = write_img_file(path, buf, len); + + xfree(buf); + xfree(node); + return ret; +} + +int amdgpu_plugin_dmabuf_restore(int id) +{ + char path[PATH_MAX]; + size_t img_size; + FILE *img_fp = NULL; + int ret = 0; + CriuDmabufNode *rd = NULL; + unsigned char *buf = NULL; + int fd_id; + + snprintf(path, sizeof(path), IMG_DMABUF_FILE, id); + + pr_info("TWI: Restoring dmabuf fd, id = %d\n", id); + + /* Read serialized metadata */ + img_fp = open_img_file(path, false, &img_size); + if (!img_fp) { + pr_err("Failed to open dmabuf metadata file: %s\n", path); + return -EINVAL; + } + + pr_debug("dmabuf Image file size:%ld\n", img_size); + buf = xmalloc(img_size); + if (!buf) { + pr_perror("Failed to allocate memory"); + return -ENOMEM; + } + + ret = read_fp(img_fp, buf, img_size); + if (ret) { + pr_perror("Unable to read from %s", path); + xfree(buf); + return ret; + } + + rd = criu_dmabuf_node__unpack(NULL, img_size, buf); + if (rd == NULL) { + pr_perror("Unable to parse the dmabuf message %d", id); + xfree(buf); + fclose(img_fp); + return -1; + } + fclose(img_fp); + + pr_info("TWI: dmabuf node gem_handle = %d\n", rd->gem_handle); + + /* Match GEM handle with shared_dmabuf list */ + fd_id = amdgpu_id_for_handle(rd->gem_handle); + if (fd_id == -1) { + pr_err("Failed to find dmabuf_fd for GEM handle = %d\n", + rd->gem_handle); + return 1; + } + int dmabuf_fd = fdstore_get(fd_id); + pr_info("TWI: dmabuf node fd_id = %d, dmabuf_fd = %d\n", fd_id, dmabuf_fd); + if (dmabuf_fd == -1) { + pr_err("Failed to find dmabuf_fd for GEM handle = %d\n", + rd->gem_handle); + return 1; /* Retry needed */ + } else { + pr_info("Restored dmabuf_fd = %d for GEM handle = %d\n", + dmabuf_fd, rd->gem_handle); + } + ret = dmabuf_fd; + + pr_info("Successfully restored dmabuf_fd %d\n", + dmabuf_fd); + criu_dmabuf_node__free_unpacked(rd, NULL); + xfree(buf); + return ret; +} + +int amdgpu_plugin_dmabuf_dump(int dmabuf_fd, int id) +{ + int ret; + + ret = __amdgpu_plugin_dmabuf_dump(dmabuf_fd, id); + if (ret == -EAGAIN) { + struct dmabuf *b = xmalloc(sizeof(*b)); + b->id = id; + b->dmabuf_fd = dmabuf_fd; + list_add(&b->node, &dmabuf_list); + return 0; + } + return ret; +} + +int try_dump_dmabuf_list() +{ + struct dmabuf *b, *t; + list_for_each_entry_safe(b, t, &dmabuf_list, node) { + int ret = __amdgpu_plugin_dmabuf_dump(b->dmabuf_fd, b->id); + if (ret == -EAGAIN) + continue; + else if (ret) + return ret; + list_del(&b->node); + xfree(b); + } + return 0; +} + +int post_dump_dmabuf_check() +{ + if (!list_empty(&dmabuf_list)) { + pr_err("Not all dma buffers have been dumped\n"); + return -1; + } + return 1; +} \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_plugin_dmabuf.h b/plugins/amdgpu/amdgpu_plugin_dmabuf.h new file mode 100644 index 000000000..f07af7ee0 --- /dev/null +++ b/plugins/amdgpu/amdgpu_plugin_dmabuf.h @@ -0,0 +1,16 @@ + +#ifndef __AMDGPU_PLUGIN_DMABUF_H__ +#define __AMDGPU_PLUGIN_DMABUF_H__ + +#include "amdgpu_plugin_util.h" +#include "criu-amdgpu.pb-c.h" + +int amdgpu_plugin_dmabuf_dump(int fd, int id); +int amdgpu_plugin_dmabuf_restore(int id); + +int try_dump_dmabuf_list(); +int post_dump_dmabuf_check(); + +int get_dmabuf_info(int fd, struct stat *st); + +#endif /* __AMDGPU_PLUGIN_DMABUF_H__ */ \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_plugin_drm.c b/plugins/amdgpu/amdgpu_plugin_drm.c index 199dad21e..8466ca40d 100644 --- a/plugins/amdgpu/amdgpu_plugin_drm.c +++ b/plugins/amdgpu/amdgpu_plugin_drm.c @@ -47,7 +47,8 @@ int get_gem_handle(amdgpu_device_handle h_dev, int dmabuf_fd) return -1; } - drmPrimeFDToHandle(fd, dmabuf_fd, &handle); + if (drmPrimeFDToHandle(fd, dmabuf_fd, &handle)) + return -1; return handle; } @@ -465,6 +466,7 @@ int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd) if (work_already_completed(boinfo->handle, rd->drm_render_minor)) { continue; } else if (boinfo->handle != -1) { + pr_info("TWI: restore bo %d\n", boinfo->handle); if (boinfo->is_import) { fd_id = amdgpu_id_for_handle(boinfo->handle); if (fd_id == -1) { @@ -472,11 +474,13 @@ int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd) continue; } dmabuf_fd = fdstore_get(fd_id); + pr_info("TWI: restore bo %d: fd_id %d, dmabuf_fd %d\n", boinfo->handle, fd_id, dmabuf_fd); } } if (boinfo->is_import) { drmPrimeFDToHandle(device_fd, dmabuf_fd, &handle); + pr_info("TWI: restore bo imported to handle %d\n", handle); } else { union drm_amdgpu_gem_create create_args = { 0 }; @@ -493,6 +497,7 @@ int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd) handle = create_args.out.handle; drmPrimeHandleToFD(device_fd, handle, 0, &dmabuf_fd); + pr_info("TWI: restore bo created at handle %d and exported to fd %d\n", handle, dmabuf_fd); } change_args.handle = handle; diff --git a/plugins/amdgpu/amdgpu_plugin_util.c b/plugins/amdgpu/amdgpu_plugin_util.c index fd59c06ad..a2cafa4a3 100644 --- a/plugins/amdgpu/amdgpu_plugin_util.c +++ b/plugins/amdgpu/amdgpu_plugin_util.c @@ -37,6 +37,7 @@ #include "amdgpu_drm.h" #include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" +#include "amdgpu_plugin_drm.h" static LIST_HEAD(dumped_fds); static LIST_HEAD(shared_bos); @@ -109,6 +110,46 @@ int record_shared_bo(int handle, bool is_imported) return 0; } +int handle_for_shared_bo_fd(int fd) +{ + struct dumped_fd *df; + int trial_handle; + amdgpu_device_handle h_dev; + uint32_t major, minor; + struct shared_bo *bo; + + list_for_each_entry(df, &dumped_fds, l) { + /* see if the gem handle for fd using the hdev for df->fd is the + same as bo->handle. */ + + if (!df->is_drm) { + continue; + } + + if (amdgpu_device_initialize(df->fd, &major, &minor, &h_dev)) { + pr_err("Failed to initialize amdgpu device\n"); + continue; + } + + trial_handle = get_gem_handle(h_dev, fd); + if (trial_handle < 0) + continue; + + pr_info("TWI: Check device %d, got handle %d\n", df->fd, trial_handle); + + list_for_each_entry(bo, &shared_bos, l) { + if (bo->handle == trial_handle) { + pr_info("TWI: And that handle exists\n"); + return trial_handle; + } + } + + amdgpu_device_deinitialize(h_dev); + } + + return -1; +} + int record_completed_work(int handle, int id) { struct restore_completed_work *work; @@ -138,13 +179,6 @@ bool work_already_completed(int handle, int id) void clear_restore_state() { - while (!list_empty(&shared_dmabuf_fds)) { - struct shared_dmabuf *st = list_first_entry(&shared_dmabuf_fds, struct shared_dmabuf, l); - list_del(&st->l); - close(st->dmabuf_fd); - free(st); - } - while (!list_empty(&completed_work)) { struct restore_completed_work *st = list_first_entry(&completed_work, struct restore_completed_work, l); list_del(&st->l); diff --git a/plugins/amdgpu/amdgpu_plugin_util.h b/plugins/amdgpu/amdgpu_plugin_util.h index f20388efa..f5f752d0b 100644 --- a/plugins/amdgpu/amdgpu_plugin_util.h +++ b/plugins/amdgpu/amdgpu_plugin_util.h @@ -53,6 +53,9 @@ /* Name of file having serialized data of DRM device */ #define IMG_DRM_FILE "amdgpu-renderD-%d.img" +/* Name of file having serialized data of dmabuf meta */ +#define IMG_DMABUF_FILE "amdgpu-dmabuf_%d.img" + /* Name of file having serialized data of DRM device buffer objects (BOs) */ #define IMG_DRM_PAGES_FILE "amdgpu-drm-pages-%d-%d-%04x.img" @@ -61,6 +64,7 @@ #define HSAKMT_SHM "/hsakmt_shared_mem" #define HSAKMT_SEM_PATH "/dev/shm/sem.hsakmt_semaphore" #define HSAKMT_SEM "hsakmt_semaphore" +#define DMABUF_LINK "/dmabuf" /* Help macros to build sDMA command packets */ #define SDMA_PACKET(op, sub_op, e) ((((e)&0xFFFF) << 16) | (((sub_op)&0xFF) << 8) | (((op)&0xFF) << 0)) @@ -123,9 +127,7 @@ void clear_dumped_fds(); bool shared_bo_has_exporter(int handle); int record_shared_bo(int handle, bool is_imported); - -int record_shared_dmabuf_fd(int handle, int dmabuf_fd); -int dmabuf_fd_for_handle(int handle); +int handle_for_shared_bo_fd(int dmabuf_fd); int record_completed_work(int handle, int id); bool work_already_completed(int handle, int id); diff --git a/plugins/amdgpu/criu-amdgpu.proto b/plugins/amdgpu/criu-amdgpu.proto index 565413c34..7682a8f21 100644 --- a/plugins/amdgpu/criu-amdgpu.proto +++ b/plugins/amdgpu/criu-amdgpu.proto @@ -89,3 +89,7 @@ message criu_render_node { required uint64 num_of_bos = 4; repeated drm_bo_entry bo_entries = 5; } + +message criu_dmabuf_node { + required uint32 gem_handle = 1; +} From ff35a9126e3a2d4e6f5f9f2ca89b032f9ae5bc22 Mon Sep 17 00:00:00 2001 From: David Francis Date: Sun, 2 Nov 2025 07:26:44 -0800 Subject: [PATCH 250/257] plugins/amdgpu: remove excessive debug messages These pr_info lines begin with "CC3" and "TWI" were not meant to be included in the patch. Co-authored-by: Andrei Vagin Signed-off-by: David Francis Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin.c | 5 +---- plugins/amdgpu/amdgpu_plugin_dmabuf.c | 22 ++++++---------------- plugins/amdgpu/amdgpu_plugin_drm.c | 4 ---- plugins/amdgpu/amdgpu_plugin_util.c | 6 +----- 4 files changed, 8 insertions(+), 29 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 125aaef9a..4640ccf88 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -48,6 +48,7 @@ #include "image.h" #include "cr_options.h" #include "util.h" + struct vma_metadata { struct list_head list; uint64_t old_pgoff; @@ -327,8 +328,6 @@ void getenv_size_t(const char *var, size_t *value) int sh = 0; size_t size; - pr_info("Value str: %s\n", value_str); - if (value_str) { size = (size_t)strtoul(value_str, &endp, 0); if (errno || value_str == endp) { @@ -1132,7 +1131,6 @@ int amdgpu_restore_init(void) if (d) { while ((dir = readdir(d)) != NULL) { if (strncmp("amdgpu-kfd-", dir->d_name, strlen("amdgpu-kfd-")) == 0) { - pr_info("CC3: Found kfd file\n"); img_fp = open_img_file(dir->d_name, false, &img_size); buf = xmalloc(img_size); if (!buf) { @@ -1155,7 +1153,6 @@ int amdgpu_restore_init(void) xfree(buf); } if (strncmp("amdgpu-renderD-", dir->d_name, strlen("amdgpu-renderD-")) == 0) { - pr_info("CC3: Found drm file\n"); img_fp = open_img_file(dir->d_name, false, &img_size); buf = xmalloc(img_size); if (!buf) { diff --git a/plugins/amdgpu/amdgpu_plugin_dmabuf.c b/plugins/amdgpu/amdgpu_plugin_dmabuf.c index 74b5f9038..bdc107f64 100644 --- a/plugins/amdgpu/amdgpu_plugin_dmabuf.c +++ b/plugins/amdgpu/amdgpu_plugin_dmabuf.c @@ -55,8 +55,6 @@ int __amdgpu_plugin_dmabuf_dump(int dmabuf_fd, int id) unsigned char *buf = NULL; int gem_handle; - pr_info("TWI: Dumping dmabuf fd = %d\n", dmabuf_fd); - gem_handle = handle_for_shared_bo_fd(dmabuf_fd); if (gem_handle < 0) { pr_err("Failed to get handle for dmabuf_fd = %d\n", dmabuf_fd); @@ -107,8 +105,6 @@ int amdgpu_plugin_dmabuf_restore(int id) snprintf(path, sizeof(path), IMG_DMABUF_FILE, id); - pr_info("TWI: Restoring dmabuf fd, id = %d\n", id); - /* Read serialized metadata */ img_fp = open_img_file(path, false, &img_size); if (!img_fp) { @@ -139,29 +135,23 @@ int amdgpu_plugin_dmabuf_restore(int id) } fclose(img_fp); - pr_info("TWI: dmabuf node gem_handle = %d\n", rd->gem_handle); - /* Match GEM handle with shared_dmabuf list */ fd_id = amdgpu_id_for_handle(rd->gem_handle); if (fd_id == -1) { - pr_err("Failed to find dmabuf_fd for GEM handle = %d\n", - rd->gem_handle); + pr_err("Failed to find dmabuf_fd for GEM handle = %d\n", rd->gem_handle); return 1; } + int dmabuf_fd = fdstore_get(fd_id); - pr_info("TWI: dmabuf node fd_id = %d, dmabuf_fd = %d\n", fd_id, dmabuf_fd); if (dmabuf_fd == -1) { - pr_err("Failed to find dmabuf_fd for GEM handle = %d\n", - rd->gem_handle); + pr_err("Failed to find dmabuf_fd for GEM handle = %d\n", rd->gem_handle); return 1; /* Retry needed */ - } else { - pr_info("Restored dmabuf_fd = %d for GEM handle = %d\n", - dmabuf_fd, rd->gem_handle); } + + pr_info("Restored dmabuf_fd = %d for GEM handle = %d\n", dmabuf_fd, rd->gem_handle); ret = dmabuf_fd; - pr_info("Successfully restored dmabuf_fd %d\n", - dmabuf_fd); + pr_info("Successfully restored dmabuf_fd %d\n", dmabuf_fd); criu_dmabuf_node__free_unpacked(rd, NULL); xfree(buf); return ret; diff --git a/plugins/amdgpu/amdgpu_plugin_drm.c b/plugins/amdgpu/amdgpu_plugin_drm.c index 8466ca40d..00bcb7a29 100644 --- a/plugins/amdgpu/amdgpu_plugin_drm.c +++ b/plugins/amdgpu/amdgpu_plugin_drm.c @@ -466,7 +466,6 @@ int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd) if (work_already_completed(boinfo->handle, rd->drm_render_minor)) { continue; } else if (boinfo->handle != -1) { - pr_info("TWI: restore bo %d\n", boinfo->handle); if (boinfo->is_import) { fd_id = amdgpu_id_for_handle(boinfo->handle); if (fd_id == -1) { @@ -474,13 +473,11 @@ int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd) continue; } dmabuf_fd = fdstore_get(fd_id); - pr_info("TWI: restore bo %d: fd_id %d, dmabuf_fd %d\n", boinfo->handle, fd_id, dmabuf_fd); } } if (boinfo->is_import) { drmPrimeFDToHandle(device_fd, dmabuf_fd, &handle); - pr_info("TWI: restore bo imported to handle %d\n", handle); } else { union drm_amdgpu_gem_create create_args = { 0 }; @@ -497,7 +494,6 @@ int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd) handle = create_args.out.handle; drmPrimeHandleToFD(device_fd, handle, 0, &dmabuf_fd); - pr_info("TWI: restore bo created at handle %d and exported to fd %d\n", handle, dmabuf_fd); } change_args.handle = handle; diff --git a/plugins/amdgpu/amdgpu_plugin_util.c b/plugins/amdgpu/amdgpu_plugin_util.c index a2cafa4a3..592562474 100644 --- a/plugins/amdgpu/amdgpu_plugin_util.c +++ b/plugins/amdgpu/amdgpu_plugin_util.c @@ -135,13 +135,9 @@ int handle_for_shared_bo_fd(int fd) if (trial_handle < 0) continue; - pr_info("TWI: Check device %d, got handle %d\n", df->fd, trial_handle); - list_for_each_entry(bo, &shared_bos, l) { - if (bo->handle == trial_handle) { - pr_info("TWI: And that handle exists\n"); + if (bo->handle == trial_handle) return trial_handle; - } } amdgpu_device_deinitialize(h_dev); From 690b6104321dc64dd2ff0c9f6aa6f7c093b24f65 Mon Sep 17 00:00:00 2001 From: David Francis Date: Sun, 2 Nov 2025 07:29:35 -0800 Subject: [PATCH 251/257] plugins/amdgpu: return 0 in post_dump_dmabuf_check Use `return 0` on success in `post_dump_dmabuf_check()` for consistency with other functions. Co-authored-by: Andrei Vagin Signed-off-by: David Francis Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin_dmabuf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin_dmabuf.c b/plugins/amdgpu/amdgpu_plugin_dmabuf.c index bdc107f64..11c9792e3 100644 --- a/plugins/amdgpu/amdgpu_plugin_dmabuf.c +++ b/plugins/amdgpu/amdgpu_plugin_dmabuf.c @@ -179,7 +179,7 @@ int try_dump_dmabuf_list() int ret = __amdgpu_plugin_dmabuf_dump(b->dmabuf_fd, b->id); if (ret == -EAGAIN) continue; - else if (ret) + if (ret) return ret; list_del(&b->node); xfree(b); @@ -193,5 +193,5 @@ int post_dump_dmabuf_check() pr_err("Not all dma buffers have been dumped\n"); return -1; } - return 1; -} \ No newline at end of file + return 0; +} From 77e6558ddb134e0e8cfbeb6ce3341bf9b3116ccd Mon Sep 17 00:00:00 2001 From: David Francis Date: Sun, 2 Nov 2025 07:32:03 -0800 Subject: [PATCH 252/257] plugins/amdgpu: apply code-style fixes Co-authored-by: Andrei Vagin Signed-off-by: David Francis Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 4640ccf88..83fa41724 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -1406,10 +1406,10 @@ int amdgpu_plugin_dump_file(int fd, int id) if (ret < 0) { pr_perror("Failed to get dmabuf info"); return -1; - } else if (ret == 0) { + } + if (ret == 0) { pr_info("Dumping dmabuf fd = %d\n", fd); - ret = amdgpu_plugin_dmabuf_dump(fd, id); - return ret; + return amdgpu_plugin_dmabuf_dump(fd, id); } if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) { @@ -1425,9 +1425,9 @@ int amdgpu_plugin_dump_file(int fd, int id) ret = record_dumped_fd(fd, true); if (ret) return ret; - ret = try_dump_dmabuf_list(); + /* Need to return success here so that criu can call plugins for renderD nodes */ - return ret; + return try_dump_dmabuf_list(); } pr_info("%s() called for fd = %d\n", __func__, major(st.st_rdev)); From 6ed49894c5da4466cc89d2fc69afce29dedd6f2e Mon Sep 17 00:00:00 2001 From: David Francis Date: Sun, 2 Nov 2025 07:32:44 -0800 Subject: [PATCH 253/257] plugins/amdgpu: add a comment for retry_needed Add a comment that explains the purpose of `retry_needed`. Co-authored-by: Andrei Vagin Signed-off-by: David Francis Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 83fa41724..36dc0b6b0 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -1837,6 +1837,10 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed) if (!img_fp) { ret = amdgpu_plugin_dmabuf_restore(id); if (ret == 1) { + /* This is a dmabuf fd, but the corresponding buffer object that was + * exported to make it has not yet been restored. Need to try again + * later when the buffer object exists, so it can be re-exported. + */ *retry_needed = true; return 0; } From f56ccfd2d6815b499f321abf2c95a6c7cb3a1c40 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 2 Nov 2025 17:01:31 +0000 Subject: [PATCH 254/257] plugins/amdgpu: remove unused variable amdgpu_plugin_drm.c:167:6: error: variable 'num_bos' set but not used [-Werror,-Wunused-but-set-variable] 167 | int num_bos = 0; | Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin_drm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin_drm.c b/plugins/amdgpu/amdgpu_plugin_drm.c index 00bcb7a29..923bfcdd1 100644 --- a/plugins/amdgpu/amdgpu_plugin_drm.c +++ b/plugins/amdgpu/amdgpu_plugin_drm.c @@ -164,7 +164,6 @@ static int restore_bo_contents_drm(int drm_render_minor, CriuRenderNode *rd, int FILE *bo_contents_fp = NULL; void *buffer = NULL; char img_path[40]; - int num_bos = 0; int i, ret = 0; ret = amdgpu_device_initialize(drm_fd, &major, &minor, &h_dev); @@ -206,8 +205,6 @@ static int restore_bo_contents_drm(int drm_render_minor, CriuRenderNode *rd, int if (rd->bo_entries[i]->num_of_vms == 0) continue; - num_bos++; - snprintf(img_path, sizeof(img_path), IMG_DRM_PAGES_FILE, rd->id, drm_render_minor, i); bo_contents_fp = open_img_file(img_path, false, &image_size); From e4a5e164b4ccad7e82cef638f9510f932daea00f Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 5 Nov 2025 15:12:06 +0000 Subject: [PATCH 255/257] plugins/amdgpu: update kernel headers This patch updates drm.h and amdgpu_drm.h kernel headers, and adds drm_mode.h (included by drm.h) from the rocm-7.1.0 release tag. Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_drm.h | 125 +++- plugins/amdgpu/drm.h | 58 +- plugins/amdgpu/drm_mode.h | 1362 +++++++++++++++++++++++++++++++++++ 3 files changed, 1523 insertions(+), 22 deletions(-) create mode 100644 plugins/amdgpu/drm_mode.h diff --git a/plugins/amdgpu/amdgpu_drm.h b/plugins/amdgpu/amdgpu_drm.h index 9cebd072a..69227a12b 100644 --- a/plugins/amdgpu/amdgpu_drm.h +++ b/plugins/amdgpu/amdgpu_drm.h @@ -58,6 +58,11 @@ extern "C" { #define DRM_AMDGPU_USERQ_SIGNAL 0x17 #define DRM_AMDGPU_USERQ_WAIT 0x18 #define DRM_AMDGPU_GEM_LIST_HANDLES 0x19 +/* not upstream */ +#define DRM_AMDGPU_GEM_DGMA 0x5c + +/* hybrid specific ioctls */ +#define DRM_AMDGPU_SEM 0x5b #define DRM_IOCTL_AMDGPU_GEM_CREATE DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_CREATE, union drm_amdgpu_gem_create) #define DRM_IOCTL_AMDGPU_GEM_MMAP DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_MMAP, union drm_amdgpu_gem_mmap) @@ -80,6 +85,8 @@ extern "C" { #define DRM_IOCTL_AMDGPU_USERQ_WAIT DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_WAIT, struct drm_amdgpu_userq_wait) #define DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_LIST_HANDLES, struct drm_amdgpu_gem_list_handles) +#define DRM_IOCTL_AMDGPU_GEM_DGMA DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_DGMA, struct drm_amdgpu_gem_dgma) + /** * DOC: memory domains * @@ -105,7 +112,12 @@ extern "C" { * * %AMDGPU_GEM_DOMAIN_DOORBELL Doorbell. It is an MMIO region for * signalling user mode queues. + * + * %AMDGPU_GEM_DOMAIN_MMIO_REMAP MMIO remap page (special mapping for HDP flushing). */ +/* hybrid specific ioctls */ +#define DRM_IOCTL_AMDGPU_SEM DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_SEM, union drm_amdgpu_sem) + #define AMDGPU_GEM_DOMAIN_CPU 0x1 #define AMDGPU_GEM_DOMAIN_GTT 0x2 #define AMDGPU_GEM_DOMAIN_VRAM 0x4 @@ -113,13 +125,20 @@ extern "C" { #define AMDGPU_GEM_DOMAIN_GWS 0x10 #define AMDGPU_GEM_DOMAIN_OA 0x20 #define AMDGPU_GEM_DOMAIN_DOORBELL 0x40 +#define AMDGPU_GEM_DOMAIN_MMIO_REMAP 0x80 +#define AMDGPU_GEM_DOMAIN_DGMA 0x400 +#define AMDGPU_GEM_DOMAIN_DGMA_IMPORT 0x800 + #define AMDGPU_GEM_DOMAIN_MASK (AMDGPU_GEM_DOMAIN_CPU | \ AMDGPU_GEM_DOMAIN_GTT | \ AMDGPU_GEM_DOMAIN_VRAM | \ AMDGPU_GEM_DOMAIN_GDS | \ AMDGPU_GEM_DOMAIN_GWS | \ - AMDGPU_GEM_DOMAIN_OA | \ - AMDGPU_GEM_DOMAIN_DOORBELL) + AMDGPU_GEM_DOMAIN_OA |\ + AMDGPU_GEM_DOMAIN_DOORBELL |\ + AMDGPU_GEM_DOMAIN_MMIO_REMAP |\ + AMDGPU_GEM_DOMAIN_DGMA |\ + AMDGPU_GEM_DOMAIN_DGMA_IMPORT) /* Flag that CPU access will be required for the case of VRAM domain */ #define AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED (1 << 0) @@ -182,6 +201,14 @@ extern "C" { /* Set PTE.D and recompress during GTT->VRAM moves according to TILING flags. */ #define AMDGPU_GEM_CREATE_GFX12_DCC (1 << 16) +/* hybrid specific */ +/* Flag that the memory should be in SPARSE resource */ +#define AMDGPU_GEM_CREATE_SPARSE (1ULL << 29) +/* Flag that the memory allocation should be from top of domain */ +#define AMDGPU_GEM_CREATE_TOP_DOWN (1ULL << 30) +/* Flag that the memory allocation should be pinned */ +#define AMDGPU_GEM_CREATE_NO_EVICT (1ULL << 31) + struct drm_amdgpu_gem_create_in { /** the requested memory size */ __u64 bo_size; @@ -581,6 +608,35 @@ struct drm_amdgpu_userq_wait { __u64 out_fences; }; +/* sem related */ +#define AMDGPU_SEM_OP_CREATE_SEM 1 +#define AMDGPU_SEM_OP_WAIT_SEM 2 +#define AMDGPU_SEM_OP_SIGNAL_SEM 3 +#define AMDGPU_SEM_OP_DESTROY_SEM 4 +#define AMDGPU_SEM_OP_IMPORT_SEM 5 +#define AMDGPU_SEM_OP_EXPORT_SEM 6 + +struct drm_amdgpu_sem_in { + /** AMDGPU_SEM_OP_* */ + uint32_t op; + uint32_t handle; + uint32_t ctx_id; + uint32_t ip_type; + uint32_t ip_instance; + uint32_t ring; + uint64_t seq; +}; + +union drm_amdgpu_sem_out { + int32_t fd; + uint32_t handle; +}; + +union drm_amdgpu_sem { + struct drm_amdgpu_sem_in in; + union drm_amdgpu_sem_out out; +}; + /* vm ioctl */ #define AMDGPU_VM_OP_RESERVE_VMID 1 #define AMDGPU_VM_OP_UNRESERVE_VMID 2 @@ -637,6 +693,15 @@ struct drm_amdgpu_gem_userptr { __u32 handle; }; +#define AMDGPU_GEM_DGMA_IMPORT 0 +#define AMDGPU_GEM_DGMA_QUERY_PHYS_ADDR 1 +struct drm_amdgpu_gem_dgma { + __u64 addr; + __u64 size; + __u32 op; + __u32 handle; +}; + /* SI-CI-VI: */ /* same meaning as the GB_TILE_MODE and GL_MACRO_TILE_MODE fields */ #define AMDGPU_TILING_ARRAY_MODE_SHIFT 0 @@ -1084,10 +1149,11 @@ struct drm_amdgpu_cs_chunk_cp_gfx_shadow { * Query h/w info: Flag that this is integrated (a.h.a. fusion) GPU * */ -#define AMDGPU_IDS_FLAGS_FUSION 0x1 -#define AMDGPU_IDS_FLAGS_PREEMPTION 0x2 -#define AMDGPU_IDS_FLAGS_TMZ 0x4 -#define AMDGPU_IDS_FLAGS_CONFORMANT_TRUNC_COORD 0x8 +#define AMDGPU_IDS_FLAGS_FUSION 0x01 +#define AMDGPU_IDS_FLAGS_PREEMPTION 0x02 +#define AMDGPU_IDS_FLAGS_TMZ 0x04 +#define AMDGPU_IDS_FLAGS_CONFORMANT_TRUNC_COORD 0x08 +#define AMDGPU_IDS_FLAGS_GANG_SUBMIT 0x10 /* * Query h/w info: Flag identifying VF/PF/PT mode @@ -1269,6 +1335,16 @@ struct drm_amdgpu_cs_chunk_cp_gfx_shadow { /* query FW object size and alignment */ #define AMDGPU_INFO_UQ_FW_AREAS 0x24 +/* Hybrid Stack Specific Defs*/ +/* gpu capability */ +#define AMDGPU_INFO_CAPABILITY 0x50 +/* virtual range */ +#define AMDGPU_INFO_VIRTUAL_RANGE 0x51 +/* query pin memory capability */ +#define AMDGPU_CAPABILITY_PIN_MEM_FLAG (1 << 0) +/* query direct gma capability */ +#define AMDGPU_CAPABILITY_DIRECT_GMA_FLAG (1 << 1) + #define AMDGPU_INFO_MMR_SE_INDEX_SHIFT 0 #define AMDGPU_INFO_MMR_SE_INDEX_MASK 0xff #define AMDGPU_INFO_MMR_SH_INDEX_SHIFT 8 @@ -1325,6 +1401,11 @@ struct drm_amdgpu_info { __u32 flags; } read_mmr_reg; + struct { + uint32_t aperture; + uint32_t _pad; + } virtual_range; + struct drm_amdgpu_query_fw query_fw; struct { @@ -1423,6 +1504,8 @@ struct drm_amdgpu_info_vbios { #define AMDGPU_VRAM_TYPE_LPDDR5 12 #define AMDGPU_VRAM_TYPE_HBM3E 13 +#define AMDGPU_VRAM_TYPE_HBM_WIDTH 4096 + struct drm_amdgpu_info_device { /** PCI Device ID */ __u32 device_id; @@ -1672,6 +1755,7 @@ struct drm_amdgpu_info_uq_metadata { #define AMDGPU_FAMILY_GC_11_5_0 150 /* GC 11.5.0 */ #define AMDGPU_FAMILY_GC_12_0_0 152 /* GC 12.0.0 */ +#ifndef HAVE_DRM_COLOR_CTM_3X4 /* FIXME wrong namespace! */ struct drm_color_ctm_3x4 { /* @@ -1680,6 +1764,35 @@ struct drm_color_ctm_3x4 { */ __u64 matrix[12]; }; +#endif + +/** + * Definition of System Unified Address (SUA) apertures + */ +#define AMDGPU_SUA_APERTURE_PRIVATE 1 +#define AMDGPU_SUA_APERTURE_SHARED 2 +struct drm_amdgpu_virtual_range { + uint64_t start; + uint64_t end; +}; + +struct drm_amdgpu_capability { + __u32 flag; + __u32 direct_gma_size; +}; + +/* + * Definition of free sync enter and exit signals + * We may have more options in the future + */ +#define AMDGPU_FREESYNC_FULLSCREEN_ENTER 1 +#define AMDGPU_FREESYNC_FULLSCREEN_EXIT 2 + +struct drm_amdgpu_freesync { + __u32 op; /* AMDGPU_FREESYNC_FULLSCREEN_ENTER or */ + /* AMDGPU_FREESYNC_FULLSCREEN_ENTER */ + __u32 spare[7]; +}; #if defined(__cplusplus) } diff --git a/plugins/amdgpu/drm.h b/plugins/amdgpu/drm.h index 84c819c17..3cd5cf15e 100644 --- a/plugins/amdgpu/drm.h +++ b/plugins/amdgpu/drm.h @@ -597,40 +597,62 @@ struct drm_set_version { int drm_dd_minor; }; -/* DRM_IOCTL_GEM_CLOSE ioctl argument type */ +/** + * struct drm_gem_close - Argument for &DRM_IOCTL_GEM_CLOSE ioctl. + * @handle: Handle of the object to be closed. + * @pad: Padding. + * + * Releases the handle to an mm object. + */ struct drm_gem_close { - /** Handle of the object to be closed. */ __u32 handle; __u32 pad; }; -/* DRM_IOCTL_GEM_FLINK ioctl argument type */ +/** + * struct drm_gem_flink - Argument for &DRM_IOCTL_GEM_FLINK ioctl. + * @handle: Handle for the object being named. + * @name: Returned global name. + * + * Create a global name for an object, returning the name. + * + * Note that the name does not hold a reference; when the object + * is freed, the name goes away. + */ struct drm_gem_flink { - /** Handle for the object being named */ __u32 handle; - - /** Returned global name */ __u32 name; }; -/* DRM_IOCTL_GEM_OPEN ioctl argument type */ +/** + * struct drm_gem_open - Argument for &DRM_IOCTL_GEM_OPEN ioctl. + * @name: Name of object being opened. + * @handle: Returned handle for the object. + * @size: Returned size of the object + * + * Open an object using the global name, returning a handle and the size. + * + * This handle (of course) holds a reference to the object, so the object + * will not go away until the handle is deleted. + */ struct drm_gem_open { - /** Name of object being opened */ __u32 name; - - /** Returned handle for the object */ __u32 handle; - - /** Returned size of the object */ __u64 size; }; -/* DRM_IOCTL_GEM_CHANGE_HANDLE ioctl argument type */ +/** + * struct drm_gem_change_handle - Argument for &DRM_IOCTL_GEM_CHANGE_HANDLE ioctl. + * @handle: The handle of a gem object. + * @new_handle: An available gem handle. + * + * This ioctl changes the handle of a GEM object to the specified one. + * The new handle must be unused. On success the old handle is closed + * and all further IOCTL should refer to the new handle only. + * Calls to DRM_IOCTL_PRIME_FD_TO_HANDLE will return the new handle. + */ struct drm_gem_change_handle { - /** Current handle of object */ __u32 handle; - - /** Handle to change that object to */ __u32 new_handle; }; @@ -914,13 +936,17 @@ struct drm_syncobj_destroy { }; #define DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE (1 << 0) +#define DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_TIMELINE (1 << 1) #define DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE (1 << 0) +#define DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_TIMELINE (1 << 1) struct drm_syncobj_handle { __u32 handle; __u32 flags; __s32 fd; __u32 pad; + + __u64 point; }; struct drm_syncobj_transfer { diff --git a/plugins/amdgpu/drm_mode.h b/plugins/amdgpu/drm_mode.h new file mode 100644 index 000000000..c082810c0 --- /dev/null +++ b/plugins/amdgpu/drm_mode.h @@ -0,0 +1,1362 @@ +/* + * Copyright (c) 2007 Dave Airlie + * Copyright (c) 2007 Jakob Bornecrantz + * Copyright (c) 2008 Red Hat Inc. + * Copyright (c) 2007-2008 Tungsten Graphics, Inc., Cedar Park, TX., USA + * Copyright (c) 2007-2008 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef _DRM_MODE_H +#define _DRM_MODE_H + +#include "drm.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +/** + * DOC: overview + * + * DRM exposes many UAPI and structure definitions to have a consistent + * and standardized interface with users. + * Userspace can refer to these structure definitions and UAPI formats + * to communicate to drivers. + */ + +#define DRM_CONNECTOR_NAME_LEN 32 +#define DRM_DISPLAY_MODE_LEN 32 +#define DRM_PROP_NAME_LEN 32 + +#define DRM_MODE_TYPE_BUILTIN (1<<0) /* deprecated */ +#define DRM_MODE_TYPE_CLOCK_C ((1<<1) | DRM_MODE_TYPE_BUILTIN) /* deprecated */ +#define DRM_MODE_TYPE_CRTC_C ((1<<2) | DRM_MODE_TYPE_BUILTIN) /* deprecated */ +#define DRM_MODE_TYPE_PREFERRED (1<<3) +#define DRM_MODE_TYPE_DEFAULT (1<<4) /* deprecated */ +#define DRM_MODE_TYPE_USERDEF (1<<5) +#define DRM_MODE_TYPE_DRIVER (1<<6) + +#define DRM_MODE_TYPE_ALL (DRM_MODE_TYPE_PREFERRED | \ + DRM_MODE_TYPE_USERDEF | \ + DRM_MODE_TYPE_DRIVER) + +/* Video mode flags */ +/* bit compatible with the xrandr RR_ definitions (bits 0-13) + * + * ABI warning: Existing userspace really expects + * the mode flags to match the xrandr definitions. Any + * changes that don't match the xrandr definitions will + * likely need a new client cap or some other mechanism + * to avoid breaking existing userspace. This includes + * allocating new flags in the previously unused bits! + */ +#define DRM_MODE_FLAG_PHSYNC (1<<0) +#define DRM_MODE_FLAG_NHSYNC (1<<1) +#define DRM_MODE_FLAG_PVSYNC (1<<2) +#define DRM_MODE_FLAG_NVSYNC (1<<3) +#define DRM_MODE_FLAG_INTERLACE (1<<4) +#define DRM_MODE_FLAG_DBLSCAN (1<<5) +#define DRM_MODE_FLAG_CSYNC (1<<6) +#define DRM_MODE_FLAG_PCSYNC (1<<7) +#define DRM_MODE_FLAG_NCSYNC (1<<8) +#define DRM_MODE_FLAG_HSKEW (1<<9) /* hskew provided */ +#define DRM_MODE_FLAG_BCAST (1<<10) /* deprecated */ +#define DRM_MODE_FLAG_PIXMUX (1<<11) /* deprecated */ +#define DRM_MODE_FLAG_DBLCLK (1<<12) +#define DRM_MODE_FLAG_CLKDIV2 (1<<13) + /* + * When adding a new stereo mode don't forget to adjust DRM_MODE_FLAGS_3D_MAX + * (define not exposed to user space). + */ +#define DRM_MODE_FLAG_3D_MASK (0x1f<<14) +#define DRM_MODE_FLAG_3D_NONE (0<<14) +#define DRM_MODE_FLAG_3D_FRAME_PACKING (1<<14) +#define DRM_MODE_FLAG_3D_FIELD_ALTERNATIVE (2<<14) +#define DRM_MODE_FLAG_3D_LINE_ALTERNATIVE (3<<14) +#define DRM_MODE_FLAG_3D_SIDE_BY_SIDE_FULL (4<<14) +#define DRM_MODE_FLAG_3D_L_DEPTH (5<<14) +#define DRM_MODE_FLAG_3D_L_DEPTH_GFX_GFX_DEPTH (6<<14) +#define DRM_MODE_FLAG_3D_TOP_AND_BOTTOM (7<<14) +#define DRM_MODE_FLAG_3D_SIDE_BY_SIDE_HALF (8<<14) + +/* Picture aspect ratio options */ +#define DRM_MODE_PICTURE_ASPECT_NONE 0 +#define DRM_MODE_PICTURE_ASPECT_4_3 1 +#define DRM_MODE_PICTURE_ASPECT_16_9 2 +#define DRM_MODE_PICTURE_ASPECT_64_27 3 +#define DRM_MODE_PICTURE_ASPECT_256_135 4 + +/* Content type options */ +#define DRM_MODE_CONTENT_TYPE_NO_DATA 0 +#define DRM_MODE_CONTENT_TYPE_GRAPHICS 1 +#define DRM_MODE_CONTENT_TYPE_PHOTO 2 +#define DRM_MODE_CONTENT_TYPE_CINEMA 3 +#define DRM_MODE_CONTENT_TYPE_GAME 4 + +/* Aspect ratio flag bitmask (4 bits 22:19) */ +#define DRM_MODE_FLAG_PIC_AR_MASK (0x0F<<19) +#define DRM_MODE_FLAG_PIC_AR_NONE \ + (DRM_MODE_PICTURE_ASPECT_NONE<<19) +#define DRM_MODE_FLAG_PIC_AR_4_3 \ + (DRM_MODE_PICTURE_ASPECT_4_3<<19) +#define DRM_MODE_FLAG_PIC_AR_16_9 \ + (DRM_MODE_PICTURE_ASPECT_16_9<<19) +#define DRM_MODE_FLAG_PIC_AR_64_27 \ + (DRM_MODE_PICTURE_ASPECT_64_27<<19) +#define DRM_MODE_FLAG_PIC_AR_256_135 \ + (DRM_MODE_PICTURE_ASPECT_256_135<<19) + +#define DRM_MODE_FLAG_ALL (DRM_MODE_FLAG_PHSYNC | \ + DRM_MODE_FLAG_NHSYNC | \ + DRM_MODE_FLAG_PVSYNC | \ + DRM_MODE_FLAG_NVSYNC | \ + DRM_MODE_FLAG_INTERLACE | \ + DRM_MODE_FLAG_DBLSCAN | \ + DRM_MODE_FLAG_CSYNC | \ + DRM_MODE_FLAG_PCSYNC | \ + DRM_MODE_FLAG_NCSYNC | \ + DRM_MODE_FLAG_HSKEW | \ + DRM_MODE_FLAG_DBLCLK | \ + DRM_MODE_FLAG_CLKDIV2 | \ + DRM_MODE_FLAG_3D_MASK) + +/* DPMS flags */ +/* bit compatible with the xorg definitions. */ +#define DRM_MODE_DPMS_ON 0 +#define DRM_MODE_DPMS_STANDBY 1 +#define DRM_MODE_DPMS_SUSPEND 2 +#define DRM_MODE_DPMS_OFF 3 + +/* Scaling mode options */ +#define DRM_MODE_SCALE_NONE 0 /* Unmodified timing (display or + software can still scale) */ +#define DRM_MODE_SCALE_FULLSCREEN 1 /* Full screen, ignore aspect */ +#define DRM_MODE_SCALE_CENTER 2 /* Centered, no scaling */ +#define DRM_MODE_SCALE_ASPECT 3 /* Full screen, preserve aspect */ + +/* Dithering mode options */ +#define DRM_MODE_DITHERING_OFF 0 +#define DRM_MODE_DITHERING_ON 1 +#define DRM_MODE_DITHERING_AUTO 2 + +/* Dirty info options */ +#define DRM_MODE_DIRTY_OFF 0 +#define DRM_MODE_DIRTY_ON 1 +#define DRM_MODE_DIRTY_ANNOTATE 2 + +/* Link Status options */ +#define DRM_MODE_LINK_STATUS_GOOD 0 +#define DRM_MODE_LINK_STATUS_BAD 1 + +/* + * DRM_MODE_ROTATE_ + * + * Signals that a drm plane is been rotated degrees in counter + * clockwise direction. + * + * This define is provided as a convenience, looking up the property id + * using the name->prop id lookup is the preferred method. + */ +#define DRM_MODE_ROTATE_0 (1<<0) +#define DRM_MODE_ROTATE_90 (1<<1) +#define DRM_MODE_ROTATE_180 (1<<2) +#define DRM_MODE_ROTATE_270 (1<<3) + +/* + * DRM_MODE_ROTATE_MASK + * + * Bitmask used to look for drm plane rotations. + */ +#define DRM_MODE_ROTATE_MASK (\ + DRM_MODE_ROTATE_0 | \ + DRM_MODE_ROTATE_90 | \ + DRM_MODE_ROTATE_180 | \ + DRM_MODE_ROTATE_270) + +/* + * DRM_MODE_REFLECT_ + * + * Signals that the contents of a drm plane is reflected along the axis, + * in the same way as mirroring. + * See kerneldoc chapter "Plane Composition Properties" for more details. + * + * This define is provided as a convenience, looking up the property id + * using the name->prop id lookup is the preferred method. + */ +#define DRM_MODE_REFLECT_X (1<<4) +#define DRM_MODE_REFLECT_Y (1<<5) + +/* + * DRM_MODE_REFLECT_MASK + * + * Bitmask used to look for drm plane reflections. + */ +#define DRM_MODE_REFLECT_MASK (\ + DRM_MODE_REFLECT_X | \ + DRM_MODE_REFLECT_Y) + +/* Content Protection Flags */ +#define DRM_MODE_CONTENT_PROTECTION_UNDESIRED 0 +#define DRM_MODE_CONTENT_PROTECTION_DESIRED 1 +#define DRM_MODE_CONTENT_PROTECTION_ENABLED 2 + +/** + * struct drm_mode_modeinfo - Display mode information. + * @clock: pixel clock in kHz + * @hdisplay: horizontal display size + * @hsync_start: horizontal sync start + * @hsync_end: horizontal sync end + * @htotal: horizontal total size + * @hskew: horizontal skew + * @vdisplay: vertical display size + * @vsync_start: vertical sync start + * @vsync_end: vertical sync end + * @vtotal: vertical total size + * @vscan: vertical scan + * @vrefresh: approximate vertical refresh rate in Hz + * @flags: bitmask of misc. flags, see DRM_MODE_FLAG_* defines + * @type: bitmask of type flags, see DRM_MODE_TYPE_* defines + * @name: string describing the mode resolution + * + * This is the user-space API display mode information structure. For the + * kernel version see struct drm_display_mode. + */ +struct drm_mode_modeinfo { + __u32 clock; + __u16 hdisplay; + __u16 hsync_start; + __u16 hsync_end; + __u16 htotal; + __u16 hskew; + __u16 vdisplay; + __u16 vsync_start; + __u16 vsync_end; + __u16 vtotal; + __u16 vscan; + + __u32 vrefresh; + + __u32 flags; + __u32 type; + char name[DRM_DISPLAY_MODE_LEN]; +}; + +struct drm_mode_card_res { + __u64 fb_id_ptr; + __u64 crtc_id_ptr; + __u64 connector_id_ptr; + __u64 encoder_id_ptr; + __u32 count_fbs; + __u32 count_crtcs; + __u32 count_connectors; + __u32 count_encoders; + __u32 min_width; + __u32 max_width; + __u32 min_height; + __u32 max_height; +}; + +struct drm_mode_crtc { + __u64 set_connectors_ptr; + __u32 count_connectors; + + __u32 crtc_id; /**< Id */ + __u32 fb_id; /**< Id of framebuffer */ + + __u32 x; /**< x Position on the framebuffer */ + __u32 y; /**< y Position on the framebuffer */ + + __u32 gamma_size; + __u32 mode_valid; + struct drm_mode_modeinfo mode; +}; + +#define DRM_MODE_PRESENT_TOP_FIELD (1<<0) +#define DRM_MODE_PRESENT_BOTTOM_FIELD (1<<1) + +/* Planes blend with or override other bits on the CRTC */ +struct drm_mode_set_plane { + __u32 plane_id; + __u32 crtc_id; + __u32 fb_id; /* fb object contains surface format type */ + __u32 flags; /* see above flags */ + + /* Signed dest location allows it to be partially off screen */ + __s32 crtc_x; + __s32 crtc_y; + __u32 crtc_w; + __u32 crtc_h; + + /* Source values are 16.16 fixed point */ + __u32 src_x; + __u32 src_y; + __u32 src_h; + __u32 src_w; +}; + +/** + * struct drm_mode_get_plane - Get plane metadata. + * + * Userspace can perform a GETPLANE ioctl to retrieve information about a + * plane. + * + * To retrieve the number of formats supported, set @count_format_types to zero + * and call the ioctl. @count_format_types will be updated with the value. + * + * To retrieve these formats, allocate an array with the memory needed to store + * @count_format_types formats. Point @format_type_ptr to this array and call + * the ioctl again (with @count_format_types still set to the value returned in + * the first ioctl call). + */ +struct drm_mode_get_plane { + /** + * @plane_id: Object ID of the plane whose information should be + * retrieved. Set by caller. + */ + __u32 plane_id; + + /** @crtc_id: Object ID of the current CRTC. */ + __u32 crtc_id; + /** @fb_id: Object ID of the current fb. */ + __u32 fb_id; + + /** + * @possible_crtcs: Bitmask of CRTC's compatible with the plane. CRTC's + * are created and they receive an index, which corresponds to their + * position in the bitmask. Bit N corresponds to + * :ref:`CRTC index` N. + */ + __u32 possible_crtcs; + /** @gamma_size: Never used. */ + __u32 gamma_size; + + /** @count_format_types: Number of formats. */ + __u32 count_format_types; + /** + * @format_type_ptr: Pointer to ``__u32`` array of formats that are + * supported by the plane. These formats do not require modifiers. + */ + __u64 format_type_ptr; +}; + +struct drm_mode_get_plane_res { + __u64 plane_id_ptr; + __u32 count_planes; +}; + +#define DRM_MODE_ENCODER_NONE 0 +#define DRM_MODE_ENCODER_DAC 1 +#define DRM_MODE_ENCODER_TMDS 2 +#define DRM_MODE_ENCODER_LVDS 3 +#define DRM_MODE_ENCODER_TVDAC 4 +#define DRM_MODE_ENCODER_VIRTUAL 5 +#define DRM_MODE_ENCODER_DSI 6 +#define DRM_MODE_ENCODER_DPMST 7 +#define DRM_MODE_ENCODER_DPI 8 + +struct drm_mode_get_encoder { + __u32 encoder_id; + __u32 encoder_type; + + __u32 crtc_id; /**< Id of crtc */ + + __u32 possible_crtcs; + __u32 possible_clones; +}; + +/* This is for connectors with multiple signal types. */ +/* Try to match DRM_MODE_CONNECTOR_X as closely as possible. */ +enum drm_mode_subconnector { + DRM_MODE_SUBCONNECTOR_Automatic = 0, /* DVI-I, TV */ + DRM_MODE_SUBCONNECTOR_Unknown = 0, /* DVI-I, TV, DP */ + DRM_MODE_SUBCONNECTOR_VGA = 1, /* DP */ + DRM_MODE_SUBCONNECTOR_DVID = 3, /* DVI-I DP */ + DRM_MODE_SUBCONNECTOR_DVIA = 4, /* DVI-I */ + DRM_MODE_SUBCONNECTOR_Composite = 5, /* TV */ + DRM_MODE_SUBCONNECTOR_SVIDEO = 6, /* TV */ + DRM_MODE_SUBCONNECTOR_Component = 8, /* TV */ + DRM_MODE_SUBCONNECTOR_SCART = 9, /* TV */ + DRM_MODE_SUBCONNECTOR_DisplayPort = 10, /* DP */ + DRM_MODE_SUBCONNECTOR_HDMIA = 11, /* DP */ + DRM_MODE_SUBCONNECTOR_Native = 15, /* DP */ + DRM_MODE_SUBCONNECTOR_Wireless = 18, /* DP */ +}; + +#define DRM_MODE_CONNECTOR_Unknown 0 +#define DRM_MODE_CONNECTOR_VGA 1 +#define DRM_MODE_CONNECTOR_DVII 2 +#define DRM_MODE_CONNECTOR_DVID 3 +#define DRM_MODE_CONNECTOR_DVIA 4 +#define DRM_MODE_CONNECTOR_Composite 5 +#define DRM_MODE_CONNECTOR_SVIDEO 6 +#define DRM_MODE_CONNECTOR_LVDS 7 +#define DRM_MODE_CONNECTOR_Component 8 +#define DRM_MODE_CONNECTOR_9PinDIN 9 +#define DRM_MODE_CONNECTOR_DisplayPort 10 +#define DRM_MODE_CONNECTOR_HDMIA 11 +#define DRM_MODE_CONNECTOR_HDMIB 12 +#define DRM_MODE_CONNECTOR_TV 13 +#define DRM_MODE_CONNECTOR_eDP 14 +#define DRM_MODE_CONNECTOR_VIRTUAL 15 +#define DRM_MODE_CONNECTOR_DSI 16 +#define DRM_MODE_CONNECTOR_DPI 17 +#define DRM_MODE_CONNECTOR_WRITEBACK 18 +#define DRM_MODE_CONNECTOR_SPI 19 +#define DRM_MODE_CONNECTOR_USB 20 + +/** + * struct drm_mode_get_connector - Get connector metadata. + * + * User-space can perform a GETCONNECTOR ioctl to retrieve information about a + * connector. User-space is expected to retrieve encoders, modes and properties + * by performing this ioctl at least twice: the first time to retrieve the + * number of elements, the second time to retrieve the elements themselves. + * + * To retrieve the number of elements, set @count_props and @count_encoders to + * zero, set @count_modes to 1, and set @modes_ptr to a temporary struct + * drm_mode_modeinfo element. + * + * To retrieve the elements, allocate arrays for @encoders_ptr, @modes_ptr, + * @props_ptr and @prop_values_ptr, then set @count_modes, @count_props and + * @count_encoders to their capacity. + * + * Performing the ioctl only twice may be racy: the number of elements may have + * changed with a hotplug event in-between the two ioctls. User-space is + * expected to retry the last ioctl until the number of elements stabilizes. + * The kernel won't fill any array which doesn't have the expected length. + * + * **Force-probing a connector** + * + * If the @count_modes field is set to zero and the DRM client is the current + * DRM master, the kernel will perform a forced probe on the connector to + * refresh the connector status, modes and EDID. A forced-probe can be slow, + * might cause flickering and the ioctl will block. + * + * User-space needs to force-probe connectors to ensure their metadata is + * up-to-date at startup and after receiving a hot-plug event. User-space + * may perform a forced-probe when the user explicitly requests it. User-space + * shouldn't perform a forced-probe in other situations. + */ +struct drm_mode_get_connector { + /** @encoders_ptr: Pointer to ``__u32`` array of object IDs. */ + __u64 encoders_ptr; + /** @modes_ptr: Pointer to struct drm_mode_modeinfo array. */ + __u64 modes_ptr; + /** @props_ptr: Pointer to ``__u32`` array of property IDs. */ + __u64 props_ptr; + /** @prop_values_ptr: Pointer to ``__u64`` array of property values. */ + __u64 prop_values_ptr; + + /** @count_modes: Number of modes. */ + __u32 count_modes; + /** @count_props: Number of properties. */ + __u32 count_props; + /** @count_encoders: Number of encoders. */ + __u32 count_encoders; + + /** @encoder_id: Object ID of the current encoder. */ + __u32 encoder_id; + /** @connector_id: Object ID of the connector. */ + __u32 connector_id; + /** + * @connector_type: Type of the connector. + * + * See DRM_MODE_CONNECTOR_* defines. + */ + __u32 connector_type; + /** + * @connector_type_id: Type-specific connector number. + * + * This is not an object ID. This is a per-type connector number. Each + * (type, type_id) combination is unique across all connectors of a DRM + * device. + * + * The (type, type_id) combination is not a stable identifier: the + * type_id can change depending on the driver probe order. + */ + __u32 connector_type_id; + + /** + * @connection: Status of the connector. + * + * See enum drm_connector_status. + */ + __u32 connection; + /** @mm_width: Width of the connected sink in millimeters. */ + __u32 mm_width; + /** @mm_height: Height of the connected sink in millimeters. */ + __u32 mm_height; + /** + * @subpixel: Subpixel order of the connected sink. + * + * See enum subpixel_order. + */ + __u32 subpixel; + + /** @pad: Padding, must be zero. */ + __u32 pad; +}; + +#define DRM_MODE_PROP_PENDING (1<<0) /* deprecated, do not use */ +#define DRM_MODE_PROP_RANGE (1<<1) +#define DRM_MODE_PROP_IMMUTABLE (1<<2) +#define DRM_MODE_PROP_ENUM (1<<3) /* enumerated type with text strings */ +#define DRM_MODE_PROP_BLOB (1<<4) +#define DRM_MODE_PROP_BITMASK (1<<5) /* bitmask of enumerated types */ + +/* non-extended types: legacy bitmask, one bit per type: */ +#define DRM_MODE_PROP_LEGACY_TYPE ( \ + DRM_MODE_PROP_RANGE | \ + DRM_MODE_PROP_ENUM | \ + DRM_MODE_PROP_BLOB | \ + DRM_MODE_PROP_BITMASK) + +/* extended-types: rather than continue to consume a bit per type, + * grab a chunk of the bits to use as integer type id. + */ +#define DRM_MODE_PROP_EXTENDED_TYPE 0x0000ffc0 +#define DRM_MODE_PROP_TYPE(n) ((n) << 6) +#define DRM_MODE_PROP_OBJECT DRM_MODE_PROP_TYPE(1) +#define DRM_MODE_PROP_SIGNED_RANGE DRM_MODE_PROP_TYPE(2) + +/* the PROP_ATOMIC flag is used to hide properties from userspace that + * is not aware of atomic properties. This is mostly to work around + * older userspace (DDX drivers) that read/write each prop they find, + * without being aware that this could be triggering a lengthy modeset. + */ +#define DRM_MODE_PROP_ATOMIC 0x80000000 + +/** + * struct drm_mode_property_enum - Description for an enum/bitfield entry. + * @value: numeric value for this enum entry. + * @name: symbolic name for this enum entry. + * + * See struct drm_property_enum for details. + */ +struct drm_mode_property_enum { + __u64 value; + char name[DRM_PROP_NAME_LEN]; +}; + +/** + * struct drm_mode_get_property - Get property metadata. + * + * User-space can perform a GETPROPERTY ioctl to retrieve information about a + * property. The same property may be attached to multiple objects, see + * "Modeset Base Object Abstraction". + * + * The meaning of the @values_ptr field changes depending on the property type. + * See &drm_property.flags for more details. + * + * The @enum_blob_ptr and @count_enum_blobs fields are only meaningful when the + * property has the type &DRM_MODE_PROP_ENUM or &DRM_MODE_PROP_BITMASK. For + * backwards compatibility, the kernel will always set @count_enum_blobs to + * zero when the property has the type &DRM_MODE_PROP_BLOB. User-space must + * ignore these two fields if the property has a different type. + * + * User-space is expected to retrieve values and enums by performing this ioctl + * at least twice: the first time to retrieve the number of elements, the + * second time to retrieve the elements themselves. + * + * To retrieve the number of elements, set @count_values and @count_enum_blobs + * to zero, then call the ioctl. @count_values will be updated with the number + * of elements. If the property has the type &DRM_MODE_PROP_ENUM or + * &DRM_MODE_PROP_BITMASK, @count_enum_blobs will be updated as well. + * + * To retrieve the elements themselves, allocate an array for @values_ptr and + * set @count_values to its capacity. If the property has the type + * &DRM_MODE_PROP_ENUM or &DRM_MODE_PROP_BITMASK, allocate an array for + * @enum_blob_ptr and set @count_enum_blobs to its capacity. Calling the ioctl + * again will fill the arrays. + */ +struct drm_mode_get_property { + /** @values_ptr: Pointer to a ``__u64`` array. */ + __u64 values_ptr; + /** @enum_blob_ptr: Pointer to a struct drm_mode_property_enum array. */ + __u64 enum_blob_ptr; + + /** + * @prop_id: Object ID of the property which should be retrieved. Set + * by the caller. + */ + __u32 prop_id; + /** + * @flags: ``DRM_MODE_PROP_*`` bitfield. See &drm_property.flags for + * a definition of the flags. + */ + __u32 flags; + /** + * @name: Symbolic property name. User-space should use this field to + * recognize properties. + */ + char name[DRM_PROP_NAME_LEN]; + + /** @count_values: Number of elements in @values_ptr. */ + __u32 count_values; + /** @count_enum_blobs: Number of elements in @enum_blob_ptr. */ + __u32 count_enum_blobs; +}; + +struct drm_mode_connector_set_property { + __u64 value; + __u32 prop_id; + __u32 connector_id; +}; + +#define DRM_MODE_OBJECT_CRTC 0xcccccccc +#define DRM_MODE_OBJECT_CONNECTOR 0xc0c0c0c0 +#define DRM_MODE_OBJECT_ENCODER 0xe0e0e0e0 +#define DRM_MODE_OBJECT_MODE 0xdededede +#define DRM_MODE_OBJECT_PROPERTY 0xb0b0b0b0 +#define DRM_MODE_OBJECT_FB 0xfbfbfbfb +#define DRM_MODE_OBJECT_BLOB 0xbbbbbbbb +#define DRM_MODE_OBJECT_PLANE 0xeeeeeeee +#define DRM_MODE_OBJECT_ANY 0 + +struct drm_mode_obj_get_properties { + __u64 props_ptr; + __u64 prop_values_ptr; + __u32 count_props; + __u32 obj_id; + __u32 obj_type; +}; + +struct drm_mode_obj_set_property { + __u64 value; + __u32 prop_id; + __u32 obj_id; + __u32 obj_type; +}; + +struct drm_mode_get_blob { + __u32 blob_id; + __u32 length; + __u64 data; +}; + +struct drm_mode_fb_cmd { + __u32 fb_id; + __u32 width; + __u32 height; + __u32 pitch; + __u32 bpp; + __u32 depth; + /* driver specific handle */ + __u32 handle; +}; + +#define DRM_MODE_FB_INTERLACED (1<<0) /* for interlaced framebuffers */ +#define DRM_MODE_FB_MODIFIERS (1<<1) /* enables ->modifier[] */ + +/** + * struct drm_mode_fb_cmd2 - Frame-buffer metadata. + * + * This struct holds frame-buffer metadata. There are two ways to use it: + * + * - User-space can fill this struct and perform a &DRM_IOCTL_MODE_ADDFB2 + * ioctl to register a new frame-buffer. The new frame-buffer object ID will + * be set by the kernel in @fb_id. + * - User-space can set @fb_id and perform a &DRM_IOCTL_MODE_GETFB2 ioctl to + * fetch metadata about an existing frame-buffer. + * + * In case of planar formats, this struct allows up to 4 buffer objects with + * offsets and pitches per plane. The pitch and offset order are dictated by + * the format FourCC as defined by ``drm_fourcc.h``, e.g. NV12 is described as: + * + * YUV 4:2:0 image with a plane of 8-bit Y samples followed by an + * interleaved U/V plane containing 8-bit 2x2 subsampled colour difference + * samples. + * + * So it would consist of a Y plane at ``offsets[0]`` and a UV plane at + * ``offsets[1]``. + * + * To accommodate tiled, compressed, etc formats, a modifier can be specified. + * For more information see the "Format Modifiers" section. Note that even + * though it looks like we have a modifier per-plane, we in fact do not. The + * modifier for each plane must be identical. Thus all combinations of + * different data layouts for multi-plane formats must be enumerated as + * separate modifiers. + * + * All of the entries in @handles, @pitches, @offsets and @modifier must be + * zero when unused. Warning, for @offsets and @modifier zero can't be used to + * figure out whether the entry is used or not since it's a valid value (a zero + * offset is common, and a zero modifier is &DRM_FORMAT_MOD_LINEAR). + */ +struct drm_mode_fb_cmd2 { + /** @fb_id: Object ID of the frame-buffer. */ + __u32 fb_id; + /** @width: Width of the frame-buffer. */ + __u32 width; + /** @height: Height of the frame-buffer. */ + __u32 height; + /** + * @pixel_format: FourCC format code, see ``DRM_FORMAT_*`` constants in + * ``drm_fourcc.h``. + */ + __u32 pixel_format; + /** + * @flags: Frame-buffer flags (see &DRM_MODE_FB_INTERLACED and + * &DRM_MODE_FB_MODIFIERS). + */ + __u32 flags; + + /** + * @handles: GEM buffer handle, one per plane. Set to 0 if the plane is + * unused. The same handle can be used for multiple planes. + */ + __u32 handles[4]; + /** @pitches: Pitch (aka. stride) in bytes, one per plane. */ + __u32 pitches[4]; + /** @offsets: Offset into the buffer in bytes, one per plane. */ + __u32 offsets[4]; + /** + * @modifier: Format modifier, one per plane. See ``DRM_FORMAT_MOD_*`` + * constants in ``drm_fourcc.h``. All planes must use the same + * modifier. Ignored unless &DRM_MODE_FB_MODIFIERS is set in @flags. + */ + __u64 modifier[4]; +}; + +#define DRM_MODE_FB_DIRTY_ANNOTATE_COPY 0x01 +#define DRM_MODE_FB_DIRTY_ANNOTATE_FILL 0x02 +#define DRM_MODE_FB_DIRTY_FLAGS 0x03 + +#define DRM_MODE_FB_DIRTY_MAX_CLIPS 256 + +/* + * Mark a region of a framebuffer as dirty. + * + * Some hardware does not automatically update display contents + * as a hardware or software draw to a framebuffer. This ioctl + * allows userspace to tell the kernel and the hardware what + * regions of the framebuffer have changed. + * + * The kernel or hardware is free to update more then just the + * region specified by the clip rects. The kernel or hardware + * may also delay and/or coalesce several calls to dirty into a + * single update. + * + * Userspace may annotate the updates, the annotates are a + * promise made by the caller that the change is either a copy + * of pixels or a fill of a single color in the region specified. + * + * If the DRM_MODE_FB_DIRTY_ANNOTATE_COPY flag is given then + * the number of updated regions are half of num_clips given, + * where the clip rects are paired in src and dst. The width and + * height of each one of the pairs must match. + * + * If the DRM_MODE_FB_DIRTY_ANNOTATE_FILL flag is given the caller + * promises that the region specified of the clip rects is filled + * completely with a single color as given in the color argument. + */ + +struct drm_mode_fb_dirty_cmd { + __u32 fb_id; + __u32 flags; + __u32 color; + __u32 num_clips; + __u64 clips_ptr; +}; + +struct drm_mode_mode_cmd { + __u32 connector_id; + struct drm_mode_modeinfo mode; +}; + +#define DRM_MODE_CURSOR_BO 0x01 +#define DRM_MODE_CURSOR_MOVE 0x02 +#define DRM_MODE_CURSOR_FLAGS 0x03 + +/* + * depending on the value in flags different members are used. + * + * CURSOR_BO uses + * crtc_id + * width + * height + * handle - if 0 turns the cursor off + * + * CURSOR_MOVE uses + * crtc_id + * x + * y + */ +struct drm_mode_cursor { + __u32 flags; + __u32 crtc_id; + __s32 x; + __s32 y; + __u32 width; + __u32 height; + /* driver specific handle */ + __u32 handle; +}; + +struct drm_mode_cursor2 { + __u32 flags; + __u32 crtc_id; + __s32 x; + __s32 y; + __u32 width; + __u32 height; + /* driver specific handle */ + __u32 handle; + __s32 hot_x; + __s32 hot_y; +}; + +struct drm_mode_crtc_lut { + __u32 crtc_id; + __u32 gamma_size; + + /* pointers to arrays */ + __u64 red; + __u64 green; + __u64 blue; +}; + +struct drm_color_ctm { + /* + * Conversion matrix in S31.32 sign-magnitude + * (not two's complement!) format. + * + * out matrix in + * |R| |0 1 2| |R| + * |G| = |3 4 5| x |G| + * |B| |6 7 8| |B| + */ + __u64 matrix[9]; +}; + +struct drm_color_lut { + /* + * Values are mapped linearly to 0.0 - 1.0 range, with 0x0 == 0.0 and + * 0xffff == 1.0. + */ + __u16 red; + __u16 green; + __u16 blue; + __u16 reserved; +}; + +/** + * struct drm_plane_size_hint - Plane size hints + * @width: The width of the plane in pixel + * @height: The height of the plane in pixel + * + * The plane SIZE_HINTS property blob contains an + * array of struct drm_plane_size_hint. + */ +struct drm_plane_size_hint { + __u16 width; + __u16 height; +}; + +/** + * struct hdr_metadata_infoframe - HDR Metadata Infoframe Data. + * + * HDR Metadata Infoframe as per CTA 861.G spec. This is expected + * to match exactly with the spec. + * + * Userspace is expected to pass the metadata information as per + * the format described in this structure. + */ +struct hdr_metadata_infoframe { + /** + * @eotf: Electro-Optical Transfer Function (EOTF) + * used in the stream. + */ + __u8 eotf; + /** + * @metadata_type: Static_Metadata_Descriptor_ID. + */ + __u8 metadata_type; + /** + * @display_primaries: Color Primaries of the Data. + * These are coded as unsigned 16-bit values in units of + * 0.00002, where 0x0000 represents zero and 0xC350 + * represents 1.0000. + * @display_primaries.x: X coordinate of color primary. + * @display_primaries.y: Y coordinate of color primary. + */ + struct { + __u16 x, y; + } display_primaries[3]; + /** + * @white_point: White Point of Colorspace Data. + * These are coded as unsigned 16-bit values in units of + * 0.00002, where 0x0000 represents zero and 0xC350 + * represents 1.0000. + * @white_point.x: X coordinate of whitepoint of color primary. + * @white_point.y: Y coordinate of whitepoint of color primary. + */ + struct { + __u16 x, y; + } white_point; + /** + * @max_display_mastering_luminance: Max Mastering Display Luminance. + * This value is coded as an unsigned 16-bit value in units of 1 cd/m2, + * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2. + */ + __u16 max_display_mastering_luminance; + /** + * @min_display_mastering_luminance: Min Mastering Display Luminance. + * This value is coded as an unsigned 16-bit value in units of + * 0.0001 cd/m2, where 0x0001 represents 0.0001 cd/m2 and 0xFFFF + * represents 6.5535 cd/m2. + */ + __u16 min_display_mastering_luminance; + /** + * @max_cll: Max Content Light Level. + * This value is coded as an unsigned 16-bit value in units of 1 cd/m2, + * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2. + */ + __u16 max_cll; + /** + * @max_fall: Max Frame Average Light Level. + * This value is coded as an unsigned 16-bit value in units of 1 cd/m2, + * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2. + */ + __u16 max_fall; +}; + +/** + * struct hdr_output_metadata - HDR output metadata + * + * Metadata Information to be passed from userspace + */ +struct hdr_output_metadata { + /** + * @metadata_type: Static_Metadata_Descriptor_ID. + */ + __u32 metadata_type; + /** + * @hdmi_metadata_type1: HDR Metadata Infoframe. + */ + union { + struct hdr_metadata_infoframe hdmi_metadata_type1; + }; +}; + +/** + * DRM_MODE_PAGE_FLIP_EVENT + * + * Request that the kernel sends back a vblank event (see + * struct drm_event_vblank) with the &DRM_EVENT_FLIP_COMPLETE type when the + * page-flip is done. + */ +#define DRM_MODE_PAGE_FLIP_EVENT 0x01 +/** + * DRM_MODE_PAGE_FLIP_ASYNC + * + * Request that the page-flip is performed as soon as possible, ie. with no + * delay due to waiting for vblank. This may cause tearing to be visible on + * the screen. + * + * When used with atomic uAPI, the driver will return an error if the hardware + * doesn't support performing an asynchronous page-flip for this update. + * User-space should handle this, e.g. by falling back to a regular page-flip. + * + * Note, some hardware might need to perform one last synchronous page-flip + * before being able to switch to asynchronous page-flips. As an exception, + * the driver will return success even though that first page-flip is not + * asynchronous. + */ +#define DRM_MODE_PAGE_FLIP_ASYNC 0x02 +#define DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE 0x4 +#define DRM_MODE_PAGE_FLIP_TARGET_RELATIVE 0x8 +#define DRM_MODE_PAGE_FLIP_TARGET (DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE | \ + DRM_MODE_PAGE_FLIP_TARGET_RELATIVE) +/** + * DRM_MODE_PAGE_FLIP_FLAGS + * + * Bitmask of flags suitable for &drm_mode_crtc_page_flip_target.flags. + */ +#define DRM_MODE_PAGE_FLIP_FLAGS (DRM_MODE_PAGE_FLIP_EVENT | \ + DRM_MODE_PAGE_FLIP_ASYNC | \ + DRM_MODE_PAGE_FLIP_TARGET) + +/* + * Request a page flip on the specified crtc. + * + * This ioctl will ask KMS to schedule a page flip for the specified + * crtc. Once any pending rendering targeting the specified fb (as of + * ioctl time) has completed, the crtc will be reprogrammed to display + * that fb after the next vertical refresh. The ioctl returns + * immediately, but subsequent rendering to the current fb will block + * in the execbuffer ioctl until the page flip happens. If a page + * flip is already pending as the ioctl is called, EBUSY will be + * returned. + * + * Flag DRM_MODE_PAGE_FLIP_EVENT requests that drm sends back a vblank + * event (see drm.h: struct drm_event_vblank) when the page flip is + * done. The user_data field passed in with this ioctl will be + * returned as the user_data field in the vblank event struct. + * + * Flag DRM_MODE_PAGE_FLIP_ASYNC requests that the flip happen + * 'as soon as possible', meaning that it not delay waiting for vblank. + * This may cause tearing on the screen. + * + * The reserved field must be zero. + */ + +struct drm_mode_crtc_page_flip { + __u32 crtc_id; + __u32 fb_id; + __u32 flags; + __u32 reserved; + __u64 user_data; +}; + +/* + * Request a page flip on the specified crtc. + * + * Same as struct drm_mode_crtc_page_flip, but supports new flags and + * re-purposes the reserved field: + * + * The sequence field must be zero unless either of the + * DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE/RELATIVE flags is specified. When + * the ABSOLUTE flag is specified, the sequence field denotes the absolute + * vblank sequence when the flip should take effect. When the RELATIVE + * flag is specified, the sequence field denotes the relative (to the + * current one when the ioctl is called) vblank sequence when the flip + * should take effect. NOTE: DRM_IOCTL_WAIT_VBLANK must still be used to + * make sure the vblank sequence before the target one has passed before + * calling this ioctl. The purpose of the + * DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE/RELATIVE flags is merely to clarify + * the target for when code dealing with a page flip runs during a + * vertical blank period. + */ + +struct drm_mode_crtc_page_flip_target { + __u32 crtc_id; + __u32 fb_id; + __u32 flags; + __u32 sequence; + __u64 user_data; +}; + +/** + * struct drm_mode_create_dumb - Create a KMS dumb buffer for scanout. + * @height: buffer height in pixels + * @width: buffer width in pixels + * @bpp: bits per pixel + * @flags: must be zero + * @handle: buffer object handle + * @pitch: number of bytes between two consecutive lines + * @size: size of the whole buffer in bytes + * + * User-space fills @height, @width, @bpp and @flags. If the IOCTL succeeds, + * the kernel fills @handle, @pitch and @size. + */ +struct drm_mode_create_dumb { + __u32 height; + __u32 width; + __u32 bpp; + __u32 flags; + + __u32 handle; + __u32 pitch; + __u64 size; +}; + +/* set up for mmap of a dumb scanout buffer */ +struct drm_mode_map_dumb { + /** Handle for the object being mapped. */ + __u32 handle; + __u32 pad; + /** + * Fake offset to use for subsequent mmap call + * + * This is a fixed-size type for 32/64 compatibility. + */ + __u64 offset; +}; + +struct drm_mode_destroy_dumb { + __u32 handle; +}; + +/** + * DRM_MODE_ATOMIC_TEST_ONLY + * + * Do not apply the atomic commit, instead check whether the hardware supports + * this configuration. + * + * See &drm_mode_config_funcs.atomic_check for more details on test-only + * commits. + */ +#define DRM_MODE_ATOMIC_TEST_ONLY 0x0100 +/** + * DRM_MODE_ATOMIC_NONBLOCK + * + * Do not block while applying the atomic commit. The &DRM_IOCTL_MODE_ATOMIC + * IOCTL returns immediately instead of waiting for the changes to be applied + * in hardware. Note, the driver will still check that the update can be + * applied before retuning. + */ +#define DRM_MODE_ATOMIC_NONBLOCK 0x0200 +/** + * DRM_MODE_ATOMIC_ALLOW_MODESET + * + * Allow the update to result in temporary or transient visible artifacts while + * the update is being applied. Applying the update may also take significantly + * more time than a page flip. All visual artifacts will disappear by the time + * the update is completed, as signalled through the vblank event's timestamp + * (see struct drm_event_vblank). + * + * This flag must be set when the KMS update might cause visible artifacts. + * Without this flag such KMS update will return a EINVAL error. What kind of + * update may cause visible artifacts depends on the driver and the hardware. + * User-space that needs to know beforehand if an update might cause visible + * artifacts can use &DRM_MODE_ATOMIC_TEST_ONLY without + * &DRM_MODE_ATOMIC_ALLOW_MODESET to see if it fails. + * + * To the best of the driver's knowledge, visual artifacts are guaranteed to + * not appear when this flag is not set. Some sinks might display visual + * artifacts outside of the driver's control. + */ +#define DRM_MODE_ATOMIC_ALLOW_MODESET 0x0400 + +/** + * DRM_MODE_ATOMIC_FLAGS + * + * Bitfield of flags accepted by the &DRM_IOCTL_MODE_ATOMIC IOCTL in + * &drm_mode_atomic.flags. + */ +#define DRM_MODE_ATOMIC_FLAGS (\ + DRM_MODE_PAGE_FLIP_EVENT |\ + DRM_MODE_PAGE_FLIP_ASYNC |\ + DRM_MODE_ATOMIC_TEST_ONLY |\ + DRM_MODE_ATOMIC_NONBLOCK |\ + DRM_MODE_ATOMIC_ALLOW_MODESET) + +struct drm_mode_atomic { + __u32 flags; + __u32 count_objs; + __u64 objs_ptr; + __u64 count_props_ptr; + __u64 props_ptr; + __u64 prop_values_ptr; + __u64 reserved; + __u64 user_data; +}; + +struct drm_format_modifier_blob { +#define FORMAT_BLOB_CURRENT 1 + /* Version of this blob format */ + __u32 version; + + /* Flags */ + __u32 flags; + + /* Number of fourcc formats supported */ + __u32 count_formats; + + /* Where in this blob the formats exist (in bytes) */ + __u32 formats_offset; + + /* Number of drm_format_modifiers */ + __u32 count_modifiers; + + /* Where in this blob the modifiers exist (in bytes) */ + __u32 modifiers_offset; + + /* __u32 formats[] */ + /* struct drm_format_modifier modifiers[] */ +}; + +struct drm_format_modifier { + /* Bitmask of formats in get_plane format list this info applies to. The + * offset allows a sliding window of which 64 formats (bits). + * + * Some examples: + * In today's world with < 65 formats, and formats 0, and 2 are + * supported + * 0x0000000000000005 + * ^-offset = 0, formats = 5 + * + * If the number formats grew to 128, and formats 98-102 are + * supported with the modifier: + * + * 0x0000007c00000000 0000000000000000 + * ^ + * |__offset = 64, formats = 0x7c00000000 + * + */ + __u64 formats; + __u32 offset; + __u32 pad; + + /* The modifier that applies to the >get_plane format list bitmask. */ + __u64 modifier; +}; + +/** + * struct drm_mode_create_blob - Create New blob property + * + * Create a new 'blob' data property, copying length bytes from data pointer, + * and returning new blob ID. + */ +struct drm_mode_create_blob { + /** @data: Pointer to data to copy. */ + __u64 data; + /** @length: Length of data to copy. */ + __u32 length; + /** @blob_id: Return: new property ID. */ + __u32 blob_id; +}; + +/** + * struct drm_mode_destroy_blob - Destroy user blob + * @blob_id: blob_id to destroy + * + * Destroy a user-created blob property. + * + * User-space can release blobs as soon as they do not need to refer to them by + * their blob object ID. For instance, if you are using a MODE_ID blob in an + * atomic commit and you will not make another commit re-using the same ID, you + * can destroy the blob as soon as the commit has been issued, without waiting + * for it to complete. + */ +struct drm_mode_destroy_blob { + __u32 blob_id; +}; + +/** + * struct drm_mode_create_lease - Create lease + * + * Lease mode resources, creating another drm_master. + * + * The @object_ids array must reference at least one CRTC, one connector and + * one plane if &DRM_CLIENT_CAP_UNIVERSAL_PLANES is enabled. Alternatively, + * the lease can be completely empty. + */ +struct drm_mode_create_lease { + /** @object_ids: Pointer to array of object ids (__u32) */ + __u64 object_ids; + /** @object_count: Number of object ids */ + __u32 object_count; + /** @flags: flags for new FD (O_CLOEXEC, etc) */ + __u32 flags; + + /** @lessee_id: Return: unique identifier for lessee. */ + __u32 lessee_id; + /** @fd: Return: file descriptor to new drm_master file */ + __u32 fd; +}; + +/** + * struct drm_mode_list_lessees - List lessees + * + * List lesses from a drm_master. + */ +struct drm_mode_list_lessees { + /** + * @count_lessees: Number of lessees. + * + * On input, provides length of the array. + * On output, provides total number. No + * more than the input number will be written + * back, so two calls can be used to get + * the size and then the data. + */ + __u32 count_lessees; + /** @pad: Padding. */ + __u32 pad; + + /** + * @lessees_ptr: Pointer to lessees. + * + * Pointer to __u64 array of lessee ids + */ + __u64 lessees_ptr; +}; + +/** + * struct drm_mode_get_lease - Get Lease + * + * Get leased objects. + */ +struct drm_mode_get_lease { + /** + * @count_objects: Number of leased objects. + * + * On input, provides length of the array. + * On output, provides total number. No + * more than the input number will be written + * back, so two calls can be used to get + * the size and then the data. + */ + __u32 count_objects; + /** @pad: Padding. */ + __u32 pad; + + /** + * @objects_ptr: Pointer to objects. + * + * Pointer to __u32 array of object ids. + */ + __u64 objects_ptr; +}; + +/** + * struct drm_mode_revoke_lease - Revoke lease + */ +struct drm_mode_revoke_lease { + /** @lessee_id: Unique ID of lessee */ + __u32 lessee_id; +}; + +/** + * struct drm_mode_rect - Two dimensional rectangle. + * @x1: Horizontal starting coordinate (inclusive). + * @y1: Vertical starting coordinate (inclusive). + * @x2: Horizontal ending coordinate (exclusive). + * @y2: Vertical ending coordinate (exclusive). + * + * With drm subsystem using struct drm_rect to manage rectangular area this + * export it to user-space. + * + * Currently used by drm_mode_atomic blob property FB_DAMAGE_CLIPS. + */ +struct drm_mode_rect { + __s32 x1; + __s32 y1; + __s32 x2; + __s32 y2; +}; + +/** + * struct drm_mode_closefb + * @fb_id: Framebuffer ID. + * @pad: Must be zero. + */ +struct drm_mode_closefb { + __u32 fb_id; + __u32 pad; +}; + +#if defined(__cplusplus) +} +#endif + +#endif From 29525f8cb3fa244b3b5ecf9fc92e42b9587fd9ef Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 4 Nov 2025 14:34:12 +0000 Subject: [PATCH 256/257] codespell: skip amdgpu kernel headers These header files are copied directly from the Linux kernel and contain typos. We skip these files in codespell to simplify maintenance. Signed-off-by: Radostin Stoyanov --- .codespellrc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.codespellrc b/.codespellrc index e91a6d2eb..5def594b2 100644 --- a/.codespellrc +++ b/.codespellrc @@ -1,3 +1,3 @@ [codespell] -skip = ./.git,./test/pki,./tags +skip = ./.git,./test/pki,./tags,./plugins/amdgpu/amdgpu_drm.h,./plugins/amdgpu/drm.h,./plugins/amdgpu/drm_mode.h ignore-words-list = creat,fpr,fle,ue,bord,parms,nd,te,testng,inh,wronly,renderd,bui,clen,sems From 1db7eed69fa974563abc6d7348ee93b679c06cc3 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 4 Nov 2025 14:41:52 +0000 Subject: [PATCH 257/257] amdgpu: use local kernel headers instead of libdrm Use local copies of amdgpu and DRM headers for consistency. Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin.c | 2 +- plugins/amdgpu/amdgpu_plugin_drm.c | 2 ++ plugins/amdgpu/kfd_ioctl.h | 5 ++++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 36dc0b6b0..713ffed6e 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -20,7 +20,6 @@ #include #include -#include #include "criu-plugin.h" #include "plugin.h" @@ -38,6 +37,7 @@ #include "rst-malloc.h" #include "common/list.h" +#include "amdgpu_drm.h" #include "amdgpu_plugin_dmabuf.h" #include "amdgpu_plugin_drm.h" #include "amdgpu_plugin_util.h" diff --git a/plugins/amdgpu/amdgpu_plugin_drm.c b/plugins/amdgpu/amdgpu_plugin_drm.c index 923bfcdd1..3520bca7a 100644 --- a/plugins/amdgpu/amdgpu_plugin_drm.c +++ b/plugins/amdgpu/amdgpu_plugin_drm.c @@ -23,6 +23,8 @@ #include "fdstore.h" #include "criu-amdgpu.pb-c.h" + +/* Define __user as empty for kernel headers in user-space */ #define __user #include "drm.h" diff --git a/plugins/amdgpu/kfd_ioctl.h b/plugins/amdgpu/kfd_ioctl.h index 1a3bcea95..a63d453f0 100644 --- a/plugins/amdgpu/kfd_ioctl.h +++ b/plugins/amdgpu/kfd_ioctl.h @@ -23,9 +23,12 @@ #ifndef KFD_IOCTL_H_INCLUDED #define KFD_IOCTL_H_INCLUDED -#include #include +/* Define __user as empty for kernel headers in user-space */ +#define __user +#include "drm.h" + /* * - 1.1 - initial version * - 1.3 - Add SMI events support